147 lines
3.5 KiB
Rust
147 lines
3.5 KiB
Rust
use std::iter;
|
|
|
|
use super::{Heading, span::Span};
|
|
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
pub enum TokenKind {
|
|
/// A newline that isn't a codeblock
|
|
Newline,
|
|
|
|
/// "#" to "######"
|
|
Heading(Heading),
|
|
|
|
/// A newline followed by three `
|
|
CodeBlock,
|
|
|
|
Mono,
|
|
Strong,
|
|
Italic,
|
|
Strikethrough,
|
|
|
|
/// ">"
|
|
Quote,
|
|
|
|
/// Two spaces
|
|
Indentation,
|
|
|
|
/// "- "
|
|
ListEntry,
|
|
|
|
/// Normal text
|
|
Text,
|
|
}
|
|
|
|
const TOKENS: &[(&str, TokenKind)] = &[
|
|
("\n", TokenKind::Newline),
|
|
("######", TokenKind::Heading(Heading::H6)),
|
|
("#####", TokenKind::Heading(Heading::H5)),
|
|
("####", TokenKind::Heading(Heading::H4)),
|
|
("###", TokenKind::Heading(Heading::H3)),
|
|
("##", TokenKind::Heading(Heading::H2)),
|
|
("#", TokenKind::Heading(Heading::H1)),
|
|
("```", TokenKind::CodeBlock),
|
|
("`", TokenKind::Mono),
|
|
("*", TokenKind::Strong),
|
|
("_", TokenKind::Italic),
|
|
("~", TokenKind::Strikethrough),
|
|
(">", TokenKind::Quote),
|
|
(" ", TokenKind::Indentation),
|
|
("- ", TokenKind::ListEntry),
|
|
];
|
|
|
|
#[derive(Debug)]
|
|
pub struct Token<'a> {
|
|
pub span: Span<'a>,
|
|
pub kind: TokenKind,
|
|
}
|
|
|
|
pub fn tokenize<'a>(s: &'a str) -> impl Iterator<Item = Token<'a>> {
|
|
let mut s = Span::new(s);
|
|
let mut yield_n: usize = 0;
|
|
|
|
iter::from_fn(move || {
|
|
loop {
|
|
if s.is_empty() {
|
|
return None;
|
|
}
|
|
|
|
if yield_n == s.len() {
|
|
let (token, rest) = s.split_at(s.len()).unwrap();
|
|
let token = Token {
|
|
span: token,
|
|
kind: TokenKind::Text,
|
|
};
|
|
s = rest;
|
|
return Some(token);
|
|
}
|
|
|
|
let token = TOKENS.iter().find_map(|(token_str, token_kind)| {
|
|
s[yield_n..]
|
|
.starts_with(token_str)
|
|
.then(|| (*token_kind, token_str.len()))
|
|
});
|
|
|
|
let Some((kind, len)) = token else {
|
|
yield_n += s[yield_n..].chars().next().unwrap_or('\0').len_utf8();
|
|
continue;
|
|
};
|
|
|
|
if yield_n > 0 {
|
|
let (token, rest) = s.split_at(yield_n).unwrap();
|
|
let token = Token {
|
|
span: token,
|
|
kind: TokenKind::Text,
|
|
};
|
|
s = rest;
|
|
yield_n = 0;
|
|
return Some(token);
|
|
}
|
|
|
|
let (token, rest) = s.split_at(len).unwrap();
|
|
let token = Token { span: token, kind };
|
|
s = rest;
|
|
return Some(token);
|
|
}
|
|
})
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use serde::Serialize;
|
|
|
|
use super::tokenize;
|
|
|
|
#[test]
|
|
fn test_tokenize() {
|
|
let examples = [
|
|
"just some normal text :D",
|
|
"normal *bold* normal",
|
|
"normal * maybe bold? * normal",
|
|
"```lang\ncode code code\n```",
|
|
"*_``_*",
|
|
"*_`*_*_",
|
|
];
|
|
|
|
#[derive(Serialize)]
|
|
struct Result {
|
|
pub string: &'static str,
|
|
|
|
/// Debug-printed tokens
|
|
pub tokens: Vec<String>,
|
|
}
|
|
|
|
let examples = examples
|
|
.into_iter()
|
|
.map(|string| {
|
|
let tokens = tokenize(string)
|
|
.map(|tokens| format!("{tokens:?}"))
|
|
.collect::<Vec<_>>();
|
|
|
|
Result { string, tokens }
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
insta::assert_yaml_snapshot!(examples);
|
|
}
|
|
}
|