use std::iter; use super::{Heading, span::Span}; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum TokenKind { /// A newline that isn't a codeblock Newline, /// "#" to "######" Heading(Heading), /// A newline followed by three ` CodeBlock, Mono, Strong, Italic, Strikethrough, /// ">" Quote, /// Two spaces Indentation, /// "- " ListEntry, /// Normal text Text, } const TOKENS: &[(&str, TokenKind)] = &[ ("\n", TokenKind::Newline), ("######", TokenKind::Heading(Heading::H6)), ("#####", TokenKind::Heading(Heading::H5)), ("####", TokenKind::Heading(Heading::H4)), ("###", TokenKind::Heading(Heading::H3)), ("##", TokenKind::Heading(Heading::H2)), ("#", TokenKind::Heading(Heading::H1)), ("```", TokenKind::CodeBlock), ("`", TokenKind::Mono), ("*", TokenKind::Strong), ("_", TokenKind::Italic), ("~", TokenKind::Strikethrough), (">", TokenKind::Quote), (" ", TokenKind::Indentation), ("- ", TokenKind::ListEntry), ]; #[derive(Debug)] pub struct Token<'a> { pub span: Span<'a>, pub kind: TokenKind, } pub fn tokenize<'a>(s: &'a str) -> impl Iterator> { let mut s = Span::new(s); let mut yield_n: usize = 0; iter::from_fn(move || { loop { if s.is_empty() { return None; } if yield_n == s.len() { let (token, rest) = s.split_at(s.len()).unwrap(); let token = Token { span: token, kind: TokenKind::Text, }; s = rest; return Some(token); } let token = TOKENS.iter().find_map(|(token_str, token_kind)| { s[yield_n..] .starts_with(token_str) .then(|| (*token_kind, token_str.len())) }); let Some((kind, len)) = token else { yield_n += s[yield_n..].chars().next().unwrap_or('\0').len_utf8(); continue; }; if yield_n > 0 { let (token, rest) = s.split_at(yield_n).unwrap(); let token = Token { span: token, kind: TokenKind::Text, }; s = rest; yield_n = 0; return Some(token); } let (token, rest) = s.split_at(len).unwrap(); let token = Token { span: token, kind }; s = rest; return Some(token); } }) } #[cfg(test)] mod tests { use serde::Serialize; use super::tokenize; #[test] fn test_tokenize() { let examples = [ "just some normal text :D", "normal *bold* normal", "normal * maybe bold? * normal", "```lang\ncode code code\n```", "*_``_*", "*_`*_*_", ]; #[derive(Serialize)] struct Result { pub string: &'static str, /// Debug-printed tokens pub tokens: Vec, } let examples = examples .into_iter() .map(|string| { let tokens = tokenize(string) .map(|tokens| format!("{tokens:?}")) .collect::>(); Result { string, tokens } }) .collect::>(); insta::assert_yaml_snapshot!(examples); } }