Restructure markdown highlighter

2025-06-13 22:08:53 +02:00
parent 83ad2068e0
commit 7f93084e64
15 changed files with 683 additions and 600 deletions
--- a/src/markdown/tokenizer.rs
+++ b/src/markdown/tokenizer.rs
@ -0,0 +1,156 @@
+use std::iter;
+
+use super::span::Span;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum Heading {
+    H6,
+    H5,
+    H4,
+    H3,
+    H2,
+    H1,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum TokenKind {
+    /// A newline that isn't a codeblock
+    Newline,
+
+    /// "#" to "######"
+    Heading(Heading),
+
+    /// A newline followed by three `
+    CodeBlock,
+
+    Mono,
+    Strong,
+    Italic,
+    Strikethrough,
+
+    /// ">"
+    Quote,
+
+    /// Two spaces
+    Indentation,
+
+    /// "- "
+    ListEntry,
+
+    /// Normal text
+    Text,
+}
+
+const TOKENS: &[(&'static str, TokenKind)] = &[
+    ("\n", TokenKind::Newline),
+    ("######", TokenKind::Heading(Heading::H6)),
+    ("#####", TokenKind::Heading(Heading::H5)),
+    ("####", TokenKind::Heading(Heading::H4)),
+    ("###", TokenKind::Heading(Heading::H3)),
+    ("##", TokenKind::Heading(Heading::H2)),
+    ("#", TokenKind::Heading(Heading::H1)),
+    ("```", TokenKind::CodeBlock),
+    ("`", TokenKind::Mono),
+    ("*", TokenKind::Strong),
+    ("_", TokenKind::Italic),
+    ("~", TokenKind::Strikethrough),
+    (">", TokenKind::Quote),
+    ("  ", TokenKind::Indentation),
+    ("- ", TokenKind::ListEntry),
+];
+
+#[derive(Debug)]
+pub struct Token<'a> {
+    pub span: Span<'a>,
+    pub kind: TokenKind,
+}
+
+pub fn tokenize<'a>(s: &'a str) -> impl Iterator<Item = Token<'a>> {
+    let mut s = Span::new(s);
+    let mut yield_n: usize = 0;
+
+    iter::from_fn(move || {
+        loop {
+            if s.is_empty() {
+                return None;
+            }
+
+            if yield_n == s.len() {
+                let (token, rest) = s.split_at(s.len()).unwrap();
+                let token = Token {
+                    span: token,
+                    kind: TokenKind::Text,
+                };
+                s = rest;
+                return Some(token);
+            }
+
+            let token = TOKENS.iter().find_map(|(token_str, token_kind)| {
+                s[yield_n..]
+                    .starts_with(token_str)
+                    .then(|| (*token_kind, token_str.len()))
+            });
+
+            let Some((kind, len)) = token else {
+                yield_n += s[yield_n..].chars().next().unwrap_or('\0').len_utf8();
+                continue;
+            };
+
+            if yield_n > 0 {
+                let (token, rest) = s.split_at(yield_n).unwrap();
+                let token = Token {
+                    span: token,
+                    kind: TokenKind::Text,
+                };
+                s = rest;
+                yield_n = 0;
+                return Some(token);
+            }
+
+            let (token, rest) = s.split_at(len).unwrap();
+            let token = Token { span: token, kind };
+            s = rest;
+            return Some(token);
+        }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use serde::Serialize;
+
+    use super::tokenize;
+
+    #[test]
+    fn test_tokenize() {
+        let examples = [
+            "just some normal text :D",
+            "normal *bold* normal",
+            "normal * maybe bold? * normal",
+            "```lang\ncode code code\n```",
+            "*/``/*",
+            "*/`*/*/",
+        ];
+
+        #[derive(Serialize)]
+        struct Result {
+            pub string: &'static str,
+
+            /// Debug-printed tokens
+            pub tokens: Vec<String>,
+        }
+
+        let examples = examples
+            .into_iter()
+            .map(|string| {
+                let tokens = tokenize(string)
+                    .map(|tokens| format!("{tokens:?}"))
+                    .collect::<Vec<_>>();
+
+                Result { string, tokens }
+            })
+            .collect::<Vec<_>>();
+
+        insta::assert_yaml_snapshot!(examples);
+    }
+}