Restructure markdown highlighter

This commit is contained in:
2025-06-13 22:08:53 +02:00
parent 83ad2068e0
commit 7f93084e64
15 changed files with 683 additions and 600 deletions

156
src/markdown/tokenizer.rs Normal file
View File

@ -0,0 +1,156 @@
use std::iter;
use super::span::Span;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Heading {
H6,
H5,
H4,
H3,
H2,
H1,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum TokenKind {
/// A newline that isn't a codeblock
Newline,
/// "#" to "######"
Heading(Heading),
/// A newline followed by three `
CodeBlock,
Mono,
Strong,
Italic,
Strikethrough,
/// ">"
Quote,
/// Two spaces
Indentation,
/// "- "
ListEntry,
/// Normal text
Text,
}
const TOKENS: &[(&'static str, TokenKind)] = &[
("\n", TokenKind::Newline),
("######", TokenKind::Heading(Heading::H6)),
("#####", TokenKind::Heading(Heading::H5)),
("####", TokenKind::Heading(Heading::H4)),
("###", TokenKind::Heading(Heading::H3)),
("##", TokenKind::Heading(Heading::H2)),
("#", TokenKind::Heading(Heading::H1)),
("```", TokenKind::CodeBlock),
("`", TokenKind::Mono),
("*", TokenKind::Strong),
("_", TokenKind::Italic),
("~", TokenKind::Strikethrough),
(">", TokenKind::Quote),
(" ", TokenKind::Indentation),
("- ", TokenKind::ListEntry),
];
#[derive(Debug)]
pub struct Token<'a> {
pub span: Span<'a>,
pub kind: TokenKind,
}
pub fn tokenize<'a>(s: &'a str) -> impl Iterator<Item = Token<'a>> {
let mut s = Span::new(s);
let mut yield_n: usize = 0;
iter::from_fn(move || {
loop {
if s.is_empty() {
return None;
}
if yield_n == s.len() {
let (token, rest) = s.split_at(s.len()).unwrap();
let token = Token {
span: token,
kind: TokenKind::Text,
};
s = rest;
return Some(token);
}
let token = TOKENS.iter().find_map(|(token_str, token_kind)| {
s[yield_n..]
.starts_with(token_str)
.then(|| (*token_kind, token_str.len()))
});
let Some((kind, len)) = token else {
yield_n += s[yield_n..].chars().next().unwrap_or('\0').len_utf8();
continue;
};
if yield_n > 0 {
let (token, rest) = s.split_at(yield_n).unwrap();
let token = Token {
span: token,
kind: TokenKind::Text,
};
s = rest;
yield_n = 0;
return Some(token);
}
let (token, rest) = s.split_at(len).unwrap();
let token = Token { span: token, kind };
s = rest;
return Some(token);
}
})
}
#[cfg(test)]
mod tests {
use serde::Serialize;
use super::tokenize;
#[test]
fn test_tokenize() {
let examples = [
"just some normal text :D",
"normal *bold* normal",
"normal * maybe bold? * normal",
"```lang\ncode code code\n```",
"*/``/*",
"*/`*/*/",
];
#[derive(Serialize)]
struct Result {
pub string: &'static str,
/// Debug-printed tokens
pub tokens: Vec<String>,
}
let examples = examples
.into_iter()
.map(|string| {
let tokens = tokenize(string)
.map(|tokens| format!("{tokens:?}"))
.collect::<Vec<_>>();
Result { string, tokens }
})
.collect::<Vec<_>>();
insta::assert_yaml_snapshot!(examples);
}
}