From 9511ae8176dd1c810756832364b0dd02592ac6ef Mon Sep 17 00:00:00 2001 From: Joakim Hulthe Date: Mon, 7 Jul 2025 00:08:36 +0200 Subject: [PATCH] wip --- build.rs | 6 - src/easy_mark/easy_mark_highlighter.rs | 133 +++++++++++++++- src/markdown/mod.rs | 3 +- ..._markdown__tokenizer__tests__tokenize.snap | 46 ++++++ src/markdown/span.rs | 43 +++++- src/markdown/tokenizer.rs | 146 ++++++++++++++++++ ...code_block__test__iter_markdown-2.snap.new | 44 ++++++ ...ing__test__serialized handwriting.snap.new | 8 + 8 files changed, 419 insertions(+), 10 deletions(-) delete mode 100644 build.rs create mode 100644 src/markdown/snapshots/inkr__markdown__tokenizer__tests__tokenize.snap create mode 100644 src/markdown/tokenizer.rs create mode 100644 src/snapshots/inkr__custom_code_block__test__iter_markdown-2.snap.new create mode 100644 src/snapshots/inkr__painting__test__serialized handwriting.snap.new diff --git a/build.rs b/build.rs deleted file mode 100644 index 8e2ef83..0000000 --- a/build.rs +++ /dev/null @@ -1,6 +0,0 @@ -fn main() { - lalrpop::Configuration::new() - .set_in_dir("./src/markdown") - .process() - .unwrap(); -} diff --git a/src/easy_mark/easy_mark_highlighter.rs b/src/easy_mark/easy_mark_highlighter.rs index 41584d9..ce23c3b 100644 --- a/src/easy_mark/easy_mark_highlighter.rs +++ b/src/easy_mark/easy_mark_highlighter.rs @@ -1,6 +1,12 @@ use egui::text::{CCursorRange, LayoutJob}; -use crate::easy_mark::easy_mark_parser; +use crate::{ + easy_mark::easy_mark_parser, + markdown::{ + span::Span, + tokenizer::{Token, TokenKind, tokenize}, + }, +}; /// Highlight easymark, memoizing previous output to save CPU. /// @@ -29,6 +35,131 @@ impl MemoizedHighlighter { } pub fn highlight_easymark( + egui_style: &egui::Style, + text: &str, + + // TODO: hide special characters where cursor isn't + _cursor: Option, +) -> LayoutJob { + let mut job = LayoutJob::default(); + let mut style = easy_mark_parser::Style::default(); + + let mut prev = TokenKind::Newline; + + let tokens: Vec<_> = tokenize(text).collect(); + let mut tokens = &tokens[..]; + + const CODE_INDENT: f32 = 10.0; + + while !tokens.is_empty() { + let token = tokens.first().unwrap(); + tokens = &tokens[1..]; + + let start_of_line = prev == TokenKind::Newline; + prev = token.kind; + + match token.kind { + TokenKind::CodeBlock if start_of_line => { + let astyle = format_from_style( + egui_style, + &easy_mark_parser::Style { + code: true, + ..Default::default() + }, + ); + + let span = collect_until( + token, + &mut tokens, + series([TokenKind::Newline, TokenKind::CodeBlock]), + ); + + job.append(&*span, CODE_INDENT, astyle.clone()); + style = Default::default(); + continue; + } + + TokenKind::Newline => style = easy_mark_parser::Style::default(), + TokenKind::Strong => style.strong ^= true, + TokenKind::Italic => style.italics ^= true, + TokenKind::Strikethrough => style.strikethrough ^= true, + TokenKind::Heading(_h) if start_of_line => style.heading = true, + TokenKind::Quote if start_of_line => style.quoted = true, + + TokenKind::CodeBlock | TokenKind::Mono => { + style.code = true; + let span = collect_until( + token, + &mut tokens, + any_of([TokenKind::Mono, TokenKind::CodeBlock, TokenKind::Newline]), + ); + job.append(&*span, 0.0, format_from_style(egui_style, &style)); + style.code = false; + continue; + } + + TokenKind::Heading(..) | TokenKind::Quote | TokenKind::Text => {} + } + + job.append(&token.span, 0.0, format_from_style(egui_style, &style)); + } + + job +} + +fn series<'a, const N: usize>(of: [TokenKind; N]) -> impl FnMut(&[Token<'a>; N]) -> bool { + move |token| { + of.iter() + .zip(token) + .all(|(kind, token)| kind == &token.kind) + } +} + +fn any_of<'a, const N: usize>(these: [TokenKind; N]) -> impl FnMut(&[Token<'a>; 1]) -> bool { + move |[token]| these.contains(&token.kind) +} + +/// Collect all tokens up to and including `pattern`, and merge them into a signle span. +/// +/// `N` determines how many specific and consecutive tokens we are looking for. +/// i.e. if we were looking for a [TokenKind::Newline] followed by a [TokenKind::Quote], `N` +/// would equal `2`. +/// +/// `pattern` is a function that accepts an array of `N` tokens and returns `true` if they match, +/// i.e. if we should stop collecting. [any_of] and [series] can help to construct this function. +/// +/// The collected tokens will be split off the head of the slice referred to by `tokens`. +/// +/// # Panic +/// Panics if `tokens` does not contain only consecutive adjacent spans. +fn collect_until<'a, const N: usize>( + token: &Token<'a>, + tokens: &mut &[Token<'a>], + pattern: impl FnMut(&[Token<'a>; N]) -> bool, +) -> Span<'a> +where + for<'b> &'b [Token<'a>; N]: TryFrom<&'b [Token<'a>]>, +{ + let mut windows = tokens + .windows(N) + .map(|slice| <&[Token<'a>; N]>::try_from(slice).ok().unwrap()); + + let split_at = match windows.position(pattern) { + Some(i) => i + N, + None => tokens.len(), // consume everything + }; + + let (consume, keep) = tokens.split_at(split_at); + *tokens = keep; + + consume + .iter() + .fold(token.span.clone(), |span: Span<'_>, token| { + span.try_merge(&token.span).unwrap() + }) +} + +pub fn highlight_easymark_old( egui_style: &egui::Style, mut text: &str, diff --git a/src/markdown/mod.rs b/src/markdown/mod.rs index 5c98f85..73b532c 100644 --- a/src/markdown/mod.rs +++ b/src/markdown/mod.rs @@ -1,4 +1,3 @@ pub mod ast; pub mod span; - -lalrpop_util::lalrpop_mod!(grammar); +pub mod tokenizer; diff --git a/src/markdown/snapshots/inkr__markdown__tokenizer__tests__tokenize.snap b/src/markdown/snapshots/inkr__markdown__tokenizer__tests__tokenize.snap new file mode 100644 index 0000000..97611ee --- /dev/null +++ b/src/markdown/snapshots/inkr__markdown__tokenizer__tests__tokenize.snap @@ -0,0 +1,46 @@ +--- +source: src/markdown/tokenizer.rs +expression: examples +--- +- string: "just some normal text :D" + tokens: + - "Token { span: Span(0..24, \"just some normal text :D\"), kind: Text }" +- string: normal *bold* normal + tokens: + - "Token { span: Span(0..7, \"normal \"), kind: Text }" + - "Token { span: Span(7..8, \"*\"), kind: Strong }" + - "Token { span: Span(8..12, \"bold\"), kind: Text }" + - "Token { span: Span(12..13, \"*\"), kind: Strong }" + - "Token { span: Span(13..20, \" normal\"), kind: Text }" +- string: normal * maybe bold? * normal + tokens: + - "Token { span: Span(0..7, \"normal \"), kind: Text }" + - "Token { span: Span(7..8, \"*\"), kind: Strong }" + - "Token { span: Span(8..21, \" maybe bold? \"), kind: Text }" + - "Token { span: Span(21..22, \"*\"), kind: Strong }" + - "Token { span: Span(22..29, \" normal\"), kind: Text }" +- string: "```lang\ncode code code\n```" + tokens: + - "Token { span: Span(0..3, \"```\"), kind: CodeBlock }" + - "Token { span: Span(3..7, \"lang\"), kind: Text }" + - "Token { span: Span(7..8, \"\\n\"), kind: Newline }" + - "Token { span: Span(8..22, \"code code code\"), kind: Text }" + - "Token { span: Span(22..23, \"\\n\"), kind: Newline }" + - "Token { span: Span(23..26, \"```\"), kind: CodeBlock }" +- string: "*/``/*" + tokens: + - "Token { span: Span(0..1, \"*\"), kind: Strong }" + - "Token { span: Span(1..2, \"/\"), kind: Italic }" + - "Token { span: Span(2..3, \"`\"), kind: Mono }" + - "Token { span: Span(3..4, \"`\"), kind: Mono }" + - "Token { span: Span(4..5, \"/\"), kind: Italic }" + - "Token { span: Span(5..6, \"*\"), kind: Strong }" +- string: "*/`*/*/" + tokens: + - "Token { span: Span(0..1, \"*\"), kind: Strong }" + - "Token { span: Span(1..2, \"/\"), kind: Italic }" + - "Token { span: Span(2..3, \"`\"), kind: Mono }" + - "Token { span: Span(3..4, \"*\"), kind: Strong }" + - "Token { span: Span(4..5, \"/\"), kind: Italic }" + - "Token { span: Span(5..6, \"*\"), kind: Strong }" + - "Token { span: Span(6..7, \"/\"), kind: Italic }" diff --git a/src/markdown/span.rs b/src/markdown/span.rs index 106fc2a..3a5e108 100644 --- a/src/markdown/span.rs +++ b/src/markdown/span.rs @@ -1,4 +1,7 @@ -use std::ops::{Deref, Range}; +use std::{ + fmt, + ops::{Deref, Range}, +}; #[derive(Clone, Eq, PartialEq)] pub struct Span<'a> { @@ -31,6 +34,35 @@ impl<'a> Span<'a> { pub fn complete_str(&self) -> Self { Self::new(self.complete_str) } + + pub fn split_at(&self, i: usize) -> Option<(Self, Self)> { + let head = self.get(0..i)?; + let tail = self.get(i..self.range.len())?; + Some((head, tail)) + } + + /// Try to merge the spans. + /// + /// This only works if spans are pointing into the same backing buffer, and are adjacent. + pub fn try_merge(&self, other: &Self) -> Option { + if self.complete_str.as_ptr() != other.complete_str.as_ptr() { + return None; + } + + if self.range.end == other.range.start { + Some(Self { + range: self.range.start..other.range.end, + ..*self + }) + } else if self.range.start == other.range.end { + Some(Self { + range: other.range.start..self.range.end, + ..*self + }) + } else { + None + } + } } impl Deref for Span<'_> { @@ -40,3 +72,12 @@ impl Deref for Span<'_> { &self.complete_str[self.range.clone()] } } + +impl<'a> fmt::Debug for Span<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("Span") + .field(&self.range) + .field(&self.deref()) + .finish() + } +} diff --git a/src/markdown/tokenizer.rs b/src/markdown/tokenizer.rs new file mode 100644 index 0000000..3c08e13 --- /dev/null +++ b/src/markdown/tokenizer.rs @@ -0,0 +1,146 @@ +use std::iter; + +use super::span::Span; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Heading { + H6, + H5, + H4, + H3, + H2, + H1, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum TokenKind { + /// A newline followed by three ` + CodeBlock, + + /// A newline that isn't a codeblock + Newline, + + Strong, + Italic, + Mono, + Strikethrough, + + Quote, + + Heading(Heading), + + /// Normal text + Text, +} + +#[derive(Debug)] +pub struct Token<'a> { + pub span: Span<'a>, + pub kind: TokenKind, +} + +pub fn tokenize<'a>(s: &'a str) -> impl Iterator> { + const TOKENS: &[(&'static str, TokenKind)] = &[ + ("\n", TokenKind::Newline), + ("######", TokenKind::Heading(Heading::H6)), + ("#####", TokenKind::Heading(Heading::H5)), + ("####", TokenKind::Heading(Heading::H4)), + ("###", TokenKind::Heading(Heading::H3)), + ("##", TokenKind::Heading(Heading::H2)), + ("#", TokenKind::Heading(Heading::H1)), + (">", TokenKind::Quote), + ("*", TokenKind::Strong), + ("/", TokenKind::Italic), + ("~", TokenKind::Strikethrough), + ("```", TokenKind::CodeBlock), + ("`", TokenKind::Mono), + ]; + + let mut s = Span::new(s); + let mut yield_n: usize = 0; + + iter::from_fn(move || { + loop { + if s.is_empty() { + return None; + } + + if yield_n == s.len() { + let (token, rest) = s.split_at(s.len()).unwrap(); + let token = Token { + span: token, + kind: TokenKind::Text, + }; + s = rest; + return Some(token); + } + + let token = TOKENS.iter().find_map(|(token_str, token_kind)| { + s[yield_n..] + .starts_with(token_str) + .then(|| (*token_kind, token_str.len())) + }); + + let Some((kind, len)) = token else { + yield_n += s[yield_n..].chars().next().unwrap_or('\0').len_utf8(); + continue; + }; + + if yield_n > 0 { + let (token, rest) = s.split_at(yield_n).unwrap(); + let token = Token { + span: token, + kind: TokenKind::Text, + }; + s = rest; + yield_n = 0; + return Some(token); + } + + let (token, rest) = s.split_at(len).unwrap(); + let token = Token { span: token, kind }; + s = rest; + return Some(token); + } + }) +} + +#[cfg(test)] +mod tests { + use serde::Serialize; + + use super::tokenize; + + #[test] + fn test_tokenize() { + let examples = [ + "just some normal text :D", + "normal *bold* normal", + "normal * maybe bold? * normal", + "```lang\ncode code code\n```", + "*/``/*", + "*/`*/*/", + ]; + + #[derive(Serialize)] + struct Result { + pub string: &'static str, + + /// Debug-printed tokens + pub tokens: Vec, + } + + let examples = examples + .into_iter() + .map(|string| { + let tokens = tokenize(string) + .map(|tokens| format!("{tokens:?}")) + .collect::>(); + + Result { string, tokens } + }) + .collect::>(); + + insta::assert_yaml_snapshot!(examples); + } +} diff --git a/src/snapshots/inkr__custom_code_block__test__iter_markdown-2.snap.new b/src/snapshots/inkr__custom_code_block__test__iter_markdown-2.snap.new new file mode 100644 index 0000000..b93a493 --- /dev/null +++ b/src/snapshots/inkr__custom_code_block__test__iter_markdown-2.snap.new @@ -0,0 +1,44 @@ +--- +source: src/custom_code_block.rs +assertion_line: 133 +expression: list +--- +[ + Line( + "\n", + ), + Line( + "# Hello world\n", + ), + Line( + "## Subheader\n", + ), + Line( + "- 1\n", + ), + CodeBlock { + key: "foo", + content: " whatever\n some code\n Hi mom!", + span: "```foo\n whatever\n some code\n Hi mom!\n```", + }, + Line( + " \n", + ), + Line( + "\n", + ), + CodeBlock { + key: "` # wrong number of ticks, but that's ok", + content: " ``` # indented ticks", + span: "```` # wrong number of ticks, but that's ok\n ``` # indented ticks\n```\n", + }, + Line( + "\n", + ), + Line( + "``` # no closing ticks\n", + ), + Line( + " ", + ), +] diff --git a/src/snapshots/inkr__painting__test__serialized handwriting.snap.new b/src/snapshots/inkr__painting__test__serialized handwriting.snap.new new file mode 100644 index 0000000..e138c44 --- /dev/null +++ b/src/snapshots/inkr__painting__test__serialized handwriting.snap.new @@ -0,0 +1,8 @@ +--- +source: src/painting.rs +assertion_line: 695 +expression: serialized +--- +```handwriting +BQAAvAA8AEIAPABCAEIAPgBAAAAAAAQAAEIAQgC8ADwAAAAAAEIAPA== +```