From 9511ae8176dd1c810756832364b0dd02592ac6ef Mon Sep 17 00:00:00 2001
From: Joakim Hulthe <joakim@hulthe.net>
Date: Mon, 7 Jul 2025 00:08:36 +0200
Subject: [PATCH] wip

---
 build.rs                                      |   6 -
 src/easy_mark/easy_mark_highlighter.rs        | 133 +++++++++++++++-
 src/markdown/mod.rs                           |   3 +-
 ..._markdown__tokenizer__tests__tokenize.snap |  46 ++++++
 src/markdown/span.rs                          |  43 +++++-
 src/markdown/tokenizer.rs                     | 146 ++++++++++++++++++
 ...code_block__test__iter_markdown-2.snap.new |  44 ++++++
 ...ing__test__serialized handwriting.snap.new |   8 +
 8 files changed, 419 insertions(+), 10 deletions(-)
 delete mode 100644 build.rs
 create mode 100644 src/markdown/snapshots/inkr__markdown__tokenizer__tests__tokenize.snap
 create mode 100644 src/markdown/tokenizer.rs
 create mode 100644 src/snapshots/inkr__custom_code_block__test__iter_markdown-2.snap.new
 create mode 100644 src/snapshots/inkr__painting__test__serialized handwriting.snap.new
diff --git a/build.rs b/build.rs
deleted file mode 100644
index 8e2ef83..0000000
--- a/build.rs
+++ /dev/null
@@ -1,6 +0,0 @@
-fn main() {
-    lalrpop::Configuration::new()
-        .set_in_dir("./src/markdown")
-        .process()
-        .unwrap();
-}
diff --git a/src/easy_mark/easy_mark_highlighter.rs b/src/easy_mark/easy_mark_highlighter.rs
index 41584d9..ce23c3b 100644
--- a/src/easy_mark/easy_mark_highlighter.rs
+++ b/src/easy_mark/easy_mark_highlighter.rs
@@ -1,6 +1,12 @@
 use egui::text::{CCursorRange, LayoutJob};
 
-use crate::easy_mark::easy_mark_parser;
+use crate::{
+    easy_mark::easy_mark_parser,
+    markdown::{
+        span::Span,
+        tokenizer::{Token, TokenKind, tokenize},
+    },
+};
 
 /// Highlight easymark, memoizing previous output to save CPU.
 ///
@@ -29,6 +35,131 @@ impl MemoizedHighlighter {
 }
 
 pub fn highlight_easymark(
+    egui_style: &egui::Style,
+    text: &str,
+
+    // TODO: hide special characters where cursor isn't
+    _cursor: Option<CCursorRange>,
+) -> LayoutJob {
+    let mut job = LayoutJob::default();
+    let mut style = easy_mark_parser::Style::default();
+
+    let mut prev = TokenKind::Newline;
+
+    let tokens: Vec<_> = tokenize(text).collect();
+    let mut tokens = &tokens[..];
+
+    const CODE_INDENT: f32 = 10.0;
+
+    while !tokens.is_empty() {
+        let token = tokens.first().unwrap();
+        tokens = &tokens[1..];
+
+        let start_of_line = prev == TokenKind::Newline;
+        prev = token.kind;
+
+        match token.kind {
+            TokenKind::CodeBlock if start_of_line => {
+                let astyle = format_from_style(
+                    egui_style,
+                    &easy_mark_parser::Style {
+                        code: true,
+                        ..Default::default()
+                    },
+                );
+
+                let span = collect_until(
+                    token,
+                    &mut tokens,
+                    series([TokenKind::Newline, TokenKind::CodeBlock]),
+                );
+
+                job.append(&*span, CODE_INDENT, astyle.clone());
+                style = Default::default();
+                continue;
+            }
+
+            TokenKind::Newline => style = easy_mark_parser::Style::default(),
+            TokenKind::Strong => style.strong ^= true,
+            TokenKind::Italic => style.italics ^= true,
+            TokenKind::Strikethrough => style.strikethrough ^= true,
+            TokenKind::Heading(_h) if start_of_line => style.heading = true,
+            TokenKind::Quote if start_of_line => style.quoted = true,
+
+            TokenKind::CodeBlock | TokenKind::Mono => {
+                style.code = true;
+                let span = collect_until(
+                    token,
+                    &mut tokens,
+                    any_of([TokenKind::Mono, TokenKind::CodeBlock, TokenKind::Newline]),
+                );
+                job.append(&*span, 0.0, format_from_style(egui_style, &style));
+                style.code = false;
+                continue;
+            }
+
+            TokenKind::Heading(..) | TokenKind::Quote | TokenKind::Text => {}
+        }
+
+        job.append(&token.span, 0.0, format_from_style(egui_style, &style));
+    }
+
+    job
+}
+
+fn series<'a, const N: usize>(of: [TokenKind; N]) -> impl FnMut(&[Token<'a>; N]) -> bool {
+    move |token| {
+        of.iter()
+            .zip(token)
+            .all(|(kind, token)| kind == &token.kind)
+    }
+}
+
+fn any_of<'a, const N: usize>(these: [TokenKind; N]) -> impl FnMut(&[Token<'a>; 1]) -> bool {
+    move |[token]| these.contains(&token.kind)
+}
+
+/// Collect all tokens up to and including `pattern`, and merge them into a signle span.
+///
+/// `N` determines how many specific and consecutive tokens we are looking for.
+/// i.e. if we were looking for a [TokenKind::Newline] followed by a [TokenKind::Quote], `N`
+/// would equal `2`.
+///
+/// `pattern` is a function that accepts an array of `N` tokens and returns `true` if they match,
+/// i.e. if we should stop collecting. [any_of] and [series] can help to construct this function.
+///
+/// The collected tokens will be split off the head of the slice referred to by `tokens`.
+///
+/// # Panic
+/// Panics if `tokens` does not contain only consecutive adjacent spans.
+fn collect_until<'a, const N: usize>(
+    token: &Token<'a>,
+    tokens: &mut &[Token<'a>],
+    pattern: impl FnMut(&[Token<'a>; N]) -> bool,
+) -> Span<'a>
+where
+    for<'b> &'b [Token<'a>; N]: TryFrom<&'b [Token<'a>]>,
+{
+    let mut windows = tokens
+        .windows(N)
+        .map(|slice| <&[Token<'a>; N]>::try_from(slice).ok().unwrap());
+
+    let split_at = match windows.position(pattern) {
+        Some(i) => i + N,
+        None => tokens.len(), // consume everything
+    };
+
+    let (consume, keep) = tokens.split_at(split_at);
+    *tokens = keep;
+
+    consume
+        .iter()
+        .fold(token.span.clone(), |span: Span<'_>, token| {
+            span.try_merge(&token.span).unwrap()
+        })
+}
+
+pub fn highlight_easymark_old(
     egui_style: &egui::Style,
     mut text: &str,
 
diff --git a/src/markdown/mod.rs b/src/markdown/mod.rs
index 5c98f85..73b532c 100644
--- a/src/markdown/mod.rs
+++ b/src/markdown/mod.rs
@@ -1,4 +1,3 @@
 pub mod ast;
 pub mod span;
-
-lalrpop_util::lalrpop_mod!(grammar);
+pub mod tokenizer;
diff --git a/src/markdown/snapshots/inkr__markdown__tokenizer__tests__tokenize.snap b/src/markdown/snapshots/inkr__markdown__tokenizer__tests__tokenize.snap
new file mode 100644
index 0000000..97611ee
--- /dev/null
+++ b/src/markdown/snapshots/inkr__markdown__tokenizer__tests__tokenize.snap
@@ -0,0 +1,46 @@
+---
+source: src/markdown/tokenizer.rs
+expression: examples
+---
+- string: "just some normal text :D"
+  tokens:
+    - "Token { span: Span(0..24, \"just some normal text :D\"), kind: Text }"
+- string: normal *bold* normal
+  tokens:
+    - "Token { span: Span(0..7, \"normal \"), kind: Text }"
+    - "Token { span: Span(7..8, \"*\"), kind: Strong }"
+    - "Token { span: Span(8..12, \"bold\"), kind: Text }"
+    - "Token { span: Span(12..13, \"*\"), kind: Strong }"
+    - "Token { span: Span(13..20, \" normal\"), kind: Text }"
+- string: normal * maybe bold? * normal
+  tokens:
+    - "Token { span: Span(0..7, \"normal \"), kind: Text }"
+    - "Token { span: Span(7..8, \"*\"), kind: Strong }"
+    - "Token { span: Span(8..21, \" maybe bold? \"), kind: Text }"
+    - "Token { span: Span(21..22, \"*\"), kind: Strong }"
+    - "Token { span: Span(22..29, \" normal\"), kind: Text }"
+- string: "```lang\ncode code code\n```"
+  tokens:
+    - "Token { span: Span(0..3, \"```\"), kind: CodeBlock }"
+    - "Token { span: Span(3..7, \"lang\"), kind: Text }"
+    - "Token { span: Span(7..8, \"\\n\"), kind: Newline }"
+    - "Token { span: Span(8..22, \"code code code\"), kind: Text }"
+    - "Token { span: Span(22..23, \"\\n\"), kind: Newline }"
+    - "Token { span: Span(23..26, \"```\"), kind: CodeBlock }"
+- string: "*/``/*"
+  tokens:
+    - "Token { span: Span(0..1, \"*\"), kind: Strong }"
+    - "Token { span: Span(1..2, \"/\"), kind: Italic }"
+    - "Token { span: Span(2..3, \"`\"), kind: Mono }"
+    - "Token { span: Span(3..4, \"`\"), kind: Mono }"
+    - "Token { span: Span(4..5, \"/\"), kind: Italic }"
+    - "Token { span: Span(5..6, \"*\"), kind: Strong }"
+- string: "*/`*/*/"
+  tokens:
+    - "Token { span: Span(0..1, \"*\"), kind: Strong }"
+    - "Token { span: Span(1..2, \"/\"), kind: Italic }"
+    - "Token { span: Span(2..3, \"`\"), kind: Mono }"
+    - "Token { span: Span(3..4, \"*\"), kind: Strong }"
+    - "Token { span: Span(4..5, \"/\"), kind: Italic }"
+    - "Token { span: Span(5..6, \"*\"), kind: Strong }"
+    - "Token { span: Span(6..7, \"/\"), kind: Italic }"
diff --git a/src/markdown/span.rs b/src/markdown/span.rs
index 106fc2a..3a5e108 100644
--- a/src/markdown/span.rs
+++ b/src/markdown/span.rs
@@ -1,4 +1,7 @@
-use std::ops::{Deref, Range};
+use std::{
+    fmt,
+    ops::{Deref, Range},
+};
 
 #[derive(Clone, Eq, PartialEq)]
 pub struct Span<'a> {
@@ -31,6 +34,35 @@ impl<'a> Span<'a> {
     pub fn complete_str(&self) -> Self {
         Self::new(self.complete_str)
     }
+
+    pub fn split_at(&self, i: usize) -> Option<(Self, Self)> {
+        let head = self.get(0..i)?;
+        let tail = self.get(i..self.range.len())?;
+        Some((head, tail))
+    }
+
+    /// Try to merge the spans.
+    ///
+    /// This only works if spans are pointing into the same backing buffer, and are adjacent.
+    pub fn try_merge(&self, other: &Self) -> Option<Self> {
+        if self.complete_str.as_ptr() != other.complete_str.as_ptr() {
+            return None;
+        }
+
+        if self.range.end == other.range.start {
+            Some(Self {
+                range: self.range.start..other.range.end,
+                ..*self
+            })
+        } else if self.range.start == other.range.end {
+            Some(Self {
+                range: other.range.start..self.range.end,
+                ..*self
+            })
+        } else {
+            None
+        }
+    }
 }
 
 impl Deref for Span<'_> {
@@ -40,3 +72,12 @@ impl Deref for Span<'_> {
         &self.complete_str[self.range.clone()]
     }
 }
+
+impl<'a> fmt::Debug for Span<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_tuple("Span")
+            .field(&self.range)
+            .field(&self.deref())
+            .finish()
+    }
+}
diff --git a/src/markdown/tokenizer.rs b/src/markdown/tokenizer.rs
new file mode 100644
index 0000000..3c08e13
--- /dev/null
+++ b/src/markdown/tokenizer.rs
@@ -0,0 +1,146 @@
+use std::iter;
+
+use super::span::Span;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum Heading {
+    H6,
+    H5,
+    H4,
+    H3,
+    H2,
+    H1,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum TokenKind {
+    /// A newline followed by three `
+    CodeBlock,
+
+    /// A newline that isn't a codeblock
+    Newline,
+
+    Strong,
+    Italic,
+    Mono,
+    Strikethrough,
+
+    Quote,
+
+    Heading(Heading),
+
+    /// Normal text
+    Text,
+}
+
+#[derive(Debug)]
+pub struct Token<'a> {
+    pub span: Span<'a>,
+    pub kind: TokenKind,
+}
+
+pub fn tokenize<'a>(s: &'a str) -> impl Iterator<Item = Token<'a>> {
+    const TOKENS: &[(&'static str, TokenKind)] = &[
+        ("\n", TokenKind::Newline),
+        ("######", TokenKind::Heading(Heading::H6)),
+        ("#####", TokenKind::Heading(Heading::H5)),
+        ("####", TokenKind::Heading(Heading::H4)),
+        ("###", TokenKind::Heading(Heading::H3)),
+        ("##", TokenKind::Heading(Heading::H2)),
+        ("#", TokenKind::Heading(Heading::H1)),
+        (">", TokenKind::Quote),
+        ("*", TokenKind::Strong),
+        ("/", TokenKind::Italic),
+        ("~", TokenKind::Strikethrough),
+        ("```", TokenKind::CodeBlock),
+        ("`", TokenKind::Mono),
+    ];
+
+    let mut s = Span::new(s);
+    let mut yield_n: usize = 0;
+
+    iter::from_fn(move || {
+        loop {
+            if s.is_empty() {
+                return None;
+            }
+
+            if yield_n == s.len() {
+                let (token, rest) = s.split_at(s.len()).unwrap();
+                let token = Token {
+                    span: token,
+                    kind: TokenKind::Text,
+                };
+                s = rest;
+                return Some(token);
+            }
+
+            let token = TOKENS.iter().find_map(|(token_str, token_kind)| {
+                s[yield_n..]
+                    .starts_with(token_str)
+                    .then(|| (*token_kind, token_str.len()))
+            });
+
+            let Some((kind, len)) = token else {
+                yield_n += s[yield_n..].chars().next().unwrap_or('\0').len_utf8();
+                continue;
+            };
+
+            if yield_n > 0 {
+                let (token, rest) = s.split_at(yield_n).unwrap();
+                let token = Token {
+                    span: token,
+                    kind: TokenKind::Text,
+                };
+                s = rest;
+                yield_n = 0;
+                return Some(token);
+            }
+
+            let (token, rest) = s.split_at(len).unwrap();
+            let token = Token { span: token, kind };
+            s = rest;
+            return Some(token);
+        }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use serde::Serialize;
+
+    use super::tokenize;
+
+    #[test]
+    fn test_tokenize() {
+        let examples = [
+            "just some normal text :D",
+            "normal *bold* normal",
+            "normal * maybe bold? * normal",
+            "```lang\ncode code code\n```",
+            "*/``/*",
+            "*/`*/*/",
+        ];
+
+        #[derive(Serialize)]
+        struct Result {
+            pub string: &'static str,
+
+            /// Debug-printed tokens
+            pub tokens: Vec<String>,
+        }
+
+        let examples = examples
+            .into_iter()
+            .map(|string| {
+                let tokens = tokenize(string)
+                    .map(|tokens| format!("{tokens:?}"))
+                    .collect::<Vec<_>>();
+
+                Result { string, tokens }
+            })
+            .collect::<Vec<_>>();
+
+        insta::assert_yaml_snapshot!(examples);
+    }
+}
diff --git a/src/snapshots/inkr__custom_code_block__test__iter_markdown-2.snap.new b/src/snapshots/inkr__custom_code_block__test__iter_markdown-2.snap.new
new file mode 100644
index 0000000..b93a493
--- /dev/null
+++ b/src/snapshots/inkr__custom_code_block__test__iter_markdown-2.snap.new
@@ -0,0 +1,44 @@
+---
+source: src/custom_code_block.rs
+assertion_line: 133
+expression: list
+---
+[
+    Line(
+        "\n",
+    ),
+    Line(
+        "# Hello world\n",
+    ),
+    Line(
+        "## Subheader\n",
+    ),
+    Line(
+        "- 1\n",
+    ),
+    CodeBlock {
+        key: "foo",
+        content: "  whatever\n     some code\n Hi mom!",
+        span: "```foo\n  whatever\n     some code\n Hi mom!\n```",
+    },
+    Line(
+        " \n",
+    ),
+    Line(
+        "\n",
+    ),
+    CodeBlock {
+        key: "` # wrong number of ticks, but that's ok",
+        content: " ``` # indented ticks",
+        span: "```` # wrong number of ticks, but that's ok\n ``` # indented ticks\n```\n",
+    },
+    Line(
+        "\n",
+    ),
+    Line(
+        "``` # no closing ticks\n",
+    ),
+    Line(
+        "            ",
+    ),
+]
diff --git a/src/snapshots/inkr__painting__test__serialized handwriting.snap.new b/src/snapshots/inkr__painting__test__serialized handwriting.snap.new
new file mode 100644
index 0000000..e138c44
--- /dev/null
+++ b/src/snapshots/inkr__painting__test__serialized handwriting.snap.new	
@@ -0,0 +1,8 @@
+---
+source: src/painting.rs
+assertion_line: 695
+expression: serialized
+---
+```handwriting
+BQAAvAA8AEIAPABCAEIAPgBAAAAAAAQAAEIAQgC8ADwAAAAAAEIAPA==
+```