wip

2025-07-07 00:08:36 +02:00
parent 138df11710
commit 9511ae8176
8 changed files with 419 additions and 10 deletions
--- a/build.rs
+++ b/build.rs
@@ -1,6 +0,0 @@
 fn main() {
    lalrpop::Configuration::new()
        .set_in_dir("./src/markdown")
        .process()
        .unwrap();
 }
--- a/src/easy_mark/easy_mark_highlighter.rs
+++ b/src/easy_mark/easy_mark_highlighter.rs
@@ -1,6 +1,12 @@
 use egui::text::{CCursorRange, LayoutJob};
-use crate::easy_mark::easy_mark_parser;
+use crate::{
    easy_mark::easy_mark_parser,
    markdown::{
        span::Span,
        tokenizer::{Token, TokenKind, tokenize},
    },
 };
 /// Highlight easymark, memoizing previous output to save CPU.
 ///
@@ -29,6 +35,131 @@ impl MemoizedHighlighter {
 }
 pub fn highlight_easymark(
    egui_style: &egui::Style,
    text: &str,
    // TODO: hide special characters where cursor isn't
    _cursor: Option<CCursorRange>,
 ) -> LayoutJob {
    let mut job = LayoutJob::default();
    let mut style = easy_mark_parser::Style::default();
    let mut prev = TokenKind::Newline;
    let tokens: Vec<_> = tokenize(text).collect();
    let mut tokens = &tokens[..];
    const CODE_INDENT: f32 = 10.0;
    while !tokens.is_empty() {
        let token = tokens.first().unwrap();
        tokens = &tokens[1..];
        let start_of_line = prev == TokenKind::Newline;
        prev = token.kind;
        match token.kind {
            TokenKind::CodeBlock if start_of_line => {
                let astyle = format_from_style(
                    egui_style,
                    &easy_mark_parser::Style {
                        code: true,
                        ..Default::default()
                    },
                );
                let span = collect_until(
                    token,
                    &mut tokens,
                    series([TokenKind::Newline, TokenKind::CodeBlock]),
                );
                job.append(&*span, CODE_INDENT, astyle.clone());
                style = Default::default();
                continue;
            }
            TokenKind::Newline => style = easy_mark_parser::Style::default(),
            TokenKind::Strong => style.strong ^= true,
            TokenKind::Italic => style.italics ^= true,
            TokenKind::Strikethrough => style.strikethrough ^= true,
            TokenKind::Heading(_h) if start_of_line => style.heading = true,
            TokenKind::Quote if start_of_line => style.quoted = true,
            TokenKind::CodeBlock | TokenKind::Mono => {
                style.code = true;
                let span = collect_until(
                    token,
                    &mut tokens,
                    any_of([TokenKind::Mono, TokenKind::CodeBlock, TokenKind::Newline]),
                );
                job.append(&*span, 0.0, format_from_style(egui_style, &style));
                style.code = false;
                continue;
            }
            TokenKind::Heading(..) | TokenKind::Quote | TokenKind::Text => {}
        }
        job.append(&token.span, 0.0, format_from_style(egui_style, &style));
    }
    job
 }
 fn series<'a, const N: usize>(of: [TokenKind; N]) -> impl FnMut(&[Token<'a>; N]) -> bool {
    move |token| {
        of.iter()
            .zip(token)
            .all(|(kind, token)| kind == &token.kind)
    }
 }
 fn any_of<'a, const N: usize>(these: [TokenKind; N]) -> impl FnMut(&[Token<'a>; 1]) -> bool {
    move |[token]| these.contains(&token.kind)
 }
 /// Collect all tokens up to and including `pattern`, and merge them into a signle span.
 ///
 /// `N` determines how many specific and consecutive tokens we are looking for.
 /// i.e. if we were looking for a [TokenKind::Newline] followed by a [TokenKind::Quote], `N`
 /// would equal `2`.
 ///
 /// `pattern` is a function that accepts an array of `N` tokens and returns `true` if they match,
 /// i.e. if we should stop collecting. [any_of] and [series] can help to construct this function.
 ///
 /// The collected tokens will be split off the head of the slice referred to by `tokens`.
 ///
 /// # Panic
 /// Panics if `tokens` does not contain only consecutive adjacent spans.
 fn collect_until<'a, const N: usize>(
    token: &Token<'a>,
    tokens: &mut &[Token<'a>],
    pattern: impl FnMut(&[Token<'a>; N]) -> bool,
 ) -> Span<'a>
 where
    for<'b> &'b [Token<'a>; N]: TryFrom<&'b [Token<'a>]>,
 {
    let mut windows = tokens
        .windows(N)
        .map(|slice| <&[Token<'a>; N]>::try_from(slice).ok().unwrap());
    let split_at = match windows.position(pattern) {
        Some(i) => i + N,
        None => tokens.len(), // consume everything
    };
    let (consume, keep) = tokens.split_at(split_at);
    *tokens = keep;
    consume
        .iter()
        .fold(token.span.clone(), |span: Span<'_>, token| {
            span.try_merge(&token.span).unwrap()
        })
 }
 pub fn highlight_easymark_old(
    egui_style: &egui::Style,
    mut text: &str,
--- a/src/markdown/mod.rs
+++ b/src/markdown/mod.rs
@@ -1,4 +1,3 @@
 pub mod ast;
 pub mod span;
-
+pub mod tokenizer;
 lalrpop_util::lalrpop_mod!(grammar);
--- a/src/markdown/snapshots/inkrmarkdowntokenizerteststokenize.snap
+++ b/src/markdown/snapshots/inkrmarkdowntokenizerteststokenize.snap
@@ -0,0 +1,46 @@
 ---
 source: src/markdown/tokenizer.rs
 expression: examples
 ---
 - string: "just some normal text :D"
  tokens:
    - "Token { span: Span(0..24, \"just some normal text :D\"), kind: Text }"
 - string: normal *bold* normal
  tokens:
    - "Token { span: Span(0..7, \"normal \"), kind: Text }"
    - "Token { span: Span(7..8, \"*\"), kind: Strong }"
    - "Token { span: Span(8..12, \"bold\"), kind: Text }"
    - "Token { span: Span(12..13, \"*\"), kind: Strong }"
    - "Token { span: Span(13..20, \" normal\"), kind: Text }"
 - string: normal * maybe bold? * normal
  tokens:
    - "Token { span: Span(0..7, \"normal \"), kind: Text }"
    - "Token { span: Span(7..8, \"*\"), kind: Strong }"
    - "Token { span: Span(8..21, \" maybe bold? \"), kind: Text }"
    - "Token { span: Span(21..22, \"*\"), kind: Strong }"
    - "Token { span: Span(22..29, \" normal\"), kind: Text }"
 - string: "```lang\ncode code code\n```"
  tokens:
    - "Token { span: Span(0..3, \"```\"), kind: CodeBlock }"
    - "Token { span: Span(3..7, \"lang\"), kind: Text }"
    - "Token { span: Span(7..8, \"\\n\"), kind: Newline }"
    - "Token { span: Span(8..22, \"code code code\"), kind: Text }"
    - "Token { span: Span(22..23, \"\\n\"), kind: Newline }"
    - "Token { span: Span(23..26, \"```\"), kind: CodeBlock }"
 - string: "*/``/*"
  tokens:
    - "Token { span: Span(0..1, \"*\"), kind: Strong }"
    - "Token { span: Span(1..2, \"/\"), kind: Italic }"
    - "Token { span: Span(2..3, \"`\"), kind: Mono }"
    - "Token { span: Span(3..4, \"`\"), kind: Mono }"
    - "Token { span: Span(4..5, \"/\"), kind: Italic }"
    - "Token { span: Span(5..6, \"*\"), kind: Strong }"
 - string: "*/`*/*/"
  tokens:
    - "Token { span: Span(0..1, \"*\"), kind: Strong }"
    - "Token { span: Span(1..2, \"/\"), kind: Italic }"
    - "Token { span: Span(2..3, \"`\"), kind: Mono }"
    - "Token { span: Span(3..4, \"*\"), kind: Strong }"
    - "Token { span: Span(4..5, \"/\"), kind: Italic }"
    - "Token { span: Span(5..6, \"*\"), kind: Strong }"
    - "Token { span: Span(6..7, \"/\"), kind: Italic }"
--- a/src/markdown/span.rs
+++ b/src/markdown/span.rs
@@ -1,4 +1,7 @@
-use std::ops::{Deref, Range};
+use std::{
    fmt,
    ops::{Deref, Range},
 };
 #[derive(Clone, Eq, PartialEq)]
 pub struct Span<'a> {
@@ -31,6 +34,35 @@ impl<'a> Span<'a> {
    pub fn complete_str(&self) -> Self {
        Self::new(self.complete_str)
    }
    pub fn split_at(&self, i: usize) -> Option<(Self, Self)> {
        let head = self.get(0..i)?;
        let tail = self.get(i..self.range.len())?;
        Some((head, tail))
    }
    /// Try to merge the spans.
    ///
    /// This only works if spans are pointing into the same backing buffer, and are adjacent.
    pub fn try_merge(&self, other: &Self) -> Option<Self> {
        if self.complete_str.as_ptr() != other.complete_str.as_ptr() {
            return None;
        }
        if self.range.end == other.range.start {
            Some(Self {
                range: self.range.start..other.range.end,
                ..*self
            })
        } else if self.range.start == other.range.end {
            Some(Self {
                range: other.range.start..self.range.end,
                ..*self
            })
        } else {
            None
        }
    }
 }
 impl Deref for Span<'_> {
@@ -40,3 +72,12 @@ impl Deref for Span<'_> {
        &self.complete_str[self.range.clone()]
    }
 }
 impl<'a> fmt::Debug for Span<'a> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_tuple("Span")
            .field(&self.range)
            .field(&self.deref())
            .finish()
    }
 }
--- a/src/markdown/tokenizer.rs
+++ b/src/markdown/tokenizer.rs
@@ -0,0 +1,146 @@
 use std::iter;
 use super::span::Span;
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum Heading {
    H6,
    H5,
    H4,
    H3,
    H2,
    H1,
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum TokenKind {
    /// A newline followed by three `
    CodeBlock,
    /// A newline that isn't a codeblock
    Newline,
    Strong,
    Italic,
    Mono,
    Strikethrough,
    Quote,
    Heading(Heading),
    /// Normal text
    Text,
 }
 #[derive(Debug)]
 pub struct Token<'a> {
    pub span: Span<'a>,
    pub kind: TokenKind,
 }
 pub fn tokenize<'a>(s: &'a str) -> impl Iterator<Item = Token<'a>> {
    const TOKENS: &[(&'static str, TokenKind)] = &[
        ("\n", TokenKind::Newline),
        ("######", TokenKind::Heading(Heading::H6)),
        ("#####", TokenKind::Heading(Heading::H5)),
        ("####", TokenKind::Heading(Heading::H4)),
        ("###", TokenKind::Heading(Heading::H3)),
        ("##", TokenKind::Heading(Heading::H2)),
        ("#", TokenKind::Heading(Heading::H1)),
        (">", TokenKind::Quote),
        ("*", TokenKind::Strong),
        ("/", TokenKind::Italic),
        ("~", TokenKind::Strikethrough),
        ("```", TokenKind::CodeBlock),
        ("`", TokenKind::Mono),
    ];
    let mut s = Span::new(s);
    let mut yield_n: usize = 0;
    iter::from_fn(move || {
        loop {
            if s.is_empty() {
                return None;
            }
            if yield_n == s.len() {
                let (token, rest) = s.split_at(s.len()).unwrap();
                let token = Token {
                    span: token,
                    kind: TokenKind::Text,
                };
                s = rest;
                return Some(token);
            }
            let token = TOKENS.iter().find_map(|(token_str, token_kind)| {
                s[yield_n..]
                    .starts_with(token_str)
                    .then(|| (*token_kind, token_str.len()))
            });
            let Some((kind, len)) = token else {
                yield_n += s[yield_n..].chars().next().unwrap_or('\0').len_utf8();
                continue;
            };
            if yield_n > 0 {
                let (token, rest) = s.split_at(yield_n).unwrap();
                let token = Token {
                    span: token,
                    kind: TokenKind::Text,
                };
                s = rest;
                yield_n = 0;
                return Some(token);
            }
            let (token, rest) = s.split_at(len).unwrap();
            let token = Token { span: token, kind };
            s = rest;
            return Some(token);
        }
    })
 }
 #[cfg(test)]
 mod tests {
    use serde::Serialize;
    use super::tokenize;
    #[test]
    fn test_tokenize() {
        let examples = [
            "just some normal text :D",
            "normal *bold* normal",
            "normal * maybe bold? * normal",
            "```lang\ncode code code\n```",
            "*/``/*",
            "*/`*/*/",
        ];
        #[derive(Serialize)]
        struct Result {
            pub string: &'static str,
            /// Debug-printed tokens
            pub tokens: Vec<String>,
        }
        let examples = examples
            .into_iter()
            .map(|string| {
                let tokens = tokenize(string)
                    .map(|tokens| format!("{tokens:?}"))
                    .collect::<Vec<_>>();
                Result { string, tokens }
            })
            .collect::<Vec<_>>();
        insta::assert_yaml_snapshot!(examples);
    }
 }
--- a/src/snapshots/inkr__custom_code_blocktestiter_markdown-2.snap.new
+++ b/src/snapshots/inkr__custom_code_blocktestiter_markdown-2.snap.new
@@ -0,0 +1,44 @@
 ---
 source: src/custom_code_block.rs
 assertion_line: 133
 expression: list
 ---
 [
    Line(
        "\n",
    ),
    Line(
        "# Hello world\n",
    ),
    Line(
        "## Subheader\n",
    ),
    Line(
        "- 1\n",
    ),
    CodeBlock {
        key: "foo",
        content: "  whatever\n     some code\n Hi mom!",
        span: "```foo\n  whatever\n     some code\n Hi mom!\n```",
    },
    Line(
        " \n",
    ),
    Line(
        "\n",
    ),
    CodeBlock {
        key: "` # wrong number of ticks, but that's ok",
        content: " ``` # indented ticks",
        span: "```` # wrong number of ticks, but that's ok\n ``` # indented ticks\n```\n",
    },
    Line(
        "\n",
    ),
    Line(
        "``` # no closing ticks\n",
    ),
    Line(
        "            ",
    ),
 ]
--- a/src/snapshots/inkrpaintingtest__serialized
+++ b/src/snapshots/inkrpaintingtest__serialized
@@ -0,0 +1,8 @@
 ---
 source: src/painting.rs
 assertion_line: 695
 expression: serialized
 ---
 ```handwriting
 BQAAvAA8AEIAPABCAEIAPgBAAAAAAAQAAEIAQgC8ADwAAAAAAEIAPA==
 ```