Split markdown parsing and highlighting

2025-07-07 13:10:45 +02:00
parent e0fd726f02
commit 462c27e111
7 changed files with 243 additions and 440 deletions
--- a/src/easy_mark/easy_mark_highlighter.rs
+++ b/src/easy_mark/easy_mark_highlighter.rs
@@ -1,245 +0,0 @@
 use egui::text::{CCursorRange, LayoutJob};
 use crate::easy_mark::easy_mark_parser;
 /// Highlight easymark, memoizing previous output to save CPU.
 ///
 /// In practice, the highlighter is fast enough not to need any caching.
 #[derive(Default)]
 pub struct MemoizedHighlighter {
    style: egui::Style,
    code: String,
    output: LayoutJob,
 }
 impl MemoizedHighlighter {
    pub fn highlight(
        &mut self,
        egui_style: &egui::Style,
        code: &str,
        cursor: Option<CCursorRange>,
    ) -> LayoutJob {
        if (&self.style, self.code.as_str()) != (egui_style, code) {
            self.style = egui_style.clone();
            code.clone_into(&mut self.code);
            self.output = highlight_easymark(egui_style, code, cursor);
        }
        self.output.clone()
    }
 }
 pub fn highlight_easymark(
    egui_style: &egui::Style,
    mut text: &str,
    // TODO: hide special characters where cursor isn't
    _cursor: Option<CCursorRange>,
 ) -> LayoutJob {
    let mut job = LayoutJob::default();
    let mut style = easy_mark_parser::Style::default();
    let mut start_of_line = true;
    const CODE_INDENT: f32 = 10.0;
    while !text.is_empty() {
        if start_of_line && text.starts_with("```") {
            let astyle = format_from_style(
                egui_style,
                &easy_mark_parser::Style {
                    code: true,
                    ..Default::default()
                },
            );
            // Render the initial backticks as spaces
            text = &text[3..];
            job.append("   ", CODE_INDENT, astyle.clone());
            match text.find("\n```") {
                Some(n) => {
                    for line in text[..n + 1].lines() {
                        job.append(line, CODE_INDENT, astyle.clone());
                        job.append("\n", 0.0, astyle.clone());
                    }
                    // Render the final backticks as spaces
                    job.append("   ", CODE_INDENT, astyle);
                    text = &text[n + 4..];
                }
                None => {
                    job.append(text, 0.0, astyle.clone());
                    text = "";
                }
            };
            style = Default::default();
            continue;
        }
        if text.starts_with('`') {
            style.code = true;
            let end = text[1..]
                .find(&['`', '\n'][..])
                .map_or_else(|| text.len(), |i| i + 2);
            job.append(&text[..end], 0.0, format_from_style(egui_style, &style));
            text = &text[end..];
            style.code = false;
            continue;
        }
        let skip;
        // zero-width space
        let _zws = "\u{200b}";
        let mut apply_basic_style =
            |text: &mut &str,
             style: &mut easy_mark_parser::Style,
             access: fn(&mut easy_mark_parser::Style) -> &mut bool| {
                let skip = if *access(style) {
                    // Include the character that is ending this style:
                    job.append(&text[..1], 0.0, format_from_style(egui_style, style));
                    *text = &text[1..];
                    0
                } else {
                    1
                };
                *access(style) ^= true;
                skip
            };
        if text.starts_with('*') {
            skip = apply_basic_style(&mut text, &mut style, |style| &mut style.strong);
        } else if text.starts_with('/') {
            skip = apply_basic_style(&mut text, &mut style, |style| &mut style.italics);
        } else if text.starts_with('_') {
            skip = apply_basic_style(&mut text, &mut style, |style| &mut style.underline);
        } else if text.starts_with('$') {
            skip = apply_basic_style(&mut text, &mut style, |style| &mut style.small);
        } else if text.starts_with('~') {
            skip = apply_basic_style(&mut text, &mut style, |style| &mut style.strikethrough);
        } else if text.starts_with('^') {
            skip = apply_basic_style(&mut text, &mut style, |style| &mut style.raised);
        } else if text.starts_with('\\') && text.len() >= 2 {
            skip = 2;
        } else if start_of_line && text.starts_with(' ') {
            // we don't preview indentation, because it is confusing
            skip = 1;
        } else if start_of_line && text.starts_with("###### ") {
            style.heading = true;
            skip = 7;
        } else if start_of_line && text.starts_with("##### ") {
            style.heading = true;
            skip = 6;
        } else if start_of_line && text.starts_with("#### ") {
            style.heading = true;
            skip = 5;
        } else if start_of_line && text.starts_with("### ") {
            style.heading = true;
            skip = 4;
        } else if start_of_line && text.starts_with("## ") {
            style.heading = true;
            skip = 3;
        } else if start_of_line && text.starts_with("# ") {
            style.heading = true;
            skip = 2;
        } else if start_of_line && text.starts_with("> ") {
            style.quoted = true;
            skip = 2;
            // we don't preview indentation, because it is confusing
        } else if start_of_line && text.trim_start().starts_with("- ") {
            job.append("• ", 0.0, format_from_style(egui_style, &style));
            text = &text[2..];
            skip = 0;
            // we don't preview indentation, because it is confusing
        } else {
            skip = 0;
        }
        // Note: we don't preview underline, strikethrough and italics because it confuses things.
        // Swallow everything up to the next special character:
        let line_end = text[skip..]
            .find('\n')
            .map_or_else(|| text.len(), |i| (skip + i + 1));
        let end = text[skip..]
            .find(&['*', '`', '~', '_', '/', '$', '^', '\\', '<', '['][..])
            .map_or_else(|| text.len(), |i| (skip + i).max(1));
        if line_end <= end {
            job.append(
                &text[..line_end],
                0.0,
                format_from_style(egui_style, &style),
            );
            text = &text[line_end..];
            start_of_line = true;
            style = Default::default();
        } else {
            job.append(&text[..end], 0.0, format_from_style(egui_style, &style));
            text = &text[end..];
            start_of_line = false;
        }
    }
    job
 }
 fn format_from_style(
    egui_style: &egui::Style,
    emark_style: &easy_mark_parser::Style,
 ) -> egui::text::TextFormat {
    use egui::{Align, Color32, Stroke, TextStyle};
    let color = if emark_style.code {
        egui_style.visuals.strong_text_color() * Color32::from_rgb(0x44, 0xff, 0x44)
    } else if emark_style.strong || emark_style.heading {
        egui_style.visuals.strong_text_color()
    } else if emark_style.quoted {
        egui_style.visuals.weak_text_color()
    } else {
        egui_style.visuals.text_color()
    };
    let text_style = if emark_style.heading {
        TextStyle::Heading
    } else if emark_style.code {
        TextStyle::Monospace
    } else if emark_style.small | emark_style.raised {
        TextStyle::Small
    } else {
        TextStyle::Body
    };
    let background = if emark_style.code {
        egui_style.visuals.code_bg_color
    } else {
        Color32::TRANSPARENT
    };
    let underline = if emark_style.underline {
        Stroke::new(1.0, color)
    } else {
        Stroke::NONE
    };
    let strikethrough = if emark_style.strikethrough {
        Stroke::new(1.0, color)
    } else {
        Stroke::NONE
    };
    let valign = if emark_style.raised {
        Align::TOP
    } else {
        Align::BOTTOM
    };
    egui::text::TextFormat {
        font_id: text_style.resolve(egui_style),
        color,
        background,
        italics: emark_style.italics,
        underline,
        strikethrough,
        valign,
        ..Default::default()
    }
 }
--- a/src/markdown/ast.rs
+++ b/src/markdown/ast.rs
@@ -40,7 +40,7 @@ pub struct Style {
    pub raised: bool,
 }
-pub enum MarkdownItem<'a> {
+pub enum Item<'a> {
    Text {
        span: Span<'a>,
        style: Style,
--- a/src/markdown/highlighter.rs
+++ b/src/markdown/highlighter.rs
@@ -1,9 +1,6 @@
 use egui::text::{CCursorRange, LayoutJob};
-use crate::markdown::{
+use super::{Item, Style, parse};
    span::Span,
    tokenizer::{Heading, Token, TokenKind, tokenize},
 };
 /// Highlight markdown, caching previous output to save CPU.
 #[derive(Default)]
@@ -13,36 +10,6 @@ pub struct MemoizedHighlighter {
    output: LayoutJob,
 }
 #[derive(Copy, Clone, Debug, Default, Eq, PartialEq)]
 pub struct Style {
    /// # heading (large text)
    pub heading: Option<Heading>,
    /// > quoted (slightly dimmer color or other font style)
    pub quoted: bool,
    /// `code` (monospace, some other color)
    pub code: bool,
    /// self.strong* (emphasized, e.g. bold)
    pub strong: bool,
    /// _underline_
    pub underline: bool,
    /// ~strikethrough~
    pub strikethrough: bool,
    /// /italics/
    pub italics: bool,
    /// $small$
    pub small: bool,
    /// ^raised^
    pub raised: bool,
 }
 impl MemoizedHighlighter {
    pub fn highlight(
        &mut self,
@@ -67,189 +34,72 @@ pub fn highlight_markdown(
    _cursor: Option<CCursorRange>,
 ) -> LayoutJob {
    let mut job = LayoutJob::default();
    let mut style = Style::default();
-    let mut prev = TokenKind::Newline;
+    let code_style = Style {
        code: true,
        ..Default::default()
    };
-    let tokens: Vec<_> = tokenize(text).collect();
+    for item in parse(text) {
-    let mut tokens = &tokens[..];
+        match item {
-
+            Item::Text { span, style } => {
-    const CODE_INDENT: f32 = 10.0;
+                job.append(&span, 0.0, format_from_style(egui_style, &style));
    while !tokens.is_empty() {
        let token = tokens.first().unwrap();
        tokens = &tokens[1..];
        let start_of_line = prev == TokenKind::Newline;
        prev = token.kind;
        let mut basic_style: Option<fn(&mut Style) -> &mut bool> = None;
        match token.kind {
            TokenKind::CodeBlock if start_of_line => {
                let span = collect_until(
                    token,
                    &mut tokens,
                    series([TokenKind::Newline, TokenKind::CodeBlock]),
                );
                let code_style = format_from_style(
                    egui_style,
                    &Style {
                        code: true,
                        ..Default::default()
                    },
                );
                job.append(&*span, CODE_INDENT, code_style.clone());
                style = Default::default();
                continue;
            }
-
+            Item::CodeBlock {
-            TokenKind::Newline => style = Style::default(),
+                all,
-
+                language: _, // TODO
-            TokenKind::Strong => basic_style = Some(|s| &mut s.strong),
+                code: _,     // TODO
-            TokenKind::Italic => basic_style = Some(|s| &mut s.italics),
+            } => {
-            TokenKind::Strikethrough => basic_style = Some(|s| &mut s.strikethrough),
+                job.append(&all, 100.0, format_from_style(egui_style, &code_style));
            TokenKind::CodeBlock | TokenKind::Mono => {
                style.code = true;
                let span = collect_until(
                    token,
                    &mut tokens,
                    any_of([TokenKind::Mono, TokenKind::CodeBlock, TokenKind::Newline]),
                );
                job.append(&*span, 0.0, format_from_style(egui_style, &style));
                style.code = false;
                continue;
            }
            // TODO: different heading strengths
            TokenKind::Heading(h) if start_of_line => style.heading = Some(h),
            TokenKind::Quote if start_of_line => style.quoted = true,
            // TODO: indented list entries
            TokenKind::ListEntry if start_of_line => {
                job.append("• ", 0.0, format_from_style(egui_style, &style));
                continue;
            }
            TokenKind::Text
            // the following tokens are only richly rendered if encountered e.g. at start_of_line.
            | TokenKind::Indentation
            | TokenKind::ListEntry
            | TokenKind::Heading(..)
            | TokenKind::Quote => {}
        }
        // if we encountered a marker for Bold, Italic, or Strikethrough, toggle that style and
        // render the token with the style enabled.
        if let Some(basic_style) = basic_style {
            let mut tmp_style = style;
            *basic_style(&mut tmp_style) = true;
            *basic_style(&mut style) ^= true; // toggle
            job.append(&token.span, 0.0, format_from_style(egui_style, &tmp_style));
            continue;
        }
        job.append(&token.span, 0.0, format_from_style(egui_style, &style));
    }
    job
 }
-fn series<'a, const N: usize>(of: [TokenKind; N]) -> impl FnMut(&[Token<'a>; N]) -> bool {
+fn format_from_style(egui_style: &egui::Style, style: &Style) -> egui::text::TextFormat {
    move |token| {
        of.iter()
            .zip(token)
            .all(|(kind, token)| kind == &token.kind)
    }
 }
 fn any_of<'a, const N: usize>(these: [TokenKind; N]) -> impl FnMut(&[Token<'a>; 1]) -> bool {
    move |[token]| these.contains(&token.kind)
 }
 /// Collect all tokens up to and including `pattern`, and merge them into a signle span.
 ///
 /// `N` determines how many specific and consecutive tokens we are looking for.
 /// i.e. if we were looking for a [TokenKind::Newline] followed by a [TokenKind::Quote], `N`
 /// would equal `2`.
 ///
 /// `pattern` is a function that accepts an array of `N` tokens and returns `true` if they match,
 /// i.e. if we should stop collecting. [any_of] and [series] can help to construct this function.
 ///
 /// The collected tokens will be split off the head of the slice referred to by `tokens`.
 ///
 /// # Panic
 /// Panics if `tokens` does not contain only consecutive adjacent spans.
 fn collect_until<'a, const N: usize>(
    token: &Token<'a>,
    tokens: &mut &[Token<'a>],
    pattern: impl FnMut(&[Token<'a>; N]) -> bool,
 ) -> Span<'a>
 where
    for<'b> &'b [Token<'a>; N]: TryFrom<&'b [Token<'a>]>,
 {
    let mut windows = tokens
        .windows(N)
        .map(|slice| <&[Token<'a>; N]>::try_from(slice).ok().unwrap());
    let split_at = match windows.position(pattern) {
        Some(i) => i + N,
        None => tokens.len(), // consume everything
    };
    let (consume, keep) = tokens.split_at(split_at);
    *tokens = keep;
    consume
        .iter()
        .fold(token.span.clone(), |span: Span<'_>, token| {
            span.try_merge(&token.span).unwrap()
        })
 }
 fn format_from_style(egui_style: &egui::Style, emark_style: &Style) -> egui::text::TextFormat {
    use egui::{Align, Color32, Stroke, TextStyle};
-    let color = if emark_style.strong || emark_style.heading.is_some() {
+    let color = if style.code {
        egui_style.visuals.strong_text_color() * Color32::GREEN
    } else if style.strong || style.heading.is_some() {
        egui_style.visuals.strong_text_color()
-    } else if emark_style.quoted {
+    } else if style.quoted {
        egui_style.visuals.weak_text_color()
    } else {
        egui_style.visuals.text_color()
    };
-    let text_style = if emark_style.heading.is_some() {
+    let text_style = if style.heading.is_some() {
        TextStyle::Heading
-    } else if emark_style.code {
+    } else if style.code {
        TextStyle::Monospace
-    } else if emark_style.small | emark_style.raised {
+    } else if style.small | style.raised {
        TextStyle::Small
    } else {
        TextStyle::Body
    };
-    let background = if emark_style.code {
+    let background = if style.code {
        egui_style.visuals.code_bg_color
    } else {
        Color32::TRANSPARENT
    };
-    let underline = if emark_style.underline {
+    let underline = if style.underline {
        Stroke::new(1.0, color)
    } else {
        Stroke::NONE
    };
-    let strikethrough = if emark_style.strikethrough {
+    let strikethrough = if style.strikethrough {
        Stroke::new(1.0, color)
    } else {
        Stroke::NONE
    };
-    let valign = if emark_style.raised {
+    let valign = if style.raised {
        Align::TOP
    } else {
        Align::BOTTOM
@@ -259,7 +109,7 @@ fn format_from_style(egui_style: &egui::Style, emark_style: &Style) -> egui::tex
        font_id: text_style.resolve(egui_style),
        color,
        background,
-        italics: emark_style.italics,
+        italics: style.italics,
        underline,
        strikethrough,
        valign,
--- a/src/markdown/mod.rs
+++ b/src/markdown/mod.rs
@@ -1,7 +1,11 @@
 mod ast;
 mod highlighter;
 mod parser;
 mod span;
 mod tokenizer;
 pub use ast::*;
 pub use highlighter::*;
 pub use parser::*;
 pub use span::*;
 pub use tokenizer::*;
--- a/src/markdown/parser.rs
+++ b/src/markdown/parser.rs
@@ -0,0 +1,172 @@
 use std::iter::{self, once};
 use crate::markdown::Style;
 use super::{Item, Span, Token, TokenKind, tokenize};
 pub fn parse(text: &str) -> Vec<Item<'_>> {
    let tokens: Vec<_> = tokenize(text).collect();
    parse_tokens(&tokens)
 }
 pub fn parse_tokens<'a>(mut tokens: &[Token<'a>]) -> Vec<Item<'a>> {
    // pretend that the first token was preceeded by a newline.
    // means we don't have to handle the first token as a special case.
    let mut prev = TokenKind::Newline;
    let mut style = Style::default();
    let mono_style = Style {
        code: true,
        ..Default::default()
    };
    iter::from_fn(move || {
        if tokens.is_empty() {
            return None;
        }
        let token = tokens.first().unwrap();
        tokens = &tokens[1..];
        let start_of_line = prev == TokenKind::Newline;
        prev = token.kind;
        let mut basic_style: Option<fn(&mut Style) -> &mut bool> = None;
        match token.kind {
            TokenKind::CodeBlock if start_of_line => {
                let language = collect_until(
                    None,
                    &mut tokens,
                    any_of([TokenKind::Newline]),
                );
                let code = collect_until(
                    None,
                    &mut tokens,
                    series([TokenKind::Newline, TokenKind::CodeBlock]),
                );
                let all = [
                    &token.span,
                    &language,
                    &code,
                ].into_iter().fold(Span::empty(), |a, b| a.try_merge(b).unwrap());
                let language = language.trim_end_matches("\n");
                let code = code.trim_end_matches("\n```");
                return Some(Item::CodeBlock { all, language, code });
            }
            TokenKind::Newline => style = Style::default(),
            TokenKind::Strong => basic_style = Some(|s| &mut s.strong),
            TokenKind::Italic => basic_style = Some(|s| &mut s.italics),
            TokenKind::Strikethrough => basic_style = Some(|s| &mut s.strikethrough),
            TokenKind::CodeBlock | TokenKind::Mono => {
                let span = collect_until(
                    Some(token),
                    &mut tokens,
                    any_of([TokenKind::Mono, TokenKind::CodeBlock, TokenKind::Newline]),
                );
                return Some(Item::Text { span, style: mono_style});
            }
            // TODO: different heading strengths
            TokenKind::Heading(h) if start_of_line => style.heading = Some(h),
            TokenKind::Quote if start_of_line => style.quoted = true,
            // TODO: replace dashes with dots
            //// TODO: indented list entries
            //TokenKind::ListEntry if start_of_line => {
            //    job.append("• ", 0.0, format_from_style(egui_style, &style));
            //    continue;
            //}
            TokenKind::Text
            // the following tokens are only richly rendered if encountered e.g. at start_of_line.
            | TokenKind::Indentation
            | TokenKind::ListEntry
            | TokenKind::Heading(..)
            | TokenKind::Quote => {}
        }
        // if we encountered a marker for Bold, Italic, or Strikethrough, toggle that style and
        // render the token with the style enabled.
        if let Some(basic_style) = basic_style {
            let mut tmp_style = style;
            *basic_style(&mut tmp_style) = true;
            *basic_style(&mut style) ^= true; // toggle
            return Some(Item::Text {
                span: token.span.clone(),
                style: tmp_style,
            });
        }
        Some(Item::Text {
            span: token.span.clone(),
            style,
        })
    })
    .collect()
 }
 fn series<'a, const N: usize>(of: [TokenKind; N]) -> impl FnMut(&[Token<'a>; N]) -> bool {
    move |token| {
        of.iter()
            .zip(token)
            .all(|(kind, token)| kind == &token.kind)
    }
 }
 fn any_of<'a, const N: usize>(these: [TokenKind; N]) -> impl FnMut(&[Token<'a>; 1]) -> bool {
    move |[token]| these.contains(&token.kind)
 }
 /// Collect all tokens up to and including `pattern`, and merge them into a signle span.
 ///
 /// `N` determines how many specific and consecutive tokens we are looking for.
 /// i.e. if we were looking for a [TokenKind::Newline] followed by a [TokenKind::Quote], `N`
 /// would equal `2`.
 ///
 /// `pattern` is a function that accepts an array of `N` tokens and returns `true` if they match,
 /// i.e. if we should stop collecting. [any_of] and [series] can help to construct this function.
 ///
 /// The collected tokens will be split off the head of the slice referred to by `tokens`.
 ///
 /// # Panic
 /// Panics if `tokens` does not contain only consecutive adjacent spans.
 fn collect_until<'a, const N: usize>(
    first_token: Option<&Token<'a>>,
    tokens: &mut &[Token<'a>],
    pattern: impl FnMut(&[Token<'a>; N]) -> bool,
 ) -> Span<'a>
 where
    // &[T; N]: TryFrom<&[T]>
    for<'b> &'b [Token<'a>; N]: TryFrom<&'b [Token<'a>]>,
 {
    let mut windows = tokens.windows(N).map(|slice| {
        <&[Token<'a>; N]>::try_from(slice)
            .ok()
            .expect("`windows` promises to return slices of length N")
    });
    let split_at = match windows.position(pattern) {
        Some(i) => i + N,
        None => tokens.len(), // consume everything
    };
    let (consume, keep) = tokens.split_at(split_at);
    *tokens = keep;
    once(first_token)
        .flatten()
        .chain(consume)
        .fold(Span::empty(), |span: Span<'_>, token| {
            span.try_merge(&token.span).unwrap()
        })
 }
--- a/src/markdown/span.rs
+++ b/src/markdown/span.rs
@@ -3,6 +3,8 @@ use std::{
    ops::{Deref, Range},
 };
 use eyre::{bail, eyre};
 #[derive(Clone, Eq, PartialEq)]
 pub struct Span<'a> {
    complete_str: &'a str,
@@ -17,6 +19,13 @@ impl<'a> Span<'a> {
        }
    }
    pub const fn empty() -> Self {
        Span {
            complete_str: "",
            range: 0..0,
        }
    }
    pub fn get(&self, slice: Range<usize>) -> Option<Self> {
        let start = self.range.start.checked_add(slice.start)?;
        let end = self.range.start.checked_add(slice.end)?;
@@ -41,26 +50,49 @@ impl<'a> Span<'a> {
        Some((head, tail))
    }
    pub fn trim_end_matches(&self, p: &str) -> Self {
        if !self.ends_with(p) {
            return self.clone();
        }
        Self {
            range: self.range.start..self.range.end - p.len(),
            complete_str: self.complete_str,
        }
    }
    /// Try to merge the spans.
    ///
    /// If either spans is empty, this just returns the other one.
    /// This only works if spans are pointing into the same backing buffer, and are adjacent.
-    pub fn try_merge(&self, other: &Self) -> Option<Self> {
+    pub fn try_merge(&self, other: &Self) -> eyre::Result<Self> {
        if self.is_empty() {
            return Ok(other.clone());
        }
        if other.is_empty() {
            return Ok(self.clone());
        }
        if self.complete_str.as_ptr() != other.complete_str.as_ptr() {
-            return None;
+            bail!("Can't merge different strings");
        }
        if self.range.end == other.range.start {
-            Some(Self {
+            Ok(Self {
                range: self.range.start..other.range.end,
                ..*self
            })
        } else if self.range.start == other.range.end {
-            Some(Self {
+            Ok(Self {
                range: other.range.start..self.range.end,
                ..*self
            })
        } else {
-            None
+            Err(eyre!("String: {:?}", self.complete_str)
                .wrap_err(eyre!("Span 2: {:?}", other.deref()))
                .wrap_err(eyre!("Span 1: {:?}", self.deref()))
                .wrap_err("Can't merge disjoint string spans"))
        }
    }
 }
--- a/src/markdown/tokenizer.rs
+++ b/src/markdown/tokenizer.rs
@@ -1,16 +1,6 @@
 use std::iter;
-use super::span::Span;
+use super::{Heading, span::Span};
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum Heading {
    H6,
    H5,
    H4,
    H3,
    H2,
    H1,
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum TokenKind {