Split markdown parsing and highlighting

2025-07-07 13:10:45 +02:00
parent e0fd726f02
commit 462c27e111
7 changed files with 243 additions and 440 deletions
--- a/src/markdown/ast.rs
+++ b/src/markdown/ast.rs
@@ -40,7 +40,7 @@ pub struct Style {
    pub raised: bool,
 }

-pub enum MarkdownItem<'a> {
+pub enum Item<'a> {
    Text {
        span: Span<'a>,
        style: Style,
--- a/src/markdown/highlighter.rs
+++ b/src/markdown/highlighter.rs
@@ -1,9 +1,6 @@
 use egui::text::{CCursorRange, LayoutJob};

-use crate::markdown::{
-    span::Span,
-    tokenizer::{Heading, Token, TokenKind, tokenize},
-};
+use super::{Item, Style, parse};

 /// Highlight markdown, caching previous output to save CPU.
 #[derive(Default)]
@@ -13,36 +10,6 @@ pub struct MemoizedHighlighter {
    output: LayoutJob,
 }

-#[derive(Copy, Clone, Debug, Default, Eq, PartialEq)]
-pub struct Style {
-    /// # heading (large text)
-    pub heading: Option<Heading>,
-
-    /// > quoted (slightly dimmer color or other font style)
-    pub quoted: bool,
-
-    /// `code` (monospace, some other color)
-    pub code: bool,
-
-    /// self.strong* (emphasized, e.g. bold)
-    pub strong: bool,
-
-    /// _underline_
-    pub underline: bool,
-
-    /// ~strikethrough~
-    pub strikethrough: bool,
-
-    /// /italics/
-    pub italics: bool,
-
-    /// $small$
-    pub small: bool,
-
-    /// ^raised^
-    pub raised: bool,
-}
-
 impl MemoizedHighlighter {
    pub fn highlight(
        &mut self,
@@ -67,189 +34,72 @@ pub fn highlight_markdown(
    _cursor: Option<CCursorRange>,
 ) -> LayoutJob {
    let mut job = LayoutJob::default();
-    let mut style = Style::default();

-    let mut prev = TokenKind::Newline;
+    let code_style = Style {
+        code: true,
+        ..Default::default()
+    };

-    let tokens: Vec<_> = tokenize(text).collect();
-    let mut tokens = &tokens[..];
-
-    const CODE_INDENT: f32 = 10.0;
-
-    while !tokens.is_empty() {
-        let token = tokens.first().unwrap();
-        tokens = &tokens[1..];
-
-        let start_of_line = prev == TokenKind::Newline;
-        prev = token.kind;
-
-        let mut basic_style: Option<fn(&mut Style) -> &mut bool> = None;
-
-        match token.kind {
-            TokenKind::CodeBlock if start_of_line => {
-                let span = collect_until(
-                    token,
-                    &mut tokens,
-                    series([TokenKind::Newline, TokenKind::CodeBlock]),
-                );
-
-                let code_style = format_from_style(
-                    egui_style,
-                    &Style {
-                        code: true,
-                        ..Default::default()
-                    },
-                );
-
-                job.append(&*span, CODE_INDENT, code_style.clone());
-                style = Default::default();
-                continue;
+    for item in parse(text) {
+        match item {
+            Item::Text { span, style } => {
+                job.append(&span, 0.0, format_from_style(egui_style, &style));
            }
-
-            TokenKind::Newline => style = Style::default(),
-
-            TokenKind::Strong => basic_style = Some(|s| &mut s.strong),
-            TokenKind::Italic => basic_style = Some(|s| &mut s.italics),
-            TokenKind::Strikethrough => basic_style = Some(|s| &mut s.strikethrough),
-
-            TokenKind::CodeBlock | TokenKind::Mono => {
-                style.code = true;
-                let span = collect_until(
-                    token,
-                    &mut tokens,
-                    any_of([TokenKind::Mono, TokenKind::CodeBlock, TokenKind::Newline]),
-                );
-                job.append(&*span, 0.0, format_from_style(egui_style, &style));
-                style.code = false;
-                continue;
+            Item::CodeBlock {
+                all,
+                language: _, // TODO
+                code: _,     // TODO
+            } => {
+                job.append(&all, 100.0, format_from_style(egui_style, &code_style));
            }
-
-            // TODO: different heading strengths
-            TokenKind::Heading(h) if start_of_line => style.heading = Some(h),
-            TokenKind::Quote if start_of_line => style.quoted = true,
-
-            // TODO: indented list entries
-            TokenKind::ListEntry if start_of_line => {
-                job.append("• ", 0.0, format_from_style(egui_style, &style));
-                continue;
-            }
-
-            TokenKind::Text
-            // the following tokens are only richly rendered if encountered e.g. at start_of_line.
-            | TokenKind::Indentation
-            | TokenKind::ListEntry
-            | TokenKind::Heading(..)
-            | TokenKind::Quote => {}
        }
-
-        // if we encountered a marker for Bold, Italic, or Strikethrough, toggle that style and
-        // render the token with the style enabled.
-        if let Some(basic_style) = basic_style {
-            let mut tmp_style = style;
-            *basic_style(&mut tmp_style) = true;
-            *basic_style(&mut style) ^= true; // toggle
-            job.append(&token.span, 0.0, format_from_style(egui_style, &tmp_style));
-            continue;
-        }
-
-        job.append(&token.span, 0.0, format_from_style(egui_style, &style));
    }

    job
 }

-fn series<'a, const N: usize>(of: [TokenKind; N]) -> impl FnMut(&[Token<'a>; N]) -> bool {
-    move |token| {
-        of.iter()
-            .zip(token)
-            .all(|(kind, token)| kind == &token.kind)
-    }
-}
-
-fn any_of<'a, const N: usize>(these: [TokenKind; N]) -> impl FnMut(&[Token<'a>; 1]) -> bool {
-    move |[token]| these.contains(&token.kind)
-}
-
-/// Collect all tokens up to and including `pattern`, and merge them into a signle span.
-///
-/// `N` determines how many specific and consecutive tokens we are looking for.
-/// i.e. if we were looking for a [TokenKind::Newline] followed by a [TokenKind::Quote], `N`
-/// would equal `2`.
-///
-/// `pattern` is a function that accepts an array of `N` tokens and returns `true` if they match,
-/// i.e. if we should stop collecting. [any_of] and [series] can help to construct this function.
-///
-/// The collected tokens will be split off the head of the slice referred to by `tokens`.
-///
-/// # Panic
-/// Panics if `tokens` does not contain only consecutive adjacent spans.
-fn collect_until<'a, const N: usize>(
-    token: &Token<'a>,
-    tokens: &mut &[Token<'a>],
-    pattern: impl FnMut(&[Token<'a>; N]) -> bool,
-) -> Span<'a>
-where
-    for<'b> &'b [Token<'a>; N]: TryFrom<&'b [Token<'a>]>,
-{
-    let mut windows = tokens
-        .windows(N)
-        .map(|slice| <&[Token<'a>; N]>::try_from(slice).ok().unwrap());
-
-    let split_at = match windows.position(pattern) {
-        Some(i) => i + N,
-        None => tokens.len(), // consume everything
-    };
-
-    let (consume, keep) = tokens.split_at(split_at);
-    *tokens = keep;
-
-    consume
-        .iter()
-        .fold(token.span.clone(), |span: Span<'_>, token| {
-            span.try_merge(&token.span).unwrap()
-        })
-}
-
-fn format_from_style(egui_style: &egui::Style, emark_style: &Style) -> egui::text::TextFormat {
+fn format_from_style(egui_style: &egui::Style, style: &Style) -> egui::text::TextFormat {
    use egui::{Align, Color32, Stroke, TextStyle};

-    let color = if emark_style.strong || emark_style.heading.is_some() {
+    let color = if style.code {
+        egui_style.visuals.strong_text_color() * Color32::GREEN
+    } else if style.strong || style.heading.is_some() {
        egui_style.visuals.strong_text_color()
-    } else if emark_style.quoted {
+    } else if style.quoted {
        egui_style.visuals.weak_text_color()
    } else {
        egui_style.visuals.text_color()
    };

-    let text_style = if emark_style.heading.is_some() {
+    let text_style = if style.heading.is_some() {
        TextStyle::Heading
-    } else if emark_style.code {
+    } else if style.code {
        TextStyle::Monospace
-    } else if emark_style.small | emark_style.raised {
+    } else if style.small | style.raised {
        TextStyle::Small
    } else {
        TextStyle::Body
    };

-    let background = if emark_style.code {
+    let background = if style.code {
        egui_style.visuals.code_bg_color
    } else {
        Color32::TRANSPARENT
    };

-    let underline = if emark_style.underline {
+    let underline = if style.underline {
        Stroke::new(1.0, color)
    } else {
        Stroke::NONE
    };

-    let strikethrough = if emark_style.strikethrough {
+    let strikethrough = if style.strikethrough {
        Stroke::new(1.0, color)
    } else {
        Stroke::NONE
    };

-    let valign = if emark_style.raised {
+    let valign = if style.raised {
        Align::TOP
    } else {
        Align::BOTTOM
@@ -259,7 +109,7 @@ fn format_from_style(egui_style: &egui::Style, emark_style: &Style) -> egui::tex
        font_id: text_style.resolve(egui_style),
        color,
        background,
-        italics: emark_style.italics,
+        italics: style.italics,
        underline,
        strikethrough,
        valign,
--- a/src/markdown/mod.rs
+++ b/src/markdown/mod.rs
@@ -1,7 +1,11 @@
+mod ast;
 mod highlighter;
+mod parser;
 mod span;
 mod tokenizer;

+pub use ast::*;
 pub use highlighter::*;
+pub use parser::*;
 pub use span::*;
 pub use tokenizer::*;
--- a/src/markdown/parser.rs
+++ b/src/markdown/parser.rs
@@ -0,0 +1,172 @@
+use std::iter::{self, once};
+
+use crate::markdown::Style;
+
+use super::{Item, Span, Token, TokenKind, tokenize};
+
+pub fn parse(text: &str) -> Vec<Item<'_>> {
+    let tokens: Vec<_> = tokenize(text).collect();
+    parse_tokens(&tokens)
+}
+
+pub fn parse_tokens<'a>(mut tokens: &[Token<'a>]) -> Vec<Item<'a>> {
+    // pretend that the first token was preceeded by a newline.
+    // means we don't have to handle the first token as a special case.
+    let mut prev = TokenKind::Newline;
+
+    let mut style = Style::default();
+
+    let mono_style = Style {
+        code: true,
+        ..Default::default()
+    };
+
+    iter::from_fn(move || {
+        if tokens.is_empty() {
+            return None;
+        }
+
+        let token = tokens.first().unwrap();
+        tokens = &tokens[1..];
+
+        let start_of_line = prev == TokenKind::Newline;
+        prev = token.kind;
+
+        let mut basic_style: Option<fn(&mut Style) -> &mut bool> = None;
+
+        match token.kind {
+            TokenKind::CodeBlock if start_of_line => {
+                let language = collect_until(
+                    None,
+                    &mut tokens,
+                    any_of([TokenKind::Newline]),
+                );
+
+                let code = collect_until(
+                    None,
+                    &mut tokens,
+                    series([TokenKind::Newline, TokenKind::CodeBlock]),
+                );
+
+                let all = [
+                    &token.span,
+                    &language,
+                    &code,
+                ].into_iter().fold(Span::empty(), |a, b| a.try_merge(b).unwrap());
+
+                let language = language.trim_end_matches("\n");
+                let code = code.trim_end_matches("\n```");
+
+                return Some(Item::CodeBlock { all, language, code });
+            }
+
+            TokenKind::Newline => style = Style::default(),
+
+            TokenKind::Strong => basic_style = Some(|s| &mut s.strong),
+            TokenKind::Italic => basic_style = Some(|s| &mut s.italics),
+            TokenKind::Strikethrough => basic_style = Some(|s| &mut s.strikethrough),
+
+            TokenKind::CodeBlock | TokenKind::Mono => {
+                let span = collect_until(
+                    Some(token),
+                    &mut tokens,
+                    any_of([TokenKind::Mono, TokenKind::CodeBlock, TokenKind::Newline]),
+                );
+
+                return Some(Item::Text { span, style: mono_style});
+            }
+
+            // TODO: different heading strengths
+            TokenKind::Heading(h) if start_of_line => style.heading = Some(h),
+            TokenKind::Quote if start_of_line => style.quoted = true,
+
+            // TODO: replace dashes with dots
+            //// TODO: indented list entries
+            //TokenKind::ListEntry if start_of_line => {
+            //    job.append("• ", 0.0, format_from_style(egui_style, &style));
+            //    continue;
+            //}
+
+            TokenKind::Text
+            // the following tokens are only richly rendered if encountered e.g. at start_of_line.
+            | TokenKind::Indentation
+            | TokenKind::ListEntry
+            | TokenKind::Heading(..)
+            | TokenKind::Quote => {}
+        }
+
+        // if we encountered a marker for Bold, Italic, or Strikethrough, toggle that style and
+        // render the token with the style enabled.
+        if let Some(basic_style) = basic_style {
+            let mut tmp_style = style;
+            *basic_style(&mut tmp_style) = true;
+            *basic_style(&mut style) ^= true; // toggle
+            return Some(Item::Text {
+                span: token.span.clone(),
+                style: tmp_style,
+            });
+        }
+
+        Some(Item::Text {
+            span: token.span.clone(),
+            style,
+        })
+    })
+    .collect()
+}
+
+fn series<'a, const N: usize>(of: [TokenKind; N]) -> impl FnMut(&[Token<'a>; N]) -> bool {
+    move |token| {
+        of.iter()
+            .zip(token)
+            .all(|(kind, token)| kind == &token.kind)
+    }
+}
+
+fn any_of<'a, const N: usize>(these: [TokenKind; N]) -> impl FnMut(&[Token<'a>; 1]) -> bool {
+    move |[token]| these.contains(&token.kind)
+}
+
+/// Collect all tokens up to and including `pattern`, and merge them into a signle span.
+///
+/// `N` determines how many specific and consecutive tokens we are looking for.
+/// i.e. if we were looking for a [TokenKind::Newline] followed by a [TokenKind::Quote], `N`
+/// would equal `2`.
+///
+/// `pattern` is a function that accepts an array of `N` tokens and returns `true` if they match,
+/// i.e. if we should stop collecting. [any_of] and [series] can help to construct this function.
+///
+/// The collected tokens will be split off the head of the slice referred to by `tokens`.
+///
+/// # Panic
+/// Panics if `tokens` does not contain only consecutive adjacent spans.
+fn collect_until<'a, const N: usize>(
+    first_token: Option<&Token<'a>>,
+    tokens: &mut &[Token<'a>],
+    pattern: impl FnMut(&[Token<'a>; N]) -> bool,
+) -> Span<'a>
+where
+    // &[T; N]: TryFrom<&[T]>
+    for<'b> &'b [Token<'a>; N]: TryFrom<&'b [Token<'a>]>,
+{
+    let mut windows = tokens.windows(N).map(|slice| {
+        <&[Token<'a>; N]>::try_from(slice)
+            .ok()
+            .expect("`windows` promises to return slices of length N")
+    });
+
+    let split_at = match windows.position(pattern) {
+        Some(i) => i + N,
+        None => tokens.len(), // consume everything
+    };
+
+    let (consume, keep) = tokens.split_at(split_at);
+    *tokens = keep;
+
+    once(first_token)
+        .flatten()
+        .chain(consume)
+        .fold(Span::empty(), |span: Span<'_>, token| {
+            span.try_merge(&token.span).unwrap()
+        })
+}
--- a/src/markdown/span.rs
+++ b/src/markdown/span.rs
@@ -3,6 +3,8 @@ use std::{
    ops::{Deref, Range},
 };

+use eyre::{bail, eyre};
+
 #[derive(Clone, Eq, PartialEq)]
 pub struct Span<'a> {
    complete_str: &'a str,
@@ -17,6 +19,13 @@ impl<'a> Span<'a> {
        }
    }

+    pub const fn empty() -> Self {
+        Span {
+            complete_str: "",
+            range: 0..0,
+        }
+    }
+
    pub fn get(&self, slice: Range<usize>) -> Option<Self> {
        let start = self.range.start.checked_add(slice.start)?;
        let end = self.range.start.checked_add(slice.end)?;
@@ -41,26 +50,49 @@ impl<'a> Span<'a> {
        Some((head, tail))
    }

+    pub fn trim_end_matches(&self, p: &str) -> Self {
+        if !self.ends_with(p) {
+            return self.clone();
+        }
+
+        Self {
+            range: self.range.start..self.range.end - p.len(),
+            complete_str: self.complete_str,
+        }
+    }
+
    /// Try to merge the spans.
    ///
+    /// If either spans is empty, this just returns the other one.
    /// This only works if spans are pointing into the same backing buffer, and are adjacent.
-    pub fn try_merge(&self, other: &Self) -> Option<Self> {
+    pub fn try_merge(&self, other: &Self) -> eyre::Result<Self> {
+        if self.is_empty() {
+            return Ok(other.clone());
+        }
+
+        if other.is_empty() {
+            return Ok(self.clone());
+        }
+
        if self.complete_str.as_ptr() != other.complete_str.as_ptr() {
-            return None;
+            bail!("Can't merge different strings");
        }

        if self.range.end == other.range.start {
-            Some(Self {
+            Ok(Self {
                range: self.range.start..other.range.end,
                ..*self
            })
        } else if self.range.start == other.range.end {
-            Some(Self {
+            Ok(Self {
                range: other.range.start..self.range.end,
                ..*self
            })
        } else {
-            None
+            Err(eyre!("String: {:?}", self.complete_str)
+                .wrap_err(eyre!("Span 2: {:?}", other.deref()))
+                .wrap_err(eyre!("Span 1: {:?}", self.deref()))
+                .wrap_err("Can't merge disjoint string spans"))
        }
    }
 }
--- a/src/markdown/tokenizer.rs
+++ b/src/markdown/tokenizer.rs
@@ -1,16 +1,6 @@
 use std::iter;

-use super::span::Span;
-
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub enum Heading {
-    H6,
-    H5,
-    H4,
-    H3,
-    H2,
-    H1,
-}
+use super::{Heading, span::Span};

 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum TokenKind {