Split markdown parsing and highlighting

This commit is contained in:
2025-07-07 13:10:45 +02:00
parent e0fd726f02
commit 462c27e111
7 changed files with 243 additions and 440 deletions

View File

@ -1,245 +0,0 @@
use egui::text::{CCursorRange, LayoutJob};
use crate::easy_mark::easy_mark_parser;
/// Highlight easymark, memoizing previous output to save CPU.
///
/// In practice, the highlighter is fast enough not to need any caching.
#[derive(Default)]
pub struct MemoizedHighlighter {
style: egui::Style,
code: String,
output: LayoutJob,
}
impl MemoizedHighlighter {
pub fn highlight(
&mut self,
egui_style: &egui::Style,
code: &str,
cursor: Option<CCursorRange>,
) -> LayoutJob {
if (&self.style, self.code.as_str()) != (egui_style, code) {
self.style = egui_style.clone();
code.clone_into(&mut self.code);
self.output = highlight_easymark(egui_style, code, cursor);
}
self.output.clone()
}
}
pub fn highlight_easymark(
egui_style: &egui::Style,
mut text: &str,
// TODO: hide special characters where cursor isn't
_cursor: Option<CCursorRange>,
) -> LayoutJob {
let mut job = LayoutJob::default();
let mut style = easy_mark_parser::Style::default();
let mut start_of_line = true;
const CODE_INDENT: f32 = 10.0;
while !text.is_empty() {
if start_of_line && text.starts_with("```") {
let astyle = format_from_style(
egui_style,
&easy_mark_parser::Style {
code: true,
..Default::default()
},
);
// Render the initial backticks as spaces
text = &text[3..];
job.append(" ", CODE_INDENT, astyle.clone());
match text.find("\n```") {
Some(n) => {
for line in text[..n + 1].lines() {
job.append(line, CODE_INDENT, astyle.clone());
job.append("\n", 0.0, astyle.clone());
}
// Render the final backticks as spaces
job.append(" ", CODE_INDENT, astyle);
text = &text[n + 4..];
}
None => {
job.append(text, 0.0, astyle.clone());
text = "";
}
};
style = Default::default();
continue;
}
if text.starts_with('`') {
style.code = true;
let end = text[1..]
.find(&['`', '\n'][..])
.map_or_else(|| text.len(), |i| i + 2);
job.append(&text[..end], 0.0, format_from_style(egui_style, &style));
text = &text[end..];
style.code = false;
continue;
}
let skip;
// zero-width space
let _zws = "\u{200b}";
let mut apply_basic_style =
|text: &mut &str,
style: &mut easy_mark_parser::Style,
access: fn(&mut easy_mark_parser::Style) -> &mut bool| {
let skip = if *access(style) {
// Include the character that is ending this style:
job.append(&text[..1], 0.0, format_from_style(egui_style, style));
*text = &text[1..];
0
} else {
1
};
*access(style) ^= true;
skip
};
if text.starts_with('*') {
skip = apply_basic_style(&mut text, &mut style, |style| &mut style.strong);
} else if text.starts_with('/') {
skip = apply_basic_style(&mut text, &mut style, |style| &mut style.italics);
} else if text.starts_with('_') {
skip = apply_basic_style(&mut text, &mut style, |style| &mut style.underline);
} else if text.starts_with('$') {
skip = apply_basic_style(&mut text, &mut style, |style| &mut style.small);
} else if text.starts_with('~') {
skip = apply_basic_style(&mut text, &mut style, |style| &mut style.strikethrough);
} else if text.starts_with('^') {
skip = apply_basic_style(&mut text, &mut style, |style| &mut style.raised);
} else if text.starts_with('\\') && text.len() >= 2 {
skip = 2;
} else if start_of_line && text.starts_with(' ') {
// we don't preview indentation, because it is confusing
skip = 1;
} else if start_of_line && text.starts_with("###### ") {
style.heading = true;
skip = 7;
} else if start_of_line && text.starts_with("##### ") {
style.heading = true;
skip = 6;
} else if start_of_line && text.starts_with("#### ") {
style.heading = true;
skip = 5;
} else if start_of_line && text.starts_with("### ") {
style.heading = true;
skip = 4;
} else if start_of_line && text.starts_with("## ") {
style.heading = true;
skip = 3;
} else if start_of_line && text.starts_with("# ") {
style.heading = true;
skip = 2;
} else if start_of_line && text.starts_with("> ") {
style.quoted = true;
skip = 2;
// we don't preview indentation, because it is confusing
} else if start_of_line && text.trim_start().starts_with("- ") {
job.append("", 0.0, format_from_style(egui_style, &style));
text = &text[2..];
skip = 0;
// we don't preview indentation, because it is confusing
} else {
skip = 0;
}
// Note: we don't preview underline, strikethrough and italics because it confuses things.
// Swallow everything up to the next special character:
let line_end = text[skip..]
.find('\n')
.map_or_else(|| text.len(), |i| (skip + i + 1));
let end = text[skip..]
.find(&['*', '`', '~', '_', '/', '$', '^', '\\', '<', '['][..])
.map_or_else(|| text.len(), |i| (skip + i).max(1));
if line_end <= end {
job.append(
&text[..line_end],
0.0,
format_from_style(egui_style, &style),
);
text = &text[line_end..];
start_of_line = true;
style = Default::default();
} else {
job.append(&text[..end], 0.0, format_from_style(egui_style, &style));
text = &text[end..];
start_of_line = false;
}
}
job
}
fn format_from_style(
egui_style: &egui::Style,
emark_style: &easy_mark_parser::Style,
) -> egui::text::TextFormat {
use egui::{Align, Color32, Stroke, TextStyle};
let color = if emark_style.code {
egui_style.visuals.strong_text_color() * Color32::from_rgb(0x44, 0xff, 0x44)
} else if emark_style.strong || emark_style.heading {
egui_style.visuals.strong_text_color()
} else if emark_style.quoted {
egui_style.visuals.weak_text_color()
} else {
egui_style.visuals.text_color()
};
let text_style = if emark_style.heading {
TextStyle::Heading
} else if emark_style.code {
TextStyle::Monospace
} else if emark_style.small | emark_style.raised {
TextStyle::Small
} else {
TextStyle::Body
};
let background = if emark_style.code {
egui_style.visuals.code_bg_color
} else {
Color32::TRANSPARENT
};
let underline = if emark_style.underline {
Stroke::new(1.0, color)
} else {
Stroke::NONE
};
let strikethrough = if emark_style.strikethrough {
Stroke::new(1.0, color)
} else {
Stroke::NONE
};
let valign = if emark_style.raised {
Align::TOP
} else {
Align::BOTTOM
};
egui::text::TextFormat {
font_id: text_style.resolve(egui_style),
color,
background,
italics: emark_style.italics,
underline,
strikethrough,
valign,
..Default::default()
}
}

View File

@ -40,7 +40,7 @@ pub struct Style {
pub raised: bool, pub raised: bool,
} }
pub enum MarkdownItem<'a> { pub enum Item<'a> {
Text { Text {
span: Span<'a>, span: Span<'a>,
style: Style, style: Style,

View File

@ -1,9 +1,6 @@
use egui::text::{CCursorRange, LayoutJob}; use egui::text::{CCursorRange, LayoutJob};
use crate::markdown::{ use super::{Item, Style, parse};
span::Span,
tokenizer::{Heading, Token, TokenKind, tokenize},
};
/// Highlight markdown, caching previous output to save CPU. /// Highlight markdown, caching previous output to save CPU.
#[derive(Default)] #[derive(Default)]
@ -13,36 +10,6 @@ pub struct MemoizedHighlighter {
output: LayoutJob, output: LayoutJob,
} }
#[derive(Copy, Clone, Debug, Default, Eq, PartialEq)]
pub struct Style {
/// # heading (large text)
pub heading: Option<Heading>,
/// > quoted (slightly dimmer color or other font style)
pub quoted: bool,
/// `code` (monospace, some other color)
pub code: bool,
/// self.strong* (emphasized, e.g. bold)
pub strong: bool,
/// _underline_
pub underline: bool,
/// ~strikethrough~
pub strikethrough: bool,
/// /italics/
pub italics: bool,
/// $small$
pub small: bool,
/// ^raised^
pub raised: bool,
}
impl MemoizedHighlighter { impl MemoizedHighlighter {
pub fn highlight( pub fn highlight(
&mut self, &mut self,
@ -67,189 +34,72 @@ pub fn highlight_markdown(
_cursor: Option<CCursorRange>, _cursor: Option<CCursorRange>,
) -> LayoutJob { ) -> LayoutJob {
let mut job = LayoutJob::default(); let mut job = LayoutJob::default();
let mut style = Style::default();
let mut prev = TokenKind::Newline; let code_style = Style {
let tokens: Vec<_> = tokenize(text).collect();
let mut tokens = &tokens[..];
const CODE_INDENT: f32 = 10.0;
while !tokens.is_empty() {
let token = tokens.first().unwrap();
tokens = &tokens[1..];
let start_of_line = prev == TokenKind::Newline;
prev = token.kind;
let mut basic_style: Option<fn(&mut Style) -> &mut bool> = None;
match token.kind {
TokenKind::CodeBlock if start_of_line => {
let span = collect_until(
token,
&mut tokens,
series([TokenKind::Newline, TokenKind::CodeBlock]),
);
let code_style = format_from_style(
egui_style,
&Style {
code: true, code: true,
..Default::default() ..Default::default()
}, };
);
job.append(&*span, CODE_INDENT, code_style.clone()); for item in parse(text) {
style = Default::default(); match item {
continue; Item::Text { span, style } => {
job.append(&span, 0.0, format_from_style(egui_style, &style));
} }
Item::CodeBlock {
TokenKind::Newline => style = Style::default(), all,
language: _, // TODO
TokenKind::Strong => basic_style = Some(|s| &mut s.strong), code: _, // TODO
TokenKind::Italic => basic_style = Some(|s| &mut s.italics), } => {
TokenKind::Strikethrough => basic_style = Some(|s| &mut s.strikethrough), job.append(&all, 100.0, format_from_style(egui_style, &code_style));
TokenKind::CodeBlock | TokenKind::Mono => {
style.code = true;
let span = collect_until(
token,
&mut tokens,
any_of([TokenKind::Mono, TokenKind::CodeBlock, TokenKind::Newline]),
);
job.append(&*span, 0.0, format_from_style(egui_style, &style));
style.code = false;
continue;
} }
// TODO: different heading strengths
TokenKind::Heading(h) if start_of_line => style.heading = Some(h),
TokenKind::Quote if start_of_line => style.quoted = true,
// TODO: indented list entries
TokenKind::ListEntry if start_of_line => {
job.append("", 0.0, format_from_style(egui_style, &style));
continue;
} }
TokenKind::Text
// the following tokens are only richly rendered if encountered e.g. at start_of_line.
| TokenKind::Indentation
| TokenKind::ListEntry
| TokenKind::Heading(..)
| TokenKind::Quote => {}
}
// if we encountered a marker for Bold, Italic, or Strikethrough, toggle that style and
// render the token with the style enabled.
if let Some(basic_style) = basic_style {
let mut tmp_style = style;
*basic_style(&mut tmp_style) = true;
*basic_style(&mut style) ^= true; // toggle
job.append(&token.span, 0.0, format_from_style(egui_style, &tmp_style));
continue;
}
job.append(&token.span, 0.0, format_from_style(egui_style, &style));
} }
job job
} }
fn series<'a, const N: usize>(of: [TokenKind; N]) -> impl FnMut(&[Token<'a>; N]) -> bool { fn format_from_style(egui_style: &egui::Style, style: &Style) -> egui::text::TextFormat {
move |token| {
of.iter()
.zip(token)
.all(|(kind, token)| kind == &token.kind)
}
}
fn any_of<'a, const N: usize>(these: [TokenKind; N]) -> impl FnMut(&[Token<'a>; 1]) -> bool {
move |[token]| these.contains(&token.kind)
}
/// Collect all tokens up to and including `pattern`, and merge them into a signle span.
///
/// `N` determines how many specific and consecutive tokens we are looking for.
/// i.e. if we were looking for a [TokenKind::Newline] followed by a [TokenKind::Quote], `N`
/// would equal `2`.
///
/// `pattern` is a function that accepts an array of `N` tokens and returns `true` if they match,
/// i.e. if we should stop collecting. [any_of] and [series] can help to construct this function.
///
/// The collected tokens will be split off the head of the slice referred to by `tokens`.
///
/// # Panic
/// Panics if `tokens` does not contain only consecutive adjacent spans.
fn collect_until<'a, const N: usize>(
token: &Token<'a>,
tokens: &mut &[Token<'a>],
pattern: impl FnMut(&[Token<'a>; N]) -> bool,
) -> Span<'a>
where
for<'b> &'b [Token<'a>; N]: TryFrom<&'b [Token<'a>]>,
{
let mut windows = tokens
.windows(N)
.map(|slice| <&[Token<'a>; N]>::try_from(slice).ok().unwrap());
let split_at = match windows.position(pattern) {
Some(i) => i + N,
None => tokens.len(), // consume everything
};
let (consume, keep) = tokens.split_at(split_at);
*tokens = keep;
consume
.iter()
.fold(token.span.clone(), |span: Span<'_>, token| {
span.try_merge(&token.span).unwrap()
})
}
fn format_from_style(egui_style: &egui::Style, emark_style: &Style) -> egui::text::TextFormat {
use egui::{Align, Color32, Stroke, TextStyle}; use egui::{Align, Color32, Stroke, TextStyle};
let color = if emark_style.strong || emark_style.heading.is_some() { let color = if style.code {
egui_style.visuals.strong_text_color() * Color32::GREEN
} else if style.strong || style.heading.is_some() {
egui_style.visuals.strong_text_color() egui_style.visuals.strong_text_color()
} else if emark_style.quoted { } else if style.quoted {
egui_style.visuals.weak_text_color() egui_style.visuals.weak_text_color()
} else { } else {
egui_style.visuals.text_color() egui_style.visuals.text_color()
}; };
let text_style = if emark_style.heading.is_some() { let text_style = if style.heading.is_some() {
TextStyle::Heading TextStyle::Heading
} else if emark_style.code { } else if style.code {
TextStyle::Monospace TextStyle::Monospace
} else if emark_style.small | emark_style.raised { } else if style.small | style.raised {
TextStyle::Small TextStyle::Small
} else { } else {
TextStyle::Body TextStyle::Body
}; };
let background = if emark_style.code { let background = if style.code {
egui_style.visuals.code_bg_color egui_style.visuals.code_bg_color
} else { } else {
Color32::TRANSPARENT Color32::TRANSPARENT
}; };
let underline = if emark_style.underline { let underline = if style.underline {
Stroke::new(1.0, color) Stroke::new(1.0, color)
} else { } else {
Stroke::NONE Stroke::NONE
}; };
let strikethrough = if emark_style.strikethrough { let strikethrough = if style.strikethrough {
Stroke::new(1.0, color) Stroke::new(1.0, color)
} else { } else {
Stroke::NONE Stroke::NONE
}; };
let valign = if emark_style.raised { let valign = if style.raised {
Align::TOP Align::TOP
} else { } else {
Align::BOTTOM Align::BOTTOM
@ -259,7 +109,7 @@ fn format_from_style(egui_style: &egui::Style, emark_style: &Style) -> egui::tex
font_id: text_style.resolve(egui_style), font_id: text_style.resolve(egui_style),
color, color,
background, background,
italics: emark_style.italics, italics: style.italics,
underline, underline,
strikethrough, strikethrough,
valign, valign,

View File

@ -1,7 +1,11 @@
mod ast;
mod highlighter; mod highlighter;
mod parser;
mod span; mod span;
mod tokenizer; mod tokenizer;
pub use ast::*;
pub use highlighter::*; pub use highlighter::*;
pub use parser::*;
pub use span::*; pub use span::*;
pub use tokenizer::*; pub use tokenizer::*;

172
src/markdown/parser.rs Normal file
View File

@ -0,0 +1,172 @@
use std::iter::{self, once};
use crate::markdown::Style;
use super::{Item, Span, Token, TokenKind, tokenize};
pub fn parse(text: &str) -> Vec<Item<'_>> {
let tokens: Vec<_> = tokenize(text).collect();
parse_tokens(&tokens)
}
pub fn parse_tokens<'a>(mut tokens: &[Token<'a>]) -> Vec<Item<'a>> {
// pretend that the first token was preceeded by a newline.
// means we don't have to handle the first token as a special case.
let mut prev = TokenKind::Newline;
let mut style = Style::default();
let mono_style = Style {
code: true,
..Default::default()
};
iter::from_fn(move || {
if tokens.is_empty() {
return None;
}
let token = tokens.first().unwrap();
tokens = &tokens[1..];
let start_of_line = prev == TokenKind::Newline;
prev = token.kind;
let mut basic_style: Option<fn(&mut Style) -> &mut bool> = None;
match token.kind {
TokenKind::CodeBlock if start_of_line => {
let language = collect_until(
None,
&mut tokens,
any_of([TokenKind::Newline]),
);
let code = collect_until(
None,
&mut tokens,
series([TokenKind::Newline, TokenKind::CodeBlock]),
);
let all = [
&token.span,
&language,
&code,
].into_iter().fold(Span::empty(), |a, b| a.try_merge(b).unwrap());
let language = language.trim_end_matches("\n");
let code = code.trim_end_matches("\n```");
return Some(Item::CodeBlock { all, language, code });
}
TokenKind::Newline => style = Style::default(),
TokenKind::Strong => basic_style = Some(|s| &mut s.strong),
TokenKind::Italic => basic_style = Some(|s| &mut s.italics),
TokenKind::Strikethrough => basic_style = Some(|s| &mut s.strikethrough),
TokenKind::CodeBlock | TokenKind::Mono => {
let span = collect_until(
Some(token),
&mut tokens,
any_of([TokenKind::Mono, TokenKind::CodeBlock, TokenKind::Newline]),
);
return Some(Item::Text { span, style: mono_style});
}
// TODO: different heading strengths
TokenKind::Heading(h) if start_of_line => style.heading = Some(h),
TokenKind::Quote if start_of_line => style.quoted = true,
// TODO: replace dashes with dots
//// TODO: indented list entries
//TokenKind::ListEntry if start_of_line => {
// job.append("• ", 0.0, format_from_style(egui_style, &style));
// continue;
//}
TokenKind::Text
// the following tokens are only richly rendered if encountered e.g. at start_of_line.
| TokenKind::Indentation
| TokenKind::ListEntry
| TokenKind::Heading(..)
| TokenKind::Quote => {}
}
// if we encountered a marker for Bold, Italic, or Strikethrough, toggle that style and
// render the token with the style enabled.
if let Some(basic_style) = basic_style {
let mut tmp_style = style;
*basic_style(&mut tmp_style) = true;
*basic_style(&mut style) ^= true; // toggle
return Some(Item::Text {
span: token.span.clone(),
style: tmp_style,
});
}
Some(Item::Text {
span: token.span.clone(),
style,
})
})
.collect()
}
fn series<'a, const N: usize>(of: [TokenKind; N]) -> impl FnMut(&[Token<'a>; N]) -> bool {
move |token| {
of.iter()
.zip(token)
.all(|(kind, token)| kind == &token.kind)
}
}
fn any_of<'a, const N: usize>(these: [TokenKind; N]) -> impl FnMut(&[Token<'a>; 1]) -> bool {
move |[token]| these.contains(&token.kind)
}
/// Collect all tokens up to and including `pattern`, and merge them into a signle span.
///
/// `N` determines how many specific and consecutive tokens we are looking for.
/// i.e. if we were looking for a [TokenKind::Newline] followed by a [TokenKind::Quote], `N`
/// would equal `2`.
///
/// `pattern` is a function that accepts an array of `N` tokens and returns `true` if they match,
/// i.e. if we should stop collecting. [any_of] and [series] can help to construct this function.
///
/// The collected tokens will be split off the head of the slice referred to by `tokens`.
///
/// # Panic
/// Panics if `tokens` does not contain only consecutive adjacent spans.
fn collect_until<'a, const N: usize>(
first_token: Option<&Token<'a>>,
tokens: &mut &[Token<'a>],
pattern: impl FnMut(&[Token<'a>; N]) -> bool,
) -> Span<'a>
where
// &[T; N]: TryFrom<&[T]>
for<'b> &'b [Token<'a>; N]: TryFrom<&'b [Token<'a>]>,
{
let mut windows = tokens.windows(N).map(|slice| {
<&[Token<'a>; N]>::try_from(slice)
.ok()
.expect("`windows` promises to return slices of length N")
});
let split_at = match windows.position(pattern) {
Some(i) => i + N,
None => tokens.len(), // consume everything
};
let (consume, keep) = tokens.split_at(split_at);
*tokens = keep;
once(first_token)
.flatten()
.chain(consume)
.fold(Span::empty(), |span: Span<'_>, token| {
span.try_merge(&token.span).unwrap()
})
}

View File

@ -3,6 +3,8 @@ use std::{
ops::{Deref, Range}, ops::{Deref, Range},
}; };
use eyre::{bail, eyre};
#[derive(Clone, Eq, PartialEq)] #[derive(Clone, Eq, PartialEq)]
pub struct Span<'a> { pub struct Span<'a> {
complete_str: &'a str, complete_str: &'a str,
@ -17,6 +19,13 @@ impl<'a> Span<'a> {
} }
} }
pub const fn empty() -> Self {
Span {
complete_str: "",
range: 0..0,
}
}
pub fn get(&self, slice: Range<usize>) -> Option<Self> { pub fn get(&self, slice: Range<usize>) -> Option<Self> {
let start = self.range.start.checked_add(slice.start)?; let start = self.range.start.checked_add(slice.start)?;
let end = self.range.start.checked_add(slice.end)?; let end = self.range.start.checked_add(slice.end)?;
@ -41,26 +50,49 @@ impl<'a> Span<'a> {
Some((head, tail)) Some((head, tail))
} }
pub fn trim_end_matches(&self, p: &str) -> Self {
if !self.ends_with(p) {
return self.clone();
}
Self {
range: self.range.start..self.range.end - p.len(),
complete_str: self.complete_str,
}
}
/// Try to merge the spans. /// Try to merge the spans.
/// ///
/// If either spans is empty, this just returns the other one.
/// This only works if spans are pointing into the same backing buffer, and are adjacent. /// This only works if spans are pointing into the same backing buffer, and are adjacent.
pub fn try_merge(&self, other: &Self) -> Option<Self> { pub fn try_merge(&self, other: &Self) -> eyre::Result<Self> {
if self.is_empty() {
return Ok(other.clone());
}
if other.is_empty() {
return Ok(self.clone());
}
if self.complete_str.as_ptr() != other.complete_str.as_ptr() { if self.complete_str.as_ptr() != other.complete_str.as_ptr() {
return None; bail!("Can't merge different strings");
} }
if self.range.end == other.range.start { if self.range.end == other.range.start {
Some(Self { Ok(Self {
range: self.range.start..other.range.end, range: self.range.start..other.range.end,
..*self ..*self
}) })
} else if self.range.start == other.range.end { } else if self.range.start == other.range.end {
Some(Self { Ok(Self {
range: other.range.start..self.range.end, range: other.range.start..self.range.end,
..*self ..*self
}) })
} else { } else {
None Err(eyre!("String: {:?}", self.complete_str)
.wrap_err(eyre!("Span 2: {:?}", other.deref()))
.wrap_err(eyre!("Span 1: {:?}", self.deref()))
.wrap_err("Can't merge disjoint string spans"))
} }
} }
} }

View File

@ -1,16 +1,6 @@
use std::iter; use std::iter;
use super::span::Span; use super::{Heading, span::Span};
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Heading {
H6,
H5,
H4,
H3,
H2,
H1,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)] #[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum TokenKind { pub enum TokenKind {