523 lines
20 KiB
Rust
523 lines
20 KiB
Rust
//! Markdown to HTML rendering via pulldown-cmark with syntax highlighting.
|
|
|
|
use crate::escape::{code_escape, html_escape};
|
|
use crate::highlight::{Language, highlight_code};
|
|
use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag, TagEnd};
|
|
use serde::Serialize;
|
|
|
|
/// A heading anchor extracted from markdown content.
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct Anchor {
|
|
/// Heading ID attribute (slug)
|
|
pub id: String,
|
|
/// Heading text content
|
|
pub label: String,
|
|
/// Heading level (2-6, h1 excluded)
|
|
pub level: u8,
|
|
}
|
|
|
|
/// Render markdown content to HTML with syntax highlighting.
|
|
/// Returns the HTML output and a list of extracted heading anchors.
|
|
pub fn markdown_to_html(markdown: &str) -> (String, Vec<Anchor>) {
|
|
let options = Options::ENABLE_TABLES
|
|
| Options::ENABLE_FOOTNOTES
|
|
| Options::ENABLE_STRIKETHROUGH
|
|
| Options::ENABLE_TASKLISTS
|
|
| Options::ENABLE_MATH;
|
|
|
|
let parser = Parser::new_ext(markdown, options);
|
|
let mut html_output = String::new();
|
|
let mut anchors = Vec::new();
|
|
let mut code_block_lang: Option<String> = None;
|
|
let mut code_block_content = String::new();
|
|
let mut in_code_block = false;
|
|
|
|
// Image alt text accumulation state
|
|
let mut image_alt_content: Option<String> = None;
|
|
let mut image_attrs: Option<(String, String)> = None; // (src, title)
|
|
|
|
// Heading accumulation state
|
|
let mut heading_level: Option<HeadingLevel> = None;
|
|
let mut heading_text = String::new();
|
|
|
|
for event in parser {
|
|
match event {
|
|
Event::Start(Tag::CodeBlock(kind)) => {
|
|
// Extract language from code fence
|
|
code_block_lang = match kind {
|
|
CodeBlockKind::Fenced(lang) => {
|
|
let lang_str = lang.as_ref().split_whitespace().next().unwrap_or("");
|
|
if lang_str.is_empty() {
|
|
None
|
|
} else {
|
|
Some(lang_str.to_string())
|
|
}
|
|
},
|
|
CodeBlockKind::Indented => None,
|
|
};
|
|
in_code_block = true;
|
|
code_block_content.clear();
|
|
},
|
|
Event::Text(text) if in_code_block => {
|
|
// Accumulate code block content
|
|
code_block_content.push_str(&text);
|
|
},
|
|
Event::Text(text) if image_alt_content.is_some() => {
|
|
// Accumulate image alt text
|
|
if let Some(ref mut alt) = image_alt_content {
|
|
alt.push_str(&text);
|
|
}
|
|
},
|
|
Event::End(TagEnd::CodeBlock) => {
|
|
// Render the code block with highlighting
|
|
let lang_str = code_block_lang.as_deref().unwrap_or("");
|
|
|
|
// Mermaid diagrams: render to SVG
|
|
if lang_str == "mermaid" {
|
|
match crate::mermaid::render_diagram(&code_block_content) {
|
|
Ok(svg) => {
|
|
html_output.push_str("<div class=\"mermaid-diagram\">\n");
|
|
html_output.push_str(&svg);
|
|
html_output.push_str("\n</div>\n");
|
|
},
|
|
Err(e) => {
|
|
eprintln!("mermaid render error: {e}");
|
|
html_output.push_str("<pre class=\"mermaid-error\"><code>");
|
|
html_output.push_str(&html_escape(&code_block_content));
|
|
html_output.push_str("</code></pre>\n");
|
|
},
|
|
}
|
|
} else {
|
|
// Code blocks: syntax highlighting
|
|
html_output.push_str("<pre><code");
|
|
|
|
if let Some(lang) = Language::from_fence(lang_str) {
|
|
// Supported language: apply tree-sitter highlighting
|
|
html_output.push_str(&format!(" class=\"language-{}\">", lang_str));
|
|
html_output.push_str(&highlight_code(lang, &code_block_content));
|
|
} else {
|
|
// Unsupported language: render as plain escaped text
|
|
if !lang_str.is_empty() {
|
|
html_output.push_str(&format!(" class=\"language-{}\">", lang_str));
|
|
} else {
|
|
html_output.push('>');
|
|
}
|
|
html_output.push_str(&code_escape(&code_block_content));
|
|
}
|
|
|
|
html_output.push_str("</code></pre>\n");
|
|
}
|
|
|
|
code_block_lang = None;
|
|
in_code_block = false;
|
|
code_block_content.clear();
|
|
},
|
|
Event::Text(text) if heading_level.is_some() => {
|
|
// Accumulate heading text
|
|
heading_text.push_str(&text);
|
|
html_output.push_str(&html_escape(&text));
|
|
},
|
|
Event::Text(text) => {
|
|
// Regular text outside code blocks
|
|
html_output.push_str(&html_escape(&text));
|
|
},
|
|
Event::Code(text) => {
|
|
// Inline code
|
|
html_output.push_str("<code>");
|
|
html_output.push_str(&html_escape(&text));
|
|
html_output.push_str("</code>");
|
|
},
|
|
Event::Start(Tag::Image {
|
|
dest_url, title, ..
|
|
}) => {
|
|
// Begin accumulating alt text; defer rendering to End event
|
|
image_alt_content = Some(String::new());
|
|
image_attrs = Some((dest_url.to_string(), title.to_string()));
|
|
},
|
|
Event::Start(Tag::Heading { level, .. }) => {
|
|
// Begin accumulating heading text
|
|
heading_level = Some(level);
|
|
heading_text.clear();
|
|
let level_num = level as u8;
|
|
html_output.push_str(&format!("<h{}", level_num));
|
|
// ID will be added at End event after we have the text
|
|
},
|
|
Event::Start(tag) => {
|
|
html_output.push_str(&start_tag_to_html(&tag));
|
|
},
|
|
Event::End(TagEnd::Image) => {
|
|
// Render image with accumulated alt text
|
|
let alt = image_alt_content.take().unwrap_or_default();
|
|
if let Some((src, title)) = image_attrs.take() {
|
|
if title.is_empty() {
|
|
html_output.push_str(&format!(
|
|
"<img src=\"{}\" alt=\"{}\" />",
|
|
html_escape(&src),
|
|
html_escape(&alt)
|
|
));
|
|
} else {
|
|
html_output.push_str(&format!(
|
|
"<img src=\"{}\" alt=\"{}\" title=\"{}\" />",
|
|
html_escape(&src),
|
|
html_escape(&alt),
|
|
html_escape(&title)
|
|
));
|
|
}
|
|
}
|
|
},
|
|
Event::End(TagEnd::Heading(level)) => {
|
|
// Generate slug ID from heading text
|
|
let id = slugify(&heading_text);
|
|
let level_num = level as u8;
|
|
|
|
// We need to go back and insert the id attribute and close the tag
|
|
// The heading was opened as "<hN" - find it and complete with id and >
|
|
if let Some(pos) = html_output.rfind(&format!("<h{}", level_num)) {
|
|
let insert_pos = pos + format!("<h{}", level_num).len();
|
|
html_output.insert_str(insert_pos, &format!(" id=\"{}\">", id));
|
|
}
|
|
// Add pilcrow anchor link for deep-linking (hover-reveal via CSS)
|
|
html_output.push_str(&format!(
|
|
"<a class=\"heading-anchor\" href=\"#{}\">¶</a></h{}>\n",
|
|
id, level_num
|
|
));
|
|
|
|
// Extract anchor for h2-h6 (skip h1)
|
|
if level_num >= 2 {
|
|
anchors.push(Anchor {
|
|
id,
|
|
label: heading_text.clone(),
|
|
level: level_num,
|
|
});
|
|
}
|
|
|
|
heading_level = None;
|
|
heading_text.clear();
|
|
},
|
|
Event::End(tag) => {
|
|
html_output.push_str(&end_tag_to_html(&tag));
|
|
},
|
|
Event::SoftBreak => {
|
|
html_output.push('\n');
|
|
},
|
|
Event::HardBreak => {
|
|
html_output.push_str("<br />\n");
|
|
},
|
|
Event::Rule => {
|
|
html_output.push_str("<hr />\n");
|
|
},
|
|
Event::Html(html) | Event::InlineHtml(html) => {
|
|
html_output.push_str(&html);
|
|
},
|
|
Event::FootnoteReference(name) => {
|
|
html_output.push_str(&format!(
|
|
"<sup class=\"footnote-ref\"><a href=\"#fn-{}\">{}</a></sup>",
|
|
name, name
|
|
));
|
|
},
|
|
Event::TaskListMarker(checked) => {
|
|
let checkbox = if checked {
|
|
"<input type=\"checkbox\" checked disabled />"
|
|
} else {
|
|
"<input type=\"checkbox\" disabled />"
|
|
};
|
|
html_output.push_str(checkbox);
|
|
},
|
|
Event::InlineMath(latex) => match crate::math::render_math(&latex, false) {
|
|
Ok(rendered) => html_output.push_str(&rendered),
|
|
Err(e) => {
|
|
eprintln!("math render error: {e}");
|
|
html_output.push_str("<code class=\"math-error\">");
|
|
html_output.push_str(&html_escape(&latex));
|
|
html_output.push_str("</code>");
|
|
},
|
|
},
|
|
Event::DisplayMath(latex) => match crate::math::render_math(&latex, true) {
|
|
Ok(rendered) => {
|
|
html_output.push_str("<div class=\"math-display\">\n");
|
|
html_output.push_str(&rendered);
|
|
html_output.push_str("\n</div>\n");
|
|
},
|
|
Err(e) => {
|
|
eprintln!("math render error: {e}");
|
|
html_output.push_str("<pre class=\"math-error\">");
|
|
html_output.push_str(&html_escape(&latex));
|
|
html_output.push_str("</pre>\n");
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
(html_output, anchors)
|
|
}
|
|
|
|
/// Convert heading text to a URL-friendly slug ID.
|
|
fn slugify(text: &str) -> String {
|
|
text.to_lowercase()
|
|
.chars()
|
|
.map(|c| if c.is_alphanumeric() { c } else { '-' })
|
|
.collect::<String>()
|
|
.split('-')
|
|
.filter(|s| !s.is_empty())
|
|
.collect::<Vec<_>>()
|
|
.join("-")
|
|
}
|
|
|
|
fn start_tag_to_html(tag: &Tag) -> String {
|
|
match tag {
|
|
Tag::Paragraph => "<p>".to_string(),
|
|
Tag::Heading { level, .. } => format!("<h{}>", *level as u8),
|
|
Tag::BlockQuote(_) => "<blockquote>\n".to_string(),
|
|
Tag::CodeBlock(_) => String::new(), // Handled separately
|
|
Tag::List(Some(start)) => format!("<ol start=\"{}\">\n", start),
|
|
Tag::List(None) => "<ul>\n".to_string(),
|
|
Tag::Item => "<li>".to_string(),
|
|
Tag::FootnoteDefinition(name) => {
|
|
format!("<div class=\"footnote\" id=\"fn-{}\">", name)
|
|
},
|
|
Tag::Table(_) => "<table>\n".to_string(),
|
|
Tag::TableHead => "<thead>\n<tr>\n".to_string(),
|
|
Tag::TableRow => "<tr>\n".to_string(),
|
|
Tag::TableCell => "<td>".to_string(),
|
|
Tag::Emphasis => "<em>".to_string(),
|
|
Tag::Strong => "<strong>".to_string(),
|
|
Tag::Strikethrough => "<del>".to_string(),
|
|
Tag::Link {
|
|
dest_url, title, ..
|
|
} => {
|
|
if title.is_empty() {
|
|
format!("<a href=\"{}\">", html_escape(dest_url))
|
|
} else {
|
|
format!(
|
|
"<a href=\"{}\" title=\"{}\">",
|
|
html_escape(dest_url),
|
|
html_escape(title)
|
|
)
|
|
}
|
|
},
|
|
Tag::Image { .. } => String::new(), // Handled separately in main loop
|
|
Tag::HtmlBlock => String::new(),
|
|
Tag::MetadataBlock(_) => String::new(),
|
|
Tag::DefinitionListTitle => "<dt>".to_string(),
|
|
Tag::DefinitionListDefinition => "<dd>".to_string(),
|
|
Tag::DefinitionList => "<dl>".to_string(),
|
|
}
|
|
}
|
|
|
|
fn end_tag_to_html(tag: &TagEnd) -> String {
|
|
match tag {
|
|
TagEnd::Paragraph => "</p>\n".to_string(),
|
|
TagEnd::Heading(level) => format!("</h{}>\n", *level as u8),
|
|
TagEnd::BlockQuote(_) => "</blockquote>\n".to_string(),
|
|
TagEnd::CodeBlock => String::new(), // Handled separately
|
|
TagEnd::List(ordered) => {
|
|
if *ordered {
|
|
"</ol>\n".to_string()
|
|
} else {
|
|
"</ul>\n".to_string()
|
|
}
|
|
},
|
|
TagEnd::Item => "</li>\n".to_string(),
|
|
TagEnd::FootnoteDefinition => "</div>\n".to_string(),
|
|
TagEnd::Table => "</table>\n".to_string(),
|
|
TagEnd::TableHead => "</tr>\n</thead>\n".to_string(),
|
|
TagEnd::TableRow => "</tr>\n".to_string(),
|
|
TagEnd::TableCell => "</td>\n".to_string(),
|
|
TagEnd::Emphasis => "</em>".to_string(),
|
|
TagEnd::Strong => "</strong>".to_string(),
|
|
TagEnd::Strikethrough => "</del>".to_string(),
|
|
TagEnd::Link => "</a>".to_string(),
|
|
TagEnd::Image => String::new(), // Handled separately in main loop
|
|
TagEnd::HtmlBlock => String::new(),
|
|
TagEnd::MetadataBlock(_) => String::new(),
|
|
TagEnd::DefinitionListTitle => "</dt>\n".to_string(),
|
|
TagEnd::DefinitionListDefinition => "</dd>\n".to_string(),
|
|
TagEnd::DefinitionList => "</dl>\n".to_string(),
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_basic_markdown() {
|
|
let md = "# Hello\n\nThis is a *test*.";
|
|
let (html, _) = markdown_to_html(md);
|
|
// Heading includes pilcrow anchor for deep-linking
|
|
assert!(html.contains(
|
|
"<h1 id=\"hello\">Hello<a class=\"heading-anchor\" href=\"#hello\">¶</a></h1>"
|
|
));
|
|
assert!(html.contains("<em>test</em>"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_code_block_highlighting() {
|
|
let md = "```rust\nfn main() {}\n```";
|
|
let (html, _) = markdown_to_html(md);
|
|
|
|
// Should contain highlighted code
|
|
assert!(html.contains("<pre><code"));
|
|
assert!(html.contains("language-rust"));
|
|
assert!(html.contains("class=\"hl-"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_code_block_unknown_language() {
|
|
let md = "```unknown\nsome code\n```";
|
|
let (html, _) = markdown_to_html(md);
|
|
|
|
// Should contain escaped code without highlighting spans
|
|
assert!(html.contains("<pre><code"));
|
|
assert!(html.contains("language-unknown"));
|
|
assert!(html.contains("some code"));
|
|
assert!(!html.contains("class=\"hl-"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_inline_code() {
|
|
let md = "Use `cargo run` to start.";
|
|
let (html, _) = markdown_to_html(md);
|
|
|
|
assert!(html.contains("<code>cargo run</code>"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_image_alt_text() {
|
|
let md = "";
|
|
let (html, _) = markdown_to_html(md);
|
|
|
|
assert!(html.contains("alt=\"Beautiful sunset\""));
|
|
assert!(html.contains("title=\"Evening sky\""));
|
|
assert!(html.contains("src=\"sunset.jpg\""));
|
|
}
|
|
|
|
#[test]
|
|
fn test_image_alt_text_no_title() {
|
|
let md = "";
|
|
let (html, _) = markdown_to_html(md);
|
|
|
|
assert!(html.contains("alt=\"Logo image\""));
|
|
assert!(html.contains("src=\"logo.png\""));
|
|
assert!(!html.contains("title="));
|
|
}
|
|
|
|
#[test]
|
|
fn test_anchor_extraction() {
|
|
let md = r#"# Page Title
|
|
## Getting Started
|
|
Some intro text.
|
|
### Installation
|
|
Install steps.
|
|
## Configuration
|
|
Config details.
|
|
#### Deep Heading
|
|
"#;
|
|
let (html, anchors) = markdown_to_html(md);
|
|
|
|
// h1 should NOT be extracted (page title, not TOC)
|
|
assert!(anchors.iter().all(|a| a.level >= 2));
|
|
|
|
// Should have 4 anchors: h2, h3, h2, h4
|
|
assert_eq!(anchors.len(), 4);
|
|
|
|
// Check first anchor
|
|
assert_eq!(anchors[0].id, "getting-started");
|
|
assert_eq!(anchors[0].label, "Getting Started");
|
|
assert_eq!(anchors[0].level, 2);
|
|
|
|
// Check h3
|
|
assert_eq!(anchors[1].id, "installation");
|
|
assert_eq!(anchors[1].level, 3);
|
|
|
|
// Check second h2
|
|
assert_eq!(anchors[2].id, "configuration");
|
|
assert_eq!(anchors[2].level, 2);
|
|
|
|
// Check h4
|
|
assert_eq!(anchors[3].id, "deep-heading");
|
|
assert_eq!(anchors[3].level, 4);
|
|
|
|
// Verify IDs are in HTML
|
|
assert!(html.contains("id=\"getting-started\""));
|
|
assert!(html.contains("id=\"installation\""));
|
|
}
|
|
|
|
#[test]
|
|
fn test_slugify_edge_cases() {
|
|
// Basic case
|
|
assert_eq!(slugify("Hello World"), "hello-world");
|
|
|
|
// Multiple spaces → single hyphen
|
|
assert_eq!(slugify("Hello World"), "hello-world");
|
|
|
|
// Special characters → hyphen (apostrophe becomes hyphen)
|
|
assert_eq!(slugify("What's New?"), "what-s-new");
|
|
|
|
// Numbers preserved, dot becomes hyphen
|
|
assert_eq!(slugify("Version 2.0"), "version-2-0");
|
|
|
|
// Leading/trailing spaces trimmed
|
|
assert_eq!(slugify(" Padded "), "padded");
|
|
|
|
// Mixed case → lowercase
|
|
assert_eq!(slugify("CamelCase"), "camelcase");
|
|
|
|
// Consecutive special chars → single hyphen
|
|
assert_eq!(slugify("A -- B"), "a-b");
|
|
}
|
|
|
|
#[test]
|
|
fn test_link_url_escaping() {
|
|
// Quote-breaking attack
|
|
let md = r#"[click]("><script>alert(1)</script>)"#;
|
|
let (html, _) = markdown_to_html(md);
|
|
assert!(!html.contains("<script>"), "script tags should be escaped");
|
|
assert!(html.contains(">"), "angle brackets should be escaped");
|
|
|
|
// JavaScript URL (should be escaped, not executed)
|
|
let md = r#"[click](javascript:alert(1))"#;
|
|
let (html, _) = markdown_to_html(md);
|
|
assert!(html.contains("href=\"javascript:alert(1)\""));
|
|
}
|
|
|
|
#[test]
|
|
fn test_link_title_escaping() {
|
|
let md = r#"[text](url "title with \"quotes\"")"#;
|
|
let (html, _) = markdown_to_html(md);
|
|
assert!(html.contains("""), "quotes in title should be escaped");
|
|
}
|
|
|
|
#[test]
|
|
fn test_image_src_escaping() {
|
|
// Quote-breaking attack in image src
|
|
let md = r#"</script>)"#;
|
|
let (html, _) = markdown_to_html(md);
|
|
assert!(!html.contains("<script>"), "script tags should be escaped");
|
|
assert!(
|
|
html.contains(""") || html.contains(">"),
|
|
"special chars in src should be escaped"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_unlabeled_code_block_preserves_quotes() {
|
|
// Code block without language specifier should preserve quotes
|
|
let md = "```\nContent-Security-Policy: default-src 'self';\n```";
|
|
let (html, _) = markdown_to_html(md);
|
|
|
|
// Should be inside <pre><code>
|
|
assert!(html.contains("<pre><code>"), "should have code block");
|
|
// Quotes should NOT be escaped (only <, >, & need escaping in code)
|
|
assert!(
|
|
html.contains("'self'"),
|
|
"single quotes should be preserved in code blocks"
|
|
);
|
|
// Should NOT have escaped quotes
|
|
assert!(
|
|
!html.contains("'"),
|
|
"quotes should not be HTML-escaped in code blocks"
|
|
);
|
|
}
|
|
}
|