feat(sitemap): add XML sitemap generation module

Implement src/sitemap.rs with generate_sitemap() for SEO-compliant
XML sitemap generation. Follows the feed.rs pattern:

- SitemapEntry struct for URL metadata
- build_sitemap_xml() for XML construction
- xml_escape() for special character handling
- 5 unit tests covering single/multiple entries, lastmod, escaping

Module declared in main.rs but not yet integrated into pipeline.
This commit is contained in:
Timothy DeHerrera
2026-01-31 22:00:20 -07:00
parent eaf09c1c7d
commit 4d869a85f7
3 changed files with 187 additions and 0 deletions

View File

@@ -11,6 +11,7 @@ mod highlight;
mod math;
mod mermaid;
mod render;
mod sitemap;
mod template_engine;
use crate::content::{discover_nav, discover_sections, Content, ContentKind, NavItem};

185
src/sitemap.rs Normal file
View File

@@ -0,0 +1,185 @@
//! XML sitemap generation for SEO.
use crate::config::SiteConfig;
use crate::content::{Content, Section};
use std::path::Path;
/// A URL entry for the sitemap.
pub struct SitemapEntry {
/// Absolute URL (e.g., "https://example.com/blog/post.html")
pub loc: String,
/// Optional last modification date in W3C format (YYYY-MM-DD)
pub lastmod: Option<String>,
}
/// Generate an XML sitemap from discovered content.
///
/// Includes:
/// - Homepage
/// - Section indices
/// - Section items (posts, projects, etc.)
/// - Standalone pages
pub fn generate_sitemap(
sections: &[Section],
pages: &[Content],
config: &SiteConfig,
content_root: &Path,
) -> String {
let base_url = config.base_url.trim_end_matches('/');
let mut entries = Vec::new();
// Homepage
entries.push(SitemapEntry {
loc: format!("{}/index.html", base_url),
lastmod: None,
});
// Sections and their items
for section in sections {
// Section index
entries.push(SitemapEntry {
loc: format!("{}/{}/index.html", base_url, section.name),
lastmod: section.index.frontmatter.date.clone(),
});
// Section items
if let Ok(items) = section.collect_items() {
for item in items {
let relative_path = item.output_path(content_root);
entries.push(SitemapEntry {
loc: format!("{}/{}", base_url, relative_path.display()),
lastmod: item.frontmatter.date.clone(),
});
}
}
}
// Standalone pages
for page in pages {
let relative_path = page.output_path(content_root);
entries.push(SitemapEntry {
loc: format!("{}/{}", base_url, relative_path.display()),
lastmod: page.frontmatter.date.clone(),
});
}
build_sitemap_xml(&entries)
}
/// Build the XML sitemap string from entries.
fn build_sitemap_xml(entries: &[SitemapEntry]) -> String {
let mut urls = String::new();
for entry in entries {
urls.push_str(" <url>\n");
urls.push_str(&format!(" <loc>{}</loc>\n", xml_escape(&entry.loc)));
if let Some(ref date) = entry.lastmod {
urls.push_str(&format!(" <lastmod>{}</lastmod>\n", xml_escape(date)));
}
urls.push_str(" </url>\n");
}
format!(
r#"<?xml version="1.0" encoding="utf-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{}
</urlset>
"#,
urls.trim_end()
)
}
/// Escape XML special characters.
fn xml_escape(s: &str) -> String {
s.replace('&', "&amp;")
.replace('<', "&lt;")
.replace('>', "&gt;")
.replace('"', "&quot;")
.replace('\'', "&apos;")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_xml_escape() {
assert_eq!(xml_escape("Hello & World"), "Hello &amp; World");
assert_eq!(xml_escape("<tag>"), "&lt;tag&gt;");
assert_eq!(xml_escape("\"quoted\""), "&quot;quoted&quot;");
assert_eq!(xml_escape("it's"), "it&apos;s");
}
#[test]
fn test_build_sitemap_xml_single_entry() {
let entries = vec![SitemapEntry {
loc: "https://example.com/index.html".to_string(),
lastmod: None,
}];
let xml = build_sitemap_xml(&entries);
assert!(xml.starts_with(r#"<?xml version="1.0" encoding="utf-8"?>"#));
assert!(xml.contains("<urlset xmlns="));
assert!(xml.contains("<url>"));
assert!(xml.contains("<loc>https://example.com/index.html</loc>"));
assert!(xml.contains("</urlset>"));
assert!(!xml.contains("<lastmod>")); // No lastmod when None
}
#[test]
fn test_build_sitemap_xml_with_lastmod() {
let entries = vec![SitemapEntry {
loc: "https://example.com/blog/post.html".to_string(),
lastmod: Some("2026-01-31".to_string()),
}];
let xml = build_sitemap_xml(&entries);
assert!(xml.contains("<loc>https://example.com/blog/post.html</loc>"));
assert!(xml.contains("<lastmod>2026-01-31</lastmod>"));
}
#[test]
fn test_build_sitemap_xml_multiple_entries() {
let entries = vec![
SitemapEntry {
loc: "https://example.com/index.html".to_string(),
lastmod: None,
},
SitemapEntry {
loc: "https://example.com/about.html".to_string(),
lastmod: Some("2026-01-15".to_string()),
},
SitemapEntry {
loc: "https://example.com/blog/index.html".to_string(),
lastmod: None,
},
];
let xml = build_sitemap_xml(&entries);
// Count url elements
let url_count = xml.matches("<url>").count();
assert_eq!(url_count, 3);
// Verify all URLs present
assert!(xml.contains("https://example.com/index.html"));
assert!(xml.contains("https://example.com/about.html"));
assert!(xml.contains("https://example.com/blog/index.html"));
}
#[test]
fn test_build_sitemap_xml_escapes_special_chars() {
let entries = vec![SitemapEntry {
loc: "https://example.com/search?q=foo&bar=baz".to_string(),
lastmod: None,
}];
let xml = build_sitemap_xml(&entries);
// & should be escaped
assert!(xml.contains("&amp;"));
assert!(!xml.contains("?q=foo&bar")); // Raw & should not appear
}
}