From 3b243beb7e098ca4169b1aaba2dc67b2d4704c28 Mon Sep 17 00:00:00 2001 From: Emil Lerch Date: Sun, 20 Jul 2025 19:46:39 -0700 Subject: [PATCH] html passthrough in markdown --- src/integration_tests.zig | 17 ++--- src/markdown.zig | 151 +++++++++++++++++++++++++++++++++++++- 2 files changed, 154 insertions(+), 14 deletions(-) diff --git a/src/integration_tests.zig b/src/integration_tests.zig index a922ec7..23474c7 100644 --- a/src/integration_tests.zig +++ b/src/integration_tests.zig @@ -9,6 +9,7 @@ const GitLab = @import("providers/GitLab.zig"); const Forgejo = @import("providers/Forgejo.zig"); const SourceHut = @import("providers/SourceHut.zig"); const config = @import("config.zig"); +const utils = @import("utils.zig"); fn testPrint(comptime fmt: []const u8, args: anytype) void { if (build_options.test_debug) { @@ -24,7 +25,7 @@ test "Atom feed validates against W3C validator" { Release{ .repo_name = "ziglang/zig", .tag_name = "0.14.0", - .published_at = "2024-12-19T00:00:00Z", + .published_at = try utils.parseReleaseTimestamp("2024-12-19T00:00:00Z"), .html_url = "https://github.com/ziglang/zig/releases/tag/0.14.0", .description = "Zig 0.14.0 release with many improvements", .provider = "github", @@ -33,7 +34,7 @@ test "Atom feed validates against W3C validator" { Release{ .repo_name = "example/test", .tag_name = "v1.2.3", - .published_at = "2024-12-18T12:30:00Z", + .published_at = try utils.parseReleaseTimestamp("2024-12-18T12:30:00Z"), .html_url = "https://github.com/example/test/releases/tag/v1.2.3", .description = "Bug fixes and performance improvements", .provider = "github", @@ -583,21 +584,13 @@ test "SourceHut commit date fetching" { try testing.expect(release.repo_name.len > 0); try testing.expect(release.tag_name.len > 0); try testing.expect(release.html_url.len > 0); - try testing.expect(release.published_at.len > 0); try testing.expectEqualStrings("sourcehut", release.provider); // Check if we got a real commit date vs epoch fallback - if (std.mem.eql(u8, release.published_at, "1970-01-01T00:00:00Z")) { + if (release.published_at == 0) { epoch_dates += 1; testPrint(" -> Using epoch fallback date\n", .{}); - } else { - valid_dates += 1; - testPrint(" -> Got real commit date\n", .{}); - - // Verify the date format looks reasonable (should be ISO 8601) - try testing.expect(release.published_at.len >= 19); // At least YYYY-MM-DDTHH:MM:SS - try testing.expect(std.mem.indexOf(u8, release.published_at, "T") != null); - } + } else valid_dates += 1; } testPrint("SourceHut commit date summary: {} valid dates, {} epoch fallbacks\n", .{ valid_dates, epoch_dates }); diff --git a/src/markdown.zig b/src/markdown.zig index c24bc96..6c11a29 100644 --- a/src/markdown.zig +++ b/src/markdown.zig @@ -153,6 +153,13 @@ pub fn convertMarkdownToHtml(allocator: Allocator, markdown: []const u8) !Conver list_type = null; } + // Check if this is a safe HTML line that can be passed through + if (isSafeHtmlLine(trimmed)) { + try result.appendSlice(trimmed); + try result.appendSlice("\n"); + continue; + } + // Check for complex markdown patterns that we don't handle if (hasComplexMarkdown(trimmed)) { has_fallback = true; @@ -373,8 +380,122 @@ fn hasComplexMarkdown(text: []const u8) bool { // Horizontal rules if (std.mem.eql(u8, text, "---") or std.mem.eql(u8, text, "***")) return true; - // HTML tags (already HTML, might be complex) - if (std.mem.indexOf(u8, text, "<") != null and std.mem.indexOf(u8, text, ">") != null) return true; + // Only treat as complex HTML if it contains potentially dangerous tags + if (containsDangerousHtml(text)) return true; + + return false; +} + +/// Check if a line contains only safe HTML that can be passed through as-is +fn isSafeHtmlLine(text: []const u8) bool { + // If no HTML tags, not an HTML line + if (std.mem.indexOf(u8, text, "<") == null or std.mem.indexOf(u8, text, ">") == null) { + return false; + } + + // List of safe HTML patterns that can be passed through exactly + const safe_exact_patterns = [_][]const u8{ + "
", + "
", + "", + "", + "
", + "
", + "
", + }; + + // Check if the line exactly matches a safe pattern (ignoring whitespace) + const trimmed_text = std.mem.trim(u8, text, " \t"); + for (safe_exact_patterns) |pattern| { + if (std.mem.eql(u8, trimmed_text, pattern)) { + return true; + } + } + + // Check for safe HTML with content (like text) + if (isSafeHtmlWithContent(trimmed_text)) { + return true; + } + + return false; +} + +/// Check if text is safe HTML that contains content (like text) +fn isSafeHtmlWithContent(text: []const u8) bool { + // Safe tags that can contain content + const safe_content_tags = [_][]const u8{ + "summary", + "code", + "em", + "strong", + "b", + "i", + }; + + // Check if it's a simple pattern like content + if (text.len < 7) return false; // Minimum: x + + if (text[0] != '<') return false; + + // Find the end of the opening tag + var tag_end: usize = 1; + while (tag_end < text.len and text[tag_end] != '>') { + tag_end += 1; + } + if (tag_end >= text.len) return false; + + const tag_name = text[1..tag_end]; + + // Check if this is a safe tag + var is_safe_tag = false; + for (safe_content_tags) |safe_tag| { + if (std.mem.eql(u8, tag_name, safe_tag)) { + is_safe_tag = true; + break; + } + } + + if (!is_safe_tag) return false; + + // Check if it ends with the corresponding closing tag + const expected_closing = std.fmt.allocPrint(std.heap.page_allocator, "", .{tag_name}) catch return false; + defer std.heap.page_allocator.free(expected_closing); + + return std.mem.endsWith(u8, text, expected_closing); +} + +/// Check if text contains HTML that should be treated as complex/dangerous +fn containsDangerousHtml(text: []const u8) bool { + // If no HTML tags, it's safe + if (std.mem.indexOf(u8, text, "<") == null or std.mem.indexOf(u8, text, ">") == null) { + return false; + } + + // Dangerous patterns that should trigger fallback (case-insensitive check) + const dangerous_patterns = [_][]const u8{ "script", "iframe", "object", "embed", "form", "input", "button", "select", "textarea", "style", "link", "meta" }; + + // Simple case-insensitive check for dangerous patterns + var i: usize = 0; + while (i < text.len) { + if (text[i] == '<') { + // Extract tag name + i += 1; + const tag_start = i; + while (i < text.len and text[i] != ' ' and text[i] != '>' and text[i] != '/') { + i += 1; + } + if (i > tag_start) { + const tag_name = text[tag_start..i]; + for (dangerous_patterns) |dangerous| { + if (std.ascii.eqlIgnoreCase(tag_name, dangerous)) { + return true; + } + } + } + } else { + i += 1; + } + } return false; } @@ -612,3 +733,29 @@ test "html escaping" { std.debug.print("HTML escaping test - Input: {s}\nOutput: {s}\n", .{ markdown, result.html }); } } + +test "safe HTML passthrough" { + const allocator = testing.allocator; + + // Test details/summary tags + const markdown1 = "
\nClick to expand\nContent here\n
"; + const result1 = try convertMarkdownToHtml(allocator, markdown1); + defer result1.deinit(allocator); + + try testing.expect(std.mem.indexOf(u8, result1.html, "
") != null); + try testing.expect(std.mem.indexOf(u8, result1.html, "") != null); + try testing.expect(std.mem.indexOf(u8, result1.html, "
") != null); + try testing.expect(!result1.has_fallback); + + // Test that dangerous HTML still triggers fallback + const markdown2 = ""; + const result2 = try convertMarkdownToHtml(allocator, markdown2); + defer result2.deinit(allocator); + + try testing.expect(result2.has_fallback); + try testing.expect(std.mem.indexOf(u8, result2.html, "
") != null);
+
+    if (std.process.hasEnvVar(allocator, "test-debug") catch false) {
+        std.debug.print("Safe HTML test - Input: {s}\nOutput: {s}\nHas fallback: {}\n", .{ markdown1, result1.html, result1.has_fallback });
+    }
+}