From a0592641506a9304fd7d12a0758df652c22b08d2 Mon Sep 17 00:00:00 2001 From: Emil Lerch Date: Tue, 15 Jul 2025 15:42:30 -0700 Subject: [PATCH] better output through markdown->html (then proper escaping to atom feed) --- .pre-commit-config.yaml | 2 +- src/atom.zig | 212 +++++++++++++++- src/markdown.zig | 510 +++++++++++++++++++++++++++++++++++++++ src/xml_parser.zig | 24 +- src/xml_parser_tests.zig | 11 +- 5 files changed, 740 insertions(+), 19 deletions(-) create mode 100644 src/markdown.zig diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index afa267a..cf53580 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ repos: - id: zig-build - repo: local hooks: - - id: zlint + - id: test name: Run zig build test entry: zig args: ["build", "--verbose", "test"] diff --git a/src/atom.zig b/src/atom.zig index f5f33f2..ef836c5 100644 --- a/src/atom.zig +++ b/src/atom.zig @@ -4,17 +4,156 @@ const ArrayList = std.ArrayList; const zeit = @import("zeit"); const Release = @import("main.zig").Release; +const markdown = @import("markdown.zig"); fn escapeXml(writer: anytype, input: []const u8) !void { - for (input) |char| { + var i: usize = 0; + var open_spans: u8 = 0; // Track number of open spans + + while (i < input.len) { + const char = input[i]; + + // Handle ANSI escape sequences + if (char == 0x1B and i + 1 < input.len and input[i + 1] == '[') { + // Found ANSI escape sequence, convert to HTML + i += 2; // Skip ESC and [ + const code_start = i; + + // Find the end of the ANSI sequence + while (i < input.len) { + const c = input[i]; + i += 1; + // ANSI sequences end with a letter (A-Z, a-z) + if ((c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')) { + // Extract the numeric codes + const codes = input[code_start .. i - 1]; + try convertAnsiToHtml(writer, codes, c, &open_spans); + break; + } + } + continue; + } + switch (char) { '<' => try writer.writeAll("<"), '>' => try writer.writeAll(">"), '&' => try writer.writeAll("&"), '"' => try writer.writeAll("""), '\'' => try writer.writeAll("'"), - else => try writer.writeByte(char), + // Valid XML characters: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + 0x09, 0x0A, 0x0D => try writer.writeByte(char), // Tab, LF, CR + else => { + if (char >= 0x20 and char <= 0x7F) { + // Printable ASCII (excluding already handled special chars) + try writer.writeByte(char); + } else if (char >= 0x80) { + // Extended ASCII (will be handled as UTF-8) + try writer.writeByte(char); + } else if (char < 0x20) { + // Other control characters - replace with space to preserve spacing + try writer.writeByte(' '); + } else { + // else skip completely invalid characters + const start = if (i < 10) 0 else i - 10; + std.log.warn("invalid character 0x{x} encountered, skipping. Previous {} chars: {s}", .{ char, i - start, input[start..i] }); + } + }, } + i += 1; + } + + // Close any remaining open spans + while (open_spans > 0) { + try writer.writeAll(""); + open_spans -= 1; + } +} + +fn convertAnsiToHtml(writer: anytype, codes: []const u8, end_char: u8, open_spans: *u8) !void { + // Only handle SGR (Select Graphic Rendition) sequences that end with 'm' + if (end_char != 'm') { + return; // Skip non-color sequences + } + + // Parse semicolon-separated codes + var code_iter = std.mem.splitScalar(u8, codes, ';'); + var has_styles = false; + + // Use a fixed buffer for styles to avoid allocation + var styles_buf: [256]u8 = undefined; + var styles_len: usize = 0; + + while (code_iter.next()) |code_str| { + const code = std.fmt.parseInt(u8, std.mem.trim(u8, code_str, " "), 10) catch continue; + + switch (code) { + 0 => { + // Reset - close all open spans + while (open_spans.* > 0) { + try writer.writeAll(""); + open_spans.* -= 1; + } + return; + }, + 1 => { + // Bold + const style = if (has_styles) ";font-weight:bold" else "font-weight:bold"; + if (styles_len + style.len < styles_buf.len) { + @memcpy(styles_buf[styles_len .. styles_len + style.len], style); + styles_len += style.len; + has_styles = true; + } + }, + 22 => { + // Normal intensity (turn off bold) - close current span and open new one without bold + if (open_spans.* > 0) { + try writer.writeAll(""); + open_spans.* -= 1; + } + // Don't add font-weight:normal as a new style, just close the bold span + return; + }, + 30 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#000000"), // Black + 31 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#800000"), // Red + 32 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#008000"), // Green + 33 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#808000"), // Yellow + 34 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#000080"), // Blue + 35 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#800080"), // Magenta + 36 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#008080"), // Cyan + 37 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#c0c0c0"), // White + 39 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:inherit"), // Default foreground + 90 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#808080"), // Bright Black (Gray) + 91 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#ff0000"), // Bright Red + 92 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#00ff00"), // Bright Green + 93 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#ffff00"), // Bright Yellow + 94 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#0000ff"), // Bright Blue + 95 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#ff00ff"), // Bright Magenta + 96 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#00ffff"), // Bright Cyan + 97 => try appendColorToBuffer(&styles_buf, &styles_len, &has_styles, "color:#ffffff"), // Bright White + else => {}, // Ignore unknown codes + } + } + + if (has_styles) { + try writer.writeAll(""); + open_spans.* += 1; + } +} + +fn appendColorToBuffer(styles_buf: *[256]u8, styles_len: *usize, has_styles: *bool, color: []const u8) !void { + const prefix = if (has_styles.*) ";" else ""; + const total_len = prefix.len + color.len; + + if (styles_len.* + total_len < styles_buf.len) { + if (prefix.len > 0) { + @memcpy(styles_buf[styles_len.* .. styles_len.* + prefix.len], prefix); + styles_len.* += prefix.len; + } + @memcpy(styles_buf[styles_len.* .. styles_len.* + color.len], color); + styles_len.* += color.len; + has_styles.* = true; } } @@ -69,9 +208,19 @@ pub fn generateFeed(allocator: Allocator, releases: []const Release) ![]u8 { try escapeXml(writer, release.provider); try writer.writeAll("\n"); - try writer.writeAll(" "); - try escapeXml(writer, release.description); - try writer.writeAll("\n"); + // Convert markdown to HTML + const conversion_result = try markdown.convertMarkdownToHtml(allocator, release.description); + defer conversion_result.deinit(allocator); + + // Add content with proper type attribute and XML-escaped HTML + try writer.writeAll(" "); + try escapeXml(writer, conversion_result.html); + try writer.writeAll("\n"); + + // Add fallback metadata if markdown conversion used fallback + if (conversion_result.has_fallback) { + try writer.writeAll(" \n"); + } try writer.writeAll(" ") != null); + try std.testing.expect(std.mem.indexOf(u8, result, "") != null); + try std.testing.expect(std.mem.indexOf(u8, result, "colored") != null); +} + test "XML escaping" { const allocator = std.testing.allocator; @@ -101,7 +269,7 @@ test "XML escaping" { try std.testing.expectEqualStrings(expected, result); } -test "Atom feed generation" { +test "Atom feed generation with markdown" { const allocator = std.testing.allocator; const releases = [_]Release{ @@ -110,7 +278,7 @@ test "Atom feed generation" { .tag_name = "v1.0.0", .published_at = "2024-01-01T00:00:00Z", .html_url = "https://github.com/test/repo/releases/tag/v1.0.0", - .description = "Test release", + .description = "## What's Changed\n* Fixed bug\n* Added feature", .provider = "github", }, }; @@ -121,6 +289,31 @@ test "Atom feed generation" { try std.testing.expect(std.mem.indexOf(u8, atom_content, "test/repo") != null); try std.testing.expect(std.mem.indexOf(u8, atom_content, "v1.0.0") != null); try std.testing.expect(std.mem.indexOf(u8, atom_content, "") != null); + try std.testing.expect(std.mem.indexOf(u8, atom_content, "") != null); + try std.testing.expect(std.mem.indexOf(u8, atom_content, "<h2>What&apos;s Changed</h2>") != null); + try std.testing.expect(std.mem.indexOf(u8, atom_content, "<ul>") != null); +} + +test "Atom feed with fallback markdown" { + const allocator = std.testing.allocator; + + const releases = [_]Release{ + Release{ + .repo_name = "test/repo", + .tag_name = "v1.0.0", + .published_at = "2024-01-01T00:00:00Z", + .html_url = "https://github.com/test/repo/releases/tag/v1.0.0", + .description = "```javascript\nconst x = 1;\n```", + .provider = "github", + }, + }; + + const atom_content = try generateFeed(allocator, &releases); + defer allocator.free(atom_content); + + // Should contain fallback metadata + try std.testing.expect(std.mem.indexOf(u8, atom_content, "markdown-fallback") != null); + try std.testing.expect(std.mem.indexOf(u8, atom_content, "<pre>") != null); } test "Atom feed with special characters" { @@ -140,13 +333,10 @@ test "Atom feed with special characters" { const atom_content = try generateFeed(allocator, &releases); defer allocator.free(atom_content); - // Verify special characters are properly escaped + // Verify special characters are properly escaped in title try std.testing.expect(std.mem.indexOf(u8, atom_content, "<script>") != null); try std.testing.expect(std.mem.indexOf(u8, atom_content, "& more") != null); - try std.testing.expect(std.mem.indexOf(u8, atom_content, ""release"") != null); - try std.testing.expect(std.mem.indexOf(u8, atom_content, "<special>") != null); // Verify raw special characters are not present try std.testing.expect(std.mem.indexOf(u8, atom_content, " & \"quotes\""; + const result = try convertMarkdownToHtml(allocator, markdown); + defer result.deinit(allocator); + + const expected = "

Test <script>alert('xss')</script> & "quotes"

\n"; + try testing.expectEqualStrings(expected, result.html); + try testing.expect(!result.has_fallback); + + if (std.process.hasEnvVar(allocator, "test-debug") catch false) { + std.debug.print("HTML escaping test - Input: {s}\nOutput: {s}\n", .{ markdown, result.html }); + } +} diff --git a/src/xml_parser.zig b/src/xml_parser.zig index b921bf1..1155d58 100644 --- a/src/xml_parser.zig +++ b/src/xml_parser.zig @@ -86,8 +86,11 @@ fn parseEntry(allocator: Allocator, entry_xml: []const u8) !Release { release.published_at = updated; } - // Parse summary (description) - if (extractTagContent(entry_xml, "summary", allocator)) |summary| { + // Parse content (description) - try content first, then fall back to summary + if (extractTagContent(entry_xml, "content", allocator)) |content| { + allocator.free(release.description); + release.description = content; + } else if (extractTagContent(entry_xml, "summary", allocator)) |summary| { allocator.free(release.description); release.description = summary; } @@ -115,6 +118,23 @@ fn extractTagContent(xml: []const u8, tag_name: []const u8, allocator: Allocator return unescapeXml(allocator, content) catch null; } } + + // Also try with attributes (e.g., ) + const open_tag_with_attrs = std.fmt.allocPrint(allocator, "<{s} ", .{tag_name}) catch return null; + defer allocator.free(open_tag_with_attrs); + + if (std.mem.indexOf(u8, xml, open_tag_with_attrs)) |start_pos| { + // Find the end of the opening tag + if (std.mem.indexOf(u8, xml[start_pos..], ">")) |tag_end_offset| { + const content_start = start_pos + tag_end_offset + 1; + if (std.mem.indexOf(u8, xml[content_start..], close_tag)) |end_offset| { + const content_end = content_start + end_offset; + const content = xml[content_start..content_end]; + return unescapeXml(allocator, content) catch null; + } + } + } + return null; } diff --git a/src/xml_parser_tests.zig b/src/xml_parser_tests.zig index 971a4b9..cc1bad8 100644 --- a/src/xml_parser_tests.zig +++ b/src/xml_parser_tests.zig @@ -48,14 +48,14 @@ test "round trip: generate atom feed and parse it back" { try testing.expectEqualStrings("v1.0.0", parsed_releases.items[0].tag_name); try testing.expectEqualStrings("2024-01-01T00:00:00Z", parsed_releases.items[0].published_at); try testing.expectEqualStrings("https://github.com/test/repo1/releases/tag/v1.0.0", parsed_releases.items[0].html_url); - try testing.expectEqualStrings("First release", parsed_releases.items[0].description); + try testing.expectEqualStrings("

First release

\n", parsed_releases.items[0].description); try testing.expectEqualStrings("github", parsed_releases.items[0].provider); try testing.expectEqualStrings("test/repo2", parsed_releases.items[1].repo_name); try testing.expectEqualStrings("v2.0.0", parsed_releases.items[1].tag_name); try testing.expectEqualStrings("2024-01-02T00:00:00Z", parsed_releases.items[1].published_at); try testing.expectEqualStrings("https://github.com/test/repo2/releases/tag/v2.0.0", parsed_releases.items[1].html_url); - try testing.expectEqualStrings("Second release", parsed_releases.items[1].description); + try testing.expectEqualStrings("

Second release

\n", parsed_releases.items[1].description); try testing.expectEqualStrings("github", parsed_releases.items[1].provider); } @@ -78,10 +78,11 @@ test "parse atom feed with special characters" { const atom_content = try atom.generateFeed(allocator, &original_releases); defer allocator.free(atom_content); - // Verify the XML contains escaped characters + // Verify the XML contains escaped characters in the title (not in content) try testing.expect(std.mem.indexOf(u8, atom_content, "<script>") != null); try testing.expect(std.mem.indexOf(u8, atom_content, "& more") != null); - try testing.expect(std.mem.indexOf(u8, atom_content, ""release"") != null); + // The content will be XML-escaped HTML, so quotes in HTML will be &quot; + try testing.expect(std.mem.indexOf(u8, atom_content, "&quot;release&quot;") != null); // Parse it back (this should unescape the characters) var parsed_releases = try xml_parser.parseAtomFeed(allocator, atom_content); @@ -96,7 +97,7 @@ test "parse atom feed with special characters" { try testing.expectEqual(@as(usize, 1), parsed_releases.items.len); try testing.expectEqualStrings("test/repo