update README with new XML information

update generated models for xml mappings
add predicate option to xml parser
2022-02-10 15:29:51 -08:00 · 2022-02-10 15:13:44 -08:00 · 2022-02-10 15:07:50 -08:00 · 2022-02-10 10:20:00 -08:00 · 2022-02-10 09:45:18 -08:00
6 changed files with 717 additions and 40 deletions
--- a/README.md
+++ b/README.md
@ -2,13 +2,11 @@

 [![Build Status](https://drone.lerch.org/api/badges/lobo/aws-sdk-for-zig/status.svg?ref=refs/heads/master)](https://drone.lerch.org/api/badges/lobo/aws-sdk-for-zig/)

-
-### NOTE: All tests pass, but credentials currently must be passed through environment
-
 This SDK currently supports all AWS services except EC2 and S3. These two
-services only support XML, and zig 0.8.0 and master both trigger compile
-errors while incorporating the XML parser. S3 also requires some plumbing
-tweaks in the signature calculation. Examples of usage are in src/main.zig.
+services only support XML, and zig 0.9.0 and master both trigger compile
+errors while incorporating the XML parser in conjunction with a process
+to fill the types. S3 also requires some plumbing tweaks in the signature
+calculation. Examples of usage are in src/main.zig.

 Current executable size for the demo is 953k (90k of which is the AWS PEM file)
 after compiling with -Drelease-safe and
@ -43,37 +41,33 @@ supersede all other configuration. Note that an alternative endpoint may
 require passing in a client option to specify an different TLS root certificate
 (pass null to disable certificate verification).

-Given that credential handling is still very basic, you may want to look at
-the [old branch](https://github.com/elerch/aws-sdk-for-zig/tree/aws-crt) if
-your needs include something more robust. Note that that branch supports
-x86_64 linux only.
+The [old branch](https://github.com/elerch/aws-sdk-for-zig/tree/aws-crt) exists
+for posterity, and supports x86_64 linux. This branch is recommended moving
+forward.

 ## Limitations

 There are many nuances of AWS V4 signature calculation. S3 is not supported
-because it uses many of these test cases. STS tokens using a session token
-are not yet implemented, though should be trivial. I have also seen a few
-service errors caused by discrepancies in signatures, though I don't know yet
-if this was an issue in the service itself (has not repro'd) or if there
-is a latent bug.
-
-Only environment variable based credentials can be used at the moment.
+because it uses many of these test cases. WebIdentityToken is not yet
+implemented.

 TODO List:

-* Add option to cache signature keys
-* Implement credentials provider
-* Implement jitter/exponential backoff
-* Implement timeouts and other TODO's in the code
+* To work around compiler issues, the best option may be to convert from
+  Xml to json, then parse from there. This will be pursued first. It may need
+  to wait for zig 0.10.0 when self-hosted compiler is likely to be completed
+  (zig 0.10.0 eta May 2022) discovered. If we need to wait, S3, EC2 and other
+  restXml protocols will be blocked.
 * Implement [AWS restXml protocol](https://awslabs.github.io/smithy/1.0/spec/aws/aws-restxml-protocol.html).
  Includes S3. Total service count 4. This may be blocked due to the same issue as EC2.
 * Implement [AWS EC2 query protocol](https://awslabs.github.io/smithy/1.0/spec/aws/aws-ec2-query-protocol.html).
  Includes EC2. Total service count 1. This may be blocked on a compiler bug,
-  though has not been tested with zig 0.9.0. It may need to wait for zig 0.10.0
-  when self-hosted compiler is likely to be completed (zig 0.10.0 eta May 2022)
-  discovered. More details and llvm ir log can be found in the
+  though has not been tested with zig 0.9.0. More details and llvm ir log can be found in the
  [XML branch](https://git.lerch.org/lobo/aws-sdk-for-zig/src/branch/xml).
 * Implement sigv4a signing
+* Implement jitter/exponential backoff
+* Implement timeouts and other TODO's in the code
+* Add option to cache signature keys

 Compiler wishlist/watchlist:

--- a/codegen/src/main.zig
+++ b/codegen/src/main.zig
@ -544,12 +544,12 @@ fn generateSimpleTypeFor(_: anytype, type_name: []const u8, writer: anytype) !vo
 }
 fn generateComplexTypeFor(shape_id: []const u8, members: []smithy.TypeMember, type_type_name: []const u8, writer: anytype, state: GenerationState) anyerror!void {
    _ = shape_id;
-    const Mapping = struct { snake: []const u8, json: []const u8 };
-    var json_field_name_mappings = try std.ArrayList(Mapping).initCapacity(state.allocator, members.len);
+    const Mapping = struct { snake: []const u8, original: []const u8 };
+    var field_name_mappings = try std.ArrayList(Mapping).initCapacity(state.allocator, members.len);
    defer {
-        for (json_field_name_mappings.items) |mapping|
+        for (field_name_mappings.items) |mapping|
            state.allocator.free(mapping.snake);
-        json_field_name_mappings.deinit();
+        field_name_mappings.deinit();
    }
    // There is an httpQueryParams trait as well, but nobody is using it. API GW
    // pretends to, but it's an empty map
@ -591,15 +591,19 @@ fn generateComplexTypeFor(shape_id: []const u8, members: []smithy.TypeMember, ty
            switch (trait) {
                .json_name => {
                    found_name_trait = true;
-                    json_field_name_mappings.appendAssumeCapacity(.{ .snake = try state.allocator.dupe(u8, snake_case_member), .json = trait.json_name });
+                    field_name_mappings.appendAssumeCapacity(.{ .snake = try state.allocator.dupe(u8, snake_case_member), .original = trait.json_name });
                },
-                .http_query => http_query_mappings.appendAssumeCapacity(.{ .snake = try state.allocator.dupe(u8, snake_case_member), .json = trait.http_query }),
-                .http_header => http_header_mappings.appendAssumeCapacity(.{ .snake = try state.allocator.dupe(u8, snake_case_member), .json = trait.http_header }),
+                .xml_name => {
+                    found_name_trait = true;
+                    field_name_mappings.appendAssumeCapacity(.{ .snake = try state.allocator.dupe(u8, snake_case_member), .original = trait.xml_name });
+                },
+                .http_query => http_query_mappings.appendAssumeCapacity(.{ .snake = try state.allocator.dupe(u8, snake_case_member), .original = trait.http_query }),
+                .http_header => http_header_mappings.appendAssumeCapacity(.{ .snake = try state.allocator.dupe(u8, snake_case_member), .original = trait.http_header }),
                else => {},
            }
        }
        if (!found_name_trait)
-            json_field_name_mappings.appendAssumeCapacity(.{ .snake = try state.allocator.dupe(u8, snake_case_member), .json = member.name });
+            field_name_mappings.appendAssumeCapacity(.{ .snake = try state.allocator.dupe(u8, snake_case_member), .original = member.name });
        defer state.allocator.free(snake_case_member);
        try outputIndent(child_state, writer);
        const member_name = avoidReserved(snake_case_member);
@ -637,11 +641,11 @@ fn generateComplexTypeFor(shape_id: []const u8, members: []smithy.TypeMember, ty
    //
    try writer.writeByte('\n');
    try outputIndent(child_state, writer);
-    _ = try writer.write("pub fn jsonFieldNameFor(_: @This(), comptime field_name: []const u8) []const u8 {\n");
+    _ = try writer.write("pub fn fieldNameFor(_: @This(), comptime field_name: []const u8) []const u8 {\n");
    var grandchild_state = child_state;
    grandchild_state.indent_level += 1;
    // We need to force output here becaseu we're referencing the field in the return statement below
-    try writeMappings(grandchild_state, "", "mappings", json_field_name_mappings, true, writer);
+    try writeMappings(grandchild_state, "", "mappings", field_name_mappings, true, writer);
    try outputIndent(grandchild_state, writer);
    _ = try writer.write("return @field(mappings, field_name);\n");
    try outputIndent(child_state, writer);
@ -667,7 +671,7 @@ fn writeStringify(state: GenerationState, fields: [][]const u8, writer: anytype)
            try outputIndent(child_state, writer);
            try writer.print("if (std.mem.eql(u8, \"{s}\", field_name))\n", .{field});
            try outputIndent(return_state, writer);
-            try writer.print("return try serializeMap(self.{s}, self.jsonFieldNameFor(\"{s}\"), options, out_stream);\n", .{ field, field });
+            try writer.print("return try serializeMap(self.{s}, self.fieldNameFor(\"{s}\"), options, out_stream);\n", .{ field, field });
        }
        try outputIndent(child_state, writer);
        _ = try writer.write("return false;\n");
@ -690,7 +694,7 @@ fn writeMappings(state: GenerationState, @"pub": []const u8, mapping_name: []con
    child_state.indent_level += 1;
    for (mappings.items) |mapping| {
        try outputIndent(child_state, writer);
-        try writer.print(".{s} = \"{s}\",\n", .{ avoidReserved(mapping.snake), mapping.json });
+        try writer.print(".{s} = \"{s}\",\n", .{ avoidReserved(mapping.snake), mapping.original });
    }
    try outputIndent(state, writer);
    _ = try writer.write("};\n");
--- a/smithy/src/smithy.zig
+++ b/smithy/src/smithy.zig
@ -96,6 +96,7 @@ pub const TraitType = enum {
    http_label,
    http_query,
    json_name,
+    xml_name,
    required,
    documentation,
    pattern,
@ -118,6 +119,7 @@ pub const Trait = union(TraitType) {
    aws_protocol: AwsProtocol,
    ec2_query_name: []const u8,
    json_name: []const u8,
+    xml_name: []const u8,
    http: struct {
        method: []const u8,
        uri: []const u8,
@ -565,6 +567,8 @@ fn getTrait(trait_type: []const u8, value: std.json.Value) SmithyParseError!?Tra
    }
    if (std.mem.eql(u8, trait_type, "smithy.api#jsonName"))
        return Trait{ .json_name = value.String };
+    if (std.mem.eql(u8, trait_type, "smithy.api#xmlName"))
+        return Trait{ .xml_name = value.String };
    if (std.mem.eql(u8, trait_type, "smithy.api#httpQuery"))
        return Trait{ .http_query = value.String };
    if (std.mem.eql(u8, trait_type, "smithy.api#httpHeader"))
@ -617,7 +621,6 @@ fn getTrait(trait_type: []const u8, value: std.json.Value) SmithyParseError!?Tra
        \\smithy.api#timestampFormat
        \\smithy.api#xmlAttribute
        \\smithy.api#xmlFlattened
-        \\smithy.api#xmlName
        \\smithy.waiters#waitable
    ;
    var iterator = std.mem.split(u8, list, "\n");
--- a/src/aws.zig
+++ b/src/aws.zig
@ -432,7 +432,7 @@ fn buildPath(allocator: std.mem.Allocator, raw_uri: []const u8, comptime ActionR
                in_var = false;
                const replacement_var = raw_uri[start..inx];
                inline for (std.meta.fields(ActionRequest)) |field| {
-                    if (std.mem.eql(u8, request.jsonFieldNameFor(field.name), replacement_var)) {
+                    if (std.mem.eql(u8, request.fieldNameFor(field.name), replacement_var)) {
                        var replacement_buffer = try std.ArrayList(u8).initCapacity(allocator, raw_uri.len);
                        defer replacement_buffer.deinit();
                        var encoded_buffer = try std.ArrayList(u8).initCapacity(allocator, raw_uri.len);
--- a/src/json.zig
+++ b/src/json.zig
@ -2871,8 +2871,8 @@ pub fn stringify(
                    field_written = try value.jsonStringifyField(Field.name, child_options, out_stream);

                if (!field_written) {
-                    if (comptime std.meta.trait.hasFn("jsonFieldNameFor")(T)) {
-                        const name = value.jsonFieldNameFor(Field.name);
+                    if (comptime std.meta.trait.hasFn("fieldNameFor")(T)) {
+                        const name = value.fieldNameFor(Field.name);
                        try stringify(name, options, out_stream);
                    } else {
                        try stringify(Field.name, options, out_stream);
--- a/src/xml.zig
+++ b/src/xml.zig
@ -0,0 +1,676 @@
+// File sourced from:
+// https://github.com/Snektron/vulkan-zig/blob/797ae8af88e84753af9640266de61a985b76b580/generator/xml.zig
+const std = @import("std");
+const mem = std.mem;
+const testing = std.testing;
+const Allocator = mem.Allocator;
+const ArenaAllocator = std.heap.ArenaAllocator;
+const ArrayList = std.ArrayList;
+
+pub const Attribute = struct {
+    name: []const u8,
+    value: []const u8,
+};
+
+pub const Content = union(enum) {
+    CharData: []const u8,
+    Comment: []const u8,
+    Element: *Element,
+};
+
+pub const Element = struct {
+    pub const AttributeList = ArrayList(*Attribute);
+    pub const ContentList = ArrayList(Content);
+
+    tag: []const u8,
+    attributes: AttributeList,
+    children: ContentList,
+
+    fn init(tag: []const u8, alloc: Allocator) Element {
+        return .{
+            .tag = tag,
+            .attributes = AttributeList.init(alloc),
+            .children = ContentList.init(alloc),
+        };
+    }
+
+    pub fn getAttribute(self: *Element, attrib_name: []const u8) ?[]const u8 {
+        for (self.attributes.items) |child| {
+            if (mem.eql(u8, child.name, attrib_name)) {
+                return child.value;
+            }
+        }
+
+        return null;
+    }
+
+    pub fn getCharData(self: *Element, child_tag: []const u8) ?[]const u8 {
+        const child = self.findChildByTag(child_tag) orelse return null;
+        if (child.children.items.len != 1) {
+            return null;
+        }
+
+        return switch (child.children.items[0]) {
+            .CharData => |char_data| char_data,
+            else => null,
+        };
+    }
+
+    pub fn iterator(self: *Element) ChildIterator {
+        return .{
+            .items = self.children.items,
+            .i = 0,
+        };
+    }
+
+    pub fn elements(self: *Element) ChildElementIterator {
+        return .{
+            .inner = self.iterator(),
+        };
+    }
+
+    pub fn findChildByTag(self: *Element, tag: []const u8) !?*Element {
+        return try self.findChildrenByTag(tag).next();
+    }
+
+    pub fn findChildrenByTag(self: *Element, tag: []const u8) FindChildrenByTagIterator {
+        return .{
+            .inner = self.elements(),
+            .tag = tag,
+        };
+    }
+
+    pub const ChildIterator = struct {
+        items: []Content,
+        i: usize,
+
+        pub fn next(self: *ChildIterator) ?*Content {
+            if (self.i < self.items.len) {
+                self.i += 1;
+                return &self.items[self.i - 1];
+            }
+
+            return null;
+        }
+    };
+
+    pub const ChildElementIterator = struct {
+        inner: ChildIterator,
+
+        pub fn next(self: *ChildElementIterator) ?*Element {
+            while (self.inner.next()) |child| {
+                if (child.* != .Element) {
+                    continue;
+                }
+
+                return child.*.Element;
+            }
+
+            return null;
+        }
+    };
+
+    fn strictEqual(a: []const u8, b: []const u8, _: PredicateOptions) !bool {
+        return mem.eql(u8, a, b);
+    }
+    pub const FindChildrenByTagIterator = struct {
+        inner: ChildElementIterator,
+        tag: []const u8,
+        predicate: fn (a: []const u8, b: []const u8, options: PredicateOptions) anyerror!bool = strictEqual,
+        predicate_options: PredicateOptions = .{},
+
+        pub fn next(self: *FindChildrenByTagIterator) !?*Element {
+            while (self.inner.next()) |child| {
+                if (!try self.predicate(child.tag, self.tag, self.predicate_options)) {
+                    continue;
+                }
+
+                return child;
+            }
+
+            return null;
+        }
+    };
+};
+
+pub const PredicateOptions = struct {
+    allocator: ?std.mem.Allocator = null,
+};
+pub const XmlDecl = struct {
+    version: []const u8,
+    encoding: ?[]const u8,
+    standalone: ?bool,
+};
+
+pub const Document = struct {
+    arena: ArenaAllocator,
+    xml_decl: ?*XmlDecl,
+    root: *Element,
+
+    pub fn deinit(self: Document) void {
+        var arena = self.arena; // Copy to stack so self can be taken by value.
+        arena.deinit();
+    }
+};
+
+const ParseContext = struct {
+    source: []const u8,
+    offset: usize,
+    line: usize,
+    column: usize,
+
+    fn init(source: []const u8) ParseContext {
+        return .{
+            .source = source,
+            .offset = 0,
+            .line = 0,
+            .column = 0,
+        };
+    }
+
+    fn peek(self: *ParseContext) ?u8 {
+        return if (self.offset < self.source.len) self.source[self.offset] else null;
+    }
+
+    fn consume(self: *ParseContext) !u8 {
+        if (self.offset < self.source.len) {
+            return self.consumeNoEof();
+        }
+
+        return error.UnexpectedEof;
+    }
+
+    fn consumeNoEof(self: *ParseContext) u8 {
+        std.debug.assert(self.offset < self.source.len);
+        const c = self.source[self.offset];
+        self.offset += 1;
+
+        if (c == '\n') {
+            self.line += 1;
+            self.column = 0;
+        } else {
+            self.column += 1;
+        }
+
+        return c;
+    }
+
+    fn eat(self: *ParseContext, char: u8) bool {
+        self.expect(char) catch return false;
+        return true;
+    }
+
+    fn expect(self: *ParseContext, expected: u8) !void {
+        if (self.peek()) |actual| {
+            if (expected != actual) {
+                return error.UnexpectedCharacter;
+            }
+
+            _ = self.consumeNoEof();
+            return;
+        }
+
+        return error.UnexpectedEof;
+    }
+
+    fn eatStr(self: *ParseContext, text: []const u8) bool {
+        self.expectStr(text) catch return false;
+        return true;
+    }
+
+    fn expectStr(self: *ParseContext, text: []const u8) !void {
+        if (self.source.len < self.offset + text.len) {
+            return error.UnexpectedEof;
+        } else if (std.mem.startsWith(u8, self.source[self.offset..], text)) {
+            var i: usize = 0;
+            while (i < text.len) : (i += 1) {
+                _ = self.consumeNoEof();
+            }
+
+            return;
+        }
+
+        return error.UnexpectedCharacter;
+    }
+
+    fn eatWs(self: *ParseContext) bool {
+        var ws = false;
+
+        while (self.peek()) |ch| {
+            switch (ch) {
+                ' ', '\t', '\n', '\r' => {
+                    ws = true;
+                    _ = self.consumeNoEof();
+                },
+                else => break,
+            }
+        }
+
+        return ws;
+    }
+
+    fn expectWs(self: *ParseContext) !void {
+        if (!self.eatWs()) return error.UnexpectedCharacter;
+    }
+
+    fn currentLine(self: ParseContext) []const u8 {
+        var begin: usize = 0;
+        if (mem.lastIndexOfScalar(u8, self.source[0..self.offset], '\n')) |prev_nl| {
+            begin = prev_nl + 1;
+        }
+
+        var end = mem.indexOfScalarPos(u8, self.source, self.offset, '\n') orelse self.source.len;
+        return self.source[begin..end];
+    }
+};
+
+test "ParseContext" {
+    {
+        var ctx = ParseContext.init("I like pythons");
+        try testing.expectEqual(@as(?u8, 'I'), ctx.peek());
+        try testing.expectEqual(@as(u8, 'I'), ctx.consumeNoEof());
+        try testing.expectEqual(@as(?u8, ' '), ctx.peek());
+        try testing.expectEqual(@as(u8, ' '), try ctx.consume());
+
+        try testing.expect(ctx.eat('l'));
+        try testing.expectEqual(@as(?u8, 'i'), ctx.peek());
+        try testing.expectEqual(false, ctx.eat('a'));
+        try testing.expectEqual(@as(?u8, 'i'), ctx.peek());
+
+        try ctx.expect('i');
+        try testing.expectEqual(@as(?u8, 'k'), ctx.peek());
+        try testing.expectError(error.UnexpectedCharacter, ctx.expect('a'));
+        try testing.expectEqual(@as(?u8, 'k'), ctx.peek());
+
+        try testing.expect(ctx.eatStr("ke"));
+        try testing.expectEqual(@as(?u8, ' '), ctx.peek());
+
+        try testing.expect(ctx.eatWs());
+        try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
+        try testing.expectEqual(false, ctx.eatWs());
+        try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
+
+        try testing.expectEqual(false, ctx.eatStr("aaaaaaaaa"));
+        try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
+
+        try testing.expectError(error.UnexpectedEof, ctx.expectStr("aaaaaaaaa"));
+        try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
+        try testing.expectError(error.UnexpectedCharacter, ctx.expectStr("pytn"));
+        try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
+        try ctx.expectStr("python");
+        try testing.expectEqual(@as(?u8, 's'), ctx.peek());
+    }
+
+    {
+        var ctx = ParseContext.init("");
+        try testing.expectEqual(ctx.peek(), null);
+        try testing.expectError(error.UnexpectedEof, ctx.consume());
+        try testing.expectEqual(ctx.eat('p'), false);
+        try testing.expectError(error.UnexpectedEof, ctx.expect('p'));
+    }
+}
+
+pub const ParseError = error{
+    IllegalCharacter,
+    UnexpectedEof,
+    UnexpectedCharacter,
+    UnclosedValue,
+    UnclosedComment,
+    InvalidName,
+    InvalidEntity,
+    InvalidStandaloneValue,
+    NonMatchingClosingTag,
+    InvalidDocument,
+    OutOfMemory,
+};
+
+pub fn parse(backing_allocator: Allocator, source: []const u8) !Document {
+    var ctx = ParseContext.init(source);
+    return try parseDocument(&ctx, backing_allocator);
+}
+
+fn parseDocument(ctx: *ParseContext, backing_allocator: Allocator) !Document {
+    var doc = Document{
+        .arena = ArenaAllocator.init(backing_allocator),
+        .xml_decl = null,
+        .root = undefined,
+    };
+
+    errdefer doc.deinit();
+
+    const allocator = doc.arena.allocator();
+
+    try trySkipComments(ctx, allocator);
+
+    doc.xml_decl = try tryParseProlog(ctx, allocator);
+    _ = ctx.eatWs();
+    try trySkipComments(ctx, allocator);
+
+    doc.root = (try tryParseElement(ctx, allocator)) orelse return error.InvalidDocument;
+    _ = ctx.eatWs();
+    try trySkipComments(ctx, allocator);
+
+    if (ctx.peek() != null) return error.InvalidDocument;
+
+    return doc;
+}
+
+fn parseAttrValue(ctx: *ParseContext, alloc: Allocator) ![]const u8 {
+    const quote = try ctx.consume();
+    if (quote != '"' and quote != '\'') return error.UnexpectedCharacter;
+
+    const begin = ctx.offset;
+
+    while (true) {
+        const c = ctx.consume() catch return error.UnclosedValue;
+        if (c == quote) break;
+    }
+
+    const end = ctx.offset - 1;
+
+    return try dupeAndUnescape(alloc, ctx.source[begin..end]);
+}
+
+fn parseEqAttrValue(ctx: *ParseContext, alloc: Allocator) ![]const u8 {
+    _ = ctx.eatWs();
+    try ctx.expect('=');
+    _ = ctx.eatWs();
+
+    return try parseAttrValue(ctx, alloc);
+}
+
+fn parseNameNoDupe(ctx: *ParseContext) ![]const u8 {
+    // XML's spec on names is very long, so to make this easier
+    // we just take any character that is not special and not whitespace
+    const begin = ctx.offset;
+
+    while (ctx.peek()) |ch| {
+        switch (ch) {
+            ' ', '\t', '\n', '\r' => break,
+            '&', '"', '\'', '<', '>', '?', '=', '/' => break,
+            else => _ = ctx.consumeNoEof(),
+        }
+    }
+
+    const end = ctx.offset;
+    if (begin == end) return error.InvalidName;
+
+    return ctx.source[begin..end];
+}
+
+fn tryParseCharData(ctx: *ParseContext, alloc: Allocator) !?[]const u8 {
+    const begin = ctx.offset;
+
+    while (ctx.peek()) |ch| {
+        switch (ch) {
+            '<' => break,
+            else => _ = ctx.consumeNoEof(),
+        }
+    }
+
+    const end = ctx.offset;
+    if (begin == end) return null;
+
+    return try dupeAndUnescape(alloc, ctx.source[begin..end]);
+}
+
+fn parseContent(ctx: *ParseContext, alloc: Allocator) ParseError!Content {
+    if (try tryParseCharData(ctx, alloc)) |cd| {
+        return Content{ .CharData = cd };
+    } else if (try tryParseComment(ctx, alloc)) |comment| {
+        return Content{ .Comment = comment };
+    } else if (try tryParseElement(ctx, alloc)) |elem| {
+        return Content{ .Element = elem };
+    } else {
+        return error.UnexpectedCharacter;
+    }
+}
+
+fn tryParseAttr(ctx: *ParseContext, alloc: Allocator) !?*Attribute {
+    const name = parseNameNoDupe(ctx) catch return null;
+    _ = ctx.eatWs();
+    try ctx.expect('=');
+    _ = ctx.eatWs();
+    const value = try parseAttrValue(ctx, alloc);
+
+    const attr = try alloc.create(Attribute);
+    attr.name = try alloc.dupe(u8, name);
+    attr.value = value;
+    return attr;
+}
+
+fn tryParseElement(ctx: *ParseContext, alloc: Allocator) !?*Element {
+    const start = ctx.offset;
+    if (!ctx.eat('<')) return null;
+    const tag = parseNameNoDupe(ctx) catch {
+        ctx.offset = start;
+        return null;
+    };
+
+    const element = try alloc.create(Element);
+    element.* = Element.init(try alloc.dupe(u8, tag), alloc);
+
+    while (ctx.eatWs()) {
+        const attr = (try tryParseAttr(ctx, alloc)) orelse break;
+        try element.attributes.append(attr);
+    }
+
+    if (ctx.eatStr("/>")) {
+        return element;
+    }
+
+    try ctx.expect('>');
+
+    while (true) {
+        if (ctx.peek() == null) {
+            return error.UnexpectedEof;
+        } else if (ctx.eatStr("</")) {
+            break;
+        }
+
+        const content = try parseContent(ctx, alloc);
+        try element.children.append(content);
+    }
+
+    const closing_tag = try parseNameNoDupe(ctx);
+    if (!std.mem.eql(u8, tag, closing_tag)) {
+        return error.NonMatchingClosingTag;
+    }
+
+    _ = ctx.eatWs();
+    try ctx.expect('>');
+    return element;
+}
+
+test "tryParseElement" {
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
+
+    {
+        var ctx = ParseContext.init("<= a='b'/>");
+        try testing.expectEqual(@as(?*Element, null), try tryParseElement(&ctx, alloc));
+        try testing.expectEqual(@as(?u8, '<'), ctx.peek());
+    }
+
+    {
+        var ctx = ParseContext.init("<python size='15' color = \"green\"/>");
+        const elem = try tryParseElement(&ctx, alloc);
+        try testing.expectEqualSlices(u8, elem.?.tag, "python");
+
+        const size_attr = elem.?.attributes.items[0];
+        try testing.expectEqualSlices(u8, size_attr.name, "size");
+        try testing.expectEqualSlices(u8, size_attr.value, "15");
+
+        const color_attr = elem.?.attributes.items[1];
+        try testing.expectEqualSlices(u8, color_attr.name, "color");
+        try testing.expectEqualSlices(u8, color_attr.value, "green");
+    }
+
+    {
+        var ctx = ParseContext.init("<python>test</python>");
+        const elem = try tryParseElement(&ctx, alloc);
+        try testing.expectEqualSlices(u8, elem.?.tag, "python");
+        try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "test");
+    }
+
+    {
+        var ctx = ParseContext.init("<a>b<c/>d<e/>f<!--g--></a>");
+        const elem = try tryParseElement(&ctx, alloc);
+        try testing.expectEqualSlices(u8, elem.?.tag, "a");
+        try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "b");
+        try testing.expectEqualSlices(u8, elem.?.children.items[1].Element.tag, "c");
+        try testing.expectEqualSlices(u8, elem.?.children.items[2].CharData, "d");
+        try testing.expectEqualSlices(u8, elem.?.children.items[3].Element.tag, "e");
+        try testing.expectEqualSlices(u8, elem.?.children.items[4].CharData, "f");
+        try testing.expectEqualSlices(u8, elem.?.children.items[5].Comment, "g");
+    }
+}
+
+fn tryParseProlog(ctx: *ParseContext, alloc: Allocator) !?*XmlDecl {
+    const start = ctx.offset;
+    if (!ctx.eatStr("<?") or !mem.eql(u8, try parseNameNoDupe(ctx), "xml")) {
+        ctx.offset = start;
+        return null;
+    }
+
+    const decl = try alloc.create(XmlDecl);
+    decl.encoding = null;
+    decl.standalone = null;
+
+    // Version info is mandatory
+    try ctx.expectWs();
+    try ctx.expectStr("version");
+    decl.version = try parseEqAttrValue(ctx, alloc);
+
+    if (ctx.eatWs()) {
+        // Optional encoding and standalone info
+        var require_ws = false;
+
+        if (ctx.eatStr("encoding")) {
+            decl.encoding = try parseEqAttrValue(ctx, alloc);
+            require_ws = true;
+        }
+
+        if (require_ws == ctx.eatWs() and ctx.eatStr("standalone")) {
+            const standalone = try parseEqAttrValue(ctx, alloc);
+            if (std.mem.eql(u8, standalone, "yes")) {
+                decl.standalone = true;
+            } else if (std.mem.eql(u8, standalone, "no")) {
+                decl.standalone = false;
+            } else {
+                return error.InvalidStandaloneValue;
+            }
+        }
+
+        _ = ctx.eatWs();
+    }
+
+    try ctx.expectStr("?>");
+    return decl;
+}
+
+test "tryParseProlog" {
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
+
+    {
+        var ctx = ParseContext.init("<?xmla version='aa'?>");
+        try testing.expectEqual(@as(?*XmlDecl, null), try tryParseProlog(&ctx, alloc));
+        try testing.expectEqual(@as(?u8, '<'), ctx.peek());
+    }
+
+    {
+        var ctx = ParseContext.init("<?xml version='aa'?>");
+        const decl = try tryParseProlog(&ctx, alloc);
+        try testing.expectEqualSlices(u8, "aa", decl.?.version);
+        try testing.expectEqual(@as(?[]const u8, null), decl.?.encoding);
+        try testing.expectEqual(@as(?bool, null), decl.?.standalone);
+    }
+
+    {
+        var ctx = ParseContext.init("<?xml version=\"aa\" encoding = 'bbb' standalone   \t =   'yes'?>");
+        const decl = try tryParseProlog(&ctx, alloc);
+        try testing.expectEqualSlices(u8, "aa", decl.?.version);
+        try testing.expectEqualSlices(u8, "bbb", decl.?.encoding.?);
+        try testing.expectEqual(@as(?bool, true), decl.?.standalone.?);
+    }
+}
+
+fn trySkipComments(ctx: *ParseContext, alloc: Allocator) !void {
+    while (try tryParseComment(ctx, alloc)) |_| {
+        _ = ctx.eatWs();
+    }
+}
+
+fn tryParseComment(ctx: *ParseContext, alloc: Allocator) !?[]const u8 {
+    if (!ctx.eatStr("<!--")) return null;
+
+    const begin = ctx.offset;
+    while (!ctx.eatStr("-->")) {
+        _ = ctx.consume() catch return error.UnclosedComment;
+    }
+
+    const end = ctx.offset - "-->".len;
+    return try alloc.dupe(u8, ctx.source[begin..end]);
+}
+
+fn unescapeEntity(text: []const u8) !u8 {
+    const EntitySubstition = struct { text: []const u8, replacement: u8 };
+
+    const entities = [_]EntitySubstition{
+        .{ .text = "&lt;", .replacement = '<' },
+        .{ .text = "&gt;", .replacement = '>' },
+        .{ .text = "&amp;", .replacement = '&' },
+        .{ .text = "&apos;", .replacement = '\'' },
+        .{ .text = "&quot;", .replacement = '"' },
+    };
+
+    for (entities) |entity| {
+        if (std.mem.eql(u8, text, entity.text)) return entity.replacement;
+    }
+
+    return error.InvalidEntity;
+}
+
+fn dupeAndUnescape(alloc: Allocator, text: []const u8) ![]const u8 {
+    const str = try alloc.alloc(u8, text.len);
+
+    var j: usize = 0;
+    var i: usize = 0;
+    while (i < text.len) : (j += 1) {
+        if (text[i] == '&') {
+            const entity_end = 1 + (mem.indexOfScalarPos(u8, text, i, ';') orelse return error.InvalidEntity);
+            str[j] = try unescapeEntity(text[i..entity_end]);
+            i = entity_end;
+        } else {
+            str[j] = text[i];
+            i += 1;
+        }
+    }
+
+    return alloc.shrink(str, j);
+}
+
+test "dupeAndUnescape" {
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
+
+    try testing.expectEqualSlices(u8, "test", try dupeAndUnescape(alloc, "test"));
+    try testing.expectEqualSlices(u8, "a<b&c>d\"e'f<", try dupeAndUnescape(alloc, "a&lt;b&amp;c&gt;d&quot;e&apos;f&lt;"));
+    try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&"));
+    try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&&"));
+    try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&test;"));
+    try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&boa"));
+}
+
+test "Top level comments" {
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
+
+    const doc = try parse(alloc, "<?xml version='aa'?><!--comment--><python color='green'/><!--another comment-->");
+    try testing.expectEqualSlices(u8, "python", doc.root.tag);
+}
Author	SHA1	Message	Date
Emil Lerch	ca801799fc	update README with new XML information All checks were successful continuous-integration/drone/push Build is passing Details	2022-02-10 15:29:51 -08:00
Emil Lerch	f374df3fa1	update generated models for xml mappings	2022-02-10 15:13:44 -08:00
Emil Lerch	744d834cfd	add predicate option to xml parser	2022-02-10 15:07:50 -08:00
Emil Lerch	c9369504fa	re-import xml.zig from Vulkan project	2022-02-10 10:20:00 -08:00
Emil Lerch	4606205b82	update readme for prioritization plan	2022-02-10 09:45:18 -08:00