From 0727fb5585e08d63c13740ba2e5c3e25269fe2eb Mon Sep 17 00:00:00 2001 From: Emil Lerch Date: Thu, 13 May 2021 15:55:06 -0700 Subject: [PATCH] remove xml.zig. This may be back later, but currently is unused --- src/xml.zig | 649 ---------------------------------------------------- 1 file changed, 649 deletions(-) delete mode 100644 src/xml.zig diff --git a/src/xml.zig b/src/xml.zig deleted file mode 100644 index 3fa9d16..0000000 --- a/src/xml.zig +++ /dev/null @@ -1,649 +0,0 @@ -const std = @import("std"); -const mem = std.mem; -const testing = std.testing; -const Allocator = mem.Allocator; -const ArenaAllocator = std.heap.ArenaAllocator; -const ArrayList = std.ArrayList; - -pub const Attribute = struct { - name: []const u8, value: []const u8 -}; - -pub const Content = union(enum) { - CharData: []const u8, Comment: []const u8, Element: *Element -}; - -pub const Element = struct { - pub const AttributeList = ArrayList(*Attribute); - pub const ContentList = ArrayList(Content); - - tag: []const u8, - attributes: AttributeList, - children: ContentList, - - fn init(tag: []const u8, alloc: *Allocator) Element { - return .{ - .tag = tag, - .attributes = AttributeList.init(alloc), - .children = ContentList.init(alloc), - }; - } - - pub fn getAttribute(self: *Element, attrib_name: []const u8) ?[]const u8 { - for (self.attributes.items) |child| { - if (mem.eql(u8, child.name, attrib_name)) { - return child.value; - } - } - - return null; - } - - pub fn getCharData(self: *Element, child_tag: []const u8) ?[]const u8 { - const child = self.findChildByTag(child_tag) orelse return null; - if (child.children.items.len != 1) { - return null; - } - - return switch (child.children.items[0]) { - .CharData => |char_data| char_data, - else => null, - }; - } - - pub fn iterator(self: *Element) ChildIterator { - return .{ - .items = self.children.items, - .i = 0, - }; - } - - pub fn elements(self: *Element) ChildElementIterator { - return .{ - .inner = self.iterator(), - }; - } - - pub fn findChildByTag(self: *Element, tag: []const u8) ?*Element { - return self.findChildrenByTag(tag).next(); - } - - pub fn findChildrenByTag(self: *Element, tag: []const u8) FindChildrenByTagIterator { - return .{ - .inner = self.elements(), - .tag = tag, - }; - } - - pub const ChildIterator = struct { - items: []Content, - i: usize, - - pub fn next(self: *ChildIterator) ?*Content { - if (self.i < self.items.len) { - self.i += 1; - return &self.items[self.i - 1]; - } - - return null; - } - }; - - pub const ChildElementIterator = struct { - inner: ChildIterator, - - pub fn next(self: *ChildElementIterator) ?*Element { - while (self.inner.next()) |child| { - if (child.* != .Element) { - continue; - } - - return child.*.Element; - } - - return null; - } - }; - - pub const FindChildrenByTagIterator = struct { - inner: ChildElementIterator, - tag: []const u8, - - pub fn next(self: *FindChildrenByTagIterator) ?*Element { - while (self.inner.next()) |child| { - if (!mem.eql(u8, child.tag, self.tag)) { - continue; - } - - return child; - } - - return null; - } - }; -}; - -pub const XmlDecl = struct { - version: []const u8, encoding: ?[]const u8, standalone: ?bool -}; - -pub const Document = struct { - arena: ArenaAllocator, - xml_decl: ?*XmlDecl, - root: *Element, - - pub fn deinit(self: Document) void { - var arena = self.arena; // Copy to stack so self can be taken by value. - arena.deinit(); - } -}; - -const ParseContext = struct { - source: []const u8, - offset: usize, - line: usize, - column: usize, - - fn init(source: []const u8) ParseContext { - return .{ - .source = source, - .offset = 0, - .line = 0, - .column = 0, - }; - } - - fn peek(self: *ParseContext) ?u8 { - return if (self.offset < self.source.len) self.source[self.offset] else null; - } - - fn consume(self: *ParseContext) !u8 { - if (self.offset < self.source.len) { - return self.consumeNoEof(); - } - - return error.UnexpectedEof; - } - - fn consumeNoEof(self: *ParseContext) u8 { - std.debug.assert(self.offset < self.source.len); - const c = self.source[self.offset]; - self.offset += 1; - - if (c == '\n') { - self.line += 1; - self.column = 0; - } else { - self.column += 1; - } - - return c; - } - - fn eat(self: *ParseContext, char: u8) bool { - self.expect(char) catch return false; - return true; - } - - fn expect(self: *ParseContext, expected: u8) !void { - if (self.peek()) |actual| { - if (expected != actual) { - return error.UnexpectedCharacter; - } - - _ = self.consumeNoEof(); - return; - } - - return error.UnexpectedEof; - } - - fn eatStr(self: *ParseContext, text: []const u8) bool { - self.expectStr(text) catch return false; - return true; - } - - fn expectStr(self: *ParseContext, text: []const u8) !void { - if (self.source.len < self.offset + text.len) { - return error.UnexpectedEof; - } else if (std.mem.startsWith(u8, self.source[self.offset..], text)) { - var i: usize = 0; - while (i < text.len) : (i += 1) { - _ = self.consumeNoEof(); - } - - return; - } - - return error.UnexpectedCharacter; - } - - fn eatWs(self: *ParseContext) bool { - var ws = false; - - while (self.peek()) |ch| { - switch (ch) { - ' ', '\t', '\n', '\r' => { - ws = true; - _ = self.consumeNoEof(); - }, - else => break, - } - } - - return ws; - } - - fn expectWs(self: *ParseContext) !void { - if (!self.eatWs()) return error.UnexpectedCharacter; - } - - fn currentLine(self: ParseContext) []const u8 { - var begin: usize = 0; - if (mem.lastIndexOfScalar(u8, self.source[0..self.offset], '\n')) |prev_nl| { - begin = prev_nl + 1; - } - - var end = mem.indexOfScalarPos(u8, self.source, self.offset, '\n') orelse self.source.len; - return self.source[begin..end]; - } -}; - -test "ParseContext" { - { - var ctx = ParseContext.init("I like pythons"); - testing.expectEqual(@as(?u8, 'I'), ctx.peek()); - testing.expectEqual(@as(u8, 'I'), ctx.consumeNoEof()); - testing.expectEqual(@as(?u8, ' '), ctx.peek()); - testing.expectEqual(@as(u8, ' '), try ctx.consume()); - - testing.expect(ctx.eat('l')); - testing.expectEqual(@as(?u8, 'i'), ctx.peek()); - testing.expectEqual(false, ctx.eat('a')); - testing.expectEqual(@as(?u8, 'i'), ctx.peek()); - - try ctx.expect('i'); - testing.expectEqual(@as(?u8, 'k'), ctx.peek()); - testing.expectError(error.UnexpectedCharacter, ctx.expect('a')); - testing.expectEqual(@as(?u8, 'k'), ctx.peek()); - - testing.expect(ctx.eatStr("ke")); - testing.expectEqual(@as(?u8, ' '), ctx.peek()); - - testing.expect(ctx.eatWs()); - testing.expectEqual(@as(?u8, 'p'), ctx.peek()); - testing.expectEqual(false, ctx.eatWs()); - testing.expectEqual(@as(?u8, 'p'), ctx.peek()); - - testing.expectEqual(false, ctx.eatStr("aaaaaaaaa")); - testing.expectEqual(@as(?u8, 'p'), ctx.peek()); - - testing.expectError(error.UnexpectedEof, ctx.expectStr("aaaaaaaaa")); - testing.expectEqual(@as(?u8, 'p'), ctx.peek()); - testing.expectError(error.UnexpectedCharacter, ctx.expectStr("pytn")); - testing.expectEqual(@as(?u8, 'p'), ctx.peek()); - try ctx.expectStr("python"); - testing.expectEqual(@as(?u8, 's'), ctx.peek()); - } - - { - var ctx = ParseContext.init(""); - testing.expectEqual(ctx.peek(), null); - testing.expectError(error.UnexpectedEof, ctx.consume()); - testing.expectEqual(ctx.eat('p'), false); - testing.expectError(error.UnexpectedEof, ctx.expect('p')); - } -} - -pub const ParseError = error{ IllegalCharacter, UnexpectedEof, UnexpectedCharacter, UnclosedValue, UnclosedComment, InvalidName, InvalidEntity, InvalidStandaloneValue, NonMatchingClosingTag, InvalidDocument, OutOfMemory }; - -pub fn parse(backing_allocator: *Allocator, source: []const u8) !Document { - var ctx = ParseContext.init(source); - return try parseDocument(&ctx, backing_allocator); -} - -fn parseDocument(ctx: *ParseContext, backing_allocator: *Allocator) !Document { - var doc = Document{ - .arena = ArenaAllocator.init(backing_allocator), - .xml_decl = null, - .root = undefined, - }; - - errdefer doc.deinit(); - - try trySkipComments(ctx, &doc.arena.allocator); - - doc.xml_decl = try tryParseProlog(ctx, &doc.arena.allocator); - _ = ctx.eatWs(); - try trySkipComments(ctx, &doc.arena.allocator); - - doc.root = (try tryParseElement(ctx, &doc.arena.allocator)) orelse return error.InvalidDocument; - _ = ctx.eatWs(); - try trySkipComments(ctx, &doc.arena.allocator); - - if (ctx.peek() != null) return error.InvalidDocument; - - return doc; -} - -fn parseAttrValue(ctx: *ParseContext, alloc: *Allocator) ![]const u8 { - const quote = try ctx.consume(); - if (quote != '"' and quote != '\'') return error.UnexpectedCharacter; - - const begin = ctx.offset; - - while (true) { - const c = ctx.consume() catch return error.UnclosedValue; - if (c == quote) break; - } - - const end = ctx.offset - 1; - - return try dupeAndUnescape(alloc, ctx.source[begin..end]); -} - -fn parseEqAttrValue(ctx: *ParseContext, alloc: *Allocator) ![]const u8 { - _ = ctx.eatWs(); - try ctx.expect('='); - _ = ctx.eatWs(); - - return try parseAttrValue(ctx, alloc); -} - -fn parseNameNoDupe(ctx: *ParseContext) ![]const u8 { - // XML's spec on names is very long, so to make this easier - // we just take any character that is not special and not whitespace - const begin = ctx.offset; - - while (ctx.peek()) |ch| { - switch (ch) { - ' ', '\t', '\n', '\r' => break, - '&', '"', '\'', '<', '>', '?', '=', '/' => break, - else => _ = ctx.consumeNoEof(), - } - } - - const end = ctx.offset; - if (begin == end) return error.InvalidName; - - return ctx.source[begin..end]; -} - -fn tryParseCharData(ctx: *ParseContext, alloc: *Allocator) !?[]const u8 { - const begin = ctx.offset; - - while (ctx.peek()) |ch| { - switch (ch) { - '<', '>' => break, - else => _ = ctx.consumeNoEof(), - } - } - - const end = ctx.offset; - if (begin == end) return null; - - return try dupeAndUnescape(alloc, ctx.source[begin..end]); -} - -fn parseContent(ctx: *ParseContext, alloc: *Allocator) ParseError!Content { - if (try tryParseCharData(ctx, alloc)) |cd| { - return Content{ .CharData = cd }; - } else if (try tryParseComment(ctx, alloc)) |comment| { - return Content{ .Comment = comment }; - } else if (try tryParseElement(ctx, alloc)) |elem| { - return Content{ .Element = elem }; - } else { - return error.UnexpectedCharacter; - } -} - -fn tryParseAttr(ctx: *ParseContext, alloc: *Allocator) !?*Attribute { - const name = parseNameNoDupe(ctx) catch return null; - _ = ctx.eatWs(); - try ctx.expect('='); - _ = ctx.eatWs(); - const value = try parseAttrValue(ctx, alloc); - - const attr = try alloc.create(Attribute); - attr.name = try mem.dupe(alloc, u8, name); - attr.value = value; - return attr; -} - -fn tryParseElement(ctx: *ParseContext, alloc: *Allocator) !?*Element { - const start = ctx.offset; - if (!ctx.eat('<')) return null; - const tag = parseNameNoDupe(ctx) catch { - ctx.offset = start; - return null; - }; - - const element = try alloc.create(Element); - element.* = Element.init(try std.mem.dupe(alloc, u8, tag), alloc); - - while (ctx.eatWs()) { - const attr = (try tryParseAttr(ctx, alloc)) orelse break; - try element.attributes.append(attr); - } - - if (ctx.eatStr("/>")) { - return element; - } - - try ctx.expect('>'); - - while (true) { - if (ctx.peek() == null) { - return error.UnexpectedEof; - } else if (ctx.eatStr("'); - return element; -} - -test "tryParseElement" { - var arena = std.heap.ArenaAllocator.init(testing.allocator); - defer arena.deinit(); - var alloc = &arena.allocator; - - { - var ctx = ParseContext.init("<= a='b'/>"); - testing.expectEqual(@as(?*Element, null), try tryParseElement(&ctx, alloc)); - testing.expectEqual(@as(?u8, '<'), ctx.peek()); - } - - { - var ctx = ParseContext.init(""); - const elem = try tryParseElement(&ctx, alloc); - testing.expectEqualSlices(u8, elem.?.tag, "python"); - - const size_attr = elem.?.attributes.items[0]; - testing.expectEqualSlices(u8, size_attr.name, "size"); - testing.expectEqualSlices(u8, size_attr.value, "15"); - - const color_attr = elem.?.attributes.items[1]; - testing.expectEqualSlices(u8, color_attr.name, "color"); - testing.expectEqualSlices(u8, color_attr.value, "green"); - } - - { - var ctx = ParseContext.init("test"); - const elem = try tryParseElement(&ctx, alloc); - testing.expectEqualSlices(u8, elem.?.tag, "python"); - testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "test"); - } - - { - var ctx = ParseContext.init("bdf"); - const elem = try tryParseElement(&ctx, alloc); - testing.expectEqualSlices(u8, elem.?.tag, "a"); - testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "b"); - testing.expectEqualSlices(u8, elem.?.children.items[1].Element.tag, "c"); - testing.expectEqualSlices(u8, elem.?.children.items[2].CharData, "d"); - testing.expectEqualSlices(u8, elem.?.children.items[3].Element.tag, "e"); - testing.expectEqualSlices(u8, elem.?.children.items[4].CharData, "f"); - testing.expectEqualSlices(u8, elem.?.children.items[5].Comment, "g"); - } -} - -fn tryParseProlog(ctx: *ParseContext, alloc: *Allocator) !?*XmlDecl { - const start = ctx.offset; - if (!ctx.eatStr(""); - return decl; -} - -test "tryParseProlog" { - var arena = std.heap.ArenaAllocator.init(testing.allocator); - defer arena.deinit(); - var alloc = &arena.allocator; - - { - var ctx = ParseContext.init(""); - testing.expectEqual(@as(?*XmlDecl, null), try tryParseProlog(&ctx, alloc)); - testing.expectEqual(@as(?u8, '<'), ctx.peek()); - } - - { - var ctx = ParseContext.init(""); - const decl = try tryParseProlog(&ctx, alloc); - testing.expectEqualSlices(u8, "aa", decl.?.version); - testing.expectEqual(@as(?[]const u8, null), decl.?.encoding); - testing.expectEqual(@as(?bool, null), decl.?.standalone); - } - - { - var ctx = ParseContext.init(""); - const decl = try tryParseProlog(&ctx, alloc); - testing.expectEqualSlices(u8, "aa", decl.?.version); - testing.expectEqualSlices(u8, "bbb", decl.?.encoding.?); - testing.expectEqual(@as(?bool, true), decl.?.standalone.?); - } -} - -fn trySkipComments(ctx: *ParseContext, alloc: *Allocator) !void { - while (try tryParseComment(ctx, alloc)) |_| { - _ = ctx.eatWs(); - } -} - -fn tryParseComment(ctx: *ParseContext, alloc: *Allocator) !?[]const u8 { - if (!ctx.eatStr("")) { - _ = ctx.consume() catch return error.UnclosedComment; - } - - const end = ctx.offset - "-->".len; - return try mem.dupe(alloc, u8, ctx.source[begin..end]); -} - -fn unescapeEntity(text: []const u8) !u8 { - const EntitySubstition = struct { - text: []const u8, replacement: u8 - }; - - const entities = [_]EntitySubstition{ - .{ .text = "<", .replacement = '<' }, - .{ .text = ">", .replacement = '>' }, - .{ .text = "&", .replacement = '&' }, - .{ .text = "'", .replacement = '\'' }, - .{ .text = """, .replacement = '"' }, - }; - - for (entities) |entity| { - if (std.mem.eql(u8, text, entity.text)) return entity.replacement; - } - - return error.InvalidEntity; -} - -fn dupeAndUnescape(alloc: *Allocator, text: []const u8) ![]const u8 { - const str = try alloc.alloc(u8, text.len); - - var j: usize = 0; - var i: usize = 0; - while (i < text.len) : (j += 1) { - if (text[i] == '&') { - const entity_end = 1 + (mem.indexOfScalarPos(u8, text, i, ';') orelse return error.InvalidEntity); - str[j] = try unescapeEntity(text[i..entity_end]); - i = entity_end; - } else { - str[j] = text[i]; - i += 1; - } - } - - return alloc.shrink(str, j); -} - -test "dupeAndUnescape" { - var arena = std.heap.ArenaAllocator.init(testing.allocator); - defer arena.deinit(); - var alloc = &arena.allocator; - - testing.expectEqualSlices(u8, "test", try dupeAndUnescape(alloc, "test")); - testing.expectEqualSlices(u8, "ad\"e'f<", try dupeAndUnescape(alloc, "a<b&c>d"e'f<")); - testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&")); - testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&&")); - testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&test;")); - testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&boa")); -} - -test "Top level comments" { - var arena = std.heap.ArenaAllocator.init(testing.allocator); - defer arena.deinit(); - var alloc = &arena.allocator; - - const doc = try parse(alloc, ""); - testing.expectEqualSlices(u8, "python", doc.root.tag); -}