//! By convention, root.zig is the root source file when making a library. const std = @import("std"); const log = std.log.scoped(.srf); pub const ParseLineError = struct { message: []const u8, level: std.log.Level, line: usize, column: usize, pub fn deinit(self: ParseLineError, allocator: std.mem.Allocator) void { allocator.free(self.message); } }; pub const Diagnostics = struct { errors: *std.ArrayList(ParseLineError), stop_after: usize = 10, pub fn addError(self: Diagnostics, allocator: std.mem.Allocator, err: ParseLineError) ParseError!void { if (self.errors.items.len >= self.stop_after) { err.deinit(allocator); return ParseError.ParseFailed; } try self.errors.append(allocator, err); } pub fn deinit(self: Diagnostics, allocator: std.mem.Allocator) void { for (self.errors) |e| e.deinit(allocator); self.errors.deinit(allocator); } }; pub const ParseError = error{ ParseFailed, ReadFailed, StreamTooLong, OutOfMemory, EndOfStream, }; const ItemValueWithMetaData = struct { item_value: ?ItemValue, error_parsing: bool = false, ending_byte: usize, column_set: bool = false, }; pub const ItemValue = union(enum) { number: f128, /// Bytes are converted to/from base64, string is not bytes: []const u8, /// String is not touched in any way string: []const u8, boolean: bool, pub fn format(self: ItemValue, writer: *std.Io.Writer) std.Io.Writer.Error!void { switch (self) { .number => try writer.print("num: {d}", .{self.number}), .bytes => try writer.print("bytes: {x}", .{self.bytes}), .string => try writer.print("string: {s}", .{self.string}), .boolean => try writer.print("boolean: {}", .{self.boolean}), } } pub fn deinit(self: ItemValue, allocator: std.mem.Allocator) void { switch (self) { .number, .boolean => {}, .bytes => |b| allocator.free(b), .string => |s| allocator.free(s), } } pub fn parse(allocator: std.mem.Allocator, str: []const u8, state: *ParseState, delimiter: u8, options: ParseOptions) ParseError!ItemValueWithMetaData { const type_val_sep_raw = std.mem.indexOfScalar(u8, str, ':'); if (type_val_sep_raw == null) { try parseError(allocator, options, "no type data or value after key", state.*); return ParseError.ParseFailed; } const type_val_sep = type_val_sep_raw.?; const metadata = str[0..type_val_sep]; const trimmed_meta = std.mem.trim(u8, metadata, &std.ascii.whitespace); if (trimmed_meta.len == 0 or std.mem.eql(u8, "string", trimmed_meta)) { // delimiter ended string var it = std.mem.splitScalar(u8, str[type_val_sep + 1 ..], delimiter); const val = it.first(); return .{ .item_value = .{ .string = try allocator.dupe(u8, val) }, .ending_byte = metadata.len + 1 + val.len, }; } if (std.mem.eql(u8, "binary", trimmed_meta)) { // binary is base64 encoded, so we need to decode it, but we don't // risk delimiter collision, so we don't need a length for this var it = std.mem.splitScalar(u8, str[type_val_sep + 1 ..], delimiter); const val = it.first(); const Decoder = std.base64.standard.Decoder; const size = Decoder.calcSizeForSlice(val) catch { try parseError(allocator, options, "error parsing base64 value", state.*); return .{ .item_value = null, .ending_byte = metadata.len + 1 + val.len, .error_parsing = true, }; }; const data = try allocator.alloc(u8, size); errdefer allocator.free(data); Decoder.decode(data, val) catch { try parseError(allocator, options, "error parsing base64 value", state.*); allocator.free(data); return .{ .item_value = null, .ending_byte = metadata.len + 1 + val.len, .error_parsing = true, }; }; return .{ .item_value = .{ .bytes = data }, .ending_byte = metadata.len + 1 + val.len, }; } if (std.mem.eql(u8, "num", trimmed_meta)) { var it = std.mem.splitScalar(u8, str[type_val_sep + 1 ..], delimiter); const val = it.first(); const val_trimmed = std.mem.trim(u8, val, &std.ascii.whitespace); const number = std.fmt.parseFloat(@FieldType(ItemValue, "number"), val_trimmed) catch { // TODO: in compact format we really need a column number here try parseError(allocator, options, "error parsing numeric value", state.*); return .{ .item_value = null, .ending_byte = metadata.len + 1 + val.len, .error_parsing = true, }; }; return .{ .item_value = .{ .number = number }, .ending_byte = metadata.len + 1 + val.len, }; } if (std.mem.eql(u8, "bool", trimmed_meta)) { var it = std.mem.splitScalar(u8, str[type_val_sep + 1 ..], delimiter); const val = it.first(); const val_trimmed = std.mem.trim(u8, val, &std.ascii.whitespace); const boolean = blk: { if (std.mem.eql(u8, "false", val_trimmed)) break :blk false; if (std.mem.eql(u8, "true", val_trimmed)) break :blk true; // TODO: in compact format we really need a column number here try parseError(allocator, options, "error parsing boolean value", state.*); return .{ .item_value = null, .ending_byte = metadata.len + 1 + val.len, .error_parsing = true, }; }; return .{ .item_value = .{ .boolean = boolean }, .ending_byte = metadata.len + 1 + val.len, }; } if (std.mem.eql(u8, "null", trimmed_meta)) { return .{ .item_value = null, .ending_byte = metadata.len + 2, }; } // Last chance...the thing between these colons is a usize indicating // the number of bytes to grab for a string const size = std.fmt.parseInt(usize, trimmed_meta, 0) catch { log.debug("parseInt fail, trimmed_data: '{s}'", .{trimmed_meta}); try parseError(allocator, options, "unrecognized metadata for key", state.*); return .{ .item_value = null, .ending_byte = metadata.len + 1, .error_parsing = true, }; }; // If we are being asked specifically for bytes, we no longer care about // delimiters. We just want raw bytes. This might adjust our line/column // in the parse state const rest_of_data = str[type_val_sep + 1 ..]; if (rest_of_data.len > size) { // We fit on this line, everything is "normal" const val = rest_of_data[0..size]; return .{ .item_value = .{ .string = val }, .ending_byte = metadata.len + 1 + val.len, }; } // This is not enough, we need more data from the reader log.debug("item value includes newlines {f}", .{state}); // We need to advance the reader, so we need a copy of what we have so fa const start = try allocator.dupe(u8, rest_of_data); defer allocator.free(start); // We won't do a parseError here. If we have an allocation error, read // error, or end of stream, all of these are fatal. Our reader is currently // past the newline, so we have to remove a character from size to account. const end = try state.reader.readAlloc(allocator, size - rest_of_data.len - 1); // However, we want to be past the end of the *next* newline too (in long // format mode) if (delimiter == '\n') state.reader.toss(1); defer allocator.free(end); // This \n is because the reader state will have advanced beyond the next newline, so end // really should start with the newline. This only applies to long mode, because otherwise the // entire record is a single line const final = try std.mem.concat(allocator, u8, &.{ start, "\n", end }); // const final = if (delimiter == '\n') // try std.mem.concat(allocator, u8, &.{ start, "\n", end }) // else // try std.mem.concat(allocator, u8, &.{ start, end }); errdefer allocator.free(final); log.debug("full val: {s}", .{final}); std.debug.assert(final.len == size); // Now we need to get the parse state correct state.line += std.mem.count(u8, final, "\n"); state.column = final.len - std.mem.lastIndexOf(u8, final, "\n").?; return .{ .item_value = .{ .string = final }, .ending_byte = metadata.len + 1 + final.len, // This is useless here .column_set = true, }; } }; pub const Item = struct { key: []const u8, value: ?ItemValue, pub fn deinit(self: Item, allocator: std.mem.Allocator) void { // std.debug.print("item deinit, key {s}, val: {?f}\n", .{ self.key, self.value }); allocator.free(self.key); if (self.value) |v| v.deinit(allocator); } }; pub const Record = struct { items: []Item, pub fn deinit(self: Record, allocator: std.mem.Allocator) void { for (self.items) |i| i.deinit(allocator); allocator.free(self.items); } }; pub const RecordList = struct { items: []Record, pub fn deinit(self: RecordList, allocator: std.mem.Allocator) void { for (self.items) |r| r.deinit(allocator); allocator.free(self.items); } pub fn format(self: RecordList, writer: *std.Io.Writer) std.Io.Writer.Error!void { _ = self; _ = writer; } }; pub const ParseOptions = struct { diagnostics: ?*Diagnostics = null, }; const Directive = union(enum) { magic, long_format, compact_format, require_eof, eof, pub fn parse(allocator: std.mem.Allocator, str: []const u8, state: ParseState, options: ParseOptions) ParseError!?Directive { if (!std.mem.startsWith(u8, str, "#!")) return null; // strip any comments off var it = std.mem.splitScalar(u8, str[2..], '#'); const line = std.mem.trimEnd(u8, it.first(), &std.ascii.whitespace); if (std.mem.eql(u8, "srfv1", line)) return .magic; if (std.mem.eql(u8, "requireeof", line)) return .require_eof; if (std.mem.eql(u8, "requireof", line)) { try parseError(allocator, options, "#!requireof found. Did you mean #!requireeof?", state); return null; } if (std.mem.eql(u8, "eof", line)) return .eof; if (std.mem.eql(u8, "compact", line)) return .compact_format; if (std.mem.eql(u8, "long", line)) return .long_format; return null; } }; pub const ParseState = struct { reader: *std.Io.Reader, line: usize, column: usize, pub fn format(self: ParseState, writer: *std.Io.Writer) std.Io.Writer.Error!void { try writer.print("line: {}, col: {}", .{ self.line, self.column }); } }; pub fn parse(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: ParseOptions) ParseError!RecordList { var long_format = false; // Default to compact format var require_eof = false; // Default to no eof required var eof_found: bool = false; var state = ParseState{ .line = 0, .column = 0, .reader = reader }; const first_line = nextLine(reader, &state, '\n') orelse return ParseError.ParseFailed; if (try Directive.parse(allocator, first_line, state, options)) |d| { if (d != .magic) try parseError(allocator, options, "Magic header not found on first line", state); } else try parseError(allocator, options, "Magic header not found on first line", state); // Loop through the header material and configure our main parsing var record_list: std.ArrayList(Record) = .empty; errdefer { for (record_list.items) |i| i.deinit(allocator); record_list.deinit(allocator); } const first_data = blk: { while (nextLine(reader, &state, '\n')) |line| { if (try Directive.parse(allocator, line, state, options)) |d| { switch (d) { .magic => try parseError(allocator, options, "Found a duplicate magic header", state), .long_format => long_format = true, .compact_format => long_format = false, // what if we have both? .require_eof => require_eof = true, .eof => { // there needs to be an eof then if (nextLine(reader, &state, '\n')) |_| { try parseError(allocator, options, "Data found after #!eof", state); return ParseError.ParseFailed; // this is terminal } else return .{ .items = try record_list.toOwnedSlice(allocator) }; }, } } else break :blk line; } return .{ .items = try record_list.toOwnedSlice(allocator) }; }; // Main parsing. We already have the first line of data, which could // be a record (compact format) or a key/value pair (long format) var line: ?[]const u8 = first_data; var items: std.ArrayList(Item) = .empty; errdefer { for (items.items) |i| i.deinit(allocator); items.deinit(allocator); } // Because in long format we don't have newline delimiter, that should really be a noop // but we need this for compact format const delimiter: u8 = if (long_format) '\n' else ','; while (line) |l| { if (std.mem.trim(u8, l, &std.ascii.whitespace).len == 0) { // empty lines can be signficant (to indicate a new record, but only once // a record is processed, which requires data first. That record processing // is at the bottom of the loop, so if an empty line is detected here, we can // safely ignore it line = nextLine(reader, &state, '\n'); continue; } if (try Directive.parse(allocator, l, state, options)) |d| { switch (d) { .eof => { // there needs to be an eof then if (nextLine(reader, &state, '\n')) |_| { try parseError(allocator, options, "Data found after #!eof", state); return ParseError.ParseFailed; // this is terminal } else { eof_found = true; break; } }, else => try parseError(allocator, options, "Directive found after data started", state), } continue; } // Real data: lfg // Whatever the format, the beginning will always be the key data // key:stuff:value var it = std.mem.splitScalar(u8, l, ':'); const key = it.next().?; // first one we get for free const value = try ItemValue.parse( allocator, it.rest(), &state, delimiter, options, ); if (!value.error_parsing) { // std.debug.print("alloc on key: {s}, val: {?f}\n", .{ key, value.item_value }); try items.append(allocator, .{ .key = try allocator.dupe(u8, key), .value = value.item_value }); } if (!value.column_set) state.column = key.len + value.ending_byte; // The difference between compact and line here is that compact we will instead of // line = try nextLine, we will do something like line = line[42..] if (long_format) { const maybe_line = nextLine(reader, &state, '\n'); if (maybe_line == null) { // close out record, return try record_list.append(allocator, .{ .items = try items.toOwnedSlice(allocator), }); break; } line = maybe_line.?; if (line.?.len == 0) { // End of record try record_list.append(allocator, .{ .items = try items.toOwnedSlice(allocator), }); line = nextLine(reader, &state, '\n'); } } else { line = l[state.column..]; if (line.?.len == 0) { // close out record try record_list.append(allocator, .{ .items = try items.toOwnedSlice(allocator), }); line = nextLine(reader, &state, '\n'); } } } // Parsing complete. Add final record to list. Then, if there are any parse errors, throw log.debug( "Parse complete. Records parsed so far: {d}, Items in array (>0 means final record): {d}", .{ record_list.items.len, items.items.len }, ); if (items.items.len > 0) try record_list.append(allocator, .{ .items = try items.toOwnedSlice(allocator), }); if (options.diagnostics) |d| if (d.errors.items.len > 0) return ParseError.ParseFailed; if (require_eof and !eof_found) return ParseError.ParseFailed; return .{ .items = try record_list.toOwnedSlice(allocator) }; } /// Takes the next line, trimming leading whitespace and ignoring comments /// Directives (comments starting with #!) are preserved fn nextLine(reader: *std.Io.Reader, state: *ParseState, delimiter: u8) ?[]const u8 { while (true) { state.line += 1; state.column = 0; const raw_line = (reader.takeDelimiter(delimiter) catch return null) orelse return null; // we don't want to trim the end, as there might be a key/value field // with a string including important trailing whitespace const trimmed_line = std.mem.trimStart(u8, raw_line, &std.ascii.whitespace); if (std.mem.startsWith(u8, trimmed_line, "#") and !std.mem.startsWith(u8, trimmed_line, "#!")) continue; return trimmed_line; } } inline fn parseError(allocator: std.mem.Allocator, options: ParseOptions, message: []const u8, state: ParseState) ParseError!void { log.debug("Parse error. Parse state {f}, message: {s}", .{ state, message }); if (options.diagnostics) |d| { try d.addError(allocator, .{ .message = try allocator.dupe(u8, message), .level = .err, .line = state.line, .column = state.column, }); } else { return ParseError.ParseFailed; } } test "long format single record, no eof" { const data = \\#!srfv1 # mandatory comment with format and version. Parser instructions start with #! \\#!long # Mandatory to use multiline records, compact format is optional #!compact \\# A comment \\# empty lines ignored \\ \\key::string value, with any data except a \n. an optional string length between the colons ; const allocator = std.testing.allocator; var reader = std.Io.Reader.fixed(data); const records = try parse(&reader, allocator, .{}); defer records.deinit(allocator); try std.testing.expectEqual(@as(usize, 1), records.items.len); try std.testing.expectEqual(@as(usize, 1), records.items[0].items.len); const kvps = records.items[0].items; try std.testing.expectEqualStrings("key", kvps[0].key); try std.testing.expectEqualStrings("string value, with any data except a \\n. an optional string length between the colons", kvps[0].value.?.string); } test "long format from README - generic data structures, first record only" { const data = \\#!srfv1 # mandatory comment with format and version. Parser instructions start with #! \\#!requireeof # Set this if you want parsing to fail when #!eof not present on last line \\#!long # Mandatory to use multiline records, compact format is optional #!compact \\# A comment \\# empty lines ignored \\ \\this is a number:num: 5 \\#!eof ; const allocator = std.testing.allocator; var reader = std.Io.Reader.fixed(data); const records = try parse(&reader, allocator, .{}); defer records.deinit(allocator); try std.testing.expectEqual(@as(usize, 1), records.items.len); } test "long format from README - generic data structures" { const data = \\#!srfv1 # mandatory comment with format and version. Parser instructions start with #! \\#!requireeof # Set this if you want parsing to fail when #!eof not present on last line \\#!long # Mandatory to use multiline records, compact format is optional #!compact \\# A comment \\# empty lines ignored \\ \\key::string value, with any data except a \n. an optional string length between the colons \\this is a number:num: 5 \\null value:null: \\array::array's don't exist. Use json or toml or something \\data with newlines must have a length:7:foo \\bar \\boolean value:bool:false \\ # Empty line separates records \\ \\key::this is the second record \\this is a number:num:42 \\null value:null: \\array::array's still don't exist \\data with newlines must have a length::single line \\#!eof # eof marker, useful to make sure your file wasn't cut in half. Only considered if requireeof set at top ; const allocator = std.testing.allocator; var reader = std.Io.Reader.fixed(data); const records = try parse(&reader, allocator, .{}); defer records.deinit(allocator); try std.testing.expectEqual(@as(usize, 2), records.items.len); const first = records.items[0]; try std.testing.expectEqual(@as(usize, 6), first.items.len); try std.testing.expectEqualStrings("key", first.items[0].key); try std.testing.expectEqualStrings("string value, with any data except a \\n. an optional string length between the colons", first.items[0].value.?.string); try std.testing.expectEqualStrings("this is a number", first.items[1].key); try std.testing.expectEqual(@as(f128, 5), first.items[1].value.?.number); try std.testing.expectEqualStrings("null value", first.items[2].key); try std.testing.expect(first.items[2].value == null); try std.testing.expectEqualStrings("array", first.items[3].key); try std.testing.expectEqualStrings("array's don't exist. Use json or toml or something", first.items[3].value.?.string); try std.testing.expectEqualStrings("data with newlines must have a length", first.items[4].key); try std.testing.expectEqualStrings("foo\nbar", first.items[4].value.?.string); try std.testing.expectEqualStrings("boolean value", first.items[5].key); try std.testing.expect(!first.items[5].value.?.boolean); // TODO: Second record const second = records.items[1]; try std.testing.expectEqual(@as(usize, 5), second.items.len); try std.testing.expectEqualStrings("key", second.items[0].key); try std.testing.expectEqualStrings("this is the second record", second.items[0].value.?.string); try std.testing.expectEqualStrings("this is a number", second.items[1].key); try std.testing.expectEqual(@as(f128, 42), second.items[1].value.?.number); try std.testing.expectEqualStrings("null value", second.items[2].key); try std.testing.expect(second.items[2].value == null); try std.testing.expectEqualStrings("array", second.items[3].key); try std.testing.expectEqualStrings("array's still don't exist", second.items[3].value.?.string); try std.testing.expectEqualStrings("data with newlines must have a length", second.items[4].key); try std.testing.expectEqualStrings("single line", second.items[4].value.?.string); } test "compact format from README - generic data structures" { const lvl = std.testing.log_level; defer std.testing.log_level = lvl; std.testing.log_level = .debug; if (true) return error.SkipZigTest; const data = \\#!srfv1 # mandatory comment with format and version. Parser instructions start with #! \\key::string value must have a length between colons or end with a comma,this is a number:num:5 ,null value:null:,array::array's don't exist. Use json or toml or something,data with newlines must have a length:7:foo \\bar,boolean value:bool:false \\key::this is the second record ; const allocator = std.testing.allocator; var reader = std.Io.Reader.fixed(data); // We want "parse" and "parseLeaky" probably. Second parameter is a diagnostics const records = try parse(&reader, allocator, .{}); defer records.deinit(allocator); try std.testing.expectEqual(@as(usize, 2), records.items.len); }