diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3389c86 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.zig-cache/ +zig-out/ diff --git a/README.md b/README.md index c75ae6d..05371a9 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,8 @@ array::array's don't exist. Use json or toml or something data with newlines must have a length:7:foo bar boolean value:bool:false - # Empty line separates records + + # Empty line separates records, but comments don't count as empty key::this is the second record this is a number:num:42 null value:null: diff --git a/build.zig b/build.zig index 24ed72b..72d5034 100644 --- a/build.zig +++ b/build.zig @@ -35,7 +35,7 @@ pub fn build(b: *std.Build) void { // intend to expose to consumers that were defined in other files part // of this module, you will have to make sure to re-export them from // the root file. - .root_source_file = b.path("src/root.zig"), + .root_source_file = b.path("src/srf.zig"), // Later on we'll use this module as the root module of a test executable // which requires us to specify a target. .target = target, diff --git a/src/main.zig b/src/main.zig index 3802762..13ab026 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,27 +1,3 @@ const std = @import("std"); -const srf = @import("srf"); -pub fn main() !void { - // Prints to stderr, ignoring potential errors. - std.debug.print("All your {s} are belong to us.\n", .{"codebase"}); - try srf.bufferedPrint(); -} - -test "simple test" { - const gpa = std.testing.allocator; - var list: std.ArrayList(i32) = .empty; - defer list.deinit(gpa); // Try commenting this out and see if zig detects the memory leak! - try list.append(gpa, 42); - try std.testing.expectEqual(@as(i32, 42), list.pop()); -} - -test "fuzz example" { - const Context = struct { - fn testOne(context: @This(), input: []const u8) anyerror!void { - _ = context; - // Try passing `--fuzz` to `zig build test` and see if it manages to fail this test case! - try std.testing.expect(!std.mem.eql(u8, "canyoufindme", input)); - } - }; - try std.testing.fuzz(Context{}, Context.testOne, .{}); -} +pub fn main() !void {} diff --git a/src/root.zig b/src/root.zig deleted file mode 100644 index 94c7cd0..0000000 --- a/src/root.zig +++ /dev/null @@ -1,23 +0,0 @@ -//! By convention, root.zig is the root source file when making a library. -const std = @import("std"); - -pub fn bufferedPrint() !void { - // Stdout is for the actual output of your application, for example if you - // are implementing gzip, then only the compressed bytes should be sent to - // stdout, not any debugging messages. - var stdout_buffer: [1024]u8 = undefined; - var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer); - const stdout = &stdout_writer.interface; - - try stdout.print("Run `zig build test` to run the tests.\n", .{}); - - try stdout.flush(); // Don't forget to flush! -} - -pub fn add(a: i32, b: i32) i32 { - return a + b; -} - -test "basic add functionality" { - try std.testing.expect(add(3, 7) == 10); -} diff --git a/src/srf.zig b/src/srf.zig new file mode 100644 index 0000000..45b37ea --- /dev/null +++ b/src/srf.zig @@ -0,0 +1,584 @@ +//! By convention, root.zig is the root source file when making a library. +const std = @import("std"); + +const log = std.log.scoped(.srf); + +pub const ParseLineError = struct { + message: []const u8, + level: std.log.Level, + line: usize, + column: usize, + + pub fn deinit(self: ParseLineError, allocator: std.mem.Allocator) void { + allocator.free(self.message); + } +}; +pub const Diagnostics = struct { + errors: *std.ArrayList(ParseLineError), + stop_after: usize = 10, + + pub fn addError(self: Diagnostics, allocator: std.mem.Allocator, err: ParseLineError) ParseError!void { + if (self.errors.items.len >= self.stop_after) { + err.deinit(allocator); + return ParseError.ParseFailed; + } + try self.errors.append(allocator, err); + } + pub fn deinit(self: Diagnostics, allocator: std.mem.Allocator) void { + for (self.errors) |e| e.deinit(allocator); + self.errors.deinit(allocator); + } +}; + +pub const ParseError = error{ + ParseFailed, + ReadFailed, + StreamTooLong, + OutOfMemory, + EndOfStream, +}; + +const ItemValueWithMetaData = struct { + item_value: ?ItemValue, + error_parsing: bool = false, + ending_byte: usize, + column_set: bool = false, +}; +pub const ItemValue = union(enum) { + number: f128, + + /// Bytes are converted to/from base64, string is not + bytes: []const u8, + + /// String is not touched in any way + string: []const u8, + + boolean: bool, + + pub fn format(self: ItemValue, writer: *std.Io.Writer) std.Io.Writer.Error!void { + switch (self) { + .number => try writer.print("num: {d}", .{self.number}), + .bytes => try writer.print("bytes: {x}", .{self.bytes}), + .string => try writer.print("string: {s}", .{self.string}), + .boolean => try writer.print("boolean: {}", .{self.boolean}), + } + } + pub fn deinit(self: ItemValue, allocator: std.mem.Allocator) void { + switch (self) { + .number, .boolean => {}, + .bytes => |b| allocator.free(b), + .string => |s| allocator.free(s), + } + } + pub fn parse(allocator: std.mem.Allocator, str: []const u8, state: *ParseState, delimiter: u8, options: ParseOptions) ParseError!ItemValueWithMetaData { + const type_val_sep_raw = std.mem.indexOfScalar(u8, str, ':'); + if (type_val_sep_raw == null) { + try parseError(allocator, options, "no type data or value after key", state.*); + return ParseError.ParseFailed; + } + + const type_val_sep = type_val_sep_raw.?; + const metadata = str[0..type_val_sep]; + const trimmed_meta = std.mem.trim(u8, metadata, &std.ascii.whitespace); + if (trimmed_meta.len == 0 or std.mem.eql(u8, "string", trimmed_meta)) { + // delimiter ended string + var it = std.mem.splitScalar(u8, str[type_val_sep + 1 ..], delimiter); + const val = it.first(); + return .{ + .item_value = .{ .string = try allocator.dupe(u8, val) }, + .ending_byte = metadata.len + 1 + val.len, + }; + } + if (std.mem.eql(u8, "binary", trimmed_meta)) { + // binary is base64 encoded, so we need to decode it, but we don't + // risk delimiter collision, so we don't need a length for this + var it = std.mem.splitScalar(u8, str[type_val_sep + 1 ..], delimiter); + const val = it.first(); + const Decoder = std.base64.standard.Decoder; + const size = Decoder.calcSizeForSlice(val) catch { + try parseError(allocator, options, "error parsing base64 value", state.*); + return .{ + .item_value = null, + .ending_byte = metadata.len + 1 + val.len, + .error_parsing = true, + }; + }; + const data = try allocator.alloc(u8, size); + errdefer allocator.free(data); + Decoder.decode(data, val) catch { + try parseError(allocator, options, "error parsing base64 value", state.*); + allocator.free(data); + return .{ + .item_value = null, + .ending_byte = metadata.len + 1 + val.len, + .error_parsing = true, + }; + }; + return .{ + .item_value = .{ .bytes = data }, + .ending_byte = metadata.len + 1 + val.len, + }; + } + if (std.mem.eql(u8, "num", trimmed_meta)) { + var it = std.mem.splitScalar(u8, str[type_val_sep + 1 ..], delimiter); + const val = it.first(); + const val_trimmed = std.mem.trim(u8, val, &std.ascii.whitespace); + const number = std.fmt.parseFloat(@FieldType(ItemValue, "number"), val_trimmed) catch { + // TODO: in compact format we really need a column number here + try parseError(allocator, options, "error parsing numeric value", state.*); + return .{ + .item_value = null, + .ending_byte = metadata.len + 1 + val.len, + .error_parsing = true, + }; + }; + return .{ + .item_value = .{ .number = number }, + .ending_byte = metadata.len + 1 + val.len, + }; + } + if (std.mem.eql(u8, "bool", trimmed_meta)) { + var it = std.mem.splitScalar(u8, str[type_val_sep + 1 ..], delimiter); + const val = it.first(); + const val_trimmed = std.mem.trim(u8, val, &std.ascii.whitespace); + const boolean = blk: { + if (std.mem.eql(u8, "false", val_trimmed)) break :blk false; + if (std.mem.eql(u8, "true", val_trimmed)) break :blk true; + + // TODO: in compact format we really need a column number here + try parseError(allocator, options, "error parsing boolean value", state.*); + return .{ + .item_value = null, + .ending_byte = metadata.len + 1 + val.len, + .error_parsing = true, + }; + }; + return .{ + .item_value = .{ .boolean = boolean }, + .ending_byte = metadata.len + 1 + val.len, + }; + } + if (std.mem.eql(u8, "null", trimmed_meta)) { + return .{ + .item_value = null, + .ending_byte = metadata.len + 2, + }; + } + // Last chance...the thing between these colons is a usize indicating + // the number of bytes to grab for a string + const size = std.fmt.parseInt(usize, trimmed_meta, 0) catch { + log.debug("parseInt fail, trimmed_data: '{s}'", .{trimmed_meta}); + try parseError(allocator, options, "unrecognized metadata for key", state.*); + return .{ + .item_value = null, + .ending_byte = metadata.len + 1, + .error_parsing = true, + }; + }; + // If we are being asked specifically for bytes, we no longer care about + // delimiters. We just want raw bytes. This might adjust our line/column + // in the parse state + const rest_of_data = str[type_val_sep + 1 ..]; + if (rest_of_data.len > size) { + // We fit on this line, everything is "normal" + const val = rest_of_data[0..size]; + return .{ + .item_value = .{ .string = val }, + .ending_byte = metadata.len + 1 + val.len, + }; + } + // This is not enough, we need more data from the reader + log.debug("item value includes newlines {f}", .{state}); + // We need to advance the reader, so we need a copy of what we have so fa + const start = try allocator.dupe(u8, rest_of_data); + defer allocator.free(start); + // We won't do a parseError here. If we have an allocation error, read + // error, or end of stream, all of these are fatal. Our reader is currently + // past the newline, so we have to remove a character from size to account. + const end = try state.reader.readAlloc(allocator, size - rest_of_data.len - 1); + // However, we want to be past the end of the *next* newline too (in long + // format mode) + if (delimiter == '\n') state.reader.toss(1); + defer allocator.free(end); + // This \n is because the reader state will have advanced beyond the next newline, so end + // really should start with the newline. This only applies to long mode, because otherwise the + // entire record is a single line + const final = try std.mem.concat(allocator, u8, &.{ start, "\n", end }); + // const final = if (delimiter == '\n') + // try std.mem.concat(allocator, u8, &.{ start, "\n", end }) + // else + // try std.mem.concat(allocator, u8, &.{ start, end }); + errdefer allocator.free(final); + log.debug("full val: {s}", .{final}); + std.debug.assert(final.len == size); + // Now we need to get the parse state correct + state.line += std.mem.count(u8, final, "\n"); + state.column = final.len - std.mem.lastIndexOf(u8, final, "\n").?; + return .{ + .item_value = .{ .string = final }, + .ending_byte = metadata.len + 1 + final.len, // This is useless here + .column_set = true, + }; + } +}; + +pub const Item = struct { + key: []const u8, + value: ?ItemValue, + + pub fn deinit(self: Item, allocator: std.mem.Allocator) void { + // std.debug.print("item deinit, key {s}, val: {?f}\n", .{ self.key, self.value }); + allocator.free(self.key); + if (self.value) |v| + v.deinit(allocator); + } +}; + +pub const Record = struct { + items: []Item, + + pub fn deinit(self: Record, allocator: std.mem.Allocator) void { + for (self.items) |i| i.deinit(allocator); + allocator.free(self.items); + } +}; + +pub const RecordList = struct { + items: []Record, + + pub fn deinit(self: RecordList, allocator: std.mem.Allocator) void { + for (self.items) |r| + r.deinit(allocator); + allocator.free(self.items); + } + pub fn format(self: RecordList, writer: *std.Io.Writer) std.Io.Writer.Error!void { + _ = self; + _ = writer; + } +}; + +pub const ParseOptions = struct { + diagnostics: ?*Diagnostics = null, +}; + +const Directive = union(enum) { + magic, + long_format, + compact_format, + require_eof, + eof, + + pub fn parse(allocator: std.mem.Allocator, str: []const u8, state: ParseState, options: ParseOptions) ParseError!?Directive { + if (!std.mem.startsWith(u8, str, "#!")) return null; + // strip any comments off + var it = std.mem.splitScalar(u8, str[2..], '#'); + const line = std.mem.trimEnd(u8, it.first(), &std.ascii.whitespace); + if (std.mem.eql(u8, "srfv1", line)) return .magic; + if (std.mem.eql(u8, "requireeof", line)) return .require_eof; + if (std.mem.eql(u8, "requireof", line)) { + try parseError(allocator, options, "#!requireof found. Did you mean #!requireeof?", state); + return null; + } + if (std.mem.eql(u8, "eof", line)) return .eof; + if (std.mem.eql(u8, "compact", line)) return .compact_format; + if (std.mem.eql(u8, "long", line)) return .long_format; + return null; + } +}; +pub const ParseState = struct { + reader: *std.Io.Reader, + line: usize, + column: usize, + + pub fn format(self: ParseState, writer: *std.Io.Writer) std.Io.Writer.Error!void { + try writer.print("line: {}, col: {}", .{ self.line, self.column }); + } +}; +pub fn parse(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: ParseOptions) ParseError!RecordList { + var long_format = false; // Default to compact format + var require_eof = false; // Default to no eof required + var eof_found: bool = false; + var state = ParseState{ .line = 0, .column = 0, .reader = reader }; + const first_line = nextLine(reader, &state, '\n') orelse return ParseError.ParseFailed; + + if (try Directive.parse(allocator, first_line, state, options)) |d| { + if (d != .magic) try parseError(allocator, options, "Magic header not found on first line", state); + } else try parseError(allocator, options, "Magic header not found on first line", state); + + // Loop through the header material and configure our main parsing + var record_list: std.ArrayList(Record) = .empty; + errdefer { + for (record_list.items) |i| i.deinit(allocator); + record_list.deinit(allocator); + } + const first_data = blk: { + while (nextLine(reader, &state, '\n')) |line| { + if (try Directive.parse(allocator, line, state, options)) |d| { + switch (d) { + .magic => try parseError(allocator, options, "Found a duplicate magic header", state), + .long_format => long_format = true, + .compact_format => long_format = false, // what if we have both? + .require_eof => require_eof = true, + .eof => { + // there needs to be an eof then + if (nextLine(reader, &state, '\n')) |_| { + try parseError(allocator, options, "Data found after #!eof", state); + return ParseError.ParseFailed; // this is terminal + } else return .{ .items = try record_list.toOwnedSlice(allocator) }; + }, + } + } else break :blk line; + } + return .{ .items = try record_list.toOwnedSlice(allocator) }; + }; + + // Main parsing. We already have the first line of data, which could + // be a record (compact format) or a key/value pair (long format) + var line: ?[]const u8 = first_data; + var items: std.ArrayList(Item) = .empty; + errdefer { + for (items.items) |i| i.deinit(allocator); + items.deinit(allocator); + } + + // Because in long format we don't have newline delimiter, that should really be a noop + // but we need this for compact format + const delimiter: u8 = if (long_format) '\n' else ','; + while (line) |l| { + if (std.mem.trim(u8, l, &std.ascii.whitespace).len == 0) { + // empty lines can be signficant (to indicate a new record, but only once + // a record is processed, which requires data first. That record processing + // is at the bottom of the loop, so if an empty line is detected here, we can + // safely ignore it + line = nextLine(reader, &state, '\n'); + continue; + } + if (try Directive.parse(allocator, l, state, options)) |d| { + switch (d) { + .eof => { + // there needs to be an eof then + if (nextLine(reader, &state, '\n')) |_| { + try parseError(allocator, options, "Data found after #!eof", state); + return ParseError.ParseFailed; // this is terminal + } else { + eof_found = true; + break; + } + }, + else => try parseError(allocator, options, "Directive found after data started", state), + } + continue; + } + + // Real data: lfg + // Whatever the format, the beginning will always be the key data + // key:stuff:value + var it = std.mem.splitScalar(u8, l, ':'); + const key = it.next().?; // first one we get for free + const value = try ItemValue.parse( + allocator, + it.rest(), + &state, + delimiter, + options, + ); + if (!value.error_parsing) { + // std.debug.print("alloc on key: {s}, val: {?f}\n", .{ key, value.item_value }); + try items.append(allocator, .{ .key = try allocator.dupe(u8, key), .value = value.item_value }); + } + + if (!value.column_set) + state.column = key.len + value.ending_byte; + // The difference between compact and line here is that compact we will instead of + // line = try nextLine, we will do something like line = line[42..] + + if (long_format) { + const maybe_line = nextLine(reader, &state, '\n'); + if (maybe_line == null) { + // close out record, return + try record_list.append(allocator, .{ + .items = try items.toOwnedSlice(allocator), + }); + break; + } + line = maybe_line.?; + if (line.?.len == 0) { + // End of record + try record_list.append(allocator, .{ + .items = try items.toOwnedSlice(allocator), + }); + line = nextLine(reader, &state, '\n'); + } + } else { + line = l[state.column..]; + if (line.?.len == 0) { + // close out record + try record_list.append(allocator, .{ + .items = try items.toOwnedSlice(allocator), + }); + line = nextLine(reader, &state, '\n'); + } + } + } + // Parsing complete. Add final record to list. Then, if there are any parse errors, throw + log.debug( + "Parse complete. Records parsed so far: {d}, Items in array (>0 means final record): {d}", + .{ record_list.items.len, items.items.len }, + ); + if (items.items.len > 0) + try record_list.append(allocator, .{ + .items = try items.toOwnedSlice(allocator), + }); + if (options.diagnostics) |d| + if (d.errors.items.len > 0) return ParseError.ParseFailed; + if (require_eof and !eof_found) return ParseError.ParseFailed; + return .{ .items = try record_list.toOwnedSlice(allocator) }; +} + +/// Takes the next line, trimming leading whitespace and ignoring comments +/// Directives (comments starting with #!) are preserved +fn nextLine(reader: *std.Io.Reader, state: *ParseState, delimiter: u8) ?[]const u8 { + while (true) { + state.line += 1; + state.column = 0; + const raw_line = (reader.takeDelimiter(delimiter) catch return null) orelse return null; + // we don't want to trim the end, as there might be a key/value field + // with a string including important trailing whitespace + const trimmed_line = std.mem.trimStart(u8, raw_line, &std.ascii.whitespace); + if (std.mem.startsWith(u8, trimmed_line, "#") and !std.mem.startsWith(u8, trimmed_line, "#!")) continue; + return trimmed_line; + } +} + +inline fn parseError(allocator: std.mem.Allocator, options: ParseOptions, message: []const u8, state: ParseState) ParseError!void { + log.debug("Parse error. Parse state {f}, message: {s}", .{ state, message }); + if (options.diagnostics) |d| { + try d.addError(allocator, .{ + .message = try allocator.dupe(u8, message), + .level = .err, + .line = state.line, + .column = state.column, + }); + } else { + return ParseError.ParseFailed; + } +} + +test "long format single record, no eof" { + const data = + \\#!srfv1 # mandatory comment with format and version. Parser instructions start with #! + \\#!long # Mandatory to use multiline records, compact format is optional #!compact + \\# A comment + \\# empty lines ignored + \\ + \\key::string value, with any data except a \n. an optional string length between the colons + ; + + const allocator = std.testing.allocator; + var reader = std.Io.Reader.fixed(data); + const records = try parse(&reader, allocator, .{}); + defer records.deinit(allocator); + try std.testing.expectEqual(@as(usize, 1), records.items.len); + try std.testing.expectEqual(@as(usize, 1), records.items[0].items.len); + const kvps = records.items[0].items; + try std.testing.expectEqualStrings("key", kvps[0].key); + try std.testing.expectEqualStrings("string value, with any data except a \\n. an optional string length between the colons", kvps[0].value.?.string); +} +test "long format from README - generic data structures, first record only" { + const data = + \\#!srfv1 # mandatory comment with format and version. Parser instructions start with #! + \\#!requireeof # Set this if you want parsing to fail when #!eof not present on last line + \\#!long # Mandatory to use multiline records, compact format is optional #!compact + \\# A comment + \\# empty lines ignored + \\ + \\this is a number:num: 5 + \\#!eof + ; + + const allocator = std.testing.allocator; + var reader = std.Io.Reader.fixed(data); + const records = try parse(&reader, allocator, .{}); + defer records.deinit(allocator); + try std.testing.expectEqual(@as(usize, 1), records.items.len); +} + +test "long format from README - generic data structures" { + const data = + \\#!srfv1 # mandatory comment with format and version. Parser instructions start with #! + \\#!requireeof # Set this if you want parsing to fail when #!eof not present on last line + \\#!long # Mandatory to use multiline records, compact format is optional #!compact + \\# A comment + \\# empty lines ignored + \\ + \\key::string value, with any data except a \n. an optional string length between the colons + \\this is a number:num: 5 + \\null value:null: + \\array::array's don't exist. Use json or toml or something + \\data with newlines must have a length:7:foo + \\bar + \\boolean value:bool:false + \\ # Empty line separates records + \\ + \\key::this is the second record + \\this is a number:num:42 + \\null value:null: + \\array::array's still don't exist + \\data with newlines must have a length::single line + \\#!eof # eof marker, useful to make sure your file wasn't cut in half. Only considered if requireeof set at top + ; + + const allocator = std.testing.allocator; + var reader = std.Io.Reader.fixed(data); + const records = try parse(&reader, allocator, .{}); + defer records.deinit(allocator); + try std.testing.expectEqual(@as(usize, 2), records.items.len); + const first = records.items[0]; + try std.testing.expectEqual(@as(usize, 6), first.items.len); + try std.testing.expectEqualStrings("key", first.items[0].key); + try std.testing.expectEqualStrings("string value, with any data except a \\n. an optional string length between the colons", first.items[0].value.?.string); + try std.testing.expectEqualStrings("this is a number", first.items[1].key); + try std.testing.expectEqual(@as(f128, 5), first.items[1].value.?.number); + try std.testing.expectEqualStrings("null value", first.items[2].key); + try std.testing.expect(first.items[2].value == null); + try std.testing.expectEqualStrings("array", first.items[3].key); + try std.testing.expectEqualStrings("array's don't exist. Use json or toml or something", first.items[3].value.?.string); + try std.testing.expectEqualStrings("data with newlines must have a length", first.items[4].key); + try std.testing.expectEqualStrings("foo\nbar", first.items[4].value.?.string); + try std.testing.expectEqualStrings("boolean value", first.items[5].key); + try std.testing.expect(!first.items[5].value.?.boolean); + + // TODO: Second record + const second = records.items[1]; + try std.testing.expectEqual(@as(usize, 5), second.items.len); + try std.testing.expectEqualStrings("key", second.items[0].key); + try std.testing.expectEqualStrings("this is the second record", second.items[0].value.?.string); + try std.testing.expectEqualStrings("this is a number", second.items[1].key); + try std.testing.expectEqual(@as(f128, 42), second.items[1].value.?.number); + try std.testing.expectEqualStrings("null value", second.items[2].key); + try std.testing.expect(second.items[2].value == null); + try std.testing.expectEqualStrings("array", second.items[3].key); + try std.testing.expectEqualStrings("array's still don't exist", second.items[3].value.?.string); + try std.testing.expectEqualStrings("data with newlines must have a length", second.items[4].key); + try std.testing.expectEqualStrings("single line", second.items[4].value.?.string); +} + +test "compact format from README - generic data structures" { + const lvl = std.testing.log_level; + defer std.testing.log_level = lvl; + std.testing.log_level = .debug; + if (true) return error.SkipZigTest; + const data = + \\#!srfv1 # mandatory comment with format and version. Parser instructions start with #! + \\key::string value must have a length between colons or end with a comma,this is a number:num:5 ,null value:null:,array::array's don't exist. Use json or toml or something,data with newlines must have a length:7:foo + \\bar,boolean value:bool:false + \\key::this is the second record + ; + + const allocator = std.testing.allocator; + var reader = std.Io.Reader.fixed(data); + // We want "parse" and "parseLeaky" probably. Second parameter is a diagnostics + const records = try parse(&reader, allocator, .{}); + defer records.deinit(allocator); + try std.testing.expectEqual(@as(usize, 2), records.items.len); +}