add alloc_strings parse option/update docs

2026-01-24 13:32:25 -08:00 · 2026-01-24 13:32:25 -08:00 · 4fbc08230e
commit 4fbc08230e
parent 0e1d8bd424
3 changed files with 45 additions and 41 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # SRF (Simple Record Format)
-SRF is a minimal data format designed for L2 caches and simple structured storage suitable for simple configuration as well. It provides human-readable key-value records with basic type hints, while avoiding the parsing complexity and escaping requirements of JSON.
+SRF is a minimal data format designed for L2 caches and simple structured storage suitable for simple configuration as well. It provides human-readable key-value records with basic type hints, while avoiding the parsing complexity and escaping requirements of JSON. Current benchmarking with hyperfine demonstrate approximately twice the performance of JSON parsing, though for L2 caches, JSON may be a poor choice. Compared to jsonl, it is approximately 40x faster. Performance also improves by 8% if you instruct the library not to copy strings around (ParseOptions alloc_strings = false).
 **Features:**
 - No escaping required - use length-prefixed strings for complex data
@ -54,23 +54,6 @@ bar,boolean value:bool:false
 key::this is the second record
 ```
 Second record problem...these
 ## Implementation
 **Note:** Long format may be tabled for later development. Initial implementation will focus on compact format only.
 Unrecognized `#!<keyword>` should be an error:
  requireof -> requireeof will probably be a common problem.
 #! anywhere other than the beginning or end is an error
 newline separates records in compact format. An empty line is required in long format
 comma separates fields in compact format
 newline separates fields in long format
 Should we have a #!hash directive to include all data not starting with `#!` ?
 ## Implementation Concerns
 **Parser robustness:**
--- a/src/main.zig
+++ b/src/main.zig
@ -77,7 +77,7 @@ pub fn main() !void {
    if (std.mem.eql(u8, format, "srf")) {
        var reader = std.Io.Reader.fixed(data.items);
-        const records = try srf.parse(&reader, allocator, .{});
+        const records = try srf.parse(&reader, allocator, .{ .alloc_strings = false });
        defer records.deinit();
    } else if (std.mem.eql(u8, format, "jsonl")) {
        var lines = std.mem.splitScalar(u8, data.items, '\n');
--- a/src/srf.zig
+++ b/src/srf.zig
@ -68,13 +68,6 @@ pub const ItemValue = union(enum) {
            .boolean => try writer.print("boolean: {}", .{self.boolean}),
        }
    }
    pub fn deinit(self: ItemValue, allocator: std.mem.Allocator) void {
        switch (self) {
            .number, .boolean => {},
            .bytes => |b| allocator.free(b),
            .string => |s| allocator.free(s),
        }
    }
    pub fn parse(allocator: std.mem.Allocator, str: []const u8, state: *ParseState, delimiter: u8, options: ParseOptions) ParseError!ItemValueWithMetaData {
        const type_val_sep_raw = std.mem.indexOfScalar(u8, str, ':');
        if (type_val_sep_raw == null) {
@ -94,7 +87,7 @@ pub const ItemValue = union(enum) {
            state.column += total_chars;
            state.partial_line_column += total_chars;
            return .{
-                .item_value = .{ .string = try allocator.dupe(u8, val) },
+                .item_value = .{ .string = try dupe(allocator, options, val) },
            };
        }
        if (std.mem.eql(u8, "binary", trimmed_meta)) {
@ -212,7 +205,7 @@ pub const ItemValue = union(enum) {
        // This is not enough, we need more data from the reader
        log.debug("item value includes newlines {f}", .{state});
        // We need to advance the reader, so we need a copy of what we have so fa
-        const start = try allocator.dupe(u8, rest_of_data);
+        const start = try dupe(allocator, options, rest_of_data);
        defer allocator.free(start);
        // We won't do a parseError here. If we have an allocation error, read
        // error, or end of stream, all of these are fatal. Our reader is currently
@ -244,27 +237,44 @@ pub const ItemValue = union(enum) {
    }
 };
 // An item has a key and a value, but the value may be null
 pub const Item = struct {
    key: []const u8,
    value: ?ItemValue,
    pub fn deinit(self: Item, allocator: std.mem.Allocator) void {
        // std.debug.print("item deinit, key {s}, val: {?f}\n", .{ self.key, self.value });
        allocator.free(self.key);
        if (self.value) |v|
            v.deinit(allocator);
    }
 };
 // A record has a list of items, with no assumptions regarding duplication,
 // etc. This is for parsing speed, but also for more flexibility in terms of
 // use cases. One can make a defacto array out of this structure by having
 // something like:
 //
 // arr:string:foo
 // arr:string:bar
 //
 // and when you coerce to zig struct have an array .arr that gets populated
 // with strings "foo" and "bar".
 pub const Record = struct {
    items: []Item,
    pub fn deinit(self: Record, allocator: std.mem.Allocator) void {
        for (self.items) |i| i.deinit(allocator);
        allocator.free(self.items);
    }
 };
 /// The RecordList is equivalent to Parsed(T) in std.json. Since most are
 /// familiar with std.json, it differs in the following ways:
 ///
 /// There is a list field instead of a value field. In json, one type of
 /// value is an array. SRF does not have an array data type, but the set of
 /// records is an array. json as a format is structred as a single object at
 /// the outermost
 ///
 /// This is not generic. In SRF, it is a separate function to bind the list
 /// of records to a specific data type. This will add some (hopefully minimal)
 /// overhead, but also avoid conflating parsing from the coercion from general
 /// type to specifics, and avoids answering questions like "what if I have
 /// 15 values for the same key" until you're actually dealing with that problem
 /// (see std.json.ParseOptions duplicate_field_behavior and ignore_unknown_fields)
 ///
 /// When implemented, there will include a pub fn bind(self: RecordList, comptime T: type, options, BindOptions) BindError![]T
 /// function. The options will include things related to duplicate handling and
 /// missing fields
 pub const RecordList = struct {
    list: std.ArrayList(Record),
    arena: *std.heap.ArenaAllocator,
@ -282,6 +292,12 @@ pub const RecordList = struct {
 pub const ParseOptions = struct {
    diagnostics: ?*Diagnostics = null,
    /// By default, the parser will copy data so it is safe to free the original
    /// This will impose about 8% overhead, but be safer. If you do not require
    /// this safety, set alloc_strings to false. Setting this to false is the
    /// equivalent of the "Leaky" parsing functions of std.json
    alloc_strings: bool = true,
 };
 const Directive = union(enum) {
@ -498,11 +514,16 @@ fn nextLine(reader: *std.Io.Reader, state: *ParseState) ?[]const u8 {
    }
 }
 inline fn dupe(allocator: std.mem.Allocator, options: ParseOptions, data: []const u8) ParseError![]const u8 {
    if (options.alloc_strings)
        return try allocator.dupe(u8, data);
    return data;
 }
 inline fn parseError(allocator: std.mem.Allocator, options: ParseOptions, message: []const u8, state: ParseState) ParseError!void {
    log.debug("Parse error. Parse state {f}, message: {s}", .{ state, message });
    if (options.diagnostics) |d| {
        try d.addError(allocator, .{
-            .message = try allocator.dupe(u8, message),
+            .message = try dupe(allocator, options, message),
            .level = .err,
            .line = state.line,
            .column = state.column,