use arena in parse method - no perf change

This commit is contained in:
Emil Lerch 2026-01-23 15:57:28 -08:00
parent b37fb7fb1a
commit 415aa30f75
Signed by: lobo
GPG key ID: A7B62D657EF764F8
2 changed files with 64 additions and 55 deletions

View file

@ -85,10 +85,7 @@ pub fn main() !void {
var reader = std.Io.Reader.fixed(data.items); var reader = std.Io.Reader.fixed(data.items);
const records = try srf.parse(&reader, srf_allocator, .{}); const records = try srf.parse(&reader, srf_allocator, .{});
defer { defer records.deinit();
for (records.items) |r| r.deinit(srf_allocator);
srf_allocator.free(records.items);
}
} else if (std.mem.eql(u8, format, "jsonl")) { } else if (std.mem.eql(u8, format, "jsonl")) {
var lines = std.mem.splitScalar(u8, data.items, '\n'); var lines = std.mem.splitScalar(u8, data.items, '\n');
while (lines.next()) |line| { while (lines.next()) |line| {

View file

@ -16,6 +16,7 @@ pub const ParseLineError = struct {
pub const Diagnostics = struct { pub const Diagnostics = struct {
errors: *std.ArrayList(ParseLineError), errors: *std.ArrayList(ParseLineError),
stop_after: usize = 10, stop_after: usize = 10,
arena: std.heap.ArenaAllocator,
pub fn addError(self: Diagnostics, allocator: std.mem.Allocator, err: ParseLineError) ParseError!void { pub fn addError(self: Diagnostics, allocator: std.mem.Allocator, err: ParseLineError) ParseError!void {
if (self.errors.items.len >= self.stop_after) { if (self.errors.items.len >= self.stop_after) {
@ -24,9 +25,14 @@ pub const Diagnostics = struct {
} }
try self.errors.append(allocator, err); try self.errors.append(allocator, err);
} }
pub fn deinit(self: Diagnostics, allocator: std.mem.Allocator) void { pub fn deinit(self: RecordList) void {
for (self.errors) |e| e.deinit(allocator); // From parse, three things can happen:
self.errors.deinit(allocator); // 1. Happy path - record comes back, deallocation happens on that deinit
// 2. Errors is returned, no diagnostics provided. Deallocation happens in parse on errdefer
// 3. Errors are returned, diagnostics provided. Deallocation happens here
const child_allocator = self.arena.child_allocator;
self.arena.deinit();
child_allocator.destroy(self.arena);
} }
}; };
@ -260,12 +266,13 @@ pub const Record = struct {
}; };
pub const RecordList = struct { pub const RecordList = struct {
items: []Record, list: std.ArrayList(Record),
arena: *std.heap.ArenaAllocator,
pub fn deinit(self: RecordList, allocator: std.mem.Allocator) void { pub fn deinit(self: RecordList) void {
for (self.items) |r| const child_allocator = self.arena.child_allocator;
r.deinit(allocator); self.arena.deinit();
allocator.free(self.items); child_allocator.destroy(self.arena);
} }
pub fn format(self: RecordList, writer: *std.Io.Writer) std.Io.Writer.Error!void { pub fn format(self: RecordList, writer: *std.Io.Writer) std.Io.Writer.Error!void {
_ = self; _ = self;
@ -312,41 +319,46 @@ pub const ParseState = struct {
} }
}; };
pub fn parse(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: ParseOptions) ParseError!RecordList { pub fn parse(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: ParseOptions) ParseError!RecordList {
// create an arena allocator for everytyhing related to parsing
const arena: *std.heap.ArenaAllocator = try allocator.create(std.heap.ArenaAllocator);
errdefer if (options.diagnostics == null) allocator.destroy(arena);
arena.* = .init(allocator);
errdefer if (options.diagnostics == null) arena.deinit();
const aa = arena.allocator();
var long_format = false; // Default to compact format var long_format = false; // Default to compact format
var require_eof = false; // Default to no eof required var require_eof = false; // Default to no eof required
var eof_found: bool = false; var eof_found: bool = false;
var state = ParseState{ .line = 0, .column = 0, .partial_line_column = 0, .reader = reader }; var state = ParseState{ .line = 0, .column = 0, .partial_line_column = 0, .reader = reader };
const first_line = nextLine(reader, &state) orelse return ParseError.ParseFailed; const first_line = nextLine(reader, &state) orelse return ParseError.ParseFailed;
if (try Directive.parse(allocator, first_line, state, options)) |d| { if (try Directive.parse(aa, first_line, state, options)) |d| {
if (d != .magic) try parseError(allocator, options, "Magic header not found on first line", state); if (d != .magic) try parseError(aa, options, "Magic header not found on first line", state);
} else try parseError(allocator, options, "Magic header not found on first line", state); } else try parseError(aa, options, "Magic header not found on first line", state);
// Loop through the header material and configure our main parsing // Loop through the header material and configure our main parsing
var record_list: std.ArrayList(Record) = .empty; var parsed: RecordList = .{
errdefer { .list = .empty,
for (record_list.items) |i| i.deinit(allocator); .arena = arena,
record_list.deinit(allocator); };
}
const first_data = blk: { const first_data = blk: {
while (nextLine(reader, &state)) |line| { while (nextLine(reader, &state)) |line| {
if (try Directive.parse(allocator, line, state, options)) |d| { if (try Directive.parse(aa, line, state, options)) |d| {
switch (d) { switch (d) {
.magic => try parseError(allocator, options, "Found a duplicate magic header", state), .magic => try parseError(aa, options, "Found a duplicate magic header", state),
.long_format => long_format = true, .long_format => long_format = true,
.compact_format => long_format = false, // what if we have both? .compact_format => long_format = false, // what if we have both?
.require_eof => require_eof = true, .require_eof => require_eof = true,
.eof => { .eof => {
// there needs to be an eof then // there needs to be an eof then
if (nextLine(reader, &state)) |_| { if (nextLine(reader, &state)) |_| {
try parseError(allocator, options, "Data found after #!eof", state); try parseError(aa, options, "Data found after #!eof", state);
return ParseError.ParseFailed; // this is terminal return ParseError.ParseFailed; // this is terminal
} else return .{ .items = try record_list.toOwnedSlice(allocator) }; } else return parsed;
}, },
} }
} else break :blk line; } else break :blk line;
} }
return .{ .items = try record_list.toOwnedSlice(allocator) }; return parsed;
}; };
// Main parsing. We already have the first line of data, which could // Main parsing. We already have the first line of data, which could
@ -354,8 +366,8 @@ pub fn parse(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: Pars
var line: ?[]const u8 = first_data; var line: ?[]const u8 = first_data;
var items: std.ArrayList(Item) = .empty; var items: std.ArrayList(Item) = .empty;
errdefer { errdefer {
for (items.items) |i| i.deinit(allocator); for (items.items) |i| i.deinit(aa);
items.deinit(allocator); items.deinit(aa);
} }
// Because in long format we don't have newline delimiter, that should really be a noop // Because in long format we don't have newline delimiter, that should really be a noop
@ -372,19 +384,19 @@ pub fn parse(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: Pars
line = nextLine(reader, &state); line = nextLine(reader, &state);
continue; continue;
} }
if (try Directive.parse(allocator, l, state, options)) |d| { if (try Directive.parse(aa, l, state, options)) |d| {
switch (d) { switch (d) {
.eof => { .eof => {
// there needs to be an eof then // there needs to be an eof then
if (nextLine(reader, &state)) |_| { if (nextLine(reader, &state)) |_| {
try parseError(allocator, options, "Data found after #!eof", state); try parseError(aa, options, "Data found after #!eof", state);
return ParseError.ParseFailed; // this is terminal return ParseError.ParseFailed; // this is terminal
} else { } else {
eof_found = true; eof_found = true;
break; break;
} }
}, },
else => try parseError(allocator, options, "Directive found after data started", state), else => try parseError(aa, options, "Directive found after data started", state),
} }
continue; continue;
} }
@ -398,7 +410,7 @@ pub fn parse(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: Pars
state.column += key.len + 1; state.column += key.len + 1;
state.partial_line_column += key.len + 1; state.partial_line_column += key.len + 1;
const value = try ItemValue.parse( const value = try ItemValue.parse(
allocator, aa,
it.rest(), it.rest(),
&state, &state,
delimiter, delimiter,
@ -407,7 +419,7 @@ pub fn parse(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: Pars
if (!value.error_parsing) { if (!value.error_parsing) {
// std.debug.print("alloc on key: {s}, val: {?f}\n", .{ key, value.item_value }); // std.debug.print("alloc on key: {s}, val: {?f}\n", .{ key, value.item_value });
try items.append(allocator, .{ .key = try allocator.dupe(u8, key), .value = value.item_value }); try items.append(aa, .{ .key = try aa.dupe(u8, key), .value = value.item_value });
} }
if (value.reader_advanced and !long_format) { if (value.reader_advanced and !long_format) {
@ -426,16 +438,16 @@ pub fn parse(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: Pars
const maybe_line = nextLine(reader, &state); const maybe_line = nextLine(reader, &state);
if (maybe_line == null) { if (maybe_line == null) {
// close out record, return // close out record, return
try record_list.append(allocator, .{ try parsed.list.append(aa, .{
.items = try items.toOwnedSlice(allocator), .items = try items.toOwnedSlice(aa),
}); });
break; break;
} }
line = maybe_line.?; line = maybe_line.?;
if (line.?.len == 0) { if (line.?.len == 0) {
// End of record // End of record
try record_list.append(allocator, .{ try parsed.list.append(aa, .{
.items = try items.toOwnedSlice(allocator), .items = try items.toOwnedSlice(aa),
}); });
line = nextLine(reader, &state); line = nextLine(reader, &state);
} }
@ -445,8 +457,8 @@ pub fn parse(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: Pars
state.partial_line_column = 0; state.partial_line_column = 0;
if (line.?.len == 0) { if (line.?.len == 0) {
// close out record // close out record
try record_list.append(allocator, .{ try parsed.list.append(aa, .{
.items = try items.toOwnedSlice(allocator), .items = try items.toOwnedSlice(aa),
}); });
line = nextLine(reader, &state); line = nextLine(reader, &state);
state.partial_line_column = 0; state.partial_line_column = 0;
@ -461,13 +473,13 @@ pub fn parse(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: Pars
} }
// Parsing complete. Add final record to list. Then, if there are any parse errors, throw // Parsing complete. Add final record to list. Then, if there are any parse errors, throw
if (items.items.len > 0) if (items.items.len > 0)
try record_list.append(allocator, .{ try parsed.list.append(aa, .{
.items = try items.toOwnedSlice(allocator), .items = try items.toOwnedSlice(aa),
}); });
if (options.diagnostics) |d| if (options.diagnostics) |d|
if (d.errors.items.len > 0) return ParseError.ParseFailed; if (d.errors.items.len > 0) return ParseError.ParseFailed;
if (require_eof and !eof_found) return ParseError.ParseFailed; if (require_eof and !eof_found) return ParseError.ParseFailed;
return .{ .items = try record_list.toOwnedSlice(allocator) }; return parsed;
} }
/// Takes the next line, trimming leading whitespace and ignoring comments /// Takes the next line, trimming leading whitespace and ignoring comments
@ -513,10 +525,10 @@ test "long format single record, no eof" {
const allocator = std.testing.allocator; const allocator = std.testing.allocator;
var reader = std.Io.Reader.fixed(data); var reader = std.Io.Reader.fixed(data);
const records = try parse(&reader, allocator, .{}); const records = try parse(&reader, allocator, .{});
defer records.deinit(allocator); defer records.deinit();
try std.testing.expectEqual(@as(usize, 1), records.items.len); try std.testing.expectEqual(@as(usize, 1), records.list.items.len);
try std.testing.expectEqual(@as(usize, 1), records.items[0].items.len); try std.testing.expectEqual(@as(usize, 1), records.list.items[0].items.len);
const kvps = records.items[0].items; const kvps = records.list.items[0].items;
try std.testing.expectEqualStrings("key", kvps[0].key); try std.testing.expectEqualStrings("key", kvps[0].key);
try std.testing.expectEqualStrings("string value, with any data except a \\n. an optional string length between the colons", kvps[0].value.?.string); try std.testing.expectEqualStrings("string value, with any data except a \\n. an optional string length between the colons", kvps[0].value.?.string);
} }
@ -535,8 +547,8 @@ test "long format from README - generic data structures, first record only" {
const allocator = std.testing.allocator; const allocator = std.testing.allocator;
var reader = std.Io.Reader.fixed(data); var reader = std.Io.Reader.fixed(data);
const records = try parse(&reader, allocator, .{}); const records = try parse(&reader, allocator, .{});
defer records.deinit(allocator); defer records.deinit();
try std.testing.expectEqual(@as(usize, 1), records.items.len); try std.testing.expectEqual(@as(usize, 1), records.list.items.len);
} }
test "long format from README - generic data structures" { test "long format from README - generic data structures" {
@ -567,9 +579,9 @@ test "long format from README - generic data structures" {
const allocator = std.testing.allocator; const allocator = std.testing.allocator;
var reader = std.Io.Reader.fixed(data); var reader = std.Io.Reader.fixed(data);
const records = try parse(&reader, allocator, .{}); const records = try parse(&reader, allocator, .{});
defer records.deinit(allocator); defer records.deinit();
try std.testing.expectEqual(@as(usize, 2), records.items.len); try std.testing.expectEqual(@as(usize, 2), records.list.items.len);
const first = records.items[0]; const first = records.list.items[0];
try std.testing.expectEqual(@as(usize, 6), first.items.len); try std.testing.expectEqual(@as(usize, 6), first.items.len);
try std.testing.expectEqualStrings("key", first.items[0].key); try std.testing.expectEqualStrings("key", first.items[0].key);
try std.testing.expectEqualStrings("string value, with any data except a \\n. an optional string length between the colons", first.items[0].value.?.string); try std.testing.expectEqualStrings("string value, with any data except a \\n. an optional string length between the colons", first.items[0].value.?.string);
@ -584,7 +596,7 @@ test "long format from README - generic data structures" {
try std.testing.expectEqualStrings("boolean value", first.items[5].key); try std.testing.expectEqualStrings("boolean value", first.items[5].key);
try std.testing.expect(!first.items[5].value.?.boolean); try std.testing.expect(!first.items[5].value.?.boolean);
const second = records.items[1]; const second = records.list.items[1];
try std.testing.expectEqual(@as(usize, 5), second.items.len); try std.testing.expectEqual(@as(usize, 5), second.items.len);
try std.testing.expectEqualStrings("key", second.items[0].key); try std.testing.expectEqualStrings("key", second.items[0].key);
try std.testing.expectEqualStrings("this is the second record", second.items[0].value.?.string); try std.testing.expectEqualStrings("this is the second record", second.items[0].value.?.string);
@ -610,9 +622,9 @@ test "compact format from README - generic data structures" {
var reader = std.Io.Reader.fixed(data); var reader = std.Io.Reader.fixed(data);
// We want "parse" and "parseLeaky" probably. Second parameter is a diagnostics // We want "parse" and "parseLeaky" probably. Second parameter is a diagnostics
const records = try parse(&reader, allocator, .{}); const records = try parse(&reader, allocator, .{});
defer records.deinit(allocator); defer records.deinit();
try std.testing.expectEqual(@as(usize, 2), records.items.len); try std.testing.expectEqual(@as(usize, 2), records.list.items.len);
const first = records.items[0]; const first = records.list.items[0];
try std.testing.expectEqual(@as(usize, 6), first.items.len); try std.testing.expectEqual(@as(usize, 6), first.items.len);
try std.testing.expectEqualStrings("key", first.items[0].key); try std.testing.expectEqualStrings("key", first.items[0].key);
try std.testing.expectEqualStrings("string value must have a length between colons or end with a comma", first.items[0].value.?.string); try std.testing.expectEqualStrings("string value must have a length between colons or end with a comma", first.items[0].value.?.string);
@ -627,7 +639,7 @@ test "compact format from README - generic data structures" {
try std.testing.expectEqualStrings("boolean value", first.items[5].key); try std.testing.expectEqualStrings("boolean value", first.items[5].key);
try std.testing.expect(!first.items[5].value.?.boolean); try std.testing.expect(!first.items[5].value.?.boolean);
const second = records.items[1]; const second = records.list.items[1];
try std.testing.expectEqual(@as(usize, 1), second.items.len); try std.testing.expectEqual(@as(usize, 1), second.items.len);
try std.testing.expectEqualStrings("key", second.items[0].key); try std.testing.expectEqualStrings("key", second.items[0].key);
try std.testing.expectEqualStrings("this is the second record", second.items[0].value.?.string); try std.testing.expectEqualStrings("this is the second record", second.items[0].value.?.string);