iterator - not optimized
All checks were successful
Generic zig build / build (push) Successful in 25s
All checks were successful
Generic zig build / build (push) Successful in 25s
This commit is contained in:
parent
95036e83e2
commit
1a47ad0ad2
1 changed files with 313 additions and 188 deletions
501
src/srf.zig
501
src/srf.zig
|
|
@ -43,7 +43,7 @@ pub const Diagnostics = struct {
|
||||||
}
|
}
|
||||||
try self.errors.append(allocator, err);
|
try self.errors.append(allocator, err);
|
||||||
}
|
}
|
||||||
pub fn deinit(self: Parsed) void {
|
pub fn deinit(self: Diagnostics) void {
|
||||||
// From parse, three things can happen:
|
// From parse, three things can happen:
|
||||||
// 1. Happy path - record comes back, deallocation happens on that deinit
|
// 1. Happy path - record comes back, deallocation happens on that deinit
|
||||||
// 2. Errors is returned, no diagnostics provided. Deallocation happens in parse on errdefer
|
// 2. Errors is returned, no diagnostics provided. Deallocation happens in parse on errdefer
|
||||||
|
|
@ -86,12 +86,12 @@ pub const Value = union(enum) {
|
||||||
// .boolean => try writer.print("boolean: {}", .{self.boolean}),
|
// .boolean => try writer.print("boolean: {}", .{self.boolean}),
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
pub fn parse(allocator: std.mem.Allocator, str: []const u8, state: *ParseState, delimiter: u8, options: ParseOptions) ParseError!ValueWithMetaData {
|
pub fn parse(allocator: std.mem.Allocator, str: []const u8, state: *RecordIterator.State, delimiter: u8) ParseError!ValueWithMetaData {
|
||||||
const debug = str.len > 2 and str[0] == '1' and str[1] == '1';
|
const debug = str.len > 2 and str[0] == '1' and str[1] == '1';
|
||||||
if (debug) log.debug("parsing {s}", .{str});
|
if (debug) log.debug("parsing {s}", .{str});
|
||||||
const type_val_sep_raw = std.mem.indexOfScalar(u8, str, ':');
|
const type_val_sep_raw = std.mem.indexOfScalar(u8, str, ':');
|
||||||
if (type_val_sep_raw == null) {
|
if (type_val_sep_raw == null) {
|
||||||
try parseError(allocator, options, "no type data or value after key", state.*);
|
try parseError(allocator, "no type data or value after key", state.*);
|
||||||
return ParseError.ParseFailed;
|
return ParseError.ParseFailed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -107,7 +107,7 @@ pub const Value = union(enum) {
|
||||||
state.column += total_chars;
|
state.column += total_chars;
|
||||||
state.partial_line_column += total_chars;
|
state.partial_line_column += total_chars;
|
||||||
return .{
|
return .{
|
||||||
.item_value = .{ .string = try dupe(allocator, options, val) },
|
.item_value = .{ .string = try dupe(allocator, state.options, val) },
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
if (std.mem.eql(u8, "binary", trimmed_meta)) {
|
if (std.mem.eql(u8, "binary", trimmed_meta)) {
|
||||||
|
|
@ -121,7 +121,7 @@ pub const Value = union(enum) {
|
||||||
state.partial_line_column += total_chars;
|
state.partial_line_column += total_chars;
|
||||||
const Decoder = std.base64.standard.Decoder;
|
const Decoder = std.base64.standard.Decoder;
|
||||||
const size = Decoder.calcSizeForSlice(val) catch {
|
const size = Decoder.calcSizeForSlice(val) catch {
|
||||||
try parseError(allocator, options, "error parsing base64 value", state.*);
|
try parseError(allocator, "error parsing base64 value", state.*);
|
||||||
return .{
|
return .{
|
||||||
.item_value = null,
|
.item_value = null,
|
||||||
.error_parsing = true,
|
.error_parsing = true,
|
||||||
|
|
@ -130,7 +130,7 @@ pub const Value = union(enum) {
|
||||||
const data = try allocator.alloc(u8, size);
|
const data = try allocator.alloc(u8, size);
|
||||||
errdefer allocator.free(data);
|
errdefer allocator.free(data);
|
||||||
Decoder.decode(data, val) catch {
|
Decoder.decode(data, val) catch {
|
||||||
try parseError(allocator, options, "error parsing base64 value", state.*);
|
try parseError(allocator, "error parsing base64 value", state.*);
|
||||||
allocator.free(data);
|
allocator.free(data);
|
||||||
return .{
|
return .{
|
||||||
.item_value = null,
|
.item_value = null,
|
||||||
|
|
@ -151,7 +151,7 @@ pub const Value = union(enum) {
|
||||||
state.partial_line_column += total_chars;
|
state.partial_line_column += total_chars;
|
||||||
const val_trimmed = std.mem.trim(u8, val, &std.ascii.whitespace);
|
const val_trimmed = std.mem.trim(u8, val, &std.ascii.whitespace);
|
||||||
const number = std.fmt.parseFloat(@FieldType(Value, "number"), val_trimmed) catch {
|
const number = std.fmt.parseFloat(@FieldType(Value, "number"), val_trimmed) catch {
|
||||||
try parseError(allocator, options, "error parsing numeric value", state.*);
|
try parseError(allocator, "error parsing numeric value", state.*);
|
||||||
return .{
|
return .{
|
||||||
.item_value = null,
|
.item_value = null,
|
||||||
.error_parsing = true,
|
.error_parsing = true,
|
||||||
|
|
@ -173,7 +173,7 @@ pub const Value = union(enum) {
|
||||||
if (std.mem.eql(u8, "false", val_trimmed)) break :blk false;
|
if (std.mem.eql(u8, "false", val_trimmed)) break :blk false;
|
||||||
if (std.mem.eql(u8, "true", val_trimmed)) break :blk true;
|
if (std.mem.eql(u8, "true", val_trimmed)) break :blk true;
|
||||||
|
|
||||||
try parseError(allocator, options, "error parsing boolean value", state.*);
|
try parseError(allocator, "error parsing boolean value", state.*);
|
||||||
return .{
|
return .{
|
||||||
.item_value = null,
|
.item_value = null,
|
||||||
.error_parsing = true,
|
.error_parsing = true,
|
||||||
|
|
@ -200,7 +200,7 @@ pub const Value = union(enum) {
|
||||||
state.partial_line_column += total_metadata_chars;
|
state.partial_line_column += total_metadata_chars;
|
||||||
const size = std.fmt.parseInt(usize, trimmed_meta, 0) catch {
|
const size = std.fmt.parseInt(usize, trimmed_meta, 0) catch {
|
||||||
log.debug("parseInt fail, trimmed_data: '{s}'", .{trimmed_meta});
|
log.debug("parseInt fail, trimmed_data: '{s}'", .{trimmed_meta});
|
||||||
try parseError(allocator, options, "unrecognized metadata for key", state.*);
|
try parseError(allocator, "unrecognized metadata for key", state.*);
|
||||||
return .{
|
return .{
|
||||||
.item_value = null,
|
.item_value = null,
|
||||||
.error_parsing = true,
|
.error_parsing = true,
|
||||||
|
|
@ -228,7 +228,7 @@ pub const Value = union(enum) {
|
||||||
// This is not enough, we need more data from the reader
|
// This is not enough, we need more data from the reader
|
||||||
log.debug("item value includes newlines {f}", .{state});
|
log.debug("item value includes newlines {f}", .{state});
|
||||||
// We need to advance the reader, so we need a copy of what we have so fa
|
// We need to advance the reader, so we need a copy of what we have so fa
|
||||||
const start = try dupe(allocator, options, rest_of_data);
|
const start = try dupe(allocator, state.options, rest_of_data);
|
||||||
defer allocator.free(start);
|
defer allocator.free(start);
|
||||||
// We won't do a parseError here. If we have an allocation error, read
|
// We won't do a parseError here. If we have an allocation error, read
|
||||||
// error, or end of stream, all of these are fatal. Our reader is currently
|
// error, or end of stream, all of these are fatal. Our reader is currently
|
||||||
|
|
@ -583,25 +583,206 @@ pub const Record = struct {
|
||||||
/// When implemented, there will include a pub fn bind(self: Parsed, comptime T: type, options, BindOptions) BindError![]T
|
/// When implemented, there will include a pub fn bind(self: Parsed, comptime T: type, options, BindOptions) BindError![]T
|
||||||
/// function. The options will include things related to duplicate handling and
|
/// function. The options will include things related to duplicate handling and
|
||||||
/// missing fields
|
/// missing fields
|
||||||
pub const Parsed = struct {
|
pub const RecordIterator = struct {
|
||||||
records: std.ArrayList(Record),
|
|
||||||
arena: *std.heap.ArenaAllocator,
|
arena: *std.heap.ArenaAllocator,
|
||||||
/// optional expiry time for the data. Useful for caching
|
/// optional expiry time for the data. Useful for caching
|
||||||
/// Note that on a parse, data will always be returned and it will be up
|
/// Note that on a parse, data will always be returned and it will be up
|
||||||
/// to the caller to check is_fresh and determine the right thing to do
|
/// to the caller to check is_fresh and determine the right thing to do
|
||||||
expires: ?i64,
|
expires: ?i64,
|
||||||
|
|
||||||
pub fn deinit(self: Parsed) void {
|
state: *State,
|
||||||
|
|
||||||
|
pub const State = struct {
|
||||||
|
line: usize = 0,
|
||||||
|
column: usize = 0,
|
||||||
|
partial_line_column: usize = 0,
|
||||||
|
reader: *std.Io.Reader,
|
||||||
|
options: ParseOptions,
|
||||||
|
|
||||||
|
require_eof: bool = false,
|
||||||
|
eof_found: bool = false,
|
||||||
|
current_line: ?[]const u8,
|
||||||
|
|
||||||
|
field_delimiter: u8 = ',',
|
||||||
|
end_of_record_reached: bool = false,
|
||||||
|
|
||||||
|
/// Takes the next line, trimming leading whitespace and ignoring comments
|
||||||
|
/// Directives (comments starting with #!) are preserved
|
||||||
|
pub fn nextLine(state: *State) ?[]const u8 {
|
||||||
|
while (true) {
|
||||||
|
state.line += 1;
|
||||||
|
state.column = 1; // column is human indexed (one-based)
|
||||||
|
state.partial_line_column = 0; // partial_line_column is zero indexed for computers
|
||||||
|
const raw_line = (state.reader.takeDelimiter('\n') catch return null) orelse return null;
|
||||||
|
// we don't want to trim the end, as there might be a key/value field
|
||||||
|
// with a string including important trailing whitespace
|
||||||
|
const trimmed_line = std.mem.trimStart(u8, raw_line, &std.ascii.whitespace);
|
||||||
|
if (std.mem.startsWith(u8, trimmed_line, "#") and !std.mem.startsWith(u8, trimmed_line, "#!")) continue;
|
||||||
|
return trimmed_line;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn format(self: State, writer: *std.Io.Writer) std.Io.Writer.Error!void {
|
||||||
|
try writer.print("line: {}, col: {}", .{ self.line, self.column });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn next(self: RecordIterator) !?FieldIterator {
|
||||||
|
// TODO: we need to capture the fieldIterator here and make sure it's run
|
||||||
|
// to the ground to keep our state intact
|
||||||
|
const state = self.state;
|
||||||
|
if (state.current_line == null) {
|
||||||
|
if (state.options.diagnostics) |d|
|
||||||
|
if (d.errors.items.len > 0) return ParseError.ParseFailed;
|
||||||
|
if (state.require_eof and !state.eof_found) return ParseError.ParseFailed;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
while (std.mem.trim(u8, state.current_line.?, &std.ascii.whitespace).len == 0) {
|
||||||
|
// empty lines can be signficant (to indicate a new record, but only once
|
||||||
|
// a record is processed, which requires data first. That record processing
|
||||||
|
// is at the bottom of the loop, so if an empty line is detected here, we can
|
||||||
|
// safely ignore it
|
||||||
|
state.current_line = state.nextLine();
|
||||||
|
// by calling recursively we get the error handling above
|
||||||
|
if (state.current_line == null) return self.next();
|
||||||
|
}
|
||||||
|
// non-blank line, but we could have an eof marker
|
||||||
|
if (try Directive.parse(self.arena.allocator(), state.current_line.?, state.*)) |d| {
|
||||||
|
switch (d) {
|
||||||
|
.eof => {
|
||||||
|
// there needs to be an eof then
|
||||||
|
if (state.nextLine()) |_| {
|
||||||
|
try parseError(self.arena.allocator(), "Data found after #!eof", state.*);
|
||||||
|
return ParseError.ParseFailed; // this is terminal
|
||||||
|
} else {
|
||||||
|
state.eof_found = true;
|
||||||
|
state.current_line = null;
|
||||||
|
return null; // all is good, we're done
|
||||||
|
}
|
||||||
|
},
|
||||||
|
else => {
|
||||||
|
try parseError(self.arena.allocator(), "Directive found after data started", state.*);
|
||||||
|
state.current_line = state.nextLine();
|
||||||
|
// TODO: This runs the risk of a malicious file creating
|
||||||
|
// a stackoverflow by using many non-eof directives
|
||||||
|
return self.next();
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
state.end_of_record_reached = false;
|
||||||
|
return .{ .ri = self };
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const FieldIterator = struct {
|
||||||
|
ri: RecordIterator,
|
||||||
|
|
||||||
|
pub fn next(self: FieldIterator) !?Field {
|
||||||
|
const state = self.ri.state;
|
||||||
|
// Main parsing. We already have the first line of data, which could
|
||||||
|
// be a record (compact format) or a key/value pair (long format)
|
||||||
|
|
||||||
|
// log.debug("", .{});
|
||||||
|
log.debug("current line:{?s}", .{state.current_line});
|
||||||
|
if (state.current_line == null) return null;
|
||||||
|
if (state.end_of_record_reached) return null;
|
||||||
|
// non-blank line, but we could have an eof marker
|
||||||
|
// TODO: deduplicate this code
|
||||||
|
if (try Directive.parse(self.ri.arena.allocator(), state.current_line.?, state.*)) |d| {
|
||||||
|
switch (d) {
|
||||||
|
.eof => {
|
||||||
|
// there needs to be an eof then
|
||||||
|
if (state.nextLine()) |_| {
|
||||||
|
try parseError(self.ri.arena.allocator(), "Data found after #!eof", state.*);
|
||||||
|
return ParseError.ParseFailed; // this is terminal
|
||||||
|
} else {
|
||||||
|
state.eof_found = true;
|
||||||
|
state.current_line = null;
|
||||||
|
return null; // all is good, we're done
|
||||||
|
}
|
||||||
|
},
|
||||||
|
else => {
|
||||||
|
try parseError(self.ri.arena.allocator(), "Directive found after data started", state.*);
|
||||||
|
state.current_line = state.nextLine();
|
||||||
|
// TODO: This runs the risk of a malicious file creating
|
||||||
|
// a stackoverflow by using many non-eof directives
|
||||||
|
return self.next();
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Whatever the format, the beginning will always be the key data
|
||||||
|
// key:stuff:value
|
||||||
|
var it = std.mem.splitScalar(u8, state.current_line.?, ':');
|
||||||
|
const key = it.next().?; // first one we get for free
|
||||||
|
if (key.len > 0) std.debug.assert(key[0] != state.field_delimiter);
|
||||||
|
state.column += key.len + 1;
|
||||||
|
state.partial_line_column += key.len + 1;
|
||||||
|
const value = try Value.parse(
|
||||||
|
self.ri.arena.allocator(),
|
||||||
|
it.rest(),
|
||||||
|
state,
|
||||||
|
state.field_delimiter,
|
||||||
|
);
|
||||||
|
|
||||||
|
var field: ?Field = null;
|
||||||
|
if (!value.error_parsing) {
|
||||||
|
field = .{ .key = try dupe(self.ri.arena.allocator(), state.options, key), .value = value.item_value };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (value.reader_advanced and state.field_delimiter == ',') {
|
||||||
|
log.debug("advanced", .{});
|
||||||
|
// In compact format we'll stay on the same line
|
||||||
|
const real_column = state.column;
|
||||||
|
state.current_line = state.nextLine();
|
||||||
|
// Reset line and column position, because we're actually staying on the same line now
|
||||||
|
state.line -= 1;
|
||||||
|
state.column = real_column + 1;
|
||||||
|
state.partial_line_column = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The difference between compact and line here is that compact we will instead of
|
||||||
|
// line = try nextLine, we will do something like line = line[42..]
|
||||||
|
if (state.field_delimiter == '\n') {
|
||||||
|
state.current_line = state.nextLine();
|
||||||
|
if (state.current_line == null) {
|
||||||
|
state.end_of_record_reached = true;
|
||||||
|
return field;
|
||||||
|
}
|
||||||
|
// close out record, return
|
||||||
|
if (state.current_line.?.len == 0) {
|
||||||
|
// End of record
|
||||||
|
state.end_of_record_reached = true;
|
||||||
|
state.current_line = state.nextLine();
|
||||||
|
return field;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// We should be on a delimiter, otherwise, we should be at the end
|
||||||
|
state.current_line = state.current_line.?[state.partial_line_column..]; // can't use l here because line may have been reassigned
|
||||||
|
state.partial_line_column = 0;
|
||||||
|
if (state.current_line.?.len == 0) {
|
||||||
|
// close out record
|
||||||
|
state.current_line = state.nextLine();
|
||||||
|
state.partial_line_column = 0;
|
||||||
|
state.end_of_record_reached = true;
|
||||||
|
return field;
|
||||||
|
} else {
|
||||||
|
if (state.current_line.?[0] != state.field_delimiter) {
|
||||||
|
log.err("reset line for next item, first char not '{c}':{?s}", .{ state.field_delimiter, state.current_line });
|
||||||
|
return error.ParseFailed;
|
||||||
|
}
|
||||||
|
state.current_line = state.current_line.?[1..];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return field;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn deinit(self: RecordIterator) void {
|
||||||
const child_allocator = self.arena.child_allocator;
|
const child_allocator = self.arena.child_allocator;
|
||||||
self.arena.deinit();
|
self.arena.deinit();
|
||||||
child_allocator.destroy(self.arena);
|
child_allocator.destroy(self.arena);
|
||||||
}
|
}
|
||||||
pub fn format(self: Parsed, writer: *std.Io.Writer) std.Io.Writer.Error!void {
|
|
||||||
_ = self;
|
|
||||||
_ = writer;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn isFresh(self: Parsed) bool {
|
pub fn isFresh(self: RecordIterator) bool {
|
||||||
if (self.expires) |exp|
|
if (self.expires) |exp|
|
||||||
return std.time.timestamp() < exp;
|
return std.time.timestamp() < exp;
|
||||||
|
|
||||||
|
|
@ -628,7 +809,7 @@ const Directive = union(enum) {
|
||||||
eof,
|
eof,
|
||||||
expires: i64,
|
expires: i64,
|
||||||
|
|
||||||
pub fn parse(allocator: std.mem.Allocator, str: []const u8, state: ParseState, options: ParseOptions) ParseError!?Directive {
|
pub fn parse(allocator: std.mem.Allocator, str: []const u8, state: RecordIterator.State) ParseError!?Directive {
|
||||||
if (!std.mem.startsWith(u8, str, "#!")) return null;
|
if (!std.mem.startsWith(u8, str, "#!")) return null;
|
||||||
// strip any comments off
|
// strip any comments off
|
||||||
var it = std.mem.splitScalar(u8, str[2..], '#');
|
var it = std.mem.splitScalar(u8, str[2..], '#');
|
||||||
|
|
@ -636,7 +817,7 @@ const Directive = union(enum) {
|
||||||
if (std.mem.eql(u8, "srfv1", line)) return .magic;
|
if (std.mem.eql(u8, "srfv1", line)) return .magic;
|
||||||
if (std.mem.eql(u8, "requireeof", line)) return .require_eof;
|
if (std.mem.eql(u8, "requireeof", line)) return .require_eof;
|
||||||
if (std.mem.eql(u8, "requireof", line)) {
|
if (std.mem.eql(u8, "requireof", line)) {
|
||||||
try parseError(allocator, options, "#!requireof found. Did you mean #!requireeof?", state);
|
try parseError(allocator, "#!requireof found. Did you mean #!requireeof?", state);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
if (std.mem.eql(u8, "eof", line)) return .eof;
|
if (std.mem.eql(u8, "eof", line)) return .eof;
|
||||||
|
|
@ -779,192 +960,105 @@ pub const RecordFormatter = struct {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
pub const ParseState = struct {
|
|
||||||
reader: *std.Io.Reader,
|
|
||||||
line: usize,
|
|
||||||
column: usize,
|
|
||||||
partial_line_column: usize,
|
|
||||||
|
|
||||||
pub fn format(self: ParseState, writer: *std.Io.Writer) std.Io.Writer.Error!void {
|
pub const Parsed = struct {
|
||||||
try writer.print("line: {}, col: {}", .{ self.line, self.column });
|
// TODO: rip this down and return an array from parse
|
||||||
|
records: std.ArrayList(Record),
|
||||||
|
arena: *std.heap.ArenaAllocator,
|
||||||
|
expires: ?i64,
|
||||||
|
|
||||||
|
pub fn deinit(self: Parsed) void {
|
||||||
|
const ca = self.arena.child_allocator;
|
||||||
|
self.arena.deinit();
|
||||||
|
ca.destroy(self.arena);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// parse function. Prefer iterator over this function. Note that this function will
|
||||||
|
/// change soon
|
||||||
pub fn parse(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: ParseOptions) ParseError!Parsed {
|
pub fn parse(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: ParseOptions) ParseError!Parsed {
|
||||||
|
var records = std.ArrayList(Record).empty;
|
||||||
|
var it = try iterator(reader, allocator, options);
|
||||||
|
errdefer it.deinit();
|
||||||
|
const aa = it.arena.allocator();
|
||||||
|
while (try it.next()) |fi| {
|
||||||
|
var al = std.ArrayList(Field).empty;
|
||||||
|
while (try fi.next()) |f| {
|
||||||
|
const val = if (f.value != null)
|
||||||
|
switch (f.value.?) {
|
||||||
|
.string => Value{ .string = try aa.dupe(u8, f.value.?.string) },
|
||||||
|
.bytes => Value{ .bytes = try aa.dupe(u8, f.value.?.bytes) },
|
||||||
|
else => f.value,
|
||||||
|
}
|
||||||
|
else
|
||||||
|
f.value;
|
||||||
|
try al.append(aa, .{
|
||||||
|
.key = try aa.dupe(u8, f.key),
|
||||||
|
.value = val,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
try records.append(aa, .{
|
||||||
|
.fields = try al.toOwnedSlice(aa),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return .{
|
||||||
|
.records = records,
|
||||||
|
.arena = it.arena,
|
||||||
|
.expires = it.expires,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets an iterator to stream through the data
|
||||||
|
pub fn iterator(reader: *std.Io.Reader, allocator: std.mem.Allocator, options: ParseOptions) ParseError!RecordIterator {
|
||||||
|
|
||||||
|
// TODO: What can we do about allocations here?
|
||||||
|
|
||||||
// create an arena allocator for everytyhing related to parsing
|
// create an arena allocator for everytyhing related to parsing
|
||||||
const arena: *std.heap.ArenaAllocator = try allocator.create(std.heap.ArenaAllocator);
|
const arena: *std.heap.ArenaAllocator = try allocator.create(std.heap.ArenaAllocator);
|
||||||
errdefer if (options.diagnostics == null) allocator.destroy(arena);
|
errdefer if (options.diagnostics == null) allocator.destroy(arena);
|
||||||
arena.* = .init(allocator);
|
arena.* = .init(allocator);
|
||||||
errdefer if (options.diagnostics == null) arena.deinit();
|
errdefer if (options.diagnostics == null) arena.deinit();
|
||||||
const aa = arena.allocator();
|
const aa = arena.allocator();
|
||||||
var long_format = false; // Default to compact format
|
const state = try aa.create(RecordIterator.State);
|
||||||
var require_eof = false; // Default to no eof required
|
state.* = .{
|
||||||
var eof_found: bool = false;
|
.reader = reader,
|
||||||
var state = ParseState{ .line = 0, .column = 0, .partial_line_column = 0, .reader = reader };
|
.current_line = null,
|
||||||
const first_line = nextLine(reader, &state) orelse return ParseError.ParseFailed;
|
.options = options,
|
||||||
|
};
|
||||||
if (try Directive.parse(aa, first_line, state, options)) |d| {
|
var it: RecordIterator = .{
|
||||||
if (d != .magic) try parseError(aa, options, "Magic header not found on first line", state);
|
|
||||||
} else try parseError(aa, options, "Magic header not found on first line", state);
|
|
||||||
|
|
||||||
// Loop through the header material and configure our main parsing
|
|
||||||
var parsed: Parsed = .{
|
|
||||||
.records = .empty,
|
|
||||||
.arena = arena,
|
.arena = arena,
|
||||||
.expires = null,
|
.expires = null,
|
||||||
|
.state = state,
|
||||||
};
|
};
|
||||||
const first_data = blk: {
|
const first_line = it.state.nextLine() orelse return ParseError.ParseFailed;
|
||||||
while (nextLine(reader, &state)) |line| {
|
|
||||||
if (try Directive.parse(aa, line, state, options)) |d| {
|
if (try Directive.parse(aa, first_line, it.state.*)) |d| {
|
||||||
|
if (d != .magic) try parseError(aa, "Magic header not found on first line", it.state.*);
|
||||||
|
} else try parseError(aa, "Magic header not found on first line", it.state.*);
|
||||||
|
|
||||||
|
// Loop through the header material and configure our main parsing
|
||||||
|
it.state.current_line = blk: {
|
||||||
|
while (it.state.nextLine()) |line| {
|
||||||
|
if (try Directive.parse(aa, line, it.state.*)) |d| {
|
||||||
switch (d) {
|
switch (d) {
|
||||||
.magic => try parseError(aa, options, "Found a duplicate magic header", state),
|
.magic => try parseError(aa, "Found a duplicate magic header", it.state.*),
|
||||||
.long_format => long_format = true,
|
.long_format => it.state.field_delimiter = '\n',
|
||||||
.compact_format => long_format = false, // what if we have both?
|
.compact_format => it.state.field_delimiter = ',', // what if we have both?
|
||||||
.require_eof => require_eof = true,
|
.require_eof => it.state.require_eof = true,
|
||||||
.expires => |exp| parsed.expires = exp,
|
.expires => |exp| it.expires = exp,
|
||||||
.eof => {
|
.eof => {
|
||||||
// there needs to be an eof then
|
// there needs to be an eof then
|
||||||
if (nextLine(reader, &state)) |_| {
|
if (it.state.nextLine()) |_| {
|
||||||
try parseError(aa, options, "Data found after #!eof", state);
|
try parseError(aa, "Data found after #!eof", it.state.*);
|
||||||
return ParseError.ParseFailed; // this is terminal
|
return ParseError.ParseFailed; // this is terminal
|
||||||
} else return parsed;
|
} else return it;
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
} else break :blk line;
|
} else break :blk line;
|
||||||
}
|
}
|
||||||
return parsed;
|
return it; //without current_line - we're at the end of file
|
||||||
};
|
};
|
||||||
|
return it; // with current_line
|
||||||
// Main parsing. We already have the first line of data, which could
|
|
||||||
// be a record (compact format) or a key/value pair (long format)
|
|
||||||
var line: ?[]const u8 = first_data;
|
|
||||||
var items: std.ArrayList(Field) = .empty;
|
|
||||||
|
|
||||||
// Because in long format we don't have newline delimiter, that should really be a noop
|
|
||||||
// but we need this for compact format
|
|
||||||
const delimiter: u8 = if (long_format) '\n' else ',';
|
|
||||||
// log.debug("", .{});
|
|
||||||
// log.debug("first line:{?s}", .{line});
|
|
||||||
while (line) |l| {
|
|
||||||
if (std.mem.trim(u8, l, &std.ascii.whitespace).len == 0) {
|
|
||||||
// empty lines can be signficant (to indicate a new record, but only once
|
|
||||||
// a record is processed, which requires data first. That record processing
|
|
||||||
// is at the bottom of the loop, so if an empty line is detected here, we can
|
|
||||||
// safely ignore it
|
|
||||||
line = nextLine(reader, &state);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (try Directive.parse(aa, l, state, options)) |d| {
|
|
||||||
switch (d) {
|
|
||||||
.eof => {
|
|
||||||
// there needs to be an eof then
|
|
||||||
if (nextLine(reader, &state)) |_| {
|
|
||||||
try parseError(aa, options, "Data found after #!eof", state);
|
|
||||||
return ParseError.ParseFailed; // this is terminal
|
|
||||||
} else {
|
|
||||||
eof_found = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
},
|
|
||||||
else => try parseError(aa, options, "Directive found after data started", state),
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Real data: lfg
|
|
||||||
// Whatever the format, the beginning will always be the key data
|
|
||||||
// key:stuff:value
|
|
||||||
var it = std.mem.splitScalar(u8, l, ':');
|
|
||||||
const key = it.next().?; // first one we get for free
|
|
||||||
if (key.len > 0) std.debug.assert(key[0] != delimiter);
|
|
||||||
state.column += key.len + 1;
|
|
||||||
state.partial_line_column += key.len + 1;
|
|
||||||
const value = try Value.parse(
|
|
||||||
aa,
|
|
||||||
it.rest(),
|
|
||||||
&state,
|
|
||||||
delimiter,
|
|
||||||
options,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!value.error_parsing) {
|
|
||||||
// std.debug.print("alloc on key: {s}, val: {?f}\n", .{ key, value.item_value });
|
|
||||||
try items.append(aa, .{ .key = try aa.dupe(u8, key), .value = value.item_value });
|
|
||||||
}
|
|
||||||
|
|
||||||
if (value.reader_advanced and !long_format) {
|
|
||||||
// In compact format we'll stay on the same line
|
|
||||||
const real_column = state.column;
|
|
||||||
line = nextLine(reader, &state);
|
|
||||||
// Reset line and column position, because we're actually staying on the same line now
|
|
||||||
state.line -= 1;
|
|
||||||
state.column = real_column + 1;
|
|
||||||
state.partial_line_column = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// The difference between compact and line here is that compact we will instead of
|
|
||||||
// line = try nextLine, we will do something like line = line[42..]
|
|
||||||
if (long_format) {
|
|
||||||
const maybe_line = nextLine(reader, &state);
|
|
||||||
if (maybe_line == null) {
|
|
||||||
// close out record, return
|
|
||||||
try parsed.records.append(aa, .{
|
|
||||||
.fields = try items.toOwnedSlice(aa),
|
|
||||||
});
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
line = maybe_line.?;
|
|
||||||
if (line.?.len == 0) {
|
|
||||||
// End of record
|
|
||||||
try parsed.records.append(aa, .{
|
|
||||||
.fields = try items.toOwnedSlice(aa),
|
|
||||||
});
|
|
||||||
line = nextLine(reader, &state);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// We should be on a delimiter, otherwise, we should be at the end
|
|
||||||
line = line.?[state.partial_line_column..]; // can't use l here because line may have been reassigned
|
|
||||||
state.partial_line_column = 0;
|
|
||||||
if (line.?.len == 0) {
|
|
||||||
// close out record
|
|
||||||
try parsed.records.append(aa, .{
|
|
||||||
.fields = try items.toOwnedSlice(aa),
|
|
||||||
});
|
|
||||||
line = nextLine(reader, &state);
|
|
||||||
state.partial_line_column = 0;
|
|
||||||
} else {
|
|
||||||
if (line.?[0] != delimiter) {
|
|
||||||
log.err("reset line for next item, first char not '{c}':{?s}", .{ delimiter, line });
|
|
||||||
return error.ParseFailed;
|
|
||||||
}
|
|
||||||
line = line.?[1..];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Parsing complete. Add final record to list. Then, if there are any parse errors, throw
|
|
||||||
if (items.items.len > 0)
|
|
||||||
try parsed.records.append(aa, .{
|
|
||||||
.fields = try items.toOwnedSlice(aa),
|
|
||||||
});
|
|
||||||
if (options.diagnostics) |d|
|
|
||||||
if (d.errors.items.len > 0) return ParseError.ParseFailed;
|
|
||||||
if (require_eof and !eof_found) return ParseError.ParseFailed;
|
|
||||||
return parsed;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Takes the next line, trimming leading whitespace and ignoring comments
|
|
||||||
/// Directives (comments starting with #!) are preserved
|
|
||||||
fn nextLine(reader: *std.Io.Reader, state: *ParseState) ?[]const u8 {
|
|
||||||
while (true) {
|
|
||||||
state.line += 1;
|
|
||||||
state.column = 1; // column is human indexed (one-based)
|
|
||||||
state.partial_line_column = 0; // partial_line_column is zero indexed for computers
|
|
||||||
const raw_line = (reader.takeDelimiter('\n') catch return null) orelse return null;
|
|
||||||
// we don't want to trim the end, as there might be a key/value field
|
|
||||||
// with a string including important trailing whitespace
|
|
||||||
const trimmed_line = std.mem.trimStart(u8, raw_line, &std.ascii.whitespace);
|
|
||||||
if (std.mem.startsWith(u8, trimmed_line, "#") and !std.mem.startsWith(u8, trimmed_line, "#!")) continue;
|
|
||||||
return trimmed_line;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline fn dupe(allocator: std.mem.Allocator, options: ParseOptions, data: []const u8) ParseError![]const u8 {
|
inline fn dupe(allocator: std.mem.Allocator, options: ParseOptions, data: []const u8) ParseError![]const u8 {
|
||||||
|
|
@ -972,11 +1066,11 @@ inline fn dupe(allocator: std.mem.Allocator, options: ParseOptions, data: []cons
|
||||||
return try allocator.dupe(u8, data);
|
return try allocator.dupe(u8, data);
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
inline fn parseError(allocator: std.mem.Allocator, options: ParseOptions, message: []const u8, state: ParseState) ParseError!void {
|
inline fn parseError(allocator: std.mem.Allocator, message: []const u8, state: RecordIterator.State) ParseError!void {
|
||||||
log.debug("Parse error. Parse state {f}, message: {s}", .{ state, message });
|
log.debug("Parse error. Parse state {f}, message: {s}", .{ state, message });
|
||||||
if (options.diagnostics) |d| {
|
if (state.options.diagnostics) |d| {
|
||||||
try d.addError(allocator, .{
|
try d.addError(allocator, .{
|
||||||
.message = try dupe(allocator, options, message),
|
.message = try dupe(allocator, state.options, message),
|
||||||
.level = .err,
|
.level = .err,
|
||||||
.line = state.line,
|
.line = state.line,
|
||||||
.column = state.column,
|
.column = state.column,
|
||||||
|
|
@ -985,7 +1079,6 @@ inline fn parseError(allocator: std.mem.Allocator, options: ParseOptions, messag
|
||||||
return ParseError.ParseFailed;
|
return ParseError.ParseFailed;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
test "long format single record, no eof" {
|
test "long format single record, no eof" {
|
||||||
const data =
|
const data =
|
||||||
\\#!srfv1 # mandatory comment with format and version. Parser instructions start with #!
|
\\#!srfv1 # mandatory comment with format and version. Parser instructions start with #!
|
||||||
|
|
@ -1435,3 +1528,35 @@ test "compact format length-prefixed string as last field" {
|
||||||
try std.testing.expectEqualStrings("desc", rec.fields[1].key);
|
try std.testing.expectEqualStrings("desc", rec.fields[1].key);
|
||||||
try std.testing.expectEqualStrings("world", rec.fields[1].value.?.string);
|
try std.testing.expectEqualStrings("world", rec.fields[1].value.?.string);
|
||||||
}
|
}
|
||||||
|
test "iterator" {
|
||||||
|
// When a length-prefixed value is the last field on the line,
|
||||||
|
// rest_of_data.len == size exactly. The check on line 216 uses
|
||||||
|
// strict > instead of >=, falling through to the multi-line path
|
||||||
|
// where size - rest_of_data.len - 1 underflows.
|
||||||
|
const data =
|
||||||
|
\\#!srfv1
|
||||||
|
\\name::alice,desc:5:world
|
||||||
|
;
|
||||||
|
const allocator = std.testing.allocator;
|
||||||
|
var reader = std.Io.Reader.fixed(data);
|
||||||
|
var ri = try iterator(&reader, allocator, .{});
|
||||||
|
defer ri.deinit();
|
||||||
|
|
||||||
|
const nfi = try ri.next();
|
||||||
|
try std.testing.expect(nfi != null);
|
||||||
|
const fi = nfi.?;
|
||||||
|
// defer fi.deinit();
|
||||||
|
const field1 = try fi.next();
|
||||||
|
try std.testing.expect(field1 != null);
|
||||||
|
try std.testing.expectEqualStrings("name", field1.?.key);
|
||||||
|
try std.testing.expectEqualStrings("alice", field1.?.value.?.string);
|
||||||
|
const field2 = try fi.next();
|
||||||
|
try std.testing.expect(field2 != null);
|
||||||
|
try std.testing.expectEqualStrings("desc", field2.?.key);
|
||||||
|
try std.testing.expectEqualStrings("world", field2.?.value.?.string);
|
||||||
|
const field3 = try fi.next();
|
||||||
|
try std.testing.expect(field3 == null);
|
||||||
|
|
||||||
|
const next = try ri.next();
|
||||||
|
try std.testing.expect(next == null);
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue