support for lienient number parsing (minus locale issues)
This commit is contained in:
parent
e21a7308c3
commit
d89a3b2a93
1 changed files with 180 additions and 1 deletions
181
src/srf.zig
181
src/srf.zig
|
|
@ -209,7 +209,10 @@ pub const Value = union(enum) {
|
|||
state.column += total_chars;
|
||||
state.partial_line_column += total_chars;
|
||||
const val_trimmed = std.mem.trim(u8, val, &std.ascii.whitespace);
|
||||
const number = std.fmt.parseFloat(@FieldType(Value, "number"), val_trimmed) catch {
|
||||
const number = (if (state.options.strict_number_parsing)
|
||||
std.fmt.parseFloat(@FieldType(Value, "number"), val_trimmed)
|
||||
else
|
||||
parseFloat(@FieldType(Value, "number"), val_trimmed)) catch {
|
||||
try parseError("error parsing numeric value", state);
|
||||
return .{
|
||||
.item_value = null,
|
||||
|
|
@ -316,6 +319,117 @@ pub const Value = union(enum) {
|
|||
state.fallback_arena.?.* = .init(state.allocator);
|
||||
return state.fallback_arena.?.allocator();
|
||||
}
|
||||
|
||||
fn parseFloat(comptime T: type, value: []const u8) !T {
|
||||
if (std.fmt.parseFloat(T, value)) |f| {
|
||||
// clean parse
|
||||
return f;
|
||||
} else |_| {} // error
|
||||
|
||||
if (@typeInfo(T) != .float) {
|
||||
@compileError("Cannot parse a float into a non-floating point type.");
|
||||
}
|
||||
// Need a temporary buffer. The maximum number of characters in our float
|
||||
// can be calculated. This is apparently out of all the text from
|
||||
// https://dl.acm.org/doi/epdf/10.1145/93542.93557
|
||||
// and
|
||||
// https://dl.acm.org/doi/epdf/10.1145/93548.93559
|
||||
// and boils down to:
|
||||
// 1 + ceil(p * log10(2))
|
||||
//
|
||||
// This is a little relevant here because we don't know the exact type,
|
||||
// even though it's almost certainly f64 (look at the Value struct)
|
||||
const buf_len: usize = 1 + @trunc(std.math.ceil(@as(f64, @typeInfo(T).float.bits) * @log10(@as(f64, 2))));
|
||||
var buffer: [buf_len]u8 = undefined;
|
||||
var val_inx: usize = 0;
|
||||
var buf_inx: usize = 0;
|
||||
var state: enum { start, middle, end } = .start;
|
||||
|
||||
// We need to "clean up" the input here
|
||||
while (val_inx < value.len) {
|
||||
const c = value[val_inx];
|
||||
switch (state) {
|
||||
.start => {
|
||||
if (isNumberIsh(c, false)) {
|
||||
state = .middle;
|
||||
// we don't increment val_inx here because we need to add to the buffer...
|
||||
continue;
|
||||
}
|
||||
// We need to have at least one more character in the string
|
||||
if (val_inx + 1 >= value.len) return error.InvalidCharacter;
|
||||
|
||||
if (leadingCurrency(value[val_inx..])) |curr| {
|
||||
val_inx += curr.len;
|
||||
while (val_inx < value.len and value[val_inx] == ' ') val_inx += 1;
|
||||
state = .middle;
|
||||
continue;
|
||||
}
|
||||
return error.InvalidCharacter;
|
||||
},
|
||||
.middle => {
|
||||
if (!isNumberIsh(c, true)) {
|
||||
val_inx += 1;
|
||||
state = .end;
|
||||
continue;
|
||||
}
|
||||
// add to our buffer if it's not a comma. We aren't dealing
|
||||
// with comma/period locale semantics
|
||||
if (c == ',') {
|
||||
val_inx += 1;
|
||||
continue;
|
||||
}
|
||||
buffer[buf_inx] = c;
|
||||
buf_inx += 1;
|
||||
val_inx += 1;
|
||||
},
|
||||
.end => {
|
||||
if (value[val_inx] == ' ' and val_inx < value.len + 1) {
|
||||
// we don't allow trailing spaces
|
||||
val_inx += 1;
|
||||
continue;
|
||||
}
|
||||
if (leadingCurrency(value[val_inx..])) |curr| {
|
||||
// we are ok to end with a currency, but nothing else
|
||||
if (val_inx + curr.len == value.len) break;
|
||||
}
|
||||
return error.InvalidCharacter;
|
||||
},
|
||||
}
|
||||
}
|
||||
return std.fmt.parseFloat(T, buffer[0..buf_inx]);
|
||||
}
|
||||
fn isNumberIsh(ch: u8, in_middle: bool) bool {
|
||||
if (ch >= '0' and ch <= '9')
|
||||
return true;
|
||||
if (ch == '-' or ch == '+')
|
||||
return true;
|
||||
if (in_middle and (ch == '.' or ch == ','))
|
||||
return true; // we will allow . and , and allow ordering to the caller
|
||||
return false;
|
||||
}
|
||||
fn leadingCurrency(s: []const u8) ?[]const u8 {
|
||||
// Check known single character currency symbols
|
||||
const single_byte_currencies = "$KLPQR";
|
||||
for (single_byte_currencies) |curr|
|
||||
if (s[0] == curr)
|
||||
return s[0..1];
|
||||
const two_byte_currencies = "£¤¥֏";
|
||||
var i: usize = 0;
|
||||
while (i < two_byte_currencies.len - 1) : (i += 2) {
|
||||
if (two_byte_currencies[i] == s[0] and
|
||||
two_byte_currencies[i + 1] == s[1])
|
||||
return s[0..2];
|
||||
}
|
||||
const three_byte_currencies = "৳฿៛₡₦₧₩₪₫€₭₮₱₲₴₸₹₺₼₽₾⃀";
|
||||
i = 0;
|
||||
while (i < three_byte_currencies.len - 2) : (i += 3) {
|
||||
if (three_byte_currencies[i] == s[0] and
|
||||
three_byte_currencies[i + 1] == s[1] and
|
||||
three_byte_currencies[i + 2] == s[2])
|
||||
return s[0..3];
|
||||
}
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
/// A single key-value pair within a record. The key is always a string.
|
||||
|
|
@ -1088,6 +1202,14 @@ pub const ParseOptions = struct {
|
|||
/// strings. More complex use cases can use their own allocator for control
|
||||
/// over string lifetime
|
||||
parse_allocator: ParseAllocator = .parse_arena,
|
||||
|
||||
/// Strict number parsing. In strict number parsing, numbers will fail
|
||||
/// to parse if `std.fmt.parseFloat` fails. Turn this off to be more lenient.
|
||||
///
|
||||
/// Turning it off will have a slight performance impact, but helps support
|
||||
/// scenarios where srf data is more user facing (think config, not cache).
|
||||
/// For example, commas will be tolerated as will leading currency symbols
|
||||
strict_number_parsing: bool = true,
|
||||
};
|
||||
|
||||
/// Allocator to use for parsing data
|
||||
|
|
@ -2279,6 +2401,63 @@ test parse {
|
|||
try std.testing.expectEqualStrings("alice", parsed.records[0].fields[0].value.?.string);
|
||||
try std.testing.expectEqualStrings("bob", parsed.records[1].fields[0].value.?.string);
|
||||
}
|
||||
test "parse tolerates commas and currency in numbers" {
|
||||
// Example: batch parsing collects all records and fields into slices.
|
||||
// Prefer `iterator` for streaming; use `parse` when random access to
|
||||
// all records is needed.
|
||||
const data =
|
||||
\\#!srfv1
|
||||
\\#!long
|
||||
\\name::bananas
|
||||
\\cost:num:$30.00
|
||||
\\
|
||||
\\name::spaceship
|
||||
\\cost:num:$1,000,000,000.42
|
||||
\\
|
||||
\\name::Omikase in Tokyo
|
||||
\\cost:num:¥15,000
|
||||
\\
|
||||
\\name::Airbus A380
|
||||
\\cost:num:€410,000,000
|
||||
\\
|
||||
\\name::Bread in London
|
||||
\\cost:num:5 €
|
||||
\\
|
||||
\\name::The other way
|
||||
\\cost:num:€ 5
|
||||
\\#!eof
|
||||
;
|
||||
const allocator = std.testing.allocator;
|
||||
var reader = std.Io.Reader.fixed(data);
|
||||
// Diagnostics are optional, but if you would like them, include
|
||||
// these three lines and set the options field:
|
||||
var diags: BoundedDiagnostics(10) = .empty;
|
||||
var diag: Diagnostics = diags.diagnostics();
|
||||
const parsed = try parse(&reader, allocator, .{ .diagnostics = &diag, .strict_number_parsing = false });
|
||||
defer parsed.deinit();
|
||||
|
||||
// Dollars are single byte currency
|
||||
try std.testing.expectEqual(@as(usize, 6), parsed.records.len);
|
||||
try std.testing.expectEqualStrings("bananas", parsed.records[0].fields[0].value.?.string);
|
||||
try std.testing.expectEqual(@as(f64, 30), parsed.records[0].fields[1].value.?.number);
|
||||
|
||||
// Add commas to the mix
|
||||
try std.testing.expectEqualStrings("spaceship", parsed.records[1].fields[0].value.?.string);
|
||||
try std.testing.expectEqual(@as(f64, 1_000_000_000.42), parsed.records[1].fields[1].value.?.number);
|
||||
|
||||
// Yen symbol is two bytes long
|
||||
try std.testing.expectEqualStrings("Omikase in Tokyo", parsed.records[2].fields[0].value.?.string);
|
||||
try std.testing.expectEqual(@as(f64, 15_000), parsed.records[2].fields[1].value.?.number);
|
||||
|
||||
try std.testing.expectEqualStrings("Airbus A380", parsed.records[3].fields[0].value.?.string);
|
||||
try std.testing.expectEqual(@as(f64, 410_000_000), parsed.records[3].fields[1].value.?.number);
|
||||
|
||||
try std.testing.expectEqualStrings("Bread in London", parsed.records[4].fields[0].value.?.string);
|
||||
try std.testing.expectEqual(@as(f64, 5), parsed.records[4].fields[1].value.?.number);
|
||||
|
||||
try std.testing.expectEqualStrings("The other way", parsed.records[5].fields[0].value.?.string);
|
||||
try std.testing.expectEqual(@as(f64, 5), parsed.records[5].fields[1].value.?.number);
|
||||
}
|
||||
test fmtFrom {
|
||||
// Example: serialize typed Zig values directly to SRF format.
|
||||
const Data = struct {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue