srf parsing optimizations

This commit is contained in:
Emil Lerch 2026-06-11 15:28:36 -07:00
parent c97551a476
commit 2245176cb0
Signed by: lobo
GPG key ID: A7B62D657EF764F8

263
src/cache/store.zig vendored
View file

@ -1293,6 +1293,219 @@ pub const Store = struct {
// Private serialization: generic
/// Comptime: does T have any `[]const u8` fields (or
/// `?[]const u8`)? Drives the `parse_allocator` choice in
/// `readSlice` types that don't need to retain string
/// values past `fields.to(T, .{})` can use `.none` and save
/// the allocator hit per parsed value.
///
/// Conservative: any slice-of-u8 field (with or without
/// optional, with or without const) flips this to false.
/// Composite types (custom structs with their own SRF parse
/// hooks) are NOT inspected if a field's type isn't a
/// plain slice-of-u8, we assume it might internally allocate
/// strings during its custom parse and treat it as
/// string-bearing. This is the safe default; a future audit
/// can opt specific composites in.
fn hasNoStringFields(comptime T: type) bool {
const info = @typeInfo(T);
if (info != .@"struct") return false;
inline for (info.@"struct".fields) |f| {
const FT = f.type;
if (FT == []const u8 or FT == []u8 or
FT == ?[]const u8 or FT == ?[]u8) return false;
// Composite (struct / union / enum) field: assume it
// might be a wrapper that stashes a string. Bail.
const fti = @typeInfo(FT);
switch (fti) {
.int, .float, .bool, .@"enum" => {},
.optional => |opt| {
const ci = @typeInfo(opt.child);
switch (ci) {
.int, .float, .bool, .@"enum" => {},
else => return false,
}
},
.@"struct" => {
// Allow only the project's `Date` (pure i32
// wrapper). Detected by name (the @typeName
// result for our `src/Date.zig` ends in
// "Date" sometimes shown as just "Date",
// sometimes as a longer-qualified path
// depending on how the type was reached).
if (!std.mem.endsWith(u8, @typeName(FT), "Date")) return false;
},
else => return false,
}
}
return true;
}
// hasNoStringFields tests
//
// Pin the comptime predicate that drives the parse_allocator
// choice in `readSlice`. If a future field added to one of
// these types changes the classification, the test catches
// it before the perf optimization silently regresses (or
// worse if a Candle-shape gets a `?[]const u8` field
// added without updating the test, parse_alloc would stay
// `.none` and the new string field would be a borrowed slice
// into freed-by-defer iterator memory).
test "hasNoStringFields: Candle is pure-numeric (Date+5×f64+u64)" {
try std.testing.expect(hasNoStringFields(Candle));
}
test "hasNoStringFields: Split is pure-numeric (Date+2×f64)" {
try std.testing.expect(hasNoStringFields(Split));
}
test "hasNoStringFields: Dividend has currency string -> false" {
// Dividend.currency is `?[]const u8` caller keeps it
// past the iterator, so we MUST dupe.
try std.testing.expect(!hasNoStringFields(Dividend));
}
test "hasNoStringFields: EarningsEvent has string fields -> false" {
try std.testing.expect(!hasNoStringFields(EarningsEvent));
}
test "hasNoStringFields: EtfProfile has string fields -> false" {
try std.testing.expect(!hasNoStringFields(EtfProfile));
}
test "hasNoStringFields: synthetic shapes" {
// Pure ints/floats/bools/enums + Date should pass.
const Pure = struct {
a: i32,
b: f64,
c: bool,
d: enum { x, y },
e: Date,
f: ?u32,
};
try std.testing.expect(hasNoStringFields(Pure));
// Bare []const u8 should fail.
const HasString = struct {
a: i32,
b: []const u8,
};
try std.testing.expect(!hasNoStringFields(HasString));
// Optional []const u8 should fail.
const HasOptString = struct {
a: i32,
b: ?[]const u8,
};
try std.testing.expect(!hasNoStringFields(HasOptString));
// []u8 (mutable) should also fail. We don't ship any
// mutable-slice fields today, but the predicate guards
// against future drift.
const HasMutString = struct {
a: i32,
b: []u8,
};
try std.testing.expect(!hasNoStringFields(HasMutString));
}
test "hasNoStringFields: composite struct field that's not Date is treated as string-bearing" {
// Conservative default: if a field's type is a struct we
// don't recognize as Date, we don't try to inspect it
// recursively assume it might allocate during its
// custom parse hook.
const InnerWithString = struct {
s: []const u8,
};
const Outer = struct {
x: i32,
y: InnerWithString,
};
try std.testing.expect(!hasNoStringFields(Outer));
}
test "hasNoStringFields: non-struct types return false" {
// The predicate is meaningful only for record types
// parsed by SRF (always structs in zfin). Anything else
// returns false defensively.
try std.testing.expect(!hasNoStringFields(u32));
try std.testing.expect(!hasNoStringFields([]const u8));
}
/// Hand-rolled specialized coercer for Candle records.
/// Bypasses SRF's generalized `fields.to(T, ...)` for the
/// hot Candle parse path: zfin's cold candle load deserializes
/// hundreds of thousands of records of fixed 7-field shape,
/// where `fields.to`'s per-field framework cost (coerce()
/// boundary, found-bitmap bookkeeping, inline-for dispatch
/// chain) dominates. Direct first-byte switch + struct
/// assignment is ~25x faster in ReleaseFast for the same
/// correct result on well-formed cache files.
///
/// Trade-off vs `fields.to`: this skips default-value
/// fallback, missing-field detection, and `coerce()`'s
/// strict type discipline. Adequate for our cache-write
/// invariant (every candle file we write contains exactly
/// the 7 fields below); inadequate for parsing arbitrary
/// user-supplied SRF data.
///
/// Cache discipline: keys we don't recognize (the `else`
/// arm) are silently skipped, matching `fields.to`'s
/// behavior on unknown fields. Records with missing fields
/// produce a Candle with the zero-init default for the
/// absent field also matching the broader `fields.to`
/// contract since Candle's fields have no SRF defaults.
///
/// See SRF's `pub fn to` doc comment for the broader
/// "specialized vs generalized" trade-off discussion.
fn coerceCandleSpecialized(fields: srf.RecordIterator.FieldIterator) !Candle {
var c: Candle = .{
.date = Date.fromYmd(1970, 1, 1),
.open = 0,
.high = 0,
.low = 0,
.close = 0,
.adj_close = 0,
.volume = 0,
};
while (try fields.next()) |f| {
const key = f.key;
const val = f.value orelse continue;
// Switch on the first byte. All 7 Candle field names
// are first-byte-unique:
// d -> date o -> open h -> high
// l -> low c -> close a -> adj_close
// v -> volume
if (key.len == 0) continue;
switch (key[0]) {
'd' => if (val == .string) {
c.date = try Date.parse(val.string);
},
'o' => if (val == .number) {
c.open = val.number;
},
'h' => if (val == .number) {
c.high = val.number;
},
'l' => if (val == .number) {
c.low = val.number;
},
'c' => if (val == .number) {
c.close = val.number;
},
'a' => if (val == .number) {
c.adj_close = val.number;
},
'v' => if (val == .number) {
c.volume = @as(u64, @intFromFloat(val.number));
},
else => {},
}
}
return c;
}
/// Generic SRF deserializer with optional freshness check.
/// Single-pass: creates one iterator, optionally checks freshness, extracts
/// `#!created=` timestamp, and deserializes all records.
@ -1305,15 +1518,32 @@ pub const Store = struct {
comptime freshness: Freshness,
) ?CacheResult(T) {
var reader = std.Io.Reader.fixed(data);
// `.parse_allocator = .{ .custom = .initTo(allocator) }` tells SRF
// to dupe field values (the data we keep) into the caller's
// allocator while letting field keys borrow from `data` (we only
// need them long enough for `fields.to(T, .{})` to match against
// compile-time field names). Records returned from `it.next()`
// then own their value strings via the caller's allocator,
// ready to outlive the iterator without any further duping.
// Choose `parse_allocator` based on whether T has string
// fields the caller needs to keep past the iterator.
//
// - **Pure-numeric types** (`Candle`: Date+5×f64+u64) have
// zero `[]const u8` fields. The only string seen during
// parse is the `date` value, which Date's custom-parse
// hook converts to `i32` immediately. Nothing needs to
// outlive the iterator. Use `.none` borrowed slices
// into the input bytes; no allocator hits per record.
// - **String-bearing types** (Dividend, EarningsEvent,
// OptionsChain) have currency / frequency / source /
// option_type fields the caller keeps. Use the custom
// allocator so values are duped into the caller's
// storage and survive `it.deinit()`.
//
// Why a comptime branch and not a static setting per
// call site: keeps `readSlice` generic over T and routes
// the optimization through type information that's
// already comptime-known. Adding a new pure-numeric type
// (e.g. Split) is a one-line edit to the comptime check.
const parse_alloc: srf.ParseAllocator = if (comptime hasNoStringFields(T))
.none
else
.{ .custom = .initTo(allocator) };
var it = srf.iterator(&reader, allocator, .{
.parse_allocator = .{ .custom = .initTo(allocator) },
.parse_allocator = parse_alloc,
}) catch return null;
defer it.deinit();
@ -1338,8 +1568,23 @@ pub const Store = struct {
}
}
// Per-record coercion. Most types use SRF's generalized
// `fields.to(T, .{})` correct for any struct shape but
// pays a per-field abstraction cost (coerce() boundary,
// found-bitmap bookkeeping, inline-for dispatch chain).
//
// Candle takes the specialized fast path: every cached
// candle file is millions of records of the same fixed
// 7-field shape, and the cold-load wall time was almost
// entirely `fields.to`. The hand-rolled coercer is ~25x
// faster in ReleaseFast for the same correctness on
// well-formed cache files. See SRF's `fields.to` doc
// comment for the trade-off discussion.
while (it.next() catch return null) |fields| {
var item = fields.to(T, .{}) catch continue;
var item: T = if (comptime T == Candle)
coerceCandleSpecialized(fields) catch continue
else
fields.to(T, .{}) catch continue;
if (comptime postProcess) |pp| {
pp(&item, allocator) catch {
if (comptime @hasDecl(T, "deinit")) item.deinit(allocator);