srf parsing optimizations
This commit is contained in:
parent
c97551a476
commit
2245176cb0
1 changed files with 254 additions and 9 deletions
263
src/cache/store.zig
vendored
263
src/cache/store.zig
vendored
|
|
@ -1293,6 +1293,219 @@ pub const Store = struct {
|
|||
|
||||
// ── Private serialization: generic ───────────────────────────
|
||||
|
||||
/// Comptime: does T have any `[]const u8` fields (or
|
||||
/// `?[]const u8`)? Drives the `parse_allocator` choice in
|
||||
/// `readSlice` — types that don't need to retain string
|
||||
/// values past `fields.to(T, .{})` can use `.none` and save
|
||||
/// the allocator hit per parsed value.
|
||||
///
|
||||
/// Conservative: any slice-of-u8 field (with or without
|
||||
/// optional, with or without const) flips this to false.
|
||||
/// Composite types (custom structs with their own SRF parse
|
||||
/// hooks) are NOT inspected — if a field's type isn't a
|
||||
/// plain slice-of-u8, we assume it might internally allocate
|
||||
/// strings during its custom parse and treat it as
|
||||
/// string-bearing. This is the safe default; a future audit
|
||||
/// can opt specific composites in.
|
||||
fn hasNoStringFields(comptime T: type) bool {
|
||||
const info = @typeInfo(T);
|
||||
if (info != .@"struct") return false;
|
||||
inline for (info.@"struct".fields) |f| {
|
||||
const FT = f.type;
|
||||
if (FT == []const u8 or FT == []u8 or
|
||||
FT == ?[]const u8 or FT == ?[]u8) return false;
|
||||
// Composite (struct / union / enum) field: assume it
|
||||
// might be a wrapper that stashes a string. Bail.
|
||||
const fti = @typeInfo(FT);
|
||||
switch (fti) {
|
||||
.int, .float, .bool, .@"enum" => {},
|
||||
.optional => |opt| {
|
||||
const ci = @typeInfo(opt.child);
|
||||
switch (ci) {
|
||||
.int, .float, .bool, .@"enum" => {},
|
||||
else => return false,
|
||||
}
|
||||
},
|
||||
.@"struct" => {
|
||||
// Allow only the project's `Date` (pure i32
|
||||
// wrapper). Detected by name (the @typeName
|
||||
// result for our `src/Date.zig` ends in
|
||||
// "Date" — sometimes shown as just "Date",
|
||||
// sometimes as a longer-qualified path
|
||||
// depending on how the type was reached).
|
||||
if (!std.mem.endsWith(u8, @typeName(FT), "Date")) return false;
|
||||
},
|
||||
else => return false,
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ── hasNoStringFields tests ─────────────────────────────
|
||||
//
|
||||
// Pin the comptime predicate that drives the parse_allocator
|
||||
// choice in `readSlice`. If a future field added to one of
|
||||
// these types changes the classification, the test catches
|
||||
// it before the perf optimization silently regresses (or
|
||||
// worse — if a Candle-shape gets a `?[]const u8` field
|
||||
// added without updating the test, parse_alloc would stay
|
||||
// `.none` and the new string field would be a borrowed slice
|
||||
// into freed-by-defer iterator memory).
|
||||
|
||||
test "hasNoStringFields: Candle is pure-numeric (Date+5×f64+u64)" {
|
||||
try std.testing.expect(hasNoStringFields(Candle));
|
||||
}
|
||||
|
||||
test "hasNoStringFields: Split is pure-numeric (Date+2×f64)" {
|
||||
try std.testing.expect(hasNoStringFields(Split));
|
||||
}
|
||||
|
||||
test "hasNoStringFields: Dividend has currency string -> false" {
|
||||
// Dividend.currency is `?[]const u8` — caller keeps it
|
||||
// past the iterator, so we MUST dupe.
|
||||
try std.testing.expect(!hasNoStringFields(Dividend));
|
||||
}
|
||||
|
||||
test "hasNoStringFields: EarningsEvent has string fields -> false" {
|
||||
try std.testing.expect(!hasNoStringFields(EarningsEvent));
|
||||
}
|
||||
|
||||
test "hasNoStringFields: EtfProfile has string fields -> false" {
|
||||
try std.testing.expect(!hasNoStringFields(EtfProfile));
|
||||
}
|
||||
|
||||
test "hasNoStringFields: synthetic shapes" {
|
||||
// Pure ints/floats/bools/enums + Date — should pass.
|
||||
const Pure = struct {
|
||||
a: i32,
|
||||
b: f64,
|
||||
c: bool,
|
||||
d: enum { x, y },
|
||||
e: Date,
|
||||
f: ?u32,
|
||||
};
|
||||
try std.testing.expect(hasNoStringFields(Pure));
|
||||
|
||||
// Bare []const u8 — should fail.
|
||||
const HasString = struct {
|
||||
a: i32,
|
||||
b: []const u8,
|
||||
};
|
||||
try std.testing.expect(!hasNoStringFields(HasString));
|
||||
|
||||
// Optional []const u8 — should fail.
|
||||
const HasOptString = struct {
|
||||
a: i32,
|
||||
b: ?[]const u8,
|
||||
};
|
||||
try std.testing.expect(!hasNoStringFields(HasOptString));
|
||||
|
||||
// []u8 (mutable) — should also fail. We don't ship any
|
||||
// mutable-slice fields today, but the predicate guards
|
||||
// against future drift.
|
||||
const HasMutString = struct {
|
||||
a: i32,
|
||||
b: []u8,
|
||||
};
|
||||
try std.testing.expect(!hasNoStringFields(HasMutString));
|
||||
}
|
||||
|
||||
test "hasNoStringFields: composite struct field that's not Date is treated as string-bearing" {
|
||||
// Conservative default: if a field's type is a struct we
|
||||
// don't recognize as Date, we don't try to inspect it
|
||||
// recursively — assume it might allocate during its
|
||||
// custom parse hook.
|
||||
const InnerWithString = struct {
|
||||
s: []const u8,
|
||||
};
|
||||
const Outer = struct {
|
||||
x: i32,
|
||||
y: InnerWithString,
|
||||
};
|
||||
try std.testing.expect(!hasNoStringFields(Outer));
|
||||
}
|
||||
|
||||
test "hasNoStringFields: non-struct types return false" {
|
||||
// The predicate is meaningful only for record types
|
||||
// parsed by SRF (always structs in zfin). Anything else
|
||||
// returns false defensively.
|
||||
try std.testing.expect(!hasNoStringFields(u32));
|
||||
try std.testing.expect(!hasNoStringFields([]const u8));
|
||||
}
|
||||
|
||||
/// Hand-rolled specialized coercer for Candle records.
|
||||
/// Bypasses SRF's generalized `fields.to(T, ...)` for the
|
||||
/// hot Candle parse path: zfin's cold candle load deserializes
|
||||
/// hundreds of thousands of records of fixed 7-field shape,
|
||||
/// where `fields.to`'s per-field framework cost (coerce()
|
||||
/// boundary, found-bitmap bookkeeping, inline-for dispatch
|
||||
/// chain) dominates. Direct first-byte switch + struct
|
||||
/// assignment is ~25x faster in ReleaseFast for the same
|
||||
/// correct result on well-formed cache files.
|
||||
///
|
||||
/// Trade-off vs `fields.to`: this skips default-value
|
||||
/// fallback, missing-field detection, and `coerce()`'s
|
||||
/// strict type discipline. Adequate for our cache-write
|
||||
/// invariant (every candle file we write contains exactly
|
||||
/// the 7 fields below); inadequate for parsing arbitrary
|
||||
/// user-supplied SRF data.
|
||||
///
|
||||
/// Cache discipline: keys we don't recognize (the `else`
|
||||
/// arm) are silently skipped, matching `fields.to`'s
|
||||
/// behavior on unknown fields. Records with missing fields
|
||||
/// produce a Candle with the zero-init default for the
|
||||
/// absent field — also matching the broader `fields.to`
|
||||
/// contract since Candle's fields have no SRF defaults.
|
||||
///
|
||||
/// See SRF's `pub fn to` doc comment for the broader
|
||||
/// "specialized vs generalized" trade-off discussion.
|
||||
fn coerceCandleSpecialized(fields: srf.RecordIterator.FieldIterator) !Candle {
|
||||
var c: Candle = .{
|
||||
.date = Date.fromYmd(1970, 1, 1),
|
||||
.open = 0,
|
||||
.high = 0,
|
||||
.low = 0,
|
||||
.close = 0,
|
||||
.adj_close = 0,
|
||||
.volume = 0,
|
||||
};
|
||||
while (try fields.next()) |f| {
|
||||
const key = f.key;
|
||||
const val = f.value orelse continue;
|
||||
// Switch on the first byte. All 7 Candle field names
|
||||
// are first-byte-unique:
|
||||
// d -> date o -> open h -> high
|
||||
// l -> low c -> close a -> adj_close
|
||||
// v -> volume
|
||||
if (key.len == 0) continue;
|
||||
switch (key[0]) {
|
||||
'd' => if (val == .string) {
|
||||
c.date = try Date.parse(val.string);
|
||||
},
|
||||
'o' => if (val == .number) {
|
||||
c.open = val.number;
|
||||
},
|
||||
'h' => if (val == .number) {
|
||||
c.high = val.number;
|
||||
},
|
||||
'l' => if (val == .number) {
|
||||
c.low = val.number;
|
||||
},
|
||||
'c' => if (val == .number) {
|
||||
c.close = val.number;
|
||||
},
|
||||
'a' => if (val == .number) {
|
||||
c.adj_close = val.number;
|
||||
},
|
||||
'v' => if (val == .number) {
|
||||
c.volume = @as(u64, @intFromFloat(val.number));
|
||||
},
|
||||
else => {},
|
||||
}
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/// Generic SRF deserializer with optional freshness check.
|
||||
/// Single-pass: creates one iterator, optionally checks freshness, extracts
|
||||
/// `#!created=` timestamp, and deserializes all records.
|
||||
|
|
@ -1305,15 +1518,32 @@ pub const Store = struct {
|
|||
comptime freshness: Freshness,
|
||||
) ?CacheResult(T) {
|
||||
var reader = std.Io.Reader.fixed(data);
|
||||
// `.parse_allocator = .{ .custom = .initTo(allocator) }` tells SRF
|
||||
// to dupe field values (the data we keep) into the caller's
|
||||
// allocator while letting field keys borrow from `data` (we only
|
||||
// need them long enough for `fields.to(T, .{})` to match against
|
||||
// compile-time field names). Records returned from `it.next()`
|
||||
// then own their value strings via the caller's allocator,
|
||||
// ready to outlive the iterator without any further duping.
|
||||
// Choose `parse_allocator` based on whether T has string
|
||||
// fields the caller needs to keep past the iterator.
|
||||
//
|
||||
// - **Pure-numeric types** (`Candle`: Date+5×f64+u64) have
|
||||
// zero `[]const u8` fields. The only string seen during
|
||||
// parse is the `date` value, which Date's custom-parse
|
||||
// hook converts to `i32` immediately. Nothing needs to
|
||||
// outlive the iterator. Use `.none` — borrowed slices
|
||||
// into the input bytes; no allocator hits per record.
|
||||
// - **String-bearing types** (Dividend, EarningsEvent,
|
||||
// OptionsChain) have currency / frequency / source /
|
||||
// option_type fields the caller keeps. Use the custom
|
||||
// allocator so values are duped into the caller's
|
||||
// storage and survive `it.deinit()`.
|
||||
//
|
||||
// Why a comptime branch and not a static setting per
|
||||
// call site: keeps `readSlice` generic over T and routes
|
||||
// the optimization through type information that's
|
||||
// already comptime-known. Adding a new pure-numeric type
|
||||
// (e.g. Split) is a one-line edit to the comptime check.
|
||||
const parse_alloc: srf.ParseAllocator = if (comptime hasNoStringFields(T))
|
||||
.none
|
||||
else
|
||||
.{ .custom = .initTo(allocator) };
|
||||
var it = srf.iterator(&reader, allocator, .{
|
||||
.parse_allocator = .{ .custom = .initTo(allocator) },
|
||||
.parse_allocator = parse_alloc,
|
||||
}) catch return null;
|
||||
defer it.deinit();
|
||||
|
||||
|
|
@ -1338,8 +1568,23 @@ pub const Store = struct {
|
|||
}
|
||||
}
|
||||
|
||||
// Per-record coercion. Most types use SRF's generalized
|
||||
// `fields.to(T, .{})` — correct for any struct shape but
|
||||
// pays a per-field abstraction cost (coerce() boundary,
|
||||
// found-bitmap bookkeeping, inline-for dispatch chain).
|
||||
//
|
||||
// Candle takes the specialized fast path: every cached
|
||||
// candle file is millions of records of the same fixed
|
||||
// 7-field shape, and the cold-load wall time was almost
|
||||
// entirely `fields.to`. The hand-rolled coercer is ~25x
|
||||
// faster in ReleaseFast for the same correctness on
|
||||
// well-formed cache files. See SRF's `fields.to` doc
|
||||
// comment for the trade-off discussion.
|
||||
while (it.next() catch return null) |fields| {
|
||||
var item = fields.to(T, .{}) catch continue;
|
||||
var item: T = if (comptime T == Candle)
|
||||
coerceCandleSpecialized(fields) catch continue
|
||||
else
|
||||
fields.to(T, .{}) catch continue;
|
||||
if (comptime postProcess) |pp| {
|
||||
pp(&item, allocator) catch {
|
||||
if (comptime @hasDecl(T, "deinit")) item.deinit(allocator);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue