dedup cusip cache and actually use the cache

This commit is contained in:
Emil Lerch 2026-06-16 16:00:53 -07:00
parent d9fd5d5b97
commit 415071b955
Signed by: lobo
GPG key ID: A7B62D657EF764F8
2 changed files with 211 additions and 0 deletions

View file

@ -53,6 +53,23 @@ pub fn run(ctx: *framework.RunCtx, parsed: ParsedArgs) !void {
try cli.printFg(out, color, cli.CLR_MUTED, "Note: '{s}' doesn't look like a CUSIP (expected 9 alphanumeric chars with digits)\n", .{parsed.cusip});
}
// L1: check the local cache before any network call. Hits are
// permanent (CUSIP->ticker mappings don't change), so a cached
// answer is authoritative and skips OpenFIGI entirely.
var cache_map = svc.loadCusipTickerMap(allocator);
defer cache_map.deinit();
if (cache_map.get(parsed.cusip)) |ticker| {
const cached: zfin.CusipResult = .{
.ticker = ticker,
.name = null,
.security_type = null,
.found = true,
};
try display(cached, parsed.cusip, color, out);
try cli.printFg(out, color, cli.CLR_MUTED, " (from local cache)\n", .{});
return;
}
cli.stderrPrint(ctx.io, "Looking up via OpenFIGI...\n");
// Try full batch lookup for richer output

View file

@ -2398,13 +2398,104 @@ pub const DataService = struct {
ticker: []const u8 = "",
};
/// CUSIP->ticker lookup table loaded from `cusip_tickers.srf`.
///
/// Zero-copy: keys and values are slices into `backing` (the raw
/// file bytes parsed with `parse_allocator = .none`). Nothing is
/// duped per entry the whole-file buffer IS the storage, and it
/// stays alive for the table's lifetime, released together with
/// the map table in `deinit`.
///
/// This is the L1 tier of CUSIP resolution: callers consult it
/// before reaching for the server or OpenFIGI.
pub const CusipTickerMap = struct {
map: std.StringHashMap([]const u8),
/// Raw bytes of `cusip_tickers.srf`; every map key and value
/// points into this buffer. `&.{}` when the file was missing
/// or unreadable (freeing a zero-length slice is a no-op).
backing: []const u8,
pub fn get(self: CusipTickerMap, cusip: []const u8) ?[]const u8 {
return self.map.get(cusip);
}
pub fn contains(self: CusipTickerMap, cusip: []const u8) bool {
return self.map.contains(cusip);
}
pub fn count(self: CusipTickerMap) u32 {
return self.map.count();
}
/// Release the map table and the backing buffer. Both were
/// allocated with the map's allocator at load time, so we
/// reuse it here the two lifetimes are bound together by
/// construction, which is the whole point of the wrapper.
pub fn deinit(self: *CusipTickerMap) void {
const allocator = self.map.allocator;
self.map.deinit();
allocator.free(self.backing);
}
};
/// Load the CUSIP->ticker cache file into a `CusipTickerMap`. The
/// returned table owns the file bytes; release it with
/// `CusipTickerMap.deinit`.
///
/// Missing file empty table (the common first-run case). First
/// occurrence wins on duplicate CUSIPs, which tolerates the
/// historical double-append bug in cache files written before
/// `cacheCusipTicker` learned to dedup.
///
/// The on-disk format is CUSIP-keyed (`cusip::X,ticker::Y`); the
/// returned map is keyed the same way for O(1) forward lookup.
pub fn loadCusipTickerMap(self: *DataService, allocator: std.mem.Allocator) CusipTickerMap {
const map = std.StringHashMap([]const u8).init(allocator);
const path = std.fs.path.join(allocator, &.{ self.config.cache_dir, "cusip_tickers.srf" }) catch
return .{ .map = map, .backing = &.{} };
defer allocator.free(path);
const data = std.Io.Dir.cwd().readFileAlloc(self.io, path, allocator, .limited(4 * 1024 * 1024)) catch
return .{ .map = map, .backing = &.{} };
// From here `data` is the table's backing store: keys and
// values are slices into it (parse_allocator = .none, so the
// parser borrows rather than copies). Freed by
// `CusipTickerMap.deinit`, never here that's the lifetime
// contract that lets us skip per-entry dupes entirely.
var result: CusipTickerMap = .{ .map = map, .backing = data };
var reader = std.Io.Reader.fixed(data);
var it = srf.iterator(&reader, allocator, .{ .parse_allocator = .none }) catch return result;
defer it.deinit();
while (it.next() catch return result) |fields| {
const entry = fields.to(CusipEntry, .{}) catch continue;
if (entry.cusip.len == 0 or entry.ticker.len == 0) continue;
// First occurrence wins; getOrPut stores the borrowed
// slices directly they live in `backing`, no dupe.
const gop = result.map.getOrPut(entry.cusip) catch continue;
if (!gop.found_existing) gop.value_ptr.* = entry.ticker;
}
return result;
}
/// Append a CUSIP->ticker mapping to the cache file.
///
/// Implemented as read-append-atomic-write (rather than a direct
/// open-for-append) so a concurrent reader never sees a file with a
/// valid header plus partial trailing record. See `cache/store.zig
/// appendRaw` for the same pattern and rationale.
///
/// Dedups: if the CUSIP is already cached, this is a no-op. That
/// keeps the file from accumulating duplicate rows when the same
/// CUSIP is looked up repeatedly (the historical bug the writer
/// never checked the file before appending).
pub fn cacheCusipTicker(self: *DataService, cusip: []const u8, ticker: []const u8) void {
// Dedup against what's already cached.
var existing_map = self.loadCusipTickerMap(self.allocator);
defer existing_map.deinit();
if (existing_map.contains(cusip)) return;
const path = std.fs.path.join(self.allocator, &.{ self.config.cache_dir, "cusip_tickers.srf" }) catch return;
defer self.allocator.free(path);
@ -3778,3 +3869,106 @@ test "freeEdgarLookup: handles all three union variants without leak" {
// testing.allocator panics on leak passing this test means
// the title was freed.
}
// CUSIP->ticker cache (loadCusipTickerMap / cacheCusipTicker)
test "loadCusipTickerMap: missing file returns empty map" {
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
var svc = DataService.init(io, allocator, Config{ .cache_dir = dir_path });
defer svc.deinit();
var map = svc.loadCusipTickerMap(allocator);
defer map.deinit();
try std.testing.expectEqual(@as(usize, 0), map.count());
}
test "cacheCusipTicker + loadCusipTickerMap: write/read round-trip" {
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
var svc = DataService.init(io, allocator, Config{ .cache_dir = dir_path });
defer svc.deinit();
// Placeholder CUSIPs/tickers never real PII.
svc.cacheCusipTicker("111111111", "AAA");
svc.cacheCusipTicker("222222222", "BBB");
var map = svc.loadCusipTickerMap(allocator);
defer map.deinit();
try std.testing.expectEqual(@as(usize, 2), map.count());
try std.testing.expectEqualStrings("AAA", map.get("111111111").?);
try std.testing.expectEqualStrings("BBB", map.get("222222222").?);
}
test "cacheCusipTicker: dedups repeated CUSIP (the historical bug)" {
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
var svc = DataService.init(io, allocator, Config{ .cache_dir = dir_path });
defer svc.deinit();
// Write the same CUSIP three times must collapse to one row.
svc.cacheCusipTicker("111111111", "AAA");
svc.cacheCusipTicker("111111111", "AAA");
svc.cacheCusipTicker("111111111", "AAA");
var map = svc.loadCusipTickerMap(allocator);
defer map.deinit();
try std.testing.expectEqual(@as(usize, 1), map.count());
try std.testing.expectEqualStrings("AAA", map.get("111111111").?);
// The on-disk file should physically contain exactly one data
// row (plus the directive header), proving dedup at the writer.
const path = try std.fs.path.join(allocator, &.{ dir_path, "cusip_tickers.srf" });
defer allocator.free(path);
const data = try std.Io.Dir.cwd().readFileAlloc(io, path, allocator, .limited(64 * 1024));
defer allocator.free(data);
var row_count: usize = 0;
var lines = std.mem.splitScalar(u8, data, '\n');
while (lines.next()) |line| {
if (std.mem.indexOf(u8, line, "cusip::") != null) row_count += 1;
}
try std.testing.expectEqual(@as(usize, 1), row_count);
}
test "loadCusipTickerMap: first occurrence wins on duplicate rows" {
// Tolerate a pre-existing file written by the buggy appender
// (duplicate rows). The reader must not crash and must keep the
// first mapping.
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
// Hand-write a file with a duplicate row (as the old bug did).
const path = try std.fs.path.join(allocator, &.{ dir_path, "cusip_tickers.srf" });
defer allocator.free(path);
try std.Io.Dir.cwd().writeFile(io, .{
.sub_path = path,
.data = "#!srfv1\ncusip::111111111,ticker::AAA\ncusip::111111111,ticker::AAA\n",
});
var svc = DataService.init(io, allocator, Config{ .cache_dir = dir_path });
defer svc.deinit();
var map = svc.loadCusipTickerMap(allocator);
defer map.deinit();
try std.testing.expectEqual(@as(usize, 1), map.count());
try std.testing.expectEqualStrings("AAA", map.get("111111111").?);
}