From 16048489ddfc12b27c5f5fb53e8189d68b7fbfe4 Mon Sep 17 00:00:00 2001 From: Emil Lerch Date: Thu, 21 May 2026 08:26:58 -0700 Subject: [PATCH] allow for fuzzy matching on dividends to account for provider reporting differences --- src/cache/store.zig | 250 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 247 insertions(+), 3 deletions(-) diff --git a/src/cache/store.zig b/src/cache/store.zig index e7764f0..44782fe 100644 --- a/src/cache/store.zig +++ b/src/cache/store.zig @@ -340,11 +340,28 @@ pub const Store = struct { for (incoming) |item| { const key = mergeKey(T, item); if (findKeyIndex(T, merged.items, key)) |idx| { - // Existing entry — try to upgrade its optional fields - // from the incoming entry's non-null values. + // Exact key match — try to upgrade existing record's + // optional fields from the incoming entry's non-null + // values. upgraded += upgradeRecord(T, &merged.items[idx], item, symbol, source_hint); + } else if (findNearMatch(T, merged.items, item)) |_| { + // Same dividend, different ex_date convention — skip. + // + // Some providers report mutual fund dividends using the + // calendar last-day-of-month even when that falls on a + // weekend (e.g. FDRXX 2025-08-31 was a Sunday); others + // use the actual trading date (Tiingo: 2025-08-29). + // Same payment, two different ex_date strings. The + // amount-and-date-window matcher (±3 days, amount + // tolerance 0.0001) catches these without affecting + // legitimate same-amount dividends paid months apart. + // + // Existing entry wins (preserves whichever source + // wrote first; in practice the Polygon-rich record). + // No log line — this is a non-event from the user's + // perspective. } else { - // New entry — append. + // Genuinely new entry — append. merged.append(self.allocator, item) catch return; added += 1; logSupplied(T, symbol, item, source_hint); @@ -408,6 +425,47 @@ pub const Store = struct { return null; } + /// Look for an existing entry that's almost certainly the same + /// event as `incoming`, just recorded with a different ex_date + /// convention. Only meaningful for `Dividend` — splits don't + /// have an amount field, so this is a no-op (returns null) for + /// `Split`. + /// + /// The match rule for `Dividend`: existing entry within ±3 + /// calendar days of incoming, with amount matching to the + /// LOOSER of: + /// - 0.0001 absolute (1/100 cent), OR + /// - 1% relative + /// + /// The relative-tolerance arm catches provider rounding: Tiingo + /// sometimes truncates dividend amounts to 2-3 decimals while + /// Polygon keeps full precision (e.g. Polygon 0.040101457 vs + /// Tiingo 0.04 — same payment, different precision). The + /// absolute arm catches near-zero-amount cases where 1% is + /// stricter than 1/100 cent. + /// + /// Canonical case: mutual funds where Polygon reports the + /// calendar month-end as ex_date even when it's a weekend, and + /// Tiingo reports the actual trading date. + fn findNearMatch(comptime T: type, items: []const T, incoming: T) ?usize { + if (T != Dividend) return null; + + const incoming_days = incoming.ex_date.days; + const abs_tolerance: f64 = 0.0001; + const rel_tolerance: f64 = 0.01; // 1% + + for (items, 0..) |it, i| { + const day_delta = @abs(it.ex_date.days - incoming_days); + if (day_delta > 3) continue; + if (day_delta == 0) continue; // exact match — handled by findKeyIndex + const amount_delta = @abs(it.amount - incoming.amount); + const ref = @max(@abs(it.amount), @abs(incoming.amount)); + const effective_tolerance = @max(abs_tolerance, ref * rel_tolerance); + if (amount_delta <= effective_tolerance) return i; + } + return null; + } + fn lessByDateDesc(comptime T: type) fn (void, T, T) bool { return struct { fn lt(_: void, a: T, b: T) bool { @@ -1767,6 +1825,192 @@ test "writeMerged Dividend: upgrade is no-op when both have same fields" { try std.testing.expectEqual(stat_before.mtime, stat_after.mtime); } +test "writeMerged Dividend: near-match dedup catches last-biz-day vs calendar-end" { + // FDRXX/FAGIX-style case: Polygon reports mutual fund dividends + // with calendar last-day-of-month as ex_date even when it's a + // weekend. Tiingo reports the actual last business day. Same + // payment, different ex_date string, identical amount. Without + // the near-match dedup, both end up in cache and total-return + // math double-counts the dividend. + const allocator = std.testing.allocator; + const io = std.testing.io; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator); + defer allocator.free(dir_path); + + var s = Store.init(io, allocator, dir_path); + + // Polygon's view: 2025-08-31 (Sunday — calendar end-of-month). + var polygon_view = [_]Dividend{ + .{ + .ex_date = Date.fromYmd(2025, 8, 31), + .pay_date = Date.fromYmd(2025, 9, 1), + .amount = 0.003422654, + .type = .regular, + }, + }; + s.writeWithSource(Dividend, "FDRXX", polygon_view[0..], Ttl.dividends, "polygon"); + + // Tiingo's view: 2025-08-29 (Friday — last business day), + // identical amount. + var tiingo_view = [_]Dividend{ + .{ + .ex_date = Date.fromYmd(2025, 8, 29), + .amount = 0.003422654, + }, + }; + s.writeWithSource(Dividend, "FDRXX", tiingo_view[0..], Ttl.dividends, "tiingo"); + + const result = s.read(Dividend, "FDRXX", null, .any) orelse return error.NoCache; + defer allocator.free(result.data); + defer for (result.data) |d| d.deinit(allocator); + + // Only one entry should survive — the Polygon-rich one. + try std.testing.expectEqual(@as(usize, 1), result.data.len); + try std.testing.expect(result.data[0].ex_date.eql(Date.fromYmd(2025, 8, 31))); + try std.testing.expect(result.data[0].pay_date != null); + try std.testing.expectEqual(DividendType.regular, result.data[0].type); +} + +test "writeMerged Dividend: near-match dedup respects 3-day window upper bound" { + // 4-day gap should NOT trigger near-match dedup. This guards + // against over-aggressive collapsing of legitimate distinct + // dividends (rare, but the rule is conservative). + const allocator = std.testing.allocator; + const io = std.testing.io; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator); + defer allocator.free(dir_path); + + var s = Store.init(io, allocator, dir_path); + + var first = [_]Dividend{ + .{ .ex_date = Date.fromYmd(2025, 8, 31), .amount = 0.003422654, .type = .regular }, + }; + s.writeWithSource(Dividend, "TEST", first[0..], Ttl.dividends, "polygon"); + + // 4 days earlier — outside the ±3 day window. + var second = [_]Dividend{ + .{ .ex_date = Date.fromYmd(2025, 8, 27), .amount = 0.003422654 }, + }; + s.writeWithSource(Dividend, "TEST", second[0..], Ttl.dividends, "tiingo"); + + const result = s.read(Dividend, "TEST", null, .any) orelse return error.NoCache; + defer allocator.free(result.data); + defer for (result.data) |d| d.deinit(allocator); + + // Both entries kept — gap is too wide for near-match. + try std.testing.expectEqual(@as(usize, 2), result.data.len); +} + +test "writeMerged Dividend: near-match dedup respects amount tolerance" { + // Different amounts within the date window should NOT collapse. + // This guards against false-positive collapsing of distinct + // dividends paid close together (rare in real life but possible + // with special dividends adjacent to a regular). + const allocator = std.testing.allocator; + const io = std.testing.io; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator); + defer allocator.free(dir_path); + + var s = Store.init(io, allocator, dir_path); + + var first = [_]Dividend{ + .{ .ex_date = Date.fromYmd(2025, 8, 31), .amount = 0.003422654, .type = .regular }, + }; + s.writeWithSource(Dividend, "TEST", first[0..], Ttl.dividends, "polygon"); + + // 2 days earlier, very different amount (>1% relative). + var second = [_]Dividend{ + .{ .ex_date = Date.fromYmd(2025, 8, 29), .amount = 0.005 }, + }; + s.writeWithSource(Dividend, "TEST", second[0..], Ttl.dividends, "tiingo"); + + const result = s.read(Dividend, "TEST", null, .any) orelse return error.NoCache; + defer allocator.free(result.data); + defer for (result.data) |d| d.deinit(allocator); + + try std.testing.expectEqual(@as(usize, 2), result.data.len); +} + +test "writeMerged Dividend: near-match dedup tolerates Tiingo amount rounding" { + // FAGIX-style case: Polygon reports amount with full precision + // (0.040101457), Tiingo truncates to 2 decimals (0.04). The + // absolute difference (0.000101) is just over the 0.0001 abs + // tolerance, but well within the 1% relative tolerance. The + // near-match dedup should treat them as the same event. + const allocator = std.testing.allocator; + const io = std.testing.io; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator); + defer allocator.free(dir_path); + + var s = Store.init(io, allocator, dir_path); + + var polygon_view = [_]Dividend{ + .{ + .ex_date = Date.fromYmd(2024, 8, 31), + .pay_date = Date.fromYmd(2024, 9, 1), + .amount = 0.040101457, + .type = .regular, + }, + }; + s.writeWithSource(Dividend, "FAGIX", polygon_view[0..], Ttl.dividends, "polygon"); + + // Tiingo's view: 1 day earlier, amount rounded to 2 decimals. + var tiingo_view = [_]Dividend{ + .{ .ex_date = Date.fromYmd(2024, 8, 30), .amount = 0.04 }, + }; + s.writeWithSource(Dividend, "FAGIX", tiingo_view[0..], Ttl.dividends, "tiingo"); + + const result = s.read(Dividend, "FAGIX", null, .any) orelse return error.NoCache; + defer allocator.free(result.data); + defer for (result.data) |d| d.deinit(allocator); + + try std.testing.expectEqual(@as(usize, 1), result.data.len); + try std.testing.expect(result.data[0].ex_date.eql(Date.fromYmd(2024, 8, 31))); + try std.testing.expectApproxEqAbs(@as(f64, 0.040101457), result.data[0].amount, 0.000001); +} + +test "writeMerged Split: near-match dedup is a no-op (no amount field)" { + // Splits don't have an amount field, so findNearMatch returns + // null for Split. Two splits within 3 days with the same ratio + // would both be kept. This is the intended behavior — splits + // are rare events, and any close-together splits (e.g. a + // forward-then-reverse) are real distinct events. + const allocator = std.testing.allocator; + const io = std.testing.io; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator); + defer allocator.free(dir_path); + + var s = Store.init(io, allocator, dir_path); + + var first = [_]Split{ + .{ .date = Date.fromYmd(2024, 6, 10), .numerator = 10, .denominator = 1 }, + }; + s.writeWithSource(Split, "TEST", first[0..], Ttl.splits, "polygon"); + + // 1 day apart (would be in near-match window if it applied to + // splits). Still kept as distinct because the dedup logic + // only applies to Dividend. + var second = [_]Split{ + .{ .date = Date.fromYmd(2024, 6, 11), .numerator = 10, .denominator = 1 }, + }; + s.writeWithSource(Split, "TEST", second[0..], Ttl.splits, "tiingo"); + + const result = s.read(Split, "TEST", null, .any) orelse return error.NoCache; + defer allocator.free(result.data); + + try std.testing.expectEqual(@as(usize, 2), result.data.len); +} + test "writeMerged Split: SPYM-style supplementary entry added" { const allocator = std.testing.allocator; const io = std.testing.io;