allow for fuzzy matching on dividends to account for provider reporting differences
All checks were successful
Generic zig build / build (push) Successful in 2m20s
Generic zig build / deploy (push) Successful in 21s

This commit is contained in:
Emil Lerch 2026-05-21 08:26:58 -07:00
parent fe28949757
commit 16048489dd
Signed by: lobo
GPG key ID: A7B62D657EF764F8

250
src/cache/store.zig vendored
View file

@ -340,11 +340,28 @@ pub const Store = struct {
for (incoming) |item| {
const key = mergeKey(T, item);
if (findKeyIndex(T, merged.items, key)) |idx| {
// Existing entry try to upgrade its optional fields
// from the incoming entry's non-null values.
// Exact key match try to upgrade existing record's
// optional fields from the incoming entry's non-null
// values.
upgraded += upgradeRecord(T, &merged.items[idx], item, symbol, source_hint);
} else if (findNearMatch(T, merged.items, item)) |_| {
// Same dividend, different ex_date convention skip.
//
// Some providers report mutual fund dividends using the
// calendar last-day-of-month even when that falls on a
// weekend (e.g. FDRXX 2025-08-31 was a Sunday); others
// use the actual trading date (Tiingo: 2025-08-29).
// Same payment, two different ex_date strings. The
// amount-and-date-window matcher (±3 days, amount
// tolerance 0.0001) catches these without affecting
// legitimate same-amount dividends paid months apart.
//
// Existing entry wins (preserves whichever source
// wrote first; in practice the Polygon-rich record).
// No log line this is a non-event from the user's
// perspective.
} else {
// New entry append.
// Genuinely new entry append.
merged.append(self.allocator, item) catch return;
added += 1;
logSupplied(T, symbol, item, source_hint);
@ -408,6 +425,47 @@ pub const Store = struct {
return null;
}
/// Look for an existing entry that's almost certainly the same
/// event as `incoming`, just recorded with a different ex_date
/// convention. Only meaningful for `Dividend` splits don't
/// have an amount field, so this is a no-op (returns null) for
/// `Split`.
///
/// The match rule for `Dividend`: existing entry within ±3
/// calendar days of incoming, with amount matching to the
/// LOOSER of:
/// - 0.0001 absolute (1/100 cent), OR
/// - 1% relative
///
/// The relative-tolerance arm catches provider rounding: Tiingo
/// sometimes truncates dividend amounts to 2-3 decimals while
/// Polygon keeps full precision (e.g. Polygon 0.040101457 vs
/// Tiingo 0.04 same payment, different precision). The
/// absolute arm catches near-zero-amount cases where 1% is
/// stricter than 1/100 cent.
///
/// Canonical case: mutual funds where Polygon reports the
/// calendar month-end as ex_date even when it's a weekend, and
/// Tiingo reports the actual trading date.
fn findNearMatch(comptime T: type, items: []const T, incoming: T) ?usize {
if (T != Dividend) return null;
const incoming_days = incoming.ex_date.days;
const abs_tolerance: f64 = 0.0001;
const rel_tolerance: f64 = 0.01; // 1%
for (items, 0..) |it, i| {
const day_delta = @abs(it.ex_date.days - incoming_days);
if (day_delta > 3) continue;
if (day_delta == 0) continue; // exact match handled by findKeyIndex
const amount_delta = @abs(it.amount - incoming.amount);
const ref = @max(@abs(it.amount), @abs(incoming.amount));
const effective_tolerance = @max(abs_tolerance, ref * rel_tolerance);
if (amount_delta <= effective_tolerance) return i;
}
return null;
}
fn lessByDateDesc(comptime T: type) fn (void, T, T) bool {
return struct {
fn lt(_: void, a: T, b: T) bool {
@ -1767,6 +1825,192 @@ test "writeMerged Dividend: upgrade is no-op when both have same fields" {
try std.testing.expectEqual(stat_before.mtime, stat_after.mtime);
}
test "writeMerged Dividend: near-match dedup catches last-biz-day vs calendar-end" {
// FDRXX/FAGIX-style case: Polygon reports mutual fund dividends
// with calendar last-day-of-month as ex_date even when it's a
// weekend. Tiingo reports the actual last business day. Same
// payment, different ex_date string, identical amount. Without
// the near-match dedup, both end up in cache and total-return
// math double-counts the dividend.
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
var s = Store.init(io, allocator, dir_path);
// Polygon's view: 2025-08-31 (Sunday calendar end-of-month).
var polygon_view = [_]Dividend{
.{
.ex_date = Date.fromYmd(2025, 8, 31),
.pay_date = Date.fromYmd(2025, 9, 1),
.amount = 0.003422654,
.type = .regular,
},
};
s.writeWithSource(Dividend, "FDRXX", polygon_view[0..], Ttl.dividends, "polygon");
// Tiingo's view: 2025-08-29 (Friday last business day),
// identical amount.
var tiingo_view = [_]Dividend{
.{
.ex_date = Date.fromYmd(2025, 8, 29),
.amount = 0.003422654,
},
};
s.writeWithSource(Dividend, "FDRXX", tiingo_view[0..], Ttl.dividends, "tiingo");
const result = s.read(Dividend, "FDRXX", null, .any) orelse return error.NoCache;
defer allocator.free(result.data);
defer for (result.data) |d| d.deinit(allocator);
// Only one entry should survive the Polygon-rich one.
try std.testing.expectEqual(@as(usize, 1), result.data.len);
try std.testing.expect(result.data[0].ex_date.eql(Date.fromYmd(2025, 8, 31)));
try std.testing.expect(result.data[0].pay_date != null);
try std.testing.expectEqual(DividendType.regular, result.data[0].type);
}
test "writeMerged Dividend: near-match dedup respects 3-day window upper bound" {
// 4-day gap should NOT trigger near-match dedup. This guards
// against over-aggressive collapsing of legitimate distinct
// dividends (rare, but the rule is conservative).
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
var s = Store.init(io, allocator, dir_path);
var first = [_]Dividend{
.{ .ex_date = Date.fromYmd(2025, 8, 31), .amount = 0.003422654, .type = .regular },
};
s.writeWithSource(Dividend, "TEST", first[0..], Ttl.dividends, "polygon");
// 4 days earlier outside the ±3 day window.
var second = [_]Dividend{
.{ .ex_date = Date.fromYmd(2025, 8, 27), .amount = 0.003422654 },
};
s.writeWithSource(Dividend, "TEST", second[0..], Ttl.dividends, "tiingo");
const result = s.read(Dividend, "TEST", null, .any) orelse return error.NoCache;
defer allocator.free(result.data);
defer for (result.data) |d| d.deinit(allocator);
// Both entries kept gap is too wide for near-match.
try std.testing.expectEqual(@as(usize, 2), result.data.len);
}
test "writeMerged Dividend: near-match dedup respects amount tolerance" {
// Different amounts within the date window should NOT collapse.
// This guards against false-positive collapsing of distinct
// dividends paid close together (rare in real life but possible
// with special dividends adjacent to a regular).
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
var s = Store.init(io, allocator, dir_path);
var first = [_]Dividend{
.{ .ex_date = Date.fromYmd(2025, 8, 31), .amount = 0.003422654, .type = .regular },
};
s.writeWithSource(Dividend, "TEST", first[0..], Ttl.dividends, "polygon");
// 2 days earlier, very different amount (>1% relative).
var second = [_]Dividend{
.{ .ex_date = Date.fromYmd(2025, 8, 29), .amount = 0.005 },
};
s.writeWithSource(Dividend, "TEST", second[0..], Ttl.dividends, "tiingo");
const result = s.read(Dividend, "TEST", null, .any) orelse return error.NoCache;
defer allocator.free(result.data);
defer for (result.data) |d| d.deinit(allocator);
try std.testing.expectEqual(@as(usize, 2), result.data.len);
}
test "writeMerged Dividend: near-match dedup tolerates Tiingo amount rounding" {
// FAGIX-style case: Polygon reports amount with full precision
// (0.040101457), Tiingo truncates to 2 decimals (0.04). The
// absolute difference (0.000101) is just over the 0.0001 abs
// tolerance, but well within the 1% relative tolerance. The
// near-match dedup should treat them as the same event.
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
var s = Store.init(io, allocator, dir_path);
var polygon_view = [_]Dividend{
.{
.ex_date = Date.fromYmd(2024, 8, 31),
.pay_date = Date.fromYmd(2024, 9, 1),
.amount = 0.040101457,
.type = .regular,
},
};
s.writeWithSource(Dividend, "FAGIX", polygon_view[0..], Ttl.dividends, "polygon");
// Tiingo's view: 1 day earlier, amount rounded to 2 decimals.
var tiingo_view = [_]Dividend{
.{ .ex_date = Date.fromYmd(2024, 8, 30), .amount = 0.04 },
};
s.writeWithSource(Dividend, "FAGIX", tiingo_view[0..], Ttl.dividends, "tiingo");
const result = s.read(Dividend, "FAGIX", null, .any) orelse return error.NoCache;
defer allocator.free(result.data);
defer for (result.data) |d| d.deinit(allocator);
try std.testing.expectEqual(@as(usize, 1), result.data.len);
try std.testing.expect(result.data[0].ex_date.eql(Date.fromYmd(2024, 8, 31)));
try std.testing.expectApproxEqAbs(@as(f64, 0.040101457), result.data[0].amount, 0.000001);
}
test "writeMerged Split: near-match dedup is a no-op (no amount field)" {
// Splits don't have an amount field, so findNearMatch returns
// null for Split. Two splits within 3 days with the same ratio
// would both be kept. This is the intended behavior splits
// are rare events, and any close-together splits (e.g. a
// forward-then-reverse) are real distinct events.
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
var s = Store.init(io, allocator, dir_path);
var first = [_]Split{
.{ .date = Date.fromYmd(2024, 6, 10), .numerator = 10, .denominator = 1 },
};
s.writeWithSource(Split, "TEST", first[0..], Ttl.splits, "polygon");
// 1 day apart (would be in near-match window if it applied to
// splits). Still kept as distinct because the dedup logic
// only applies to Dividend.
var second = [_]Split{
.{ .date = Date.fromYmd(2024, 6, 11), .numerator = 10, .denominator = 1 },
};
s.writeWithSource(Split, "TEST", second[0..], Ttl.splits, "tiingo");
const result = s.read(Split, "TEST", null, .any) orelse return error.NoCache;
defer allocator.free(result.data);
try std.testing.expectEqual(@as(usize, 2), result.data.len);
}
test "writeMerged Split: SPYM-style supplementary entry added" {
const allocator = std.testing.allocator;
const io = std.testing.io;