allow for fuzzy matching on dividends to account for provider reporting differences
This commit is contained in:
parent
fe28949757
commit
16048489dd
1 changed files with 247 additions and 3 deletions
250
src/cache/store.zig
vendored
250
src/cache/store.zig
vendored
|
|
@ -340,11 +340,28 @@ pub const Store = struct {
|
|||
for (incoming) |item| {
|
||||
const key = mergeKey(T, item);
|
||||
if (findKeyIndex(T, merged.items, key)) |idx| {
|
||||
// Existing entry — try to upgrade its optional fields
|
||||
// from the incoming entry's non-null values.
|
||||
// Exact key match — try to upgrade existing record's
|
||||
// optional fields from the incoming entry's non-null
|
||||
// values.
|
||||
upgraded += upgradeRecord(T, &merged.items[idx], item, symbol, source_hint);
|
||||
} else if (findNearMatch(T, merged.items, item)) |_| {
|
||||
// Same dividend, different ex_date convention — skip.
|
||||
//
|
||||
// Some providers report mutual fund dividends using the
|
||||
// calendar last-day-of-month even when that falls on a
|
||||
// weekend (e.g. FDRXX 2025-08-31 was a Sunday); others
|
||||
// use the actual trading date (Tiingo: 2025-08-29).
|
||||
// Same payment, two different ex_date strings. The
|
||||
// amount-and-date-window matcher (±3 days, amount
|
||||
// tolerance 0.0001) catches these without affecting
|
||||
// legitimate same-amount dividends paid months apart.
|
||||
//
|
||||
// Existing entry wins (preserves whichever source
|
||||
// wrote first; in practice the Polygon-rich record).
|
||||
// No log line — this is a non-event from the user's
|
||||
// perspective.
|
||||
} else {
|
||||
// New entry — append.
|
||||
// Genuinely new entry — append.
|
||||
merged.append(self.allocator, item) catch return;
|
||||
added += 1;
|
||||
logSupplied(T, symbol, item, source_hint);
|
||||
|
|
@ -408,6 +425,47 @@ pub const Store = struct {
|
|||
return null;
|
||||
}
|
||||
|
||||
/// Look for an existing entry that's almost certainly the same
|
||||
/// event as `incoming`, just recorded with a different ex_date
|
||||
/// convention. Only meaningful for `Dividend` — splits don't
|
||||
/// have an amount field, so this is a no-op (returns null) for
|
||||
/// `Split`.
|
||||
///
|
||||
/// The match rule for `Dividend`: existing entry within ±3
|
||||
/// calendar days of incoming, with amount matching to the
|
||||
/// LOOSER of:
|
||||
/// - 0.0001 absolute (1/100 cent), OR
|
||||
/// - 1% relative
|
||||
///
|
||||
/// The relative-tolerance arm catches provider rounding: Tiingo
|
||||
/// sometimes truncates dividend amounts to 2-3 decimals while
|
||||
/// Polygon keeps full precision (e.g. Polygon 0.040101457 vs
|
||||
/// Tiingo 0.04 — same payment, different precision). The
|
||||
/// absolute arm catches near-zero-amount cases where 1% is
|
||||
/// stricter than 1/100 cent.
|
||||
///
|
||||
/// Canonical case: mutual funds where Polygon reports the
|
||||
/// calendar month-end as ex_date even when it's a weekend, and
|
||||
/// Tiingo reports the actual trading date.
|
||||
fn findNearMatch(comptime T: type, items: []const T, incoming: T) ?usize {
|
||||
if (T != Dividend) return null;
|
||||
|
||||
const incoming_days = incoming.ex_date.days;
|
||||
const abs_tolerance: f64 = 0.0001;
|
||||
const rel_tolerance: f64 = 0.01; // 1%
|
||||
|
||||
for (items, 0..) |it, i| {
|
||||
const day_delta = @abs(it.ex_date.days - incoming_days);
|
||||
if (day_delta > 3) continue;
|
||||
if (day_delta == 0) continue; // exact match — handled by findKeyIndex
|
||||
const amount_delta = @abs(it.amount - incoming.amount);
|
||||
const ref = @max(@abs(it.amount), @abs(incoming.amount));
|
||||
const effective_tolerance = @max(abs_tolerance, ref * rel_tolerance);
|
||||
if (amount_delta <= effective_tolerance) return i;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
fn lessByDateDesc(comptime T: type) fn (void, T, T) bool {
|
||||
return struct {
|
||||
fn lt(_: void, a: T, b: T) bool {
|
||||
|
|
@ -1767,6 +1825,192 @@ test "writeMerged Dividend: upgrade is no-op when both have same fields" {
|
|||
try std.testing.expectEqual(stat_before.mtime, stat_after.mtime);
|
||||
}
|
||||
|
||||
test "writeMerged Dividend: near-match dedup catches last-biz-day vs calendar-end" {
|
||||
// FDRXX/FAGIX-style case: Polygon reports mutual fund dividends
|
||||
// with calendar last-day-of-month as ex_date even when it's a
|
||||
// weekend. Tiingo reports the actual last business day. Same
|
||||
// payment, different ex_date string, identical amount. Without
|
||||
// the near-match dedup, both end up in cache and total-return
|
||||
// math double-counts the dividend.
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
var tmp = std.testing.tmpDir(.{});
|
||||
defer tmp.cleanup();
|
||||
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
|
||||
defer allocator.free(dir_path);
|
||||
|
||||
var s = Store.init(io, allocator, dir_path);
|
||||
|
||||
// Polygon's view: 2025-08-31 (Sunday — calendar end-of-month).
|
||||
var polygon_view = [_]Dividend{
|
||||
.{
|
||||
.ex_date = Date.fromYmd(2025, 8, 31),
|
||||
.pay_date = Date.fromYmd(2025, 9, 1),
|
||||
.amount = 0.003422654,
|
||||
.type = .regular,
|
||||
},
|
||||
};
|
||||
s.writeWithSource(Dividend, "FDRXX", polygon_view[0..], Ttl.dividends, "polygon");
|
||||
|
||||
// Tiingo's view: 2025-08-29 (Friday — last business day),
|
||||
// identical amount.
|
||||
var tiingo_view = [_]Dividend{
|
||||
.{
|
||||
.ex_date = Date.fromYmd(2025, 8, 29),
|
||||
.amount = 0.003422654,
|
||||
},
|
||||
};
|
||||
s.writeWithSource(Dividend, "FDRXX", tiingo_view[0..], Ttl.dividends, "tiingo");
|
||||
|
||||
const result = s.read(Dividend, "FDRXX", null, .any) orelse return error.NoCache;
|
||||
defer allocator.free(result.data);
|
||||
defer for (result.data) |d| d.deinit(allocator);
|
||||
|
||||
// Only one entry should survive — the Polygon-rich one.
|
||||
try std.testing.expectEqual(@as(usize, 1), result.data.len);
|
||||
try std.testing.expect(result.data[0].ex_date.eql(Date.fromYmd(2025, 8, 31)));
|
||||
try std.testing.expect(result.data[0].pay_date != null);
|
||||
try std.testing.expectEqual(DividendType.regular, result.data[0].type);
|
||||
}
|
||||
|
||||
test "writeMerged Dividend: near-match dedup respects 3-day window upper bound" {
|
||||
// 4-day gap should NOT trigger near-match dedup. This guards
|
||||
// against over-aggressive collapsing of legitimate distinct
|
||||
// dividends (rare, but the rule is conservative).
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
var tmp = std.testing.tmpDir(.{});
|
||||
defer tmp.cleanup();
|
||||
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
|
||||
defer allocator.free(dir_path);
|
||||
|
||||
var s = Store.init(io, allocator, dir_path);
|
||||
|
||||
var first = [_]Dividend{
|
||||
.{ .ex_date = Date.fromYmd(2025, 8, 31), .amount = 0.003422654, .type = .regular },
|
||||
};
|
||||
s.writeWithSource(Dividend, "TEST", first[0..], Ttl.dividends, "polygon");
|
||||
|
||||
// 4 days earlier — outside the ±3 day window.
|
||||
var second = [_]Dividend{
|
||||
.{ .ex_date = Date.fromYmd(2025, 8, 27), .amount = 0.003422654 },
|
||||
};
|
||||
s.writeWithSource(Dividend, "TEST", second[0..], Ttl.dividends, "tiingo");
|
||||
|
||||
const result = s.read(Dividend, "TEST", null, .any) orelse return error.NoCache;
|
||||
defer allocator.free(result.data);
|
||||
defer for (result.data) |d| d.deinit(allocator);
|
||||
|
||||
// Both entries kept — gap is too wide for near-match.
|
||||
try std.testing.expectEqual(@as(usize, 2), result.data.len);
|
||||
}
|
||||
|
||||
test "writeMerged Dividend: near-match dedup respects amount tolerance" {
|
||||
// Different amounts within the date window should NOT collapse.
|
||||
// This guards against false-positive collapsing of distinct
|
||||
// dividends paid close together (rare in real life but possible
|
||||
// with special dividends adjacent to a regular).
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
var tmp = std.testing.tmpDir(.{});
|
||||
defer tmp.cleanup();
|
||||
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
|
||||
defer allocator.free(dir_path);
|
||||
|
||||
var s = Store.init(io, allocator, dir_path);
|
||||
|
||||
var first = [_]Dividend{
|
||||
.{ .ex_date = Date.fromYmd(2025, 8, 31), .amount = 0.003422654, .type = .regular },
|
||||
};
|
||||
s.writeWithSource(Dividend, "TEST", first[0..], Ttl.dividends, "polygon");
|
||||
|
||||
// 2 days earlier, very different amount (>1% relative).
|
||||
var second = [_]Dividend{
|
||||
.{ .ex_date = Date.fromYmd(2025, 8, 29), .amount = 0.005 },
|
||||
};
|
||||
s.writeWithSource(Dividend, "TEST", second[0..], Ttl.dividends, "tiingo");
|
||||
|
||||
const result = s.read(Dividend, "TEST", null, .any) orelse return error.NoCache;
|
||||
defer allocator.free(result.data);
|
||||
defer for (result.data) |d| d.deinit(allocator);
|
||||
|
||||
try std.testing.expectEqual(@as(usize, 2), result.data.len);
|
||||
}
|
||||
|
||||
test "writeMerged Dividend: near-match dedup tolerates Tiingo amount rounding" {
|
||||
// FAGIX-style case: Polygon reports amount with full precision
|
||||
// (0.040101457), Tiingo truncates to 2 decimals (0.04). The
|
||||
// absolute difference (0.000101) is just over the 0.0001 abs
|
||||
// tolerance, but well within the 1% relative tolerance. The
|
||||
// near-match dedup should treat them as the same event.
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
var tmp = std.testing.tmpDir(.{});
|
||||
defer tmp.cleanup();
|
||||
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
|
||||
defer allocator.free(dir_path);
|
||||
|
||||
var s = Store.init(io, allocator, dir_path);
|
||||
|
||||
var polygon_view = [_]Dividend{
|
||||
.{
|
||||
.ex_date = Date.fromYmd(2024, 8, 31),
|
||||
.pay_date = Date.fromYmd(2024, 9, 1),
|
||||
.amount = 0.040101457,
|
||||
.type = .regular,
|
||||
},
|
||||
};
|
||||
s.writeWithSource(Dividend, "FAGIX", polygon_view[0..], Ttl.dividends, "polygon");
|
||||
|
||||
// Tiingo's view: 1 day earlier, amount rounded to 2 decimals.
|
||||
var tiingo_view = [_]Dividend{
|
||||
.{ .ex_date = Date.fromYmd(2024, 8, 30), .amount = 0.04 },
|
||||
};
|
||||
s.writeWithSource(Dividend, "FAGIX", tiingo_view[0..], Ttl.dividends, "tiingo");
|
||||
|
||||
const result = s.read(Dividend, "FAGIX", null, .any) orelse return error.NoCache;
|
||||
defer allocator.free(result.data);
|
||||
defer for (result.data) |d| d.deinit(allocator);
|
||||
|
||||
try std.testing.expectEqual(@as(usize, 1), result.data.len);
|
||||
try std.testing.expect(result.data[0].ex_date.eql(Date.fromYmd(2024, 8, 31)));
|
||||
try std.testing.expectApproxEqAbs(@as(f64, 0.040101457), result.data[0].amount, 0.000001);
|
||||
}
|
||||
|
||||
test "writeMerged Split: near-match dedup is a no-op (no amount field)" {
|
||||
// Splits don't have an amount field, so findNearMatch returns
|
||||
// null for Split. Two splits within 3 days with the same ratio
|
||||
// would both be kept. This is the intended behavior — splits
|
||||
// are rare events, and any close-together splits (e.g. a
|
||||
// forward-then-reverse) are real distinct events.
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
var tmp = std.testing.tmpDir(.{});
|
||||
defer tmp.cleanup();
|
||||
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
|
||||
defer allocator.free(dir_path);
|
||||
|
||||
var s = Store.init(io, allocator, dir_path);
|
||||
|
||||
var first = [_]Split{
|
||||
.{ .date = Date.fromYmd(2024, 6, 10), .numerator = 10, .denominator = 1 },
|
||||
};
|
||||
s.writeWithSource(Split, "TEST", first[0..], Ttl.splits, "polygon");
|
||||
|
||||
// 1 day apart (would be in near-match window if it applied to
|
||||
// splits). Still kept as distinct because the dedup logic
|
||||
// only applies to Dividend.
|
||||
var second = [_]Split{
|
||||
.{ .date = Date.fromYmd(2024, 6, 11), .numerator = 10, .denominator = 1 },
|
||||
};
|
||||
s.writeWithSource(Split, "TEST", second[0..], Ttl.splits, "tiingo");
|
||||
|
||||
const result = s.read(Split, "TEST", null, .any) orelse return error.NoCache;
|
||||
defer allocator.free(result.data);
|
||||
|
||||
try std.testing.expectEqual(@as(usize, 2), result.data.len);
|
||||
}
|
||||
|
||||
test "writeMerged Split: SPYM-style supplementary entry added" {
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue