diff --git a/AGENTS.md b/AGENTS.md index fd7d334..28754cd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -439,6 +439,93 @@ look nice in prose but they create real problems: When in doubt, **ask**. A one-line "I'm about to use `—` here for X, OK?" is much cheaper than reverting after the user notices. +### NEVER put PII in tests, fixtures, comments, or docs + +This codebase is a **personal finance tool**. Real account names, +account numbers (full or partial trailing digits), real holders' +names, and any production-data identifiers are PII. Test +fixtures, doc comments, error-message examples, and sample +account-map entries must use **placeholder data only**. + +The user has had to ask multiple times to scrub PII out of +tests after I introduced it via "this is real data so let me +write a test that uses it." That's wrong every time. Real data +should NEVER end up in source files. + +**Approved placeholder vocabulary** (use these consistently +across the codebase so search-and-replace stays trivial): + +- Account names: + - `Sample IRA`, `Sample Roth IRA`, `Sample Roth`, `Sample + Brokerage`, `Sample Trust`, `Sample HSA`, `Sample Source`, + `Sample Account`, `Sample Fid`, `Sample Fidelity Brokerage` + - With trailing digits: `Sample IRA *1234`, `Sample Brokerage + *5678` + - Filenames: `Sample_IRA_1234.txt`, `Sample_IRA_5678.txt`, + `smpl_1234`, `smpl-ira-1234` +- Account numbers: `1234`, `5678`, `9012`, `3456`, `7890`, or + alphanumeric like `Z123`, `Z111`, `Z222`. Do not use real + trailing-digit values from `~/finance/`. +- Portfolio file names: `portfolio_other.srf`, never + `portfolio_.srf`. +- Schwab/Fidelity-style: `Schwab Trust`, `Inherited IRA`, `Roth + IRA`, `Tax Loss` are generic enough to keep, but any + uniqueness suffix must be a placeholder. + +**Things that are NEVER OK in source:** + +- Real first names of family members. (No `Emil`, `Elizabeth`, + `Kelly`, `Mom`, `Dad`, etc.) +- Real account-number trailing digits used in + `~/finance/accounts.srf` (e.g. `6135`, `3522`, `7891`, `716`, + `901`, `503`, `311`, `152`, `118`, `Z30619248`, + `229948882`, etc.). Any number that came from a real + brokerage entry is PII. +- Real portfolio filenames like `portfolio_mom.srf`, + `portfolio_.srf`. +- Composite identifiers that combine the above (`Mom Roth IRA`, + `Joint trust ...716`, etc.). + +**Workflow rule when adding a test based on a real-world +scenario:** + +1. Reproduce the bug locally with real data (in `~/finance/`, + never staged). +2. Write the test using **placeholder names and numbers** that + preserve the structural shape of the bug (same string + lengths, same pattern of digits-vs-letters, same separator + characters, etc.) but contain no real-world identifiers. +3. Verify the test still reproduces the bug. If it doesn't, the + bug was tied to specific real-world content — investigate + whether that's a real signal (e.g. a Unicode-handling issue) + and either fix the underlying bug or find a placeholder that + exhibits the same shape. + +**Workflow rule when finding existing PII:** + +If you grep for the placeholder vocabulary while working in any +file and find a real name or number that snuck in, fix it in +the same change. Don't add to TODO; PII removal is never +optional, and it never lands in a separate commit unless the +user explicitly asks. + +**One-line grep that should ALWAYS return zero non-`ie_data.csv` +hits before committing:** + +``` +grep -rn "\bMom\b\|Elizabeth\|Joint trust\|portfolio_mom\|\bEmil\b\|Fidelity Emil\|6135\|Z30619248" src/ \ + | grep -v ie_data.csv +``` + +(Update the alternation as new real-world identifiers come up. +The `ie_data.csv` exclusion is because the Shiller dataset +contains coincidental numeric matches in historical-year fields +that aren't PII.) + +If you're uncertain whether something is PII, **ask before +committing.** PII can be surgically removed from a working +tree, but once it's in `git log` it's effectively permanent. + --- ## Commands diff --git a/src/brokerage/wells_fargo.zig b/src/brokerage/wells_fargo.zig index d0a2a7b..2916fcd 100644 --- a/src/brokerage/wells_fargo.zig +++ b/src/brokerage/wells_fargo.zig @@ -405,7 +405,7 @@ pub fn resolveAccount( for (account_map.entries) |e| { const inst = e.institution orelse continue; if (!std.mem.eql(u8, inst, institution)) continue; - if (filenameMatchesAccount(base, e.account)) { + if (filenameMatchesAccount(base, e.account, e.account_number)) { if (match != null) { // More than one WF entry matched the // filename — punt to the user. @@ -499,11 +499,21 @@ fn resolutionFor(io: std.Io, entry: analysis.AccountTaxEntry) !Resolved { /// tail of the account name (after `*` or end-of-string), with /// underscores and spaces treated as equivalent. /// +/// `account_number` (when non-null) is also tried as an +/// alternate anchor: a filename containing `accounts.srf`'s +/// `account_number::` value matches even when the account name +/// itself has no trailing digit run (e.g. user named the file +/// `1234.txt` and recorded `account::Sample Roth IRA, +/// account_number::1234` without putting `*1234` in the name). +/// This is the more user-friendly path; without it, the user +/// would have to keep the digit suffix in two places. +/// /// Examples: -/// filenameMatchesAccount("Sample_IRA_1234", "Sample IRA *1234") → true -/// filenameMatchesAccount("smpl-ira-1234", "Sample IRA *1234") → true (digits match) -/// filenameMatchesAccount("portfolio_other", "Sample IRA *1234") → false -fn filenameMatchesAccount(filename: []const u8, account_name: []const u8) bool { +/// filenameMatchesAccount("Sample_IRA_1234", "Sample IRA *1234", null) → true +/// filenameMatchesAccount("smpl-ira-1234", "Sample IRA *1234", null) → true (digits match) +/// filenameMatchesAccount("portfolio_other", "Sample IRA *1234", null) → false +/// filenameMatchesAccount("1234.txt", "Sample Roth IRA", "1234") → true (account_number anchor) +fn filenameMatchesAccount(filename: []const u8, account_name: []const u8, account_number: ?[]const u8) bool { // Extract the trailing digit run from the account name. // "Sample IRA *1234" → "1234". var digits_start: usize = account_name.len; @@ -520,6 +530,21 @@ fn filenameMatchesAccount(filename: []const u8, account_name: []const u8) bool { // a household. if (digits.len > 0 and std.mem.indexOf(u8, filename, digits) != null) return true; + // Try the `account_number::` field as an alternate anchor. + // Useful when the user didn't bother to put the digits in + // the human-readable account name. We only treat the + // account_number as an anchor when it's all digits (e.g. + // "1234"); alphanumeric account numbers like Schwab's + // "Z123" prefixed format wouldn't be a useful filename hint + // for a WF import anyway, but tolerating them here as a + // substring match is harmless. So: if the number is all + // digits, do an exact substring; if it's mixed, also try a + // substring + // (case-insensitive) which is the broader fuzzy fallback. + if (account_number) |num| { + if (num.len > 0 and std.mem.indexOf(u8, filename, num) != null) return true; + } + // No digit suffix to compare; fall back to a fuzzy // letters-only overlap. Lowercase both sides; compare // alphanumeric runs only. If every alphanumeric run of the @@ -767,7 +792,7 @@ test "parsePaste: parses across intermediate totals (Stocks Total + ETFs Total)" // sections (Stocks, ETFs, Bonds, …), each terminated by its // own `
Total` footer. The parser must keep going // past intermediate totals to capture records in subsequent - // sections. (Real-world example: 6135.txt-style export with + // sections. (Real-world example: a multi-section export with // 43 stocks then 13 ETFs separated by `Stocks Total`.) const data = "GSLC , popup\n" ++ @@ -1023,30 +1048,53 @@ fn testAccountMap(allocator: std.mem.Allocator, entries: []const analysis.Accoun test "filenameMatchesAccount: trailing-digit anchor wins" { // Strongest signal — WF account suffixes are unique within // a household, so a digit-run match is unambiguous. - try testing.expect(filenameMatchesAccount("Sample_IRA_1234", "Sample IRA *1234")); - try testing.expect(filenameMatchesAccount("1234.txt", "Sample IRA *1234")); - try testing.expect(filenameMatchesAccount("smpl-ira-1234", "Sample IRA *1234")); + try testing.expect(filenameMatchesAccount("Sample_IRA_1234", "Sample IRA *1234", null)); + try testing.expect(filenameMatchesAccount("1234.txt", "Sample IRA *1234", null)); + try testing.expect(filenameMatchesAccount("smpl-ira-1234", "Sample IRA *1234", null)); // Different digit suffix → no match. - try testing.expect(!filenameMatchesAccount("Sample_IRA_5678", "Sample IRA *1234")); - try testing.expect(!filenameMatchesAccount("portfolio_other", "Sample IRA *1234")); + try testing.expect(!filenameMatchesAccount("Sample_IRA_5678", "Sample IRA *1234", null)); + try testing.expect(!filenameMatchesAccount("portfolio_other", "Sample IRA *1234", null)); +} + +test "filenameMatchesAccount: account_number anchor when name lacks digits" { + // User stored the digits in `account_number::` but didn't + // bother to put them in the human-readable account name. + // The number itself can anchor the filename match. + try testing.expect(filenameMatchesAccount("1234.txt", "Sample Roth IRA", "1234")); + try testing.expect(filenameMatchesAccount("smpl_1234", "Sample Roth IRA", "1234")); + // Wrong digits → no match. + try testing.expect(!filenameMatchesAccount("9999.txt", "Sample Roth IRA", "1234")); + // No account_number and no digits in name → no match + // (alphaRunsContained doesn't help against a digit-only file). + try testing.expect(!filenameMatchesAccount("1234.txt", "Sample Roth IRA", null)); +} + +test "filenameMatchesAccount: name digits take precedence over account_number" { + // Both signals available; either one matching is enough. + // (Tests the OR semantics — name digits win first because + // they're checked first; we also verify account_number-only + // matches when name digits don't appear.) + try testing.expect(filenameMatchesAccount("Sample_1234", "Sample *1234", "9999")); + try testing.expect(filenameMatchesAccount("Sample_9999", "Sample *1234", "9999")); + try testing.expect(!filenameMatchesAccount("Sample_5555", "Sample *1234", "9999")); } test "filenameMatchesAccount: alpha-only fallback when account has no digit suffix" { // No trailing digits to anchor on — falls through to the // alpha-runs-contained check. - try testing.expect(filenameMatchesAccount("emils_brokerage", "Emils Brokerage")); + try testing.expect(filenameMatchesAccount("emils_brokerage", "Emils Brokerage", null)); // Out-of-order tokens don't match: alphaRunsContained // requires every account-name run to appear in order in // the filename. - try testing.expect(!filenameMatchesAccount("Brokerage_Emils", "Emils Brokerage")); + try testing.expect(!filenameMatchesAccount("Brokerage_Emils", "Emils Brokerage", null)); // Partial overlap also doesn't match — every run must be // present. - try testing.expect(!filenameMatchesAccount("emils_only", "Emils Brokerage")); + try testing.expect(!filenameMatchesAccount("emils_only", "Emils Brokerage", null)); } test "filenameMatchesAccount: case-insensitive fallback" { - try testing.expect(filenameMatchesAccount("EMILS_brokerage", "Emils Brokerage")); - try testing.expect(filenameMatchesAccount("emils_BROKERAGE", "Emils Brokerage")); + try testing.expect(filenameMatchesAccount("EMILS_brokerage", "Emils Brokerage", null)); + try testing.expect(filenameMatchesAccount("emils_BROKERAGE", "Emils Brokerage", null)); } test "alphaRunsContained: every alphanumeric run from account appears in order" { diff --git a/src/commands/audit.zig b/src/commands/audit.zig index 22cd027..f3b6511 100644 --- a/src/commands/audit.zig +++ b/src/commands/audit.zig @@ -2642,21 +2642,21 @@ test "compareSchwabSummary: matching account → no discrepancy" { .open_date = Date.fromYmd(2024, 1, 1), .open_price = 1.0, .security_type = .cash, - .account = "Emil Brokerage", + .account = "Sample Brokerage", }, .{ .symbol = "AAPL", .shares = 10, .open_date = Date.fromYmd(2024, 1, 1), .open_price = 150, - .account = "Emil Brokerage", + .account = "Sample Brokerage", }, }; const portfolio = portfolio_mod.Portfolio{ .lots = @constCast(&lots), .allocator = allocator }; const schwab_accounts = [_]SchwabAccountSummary{ .{ - .account_name = "Emil Brokerage", + .account_name = "Sample Brokerage", .account_number = "1234", .cash = 5000.0, .total_value = 7000.0, @@ -2665,7 +2665,7 @@ test "compareSchwabSummary: matching account → no discrepancy" { var entries = [_]analysis.AccountTaxEntry{ .{ - .account = "Emil Brokerage", + .account = "Sample Brokerage", .tax_type = .taxable, .institution = "schwab", .account_number = "1234", @@ -2681,7 +2681,7 @@ test "compareSchwabSummary: matching account → no discrepancy" { defer allocator.free(results); try std.testing.expectEqual(@as(usize, 1), results.len); - try std.testing.expectEqualStrings("Emil Brokerage", results[0].account_name); + try std.testing.expectEqualStrings("Sample Brokerage", results[0].account_name); try std.testing.expectApproxEqAbs(@as(f64, 5000), results[0].portfolio_cash, 0.01); try std.testing.expectApproxEqAbs(@as(f64, 7000), results[0].portfolio_total, 0.01); try std.testing.expectApproxEqAbs(@as(f64, 0), results[0].cash_delta.?, 0.01); diff --git a/src/views/projections.zig b/src/views/projections.zig index f13e905..c3b93e3 100644 --- a/src/views/projections.zig +++ b/src/views/projections.zig @@ -1109,7 +1109,7 @@ pub const EventLine = struct { }; /// Format a single event line for display. -/// Output: " Social Security (Emil) +$38,400/yr age 67 (in 17yr)" +/// Output: " Social Security (Owner) +$38,400/yr age 67 (in 17yr)" pub fn fmtEventLine(arena: std.mem.Allocator, ev: *const projections.LifeEvent, current_ages: []const u16) !EventLine { const name = ev.getName(); const amount = ev.annual_amount;