From 85c9a489699d3d31eb5c8854a8510ead5d84ffda Mon Sep 17 00:00:00 2001 From: Emil Lerch Date: Sat, 30 May 2026 11:21:40 -0700 Subject: [PATCH] sector/geo inference for enrich command --- .pre-commit-config.yaml | 2 +- TODO.md | 300 +++++++++++++++++ src/commands/enrich.zig | 595 +++++++++++++++++++++++++++++++++- src/models/classification.zig | 24 ++ 4 files changed, 910 insertions(+), 11 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cc5fc74..fe4408f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,7 +32,7 @@ repos: - id: test name: Run zig build test entry: zig - args: ["build", "coverage", "-Dcoverage-threshold=72"] + args: ["build", "coverage", "-Dcoverage-threshold=74"] language: system types: [file] pass_filenames: false diff --git a/TODO.md b/TODO.md index b64307b..08d0f5f 100644 --- a/TODO.md +++ b/TODO.md @@ -358,6 +358,42 @@ gain. Possible fixes are discussed in the "Contributions diff" TODO below — option C there (per-account `cash_is_contribution`) would make manually-entered ESPP-style cash additions count correctly. +## Audit: stale manual prices section is incorrect — priority HIGH + +The `Stale manual prices` section in `zfin audit` (in +`src/commands/audit.zig` around line 1333) isn't computing the +right thing. The current logic walks `portfolio.lots`, filters to +lots with both `price` and `price_date` set, and flags any whose +`as_of.days - price_date.days > stale_days`. In practice this +either over-flags (counting lots that aren't really +manually-priced), under-flags (missing lots that ARE manually +priced but lack `price_date`), or both — needs investigation +against a real portfolio to determine which. + +Things to check: + +- Are we using the right field to identify "manually priced"? The + `Lot.price` field is set for any non-API price (manual override, + illiquid valuation, CD face, etc.); some of those shouldn't be + in a "stale prices" check (e.g. CDs with a fixed face value + aren't stale by age). +- Should the staleness comparison use `Allocation.is_manual_price` + (computed at the position level after the price-resolution + cascade) instead of the per-lot field? That captures "the price + this position is currently displaying came from a manual + source," which is what the user actually cares about. +- `price_date` falsely-null lots: if a lot has `price` set but no + `price_date`, we silently skip it instead of flagging it. That's + almost certainly wrong — a manually-priced lot with no recorded + date is the *most* stale case, not the least. +- Per-symbol vs per-lot: if the same symbol appears in multiple + lots with the same manual price, we currently emit one line per + lot. Probably wants to be one line per symbol with a count, or + at least dedup by `(symbol, price, price_date)`. + +Fix should land with regression tests against a fixture portfolio +that exhibits each of the above shapes. + ## Investigate: detailed 401(k) contributions data source Found a more detailed contributions screen on at least one @@ -589,6 +625,40 @@ Verification: open the TUI, press `d` on projections, try to type a date with the keypad. Then try the keyboard row. Both should commit identical input. +## TUI: memory leaks somewhere — priority MEDIUM + +User reported leaks while doing a detailed TUI walkthrough; no +specific tab or interaction yet identified. The TUI uses a mix +of arena allocators (frame-scoped) and persistent tab state, so +likely culprits: + +- Per-tab `State` structs that hold `[]const u8` slices duped + from a long-lived allocator but not freed when the tab + reloads or the symbol changes. +- Cached service-fetch results stored in tab state that aren't + `result.deinit()`-ed before being replaced. +- ArrayList accumulators that get appended to across multiple + draw cycles without an intervening clear. +- Vaxis event/dialog closures that capture strings into + arena-allocated lambdas but escape the arena's lifetime. + +### Investigation plan + +1. Run the TUI under `std.testing.allocator` (a debug + allocator that panics on leak). The current binary uses a + gpa, which silently tolerates leaks. A test-mode TUI run + with a leak-detecting allocator would surface the + offending alloc sites with file/line info. +2. Walk each tab's `State.deinit` (and `tab.deactivate` / + `tab.reload` hooks) against the `State` field list — every + owned field needs a free path on every state-change boundary. +3. Pay specific attention to `classification_map` and any + per-symbol caches (option chains, candle snapshots) — those + are the biggest fixed-size strings. + +No reproducer yet. When the user has a more specific lead +(which tab, which interaction), this entry should narrow. + ## CLI dispatch / arg-parsing bugs (found May 2026) Found during a post-framework-refactor sanity check of all 20 @@ -627,6 +697,236 @@ Investigate by replacing the `print` with a manual `print` + `flush` to see if it's a buffer-not-flushed issue, or by serializing a known-good fixture in isolation. +## Analysis: dividend equity / income-shaped equity — think about it + +Dividend-equity ETFs (SCHD, VYM, DGRO, NOBL, SDY, VIG, etc.) +bucket as Equity in `analysis.bucketSector`. That's correct for +risk-exposure analysis — they drop with the market in a +2008-style crash, regardless of the dividend stream — but it +loses the income-vs-growth distinction that retirement-planning +tools care about. + +Open question: is there a useful second dimension to add? +Possibilities: + +- **Yield-weighted breakdown.** Aggregate `current_yield` per + position, weight by market value, report a portfolio-level + yield. Doesn't change the asset-class taxonomy; adds a new + metric. +- **Income coverage of expenses.** "My dividends + bond coupons + cover X% of projected retirement spending." Closer to what the + income-side framing actually wants — answers the question + rather than redefining the buckets. +- **Income-equity sub-bucket within Equity.** A sub-row in the + Asset Category breakdown, not a 5th top-level bucket. Would + need a way to mark funds as "income-shaped" — probably a + per-symbol opt-in in `metadata.srf`. + +Not a bug. Not blocking anything. Could end up being a feature. +This is a note to revisit after using the 4-bucket view for a +while and seeing whether the missing dimension actually matters +in practice. + +Resist the temptation to: + +- **Add a 5th top-level bucket** ("Income Equity" / "Dividend + Equity"). The 4-bucket view is already the right answer for + "how much equity exposure do I have?". A 5th bucket + fragments the headline number. +- **Override SCHD to Fixed Income.** Wrong on risk grounds. + SCHD will lose 35-45% in an equity crash; treating it as FI + makes the user think they have downside protection they don't. +- **Add per-symbol "intent" metadata** (`held_for_income::true`). + Smell of putting framing into data. Intent is a property of + the holder's strategy, not the security. + +If a fix lands, it's probably a separate analysis section (yield +breakdown, income coverage) — not a change to the asset-class +taxonomy. + +## Enrich: title-keyword classification inference for ETFs — priority MEDIUM + +When Wikidata returns no entry for a fund symbol and we fall +through to the EDGAR ticker-map fallback, the auto-emitted +metadata line carries a generic `sector::Equity / Corporate, +geo::US,asset_class::Fund` triple. That's mechanically correct +(NPORT-P really does say "this fund holds equity in corporate +issuers, US-domiciled fund") but loses information the user +actually cares about: sector-themed ETFs (XLV → Healthcare), +geo-themed ETFs (FRDM → Emerging Markets, IDMO/HFXI/IVLU → +International Developed). + +The fund's *title* often carries the answer unambiguously. We +already plumb `series_name` (NPORT-P ``, falling +back to the company-tickers `title`) through to +`emitMissingClassification`. Add a keyword-inference pass that +overrides the default sector and geo when the title contains +unambiguous keywords. + +### Sector inference + +Trigger: when the lookup is `.managed_fund` or `.company_or_uit` +AND the fund has a single dominant `Equity / Corporate` sector +(>95% of holdings), AND the title carries one of the keywords +below. Emit a single GICS-tagged line in place of the NPORT-P +breakdown. + +Conservative keyword set (matches one GICS sector unambiguously): + +- "Health Care" / "Healthcare" → Healthcare +- "Semiconductor" → Technology (not "Technology" alone — too generic) +- "Software" → Technology +- "Financial" → Financial Services (careful: "Financial Select Sector SPDR") +- "Energy" → Energy +- "Oil & Gas" / "Oil and Gas" → Energy +- "Real Estate" / "REIT" → Real Estate +- "Utilities" → Utilities +- "Consumer Discretionary" → Consumer Cyclical +- "Consumer Staples" → Consumer Defensive +- "Industrial" → Industrials (careful: "Industrial Materials" — match + whole phrase) +- "Materials" → Basic Materials +- "Communication" / "Telecom" → Communication Services + +Reuse `Wikidata.canonicalizeSector`'s sector constants so the +two taxonomies don't drift. + +The "single dominant Equity / Corporate" guard prevents the +inference from misclassifying multi-asset funds (FAGIX-shape: +Debt + Equity + Loan), pure-debt funds (VBTLX), or sector-fund +edge cases like a hypothetical "Vanguard Healthcare Income Fund" +(if the breakdown is multi-sleeve, leave the NPORT-P decomposition +alone). + +### Geo inference + +Trigger: when the lookup is `.managed_fund` or `.company_or_uit` +AND the title carries an unambiguous geo keyword. Override the +default `geo::US` to the inferred bucket. + +This one is more important than sector inference because the +default is *factually wrong* for international/emerging funds, +not just imprecise. FRDM holds Taiwanese, South Korean, +Chilean, Polish equities; tagging it `geo::US` overstates US +exposure and understates EM exposure proportionally to the +fund's weight in the portfolio. + +Conservative keyword set: + +- "Emerging Markets" / "Emerging Market" → Emerging Markets +- "Frontier Markets" → Emerging Markets (or own bucket if added) +- "International Developed" → International Developed +- "International" / "Intl" / "Intl." → International Developed + (careful: only when not paired with "+ US" or similar mixing + modifier) +- (Skip country-specific keywords for now — "China" / "Japan" / + "Europe" are unambiguous but we'd be designing per-country + buckets that don't exist in the current taxonomy) + +False-positive risk for "International": fund names like +"Vanguard Total International + US Equity Index" would mis-tag +as International. Audit your portfolio's titles before locking +in the keyword. The conservative version may need to be +"International" only when the title contains no US-related +keyword, or might need explicit phrase matching. + +### Tests + +- `inferSectorFromTitle("State Street(R) Health Care Select Sector SPDR(R) ETF")` → "Healthcare" +- `inferSectorFromTitle("iShares Semiconductor ETF")` → "Technology" +- `inferSectorFromTitle("Schwab U.S. Dividend Equity ETF")` → null (broad-market, no sector word) +- `inferSectorFromTitle("Vanguard Total Bond Market Index Fund")` → null +- `inferGeoFromTitle("Freedom 100 Emerging Markets ETF")` → "Emerging Markets" +- `inferGeoFromTitle("iShares MSCI Intl Value Factor ETF")` → "International Developed" +- `inferGeoFromTitle("Schwab U.S. Dividend Equity ETF")` → null +- Plus integration tests against `emitMissingClassification` confirming the override only fires when the dominant-sector / single-geo guards are satisfied. + +### User's portfolio coverage + +After this work, the funds in the user's metadata.srf that +currently need hand-editing for sector/geo would be auto-tagged: + +- Sector: XLV (Healthcare), SOXX (Technology), QTUM (Technology — "Quantum" is borderline; might require explicit add) +- Geo: FRDM (Emerging Markets), IDMO (International Developed), HFXI (International Developed), IVLU (International Developed) + +## Analysis: collapse fine-grained NPORT-P sector strings at display time — priority MEDIUM + +The Sector (Equities) section in `analysis` output currently +shows raw NPORT-P sector strings. For a portfolio with +multi-asset funds (FAGIX, VBTLX, PTY) this means six different +"Debt / *" rows (Debt / Corporate, Debt / US Treasury, +Debt / Municipal, Debt / Non-US Sovereign, Debt / US Gov Agency, +Debt / US GSE), three "Asset-Backed / *" rows, three +"Derivative / *" rows, etc. — too granular to scan. + +The user's framing: "sometimes I'd be interested in 'roll up +all my debt investments to a single bucket', sometimes I'd want +to see split between federal government, munis and corporate." +That argues for **multiple display granularity levels** with a +TUI hot-key to toggle, not a one-time collapse decision. + +### Design + +Three display granularity tiers: + +1. **Coarse** (4 buckets): Equity / Fixed Income / Cash / Other. + Already implemented as the Asset Category section. Could be + a granularity option for the Sector section too. +2. **Mid** (~12-16 buckets): collapses NPORT-P sub-flavors but + keeps GICS sectors distinct. Roughly: + - "Bonds" (collapses all `Debt / *` + `Loan / *`) + - "Asset-Backed Securities" (collapses all `Asset-Backed / *`) + - "Cash & Equivalents" (collapses STIV variants + Repurchase Agreement) + - "Equity / Corporate" (the dominant equity bucket) + - "Equity / Other" (small equity sleeves) + - "Derivatives & Other" (collapses Derivative / Derivative-FX / Direct Real Property / etc.) + - The 11 GICS sectors (Technology, Healthcare, etc.) for stock-level entries +3. **Fine** (current behavior): raw NPORT-P strings — Debt / + US Treasury vs Debt / Municipal vs Debt / Non-US Sovereign, + etc. + +User toggles between tiers. Default: probably Mid. + +### Implementation + +Build a pure mapping function `collapseSector(sector, granularity) +[]const u8` parallel to `bucketSector`. Display layer chooses +granularity. Aggregation can either: + +- **(a)** Run all three aggregations every time and pick at + display. Memory cost ~3x for the sector breakdown but the + data is small (dozens of rows). +- **(b)** Re-aggregate when granularity changes. Cheaper memory, + costs a single pass over the classifications on toggle. TUI + toggle latency is fine — it's a hashmap rebuild over <50 rows. + +Option (b) is probably right for the TUI. CLI can pick one +granularity at command-line time (default Mid; `--sector-detail +fine|mid|coarse` to override). + +### Dependency + +**Lands AFTER the title-keyword inference work above**, so the +collapse logic is designed against the post-inference content +shape (where XLV is `Healthcare` rather than `Equity / Corporate`, +FRDM is `Equity / Corporate` + `geo::Emerging Markets`, etc.) +rather than today's pre-inference shape. + +### TUI integration + +Hot-key cycles between coarse / mid / fine on the analysis tab. +Status bar shows current granularity. State persists across +re-renders within a session; no need to persist across sessions. + +### Tests + +- `collapseSector("Debt / US Treasury", .mid)` → "Bonds" +- `collapseSector("Debt / US Treasury", .fine)` → "Debt / US Treasury" +- `collapseSector("Debt / US Treasury", .coarse)` → "Fixed Income" (delegates to bucketSector) +- `collapseSector("Technology", .mid)` → "Technology" (GICS sectors stay distinct at mid) +- `collapseSector("Technology", .coarse)` → "Equity" +- TUI hot-key cycles through three granularities and updates display. + ## Analysis: umbrella-insurance exposure indicator — priority MEDIUM In the `analysis` command and TUI tab, surface how much of the diff --git a/src/commands/enrich.zig b/src/commands/enrich.zig index 2ed5114..90eb7ae 100644 --- a/src/commands/enrich.zig +++ b/src/commands/enrich.zig @@ -414,6 +414,187 @@ pub const FundSector = struct { pct: f64, }; +// ── Title-keyword inference ────────────────────────────────── +// +// When the EDGAR fallback fires (Wikidata had no entry), the +// fund's title — pulled from NPORT-P `` or the +// company-tickers `title` — often carries enough signal to +// infer a useful sector or geo override. The default +// `sector::Equity / Corporate, geo::US, asset_class::Fund` +// triple is mechanically correct (NPORT-P really says the fund +// holds equity in corporate issuers, US-domiciled fund) but +// loses the specific GICS sector or international/emerging-market +// exposure that actually matters for portfolio analysis. +// +// Both inference functions are pure-data and tested directly. + +/// Returns true if `haystack` contains any of `needles` as a +/// substring. Case-sensitive — caller pre-lowercases when +/// case-insensitive matching is wanted. +fn titleContainsAny(haystack: []const u8, needles: []const []const u8) bool { + for (needles) |needle| { + if (std.mem.indexOf(u8, haystack, needle) != null) return true; + } + return false; +} + +/// Lowercase the title into a stack buffer for case-insensitive +/// keyword matching. Truncates titles longer than the buffer +/// (returns null) — real fund names easily fit in 256 bytes. +fn lowercaseTitle(buf: []u8, title: []const u8) ?[]const u8 { + if (title.len > buf.len) return null; + return std.ascii.lowerString(buf[0..title.len], title); +} + +/// Infer a GICS sector from a fund's title. Returns null when +/// no unambiguous keyword match — caller falls back to whatever +/// sector data NPORT-P provided (typically `Equity / Corporate`). +/// +/// Conservative keyword set: matches only words that map +/// unambiguously to a single GICS sector. "Income" / "Dividend" +/// / "Value" / "Growth" / "Momentum" / "Total" / "Equal Weight" +/// / "International" / "Emerging" don't appear here — they +/// describe the screening methodology or geo, not the sector. +/// +/// Reuses `zfin.classification.sector` constants so the +/// inference taxonomy stays in lock-step with the canonicalizer. +pub fn inferSectorFromTitle(title: ?[]const u8) ?[]const u8 { + const t = title orelse return null; + if (t.len == 0) return null; + + var buf: [256]u8 = undefined; + const lc = lowercaseTitle(&buf, t) orelse return null; + + // Order matters: more-specific keywords come first within + // each sector. "Health care" before "care" (irrelevant + // example), "semiconductor" before generic "tech" (which we + // don't include — too broad). + + // Healthcare. "Health care" with space (XLV title), "healthcare" + // (one word), "biotech", "pharmaceutical". + if (titleContainsAny(lc, &.{ "health care", "healthcare", "biotech", "pharmaceutical", "medical" })) { + return zfin.classification.sector.healthcare; + } + + // Technology. Specific terms only — "tech" alone is too + // broad (matches "biotech", "fintech", "edtech" — all + // sector-mixing). + if (titleContainsAny(lc, &.{ "semiconductor", "software", "cloud computing", "internet" })) { + return zfin.classification.sector.technology; + } + + // Financial Services. "Financial" is fairly specific in + // fund-name conventions ("Financial Select Sector SPDR", + // "Vanguard Financials ETF"). + if (titleContainsAny(lc, &.{ "financial", "bank" })) { + return zfin.classification.sector.financial_services; + } + + // Energy. "Energy" alone is mostly unambiguous in fund + // conventions; pair with "oil" / "gas" for redundancy. + if (titleContainsAny(lc, &.{ "energy", "oil & gas", "oil and gas", "petroleum" })) { + return zfin.classification.sector.energy; + } + + // Real Estate / REITs. + if (titleContainsAny(lc, &.{ "real estate", "reit" })) { + return zfin.classification.sector.real_estate; + } + + // Utilities. "Utilities" alone is unambiguous. + if (titleContainsAny(lc, &.{"utilities"})) { + return zfin.classification.sector.utilities; + } + + // Consumer Discretionary / Cyclical. Match the explicit + // labels — "consumer" alone is ambiguous (could be + // discretionary or staples). + if (titleContainsAny(lc, &.{ "consumer discretionary", "consumer cyclical" })) { + return zfin.classification.sector.consumer_cyclical; + } + + // Consumer Staples / Defensive. + if (titleContainsAny(lc, &.{ "consumer staples", "consumer defensive" })) { + return zfin.classification.sector.consumer_defensive; + } + + // Industrials. "Industrial" is more reliable than + // "industrials" because some fund names use the singular + // ("Industrial Select Sector SPDR"). + if (titleContainsAny(lc, &.{ "industrial", "aerospace", "defense" })) { + return zfin.classification.sector.industrials; + } + + // Basic Materials. + if (titleContainsAny(lc, &.{ "materials", "mining", "miners", "metals" })) { + return zfin.classification.sector.basic_materials; + } + + // Communication Services. "Communication" / "Telecom" + // unambiguous. + if (titleContainsAny(lc, &.{ "communication", "telecom", "media" })) { + return zfin.classification.sector.communication_services; + } + + return null; +} + +/// Infer a geo bucket from a fund's title. Returns null when +/// the title doesn't carry an unambiguous international/emerging +/// keyword — caller keeps the default `geo::US`. +/// +/// More important than sector inference: the default `geo::US` +/// is *factually wrong* for international funds (FRDM holds +/// emerging-market equities, not US), so this fix tightens +/// portfolio-level geographic-exposure reporting. +pub fn inferGeoFromTitle(title: ?[]const u8) ?[]const u8 { + const t = title orelse return null; + if (t.len == 0) return null; + + var buf: [256]u8 = undefined; + const lc = lowercaseTitle(&buf, t) orelse return null; + + // Emerging markets first — most specific. "Emerging" alone + // is rare in non-EM contexts in fund-name conventions. + // "Frontier" likewise is conventionally only used for + // frontier markets in fund titles. + if (titleContainsAny(lc, &.{ "emerging market", "emerging markets", "frontier market", "frontier markets", "frontier" })) { + return zfin.classification.geo.emerging; + } + + // International Developed. "International" / "Intl" / + // "ex-US" / "World ex US" / "Developed Markets" / + // specific developed-market regions. + // + // False-positive risk: a hypothetical "Vanguard Total + // International + US Equity Fund" would mis-tag here. None + // of the user's current portfolio holds such a hybrid + // fund. If one ever shows up, it'll get flagged in the + // diff-against-old-metadata.srf review and can be + // hand-corrected. + if (titleContainsAny(lc, &.{ "international", " intl", "ex-us", "ex us", "world ex", "developed market", "developed markets" })) { + return zfin.classification.geo.developed; + } + + return null; +} + +/// Determine whether a fund's NPORT-P breakdown is dominated +/// by a single Equity / Corporate sector — the precondition +/// for sector inference firing. A "dominant" sector is one +/// that's >95% of the holdings; multi-asset funds (FAGIX-shape: +/// 48% Debt + 22% Equity + ...) don't meet this guard and +/// keep their NPORT-P decomposition. +fn hasDominantEquitySector(fund_sectors: ?[]const FundSector) bool { + const sectors = fund_sectors orelse return false; + for (sectors) |s| { + if (std.mem.eql(u8, s.description, "Equity / Corporate") and s.pct >= 95.0) { + return true; + } + } + return false; +} + /// Wikidata didn't return a classification for `sym` (either the /// fetch errored out softly, or returned an empty result set). /// Emit a metadata line based on the EDGAR-fallback `lookup`: @@ -448,7 +629,11 @@ fn emitMissingClassification( } else { try out.print("# {s} -- EDGAR managed fund (Wikidata had no entry)\n", .{sym}); } - try emitFundLines(sym, "Fund", fund_sectors, out); + // Title-keyword inference: try the series_name (which + // is the only title source on the managed-fund path). + const inferred_sector = inferSectorFromTitle(series_name); + const inferred_geo = inferGeoFromTitle(series_name); + try emitFundLines(sym, "Fund", fund_sectors, inferred_sector, inferred_geo, out); }, .company_or_uit => |c| { const asset_class = if (c.is_etf) "ETF" else "Fund"; @@ -463,7 +648,12 @@ fn emitMissingClassification( } else { try out.print("# {s} -- EDGAR company-map entry (Wikidata had no entry)\n", .{sym}); } - try emitFundLines(sym, asset_class, fund_sectors, out); + // Title-keyword inference: prefer series_name (more + // authoritative), fall back to company-tickers title. + const effective_title: ?[]const u8 = series_name orelse c.title; + const inferred_sector = inferSectorFromTitle(effective_title); + const inferred_geo = inferGeoFromTitle(effective_title); + try emitFundLines(sym, asset_class, fund_sectors, inferred_sector, inferred_geo, out); }, .none => { if (err) |e| { @@ -481,24 +671,58 @@ fn emitMissingClassification( /// `pct:num:N` line per sector; otherwise emits a single /// `sector::TODO` line. The asset_class comes from the caller /// (already determined: "Fund" or "ETF"). +/// +/// `inferred_sector` (when non-null AND a single dominant +/// `Equity / Corporate` sleeve exists) replaces that sleeve's +/// row with a GICS-tagged row, preserving the original pct. +/// Other rows (Cash sleeves, dust derivatives) stay as-is. +/// When inference doesn't apply (no dominant sleeve, no +/// inferred sector, or the breakdown is multi-asset like +/// FAGIX), the NPORT-P decomposition emits unchanged. +/// +/// `geo` is applied uniformly to every emitted row. Defaults +/// to `"US"` when null. NPORT-P doesn't tell us the holdings' +/// geo (only the fund's domicile, which is always US for funds +/// in this map), so the caller passes the inferred geo when +/// available. fn emitFundLines( sym: []const u8, asset_class: []const u8, fund_sectors: ?[]const FundSector, + inferred_sector: ?[]const u8, + geo: ?[]const u8, out: *std.Io.Writer, ) !void { + const geo_str = geo orelse "US"; if (fund_sectors) |sectors| { if (sectors.len > 0) { + const should_override = + inferred_sector != null and + hasDominantEquitySector(sectors); for (sectors) |s| { + // When inference fires, replace the dominant + // Equity / Corporate row with the inferred GICS + // sector. Other rows stay as the raw NPORT-P + // category — they're informative as-is (Cash + // sleeves, derivatives, etc.). + const sector_str = if (should_override and + std.mem.eql(u8, s.description, "Equity / Corporate")) + inferred_sector.? + else + s.description; try out.print( - "symbol::{s},sector::{s},geo::US,asset_class::{s},pct:num:{d:.2}\n", - .{ sym, s.description, asset_class, s.pct }, + "symbol::{s},sector::{s},geo::{s},asset_class::{s},pct:num:{d:.2}\n", + .{ sym, sector_str, geo_str, asset_class, s.pct }, ); } return; } } - try out.print("symbol::{s},sector::TODO,geo::US,asset_class::{s}\n", .{ sym, asset_class }); + // No sector breakdown at all (NPORT-P fetch failed). Emit + // one TODO line — but if title-keyword inference returned + // a sector, use it instead of "TODO". + const sector_str = inferred_sector orelse "TODO"; + try out.print("symbol::{s},sector::{s},geo::{s},asset_class::{s}\n", .{ sym, sector_str, geo_str, asset_class }); } /// What `getEtfMetrics` provides that `enrich` actually uses: @@ -1565,10 +1789,361 @@ test "emitMissingClassification: .none ignores series_name (no fund name to disp try std.testing.expect(std.mem.indexOf(u8, written, "Spurious Name") == null); } +// ── inferSectorFromTitle ───────────────────────────────────── + +test "inferSectorFromTitle: Health Care / Healthcare → Healthcare" { + try std.testing.expectEqualStrings( + "Healthcare", + inferSectorFromTitle("State Street(R) Health Care Select Sector SPDR(R) ETF").?, + ); + try std.testing.expectEqualStrings("Healthcare", inferSectorFromTitle("iShares U.S. Healthcare ETF").?); + try std.testing.expectEqualStrings("Healthcare", inferSectorFromTitle("Vanguard Health Care ETF").?); + try std.testing.expectEqualStrings("Healthcare", inferSectorFromTitle("SPDR S&P Pharmaceutical ETF").?); + try std.testing.expectEqualStrings("Healthcare", inferSectorFromTitle("iShares Biotech ETF").?); +} + +test "inferSectorFromTitle: Semiconductor / Software → Technology" { + try std.testing.expectEqualStrings("Technology", inferSectorFromTitle("iShares Semiconductor ETF").?); + try std.testing.expectEqualStrings("Technology", inferSectorFromTitle("VanEck Semiconductor ETF").?); + try std.testing.expectEqualStrings("Technology", inferSectorFromTitle("Invesco Software ETF").?); + try std.testing.expectEqualStrings("Technology", inferSectorFromTitle("First Trust Cloud Computing ETF").?); +} + +test "inferSectorFromTitle: Financial → Financial Services" { + try std.testing.expectEqualStrings("Financial Services", inferSectorFromTitle("Financial Select Sector SPDR Fund").?); + try std.testing.expectEqualStrings("Financial Services", inferSectorFromTitle("Vanguard Financials ETF").?); + try std.testing.expectEqualStrings("Financial Services", inferSectorFromTitle("SPDR S&P Bank ETF").?); +} + +test "inferSectorFromTitle: Energy → Energy" { + try std.testing.expectEqualStrings("Energy", inferSectorFromTitle("Energy Select Sector SPDR Fund").?); + try std.testing.expectEqualStrings("Energy", inferSectorFromTitle("SPDR S&P Oil & Gas Exploration ETF").?); + try std.testing.expectEqualStrings("Energy", inferSectorFromTitle("VanEck Oil and Gas ETF").?); + try std.testing.expectEqualStrings("Energy", inferSectorFromTitle("Invesco Petroleum ETF").?); +} + +test "inferSectorFromTitle: Real Estate / REIT → Real Estate" { + try std.testing.expectEqualStrings("Real Estate", inferSectorFromTitle("Vanguard Real Estate ETF").?); + try std.testing.expectEqualStrings("Real Estate", inferSectorFromTitle("Schwab U.S. REIT ETF").?); +} + +test "inferSectorFromTitle: Utilities → Utilities" { + try std.testing.expectEqualStrings("Utilities", inferSectorFromTitle("Utilities Select Sector SPDR Fund").?); + try std.testing.expectEqualStrings("Utilities", inferSectorFromTitle("Vanguard Utilities ETF").?); +} + +test "inferSectorFromTitle: Industrials and Materials" { + try std.testing.expectEqualStrings("Industrials", inferSectorFromTitle("Industrial Select Sector SPDR Fund").?); + try std.testing.expectEqualStrings("Industrials", inferSectorFromTitle("iShares Aerospace & Defense ETF").?); + try std.testing.expectEqualStrings("Basic Materials", inferSectorFromTitle("Materials Select Sector SPDR Fund").?); + try std.testing.expectEqualStrings("Basic Materials", inferSectorFromTitle("VanEck Gold Miners ETF").?); +} + +test "inferSectorFromTitle: Communication Services" { + try std.testing.expectEqualStrings("Communication Services", inferSectorFromTitle("Communication Services Select Sector SPDR Fund").?); + try std.testing.expectEqualStrings("Communication Services", inferSectorFromTitle("iShares U.S. Telecom ETF").?); +} + +test "inferSectorFromTitle: Consumer Discretionary / Cyclical / Staples / Defensive" { + try std.testing.expectEqualStrings("Consumer Cyclical", inferSectorFromTitle("Consumer Discretionary Select Sector SPDR Fund").?); + try std.testing.expectEqualStrings("Consumer Cyclical", inferSectorFromTitle("iShares U.S. Consumer Cyclical ETF").?); + try std.testing.expectEqualStrings("Consumer Defensive", inferSectorFromTitle("Consumer Staples Select Sector SPDR Fund").?); + try std.testing.expectEqualStrings("Consumer Defensive", inferSectorFromTitle("Vanguard Consumer Defensive ETF").?); +} + +test "inferSectorFromTitle: broad-market / strategy funds return null" { + // No sector keyword — falls through. Caller keeps the + // NPORT-P generic Equity / Corporate. + try std.testing.expect(inferSectorFromTitle("SPDR S&P 500 ETF Trust") == null); + try std.testing.expect(inferSectorFromTitle("Vanguard Total Stock Market ETF") == null); + try std.testing.expect(inferSectorFromTitle("Schwab U.S. Dividend Equity ETF") == null); + try std.testing.expect(inferSectorFromTitle("Invesco S&P 500 Equal Weight ETF") == null); + try std.testing.expect(inferSectorFromTitle("Vanguard Total Bond Market Index Fund") == null); + try std.testing.expect(inferSectorFromTitle("Fidelity Capital and Income Fund") == null); +} + +test "inferSectorFromTitle: target-date funds return null (multi-sector)" { + // Target-date funds hold a mix of equity and bonds across + // multiple sectors. No keyword should match. + try std.testing.expect(inferSectorFromTitle("VANGUARD TARGET RETIREMENT 2040 FUND") == null); + try std.testing.expect(inferSectorFromTitle("Fidelity Freedom 2050 Fund") == null); +} + +test "inferSectorFromTitle: null and empty input return null" { + try std.testing.expect(inferSectorFromTitle(null) == null); + try std.testing.expect(inferSectorFromTitle("") == null); +} + +test "inferSectorFromTitle: oversized title returns null safely" { + // Buffer-bounded; titles >256 bytes return null rather + // than crash. Real fund names are far shorter; this is a + // defensive check. + const long_title = "X" ** 300; + try std.testing.expect(inferSectorFromTitle(long_title) == null); +} + +test "inferSectorFromTitle: case-insensitive matching" { + // "HEALTH CARE" (all caps, e.g. Vanguard's all-caps style) + // matches the same as "Health Care". + try std.testing.expectEqualStrings("Healthcare", inferSectorFromTitle("VANGUARD HEALTH CARE ETF").?); + try std.testing.expectEqualStrings("Technology", inferSectorFromTitle("ISHARES SEMICONDUCTOR ETF").?); +} + +test "inferSectorFromTitle: returns same pointer for same bucket (static literal)" { + // The taxonomy constants are static literals; multiple + // calls returning the same bucket should hand out the + // same byte pointer. Lets callers compare via ptr equality + // and use the result as a stable HashMap key. + const a = inferSectorFromTitle("Vanguard Health Care ETF").?; + const b = inferSectorFromTitle("iShares U.S. Healthcare ETF").?; + try std.testing.expectEqual(@intFromPtr(a.ptr), @intFromPtr(b.ptr)); +} + +// ── inferGeoFromTitle ──────────────────────────────────────── + +test "inferGeoFromTitle: Emerging Markets → Emerging Markets" { + try std.testing.expectEqualStrings("Emerging Markets", inferGeoFromTitle("Freedom 100 Emerging Markets ETF").?); + try std.testing.expectEqualStrings("Emerging Markets", inferGeoFromTitle("Vanguard FTSE Emerging Markets ETF").?); + try std.testing.expectEqualStrings("Emerging Markets", inferGeoFromTitle("iShares MSCI Emerging Markets ETF").?); +} + +test "inferGeoFromTitle: Frontier Markets → Emerging Markets bucket" { + try std.testing.expectEqualStrings("Emerging Markets", inferGeoFromTitle("iShares MSCI Frontier 100 ETF").?); + try std.testing.expectEqualStrings("Emerging Markets", inferGeoFromTitle("Vanguard Frontier Markets ETF").?); +} + +test "inferGeoFromTitle: International → International Developed" { + try std.testing.expectEqualStrings("International Developed", inferGeoFromTitle("iShares MSCI Intl Value Factor ETF").?); + try std.testing.expectEqualStrings("International Developed", inferGeoFromTitle("Vanguard FTSE Developed Markets ETF").?); + try std.testing.expectEqualStrings("International Developed", inferGeoFromTitle("Invesco S&P International Developed Momentum ETF").?); + try std.testing.expectEqualStrings("International Developed", inferGeoFromTitle("NYLI FTSE International Equity Currency Neutral ETF").?); + try std.testing.expectEqualStrings("International Developed", inferGeoFromTitle("Vanguard FTSE All-World ex-US ETF").?); +} + +test "inferGeoFromTitle: US-only / no geo keyword returns null" { + try std.testing.expect(inferGeoFromTitle("SPDR S&P 500 ETF Trust") == null); + try std.testing.expect(inferGeoFromTitle("Vanguard Total Stock Market ETF") == null); + try std.testing.expect(inferGeoFromTitle("Schwab U.S. Dividend Equity ETF") == null); + try std.testing.expect(inferGeoFromTitle("Fidelity Capital and Income Fund") == null); +} + +test "inferGeoFromTitle: Emerging beats International when both present" { + // Defensive: "iShares MSCI International Emerging Markets + // ETF" (hypothetical) would match both branches. Emerging + // Markets is more specific and is checked first; verify + // the priority order holds. + try std.testing.expectEqualStrings( + "Emerging Markets", + inferGeoFromTitle("iShares MSCI International Emerging Markets ETF").?, + ); +} + +test "inferGeoFromTitle: null and empty input return null" { + try std.testing.expect(inferGeoFromTitle(null) == null); + try std.testing.expect(inferGeoFromTitle("") == null); +} + +test "inferGeoFromTitle: case-insensitive matching" { + try std.testing.expectEqualStrings("Emerging Markets", inferGeoFromTitle("FREEDOM 100 EMERGING MARKETS ETF").?); + try std.testing.expectEqualStrings("International Developed", inferGeoFromTitle("ISHARES MSCI INTL VALUE FACTOR ETF").?); +} + +test "inferGeoFromTitle: returns static-literal pointers" { + const a = inferGeoFromTitle("iShares Emerging Markets ETF").?; + const b = inferGeoFromTitle("Vanguard Emerging Markets ETF").?; + try std.testing.expectEqual(@intFromPtr(a.ptr), @intFromPtr(b.ptr)); +} + +// ── hasDominantEquitySector ────────────────────────────────── + +test "hasDominantEquitySector: single 99% Equity / Corporate -> true" { + const sectors = [_]FundSector{ + .{ .description = "Equity / Corporate", .pct = 99.7 }, + .{ .description = "Short-Term Investment Vehicle / Registered Fund", .pct = 0.19 }, + }; + try std.testing.expect(hasDominantEquitySector(sectors[0..])); +} + +test "hasDominantEquitySector: 95% threshold is inclusive" { + const sectors = [_]FundSector{ + .{ .description = "Equity / Corporate", .pct = 95.0 }, + }; + try std.testing.expect(hasDominantEquitySector(sectors[0..])); +} + +test "hasDominantEquitySector: 94.99% does NOT trigger" { + const sectors = [_]FundSector{ + .{ .description = "Equity / Corporate", .pct = 94.99 }, + }; + try std.testing.expect(!hasDominantEquitySector(sectors[0..])); +} + +test "hasDominantEquitySector: multi-asset fund (FAGIX-shape) -> false" { + // FAGIX has 22% Equity / Corporate plus debt and other + // sleeves. 22% is way under the 95% threshold. + const sectors = [_]FundSector{ + .{ .description = "Debt / Corporate", .pct = 47.69 }, + .{ .description = "Equity / Corporate", .pct = 22.49 }, + .{ .description = "Loan / Corporate", .pct = 9.99 }, + }; + try std.testing.expect(!hasDominantEquitySector(sectors[0..])); +} + +test "hasDominantEquitySector: pure-debt fund -> false" { + // VBTLX-shape: all Debt / *. No Equity / Corporate row at all. + const sectors = [_]FundSector{ + .{ .description = "Debt / Corporate", .pct = 50.0 }, + .{ .description = "Debt / US Treasury", .pct = 30.0 }, + }; + try std.testing.expect(!hasDominantEquitySector(sectors[0..])); +} + +test "hasDominantEquitySector: null and empty -> false" { + try std.testing.expect(!hasDominantEquitySector(null)); + const empty = [_]FundSector{}; + try std.testing.expect(!hasDominantEquitySector(empty[0..])); +} + +// ── inference integration with emitMissingClassification ───── + +test "emitMissingClassification: XLV-shape applies sector inference (Health Care -> Healthcare)" { + // Single dominant Equity / Corporate (99.76%) AND title + // contains "Health Care" → the Equity row gets replaced + // with Healthcare. Cash sleeve stays as STIV. + var out_buf: [1024]u8 = undefined; + var out: std.Io.Writer = .fixed(&out_buf); + const sectors = [_]FundSector{ + .{ .description = "Equity / Corporate", .pct = 99.76 }, + .{ .description = "Short-Term Investment Vehicle / Registered Fund", .pct = 0.45 }, + }; + try emitMissingClassification( + "XLV", + .{ .company_or_uit = .{ .title = "SPDR HEALTH CARE SELECT SECTOR ETF", .is_etf = true } }, + sectors[0..], + "State Street(R) Health Care Select Sector SPDR(R) ETF", + null, + &out, + ); + + const written = out.buffered(); + // The dominant row gets the GICS sector. Note pct preserved. + try std.testing.expect(std.mem.indexOf(u8, written, "symbol::XLV,sector::Healthcare,geo::US,asset_class::ETF,pct:num:99.76") != null); + // Cash sleeve unchanged. + try std.testing.expect(std.mem.indexOf(u8, written, "Short-Term Investment Vehicle / Registered Fund") != null); + // The generic Equity / Corporate row should NOT appear. + try std.testing.expect(std.mem.indexOf(u8, written, "symbol::XLV,sector::Equity / Corporate") == null); +} + +test "emitMissingClassification: FRDM-shape applies geo inference (Emerging Markets)" { + // Title "Emerging Markets" → every row gets geo::Emerging Markets. + // No sector inference (no sector keyword in title). + var out_buf: [1024]u8 = undefined; + var out: std.Io.Writer = .fixed(&out_buf); + const sectors = [_]FundSector{ + .{ .description = "Equity / Corporate", .pct = 99.55 }, + }; + try emitMissingClassification( + "FRDM", + .managed_fund, + sectors[0..], + "Freedom 100 Emerging Markets ETF", + null, + &out, + ); + + const written = out.buffered(); + try std.testing.expect(std.mem.indexOf(u8, written, "geo::Emerging Markets") != null); + // No US geo on this row. + try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FRDM,sector::Equity / Corporate,geo::US") == null); +} + +test "emitMissingClassification: multi-asset fund (FAGIX-shape) does NOT apply sector inference" { + // Multi-asset breakdown — no dominant Equity / Corporate + // sleeve. Sector inference should NOT fire even if the + // title had a sector keyword (FAGIX's title doesn't, but + // this guards the multi-asset case generally). + var out_buf: [1024]u8 = undefined; + var out: std.Io.Writer = .fixed(&out_buf); + const sectors = [_]FundSector{ + .{ .description = "Debt / Corporate", .pct = 47.69 }, + .{ .description = "Equity / Corporate", .pct = 22.49 }, + .{ .description = "Loan / Corporate", .pct = 9.99 }, + }; + try emitMissingClassification( + "FAKE", + .managed_fund, + sectors[0..], + // Hypothetical title with a sector keyword the + // inference would normally pick up. + "Hypothetical Healthcare Multi-Asset Fund", + null, + &out, + ); + + const written = out.buffered(); + // Inference should NOT have fired — the Equity / Corporate + // row stays raw. The fund's *comment line* contains the + // word "Healthcare" because the title does, but no + // `sector::Healthcare` row should appear. + try std.testing.expect(std.mem.indexOf(u8, written, "sector::Healthcare") == null); + try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FAKE,sector::Equity / Corporate") != null); + // Other sleeves unchanged. + try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FAKE,sector::Debt / Corporate") != null); + try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FAKE,sector::Loan / Corporate") != null); +} + +test "emitMissingClassification: SCHD-shape (no sector keyword) keeps NPORT-P breakdown" { + // SCHD has dominant Equity / Corporate but no sector + // keyword in its title — inference returns null and the + // raw NPORT-P row stays. (User can hand-edit if they want + // a different label.) + var out_buf: [1024]u8 = undefined; + var out: std.Io.Writer = .fixed(&out_buf); + const sectors = [_]FundSector{ + .{ .description = "Equity / Corporate", .pct = 99.70 }, + .{ .description = "Short-Term Investment Vehicle / Registered Fund", .pct = 0.19 }, + }; + try emitMissingClassification( + "SCHD", + .managed_fund, + sectors[0..], + "Schwab U.S. Dividend Equity ETF", + null, + &out, + ); + + const written = out.buffered(); + // No sector inference: keep the raw row. + try std.testing.expect(std.mem.indexOf(u8, written, "symbol::SCHD,sector::Equity / Corporate") != null); +} + +test "emitMissingClassification: combined sector + geo inference" { + // Hypothetical "iShares MSCI Healthcare Emerging Markets + // ETF" — both keywords fire. Healthcare overrides the + // Equity / Corporate row; Emerging Markets overrides the + // geo on every row. + var out_buf: [1024]u8 = undefined; + var out: std.Io.Writer = .fixed(&out_buf); + const sectors = [_]FundSector{ + .{ .description = "Equity / Corporate", .pct = 99.0 }, + }; + try emitMissingClassification( + "FAKE", + .managed_fund, + sectors[0..], + "iShares MSCI Healthcare Emerging Markets ETF", + null, + &out, + ); + + const written = out.buffered(); + try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FAKE,sector::Healthcare,geo::Emerging Markets,asset_class::Fund,pct:num:99.00") != null); +} + test "emitFundLines: null sectors -> single TODO line" { var out_buf: [256]u8 = undefined; var out: std.Io.Writer = .fixed(&out_buf); - try emitFundLines("VTI", "ETF", null, &out); + try emitFundLines("VTI", "ETF", null, null, null, &out); try std.testing.expectEqualStrings( "symbol::VTI,sector::TODO,geo::US,asset_class::ETF\n", out.buffered(), @@ -1582,7 +2157,7 @@ test "emitFundLines: populated sectors -> one line per sector with pct" { .{ .description = "Debt / Corporate", .pct = 47.69 }, .{ .description = "Equity / Corporate", .pct = 22.49 }, }; - try emitFundLines("FAGIX", "Fund", sectors[0..], &out); + try emitFundLines("FAGIX", "Fund", sectors[0..], null, null, &out); const written = out.buffered(); try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FAGIX,sector::Debt / Corporate,geo::US,asset_class::Fund,pct:num:47.69") != null); @@ -1594,7 +2169,7 @@ test "emitFundLines: empty slice -> single TODO line (treats empty as null)" { var out_buf: [256]u8 = undefined; var out: std.Io.Writer = .fixed(&out_buf); const empty: [0]FundSector = .{}; - try emitFundLines("VTI", "ETF", empty[0..], &out); + try emitFundLines("VTI", "ETF", empty[0..], null, null, &out); try std.testing.expectEqualStrings( "symbol::VTI,sector::TODO,geo::US,asset_class::ETF\n", out.buffered(), @@ -1610,7 +2185,7 @@ test "emitFundLines: negative pct values render correctly" { .{ .description = "Repurchase Agreement / Other", .pct = -29.72 }, .{ .description = "Derivative-FX / Other", .pct = -0.84 }, }; - try emitFundLines("PTY", "Fund", sectors[0..], &out); + try emitFundLines("PTY", "Fund", sectors[0..], null, null, &out); const written = out.buffered(); try std.testing.expect(std.mem.indexOf(u8, written, "pct:num:-29.72") != null); @@ -1623,7 +2198,7 @@ test "emitFundLines: ETF asset_class flows through" { const sectors = [_]FundSector{ .{ .description = "Equity / Corporate", .pct = 99.86 }, }; - try emitFundLines("SOXX", "ETF", sectors[0..], &out); + try emitFundLines("SOXX", "ETF", sectors[0..], null, null, &out); try std.testing.expectEqualStrings( "symbol::SOXX,sector::Equity / Corporate,geo::US,asset_class::ETF,pct:num:99.86\n", out.buffered(), diff --git a/src/models/classification.zig b/src/models/classification.zig index dcba6ff..148c3d7 100644 --- a/src/models/classification.zig +++ b/src/models/classification.zig @@ -175,6 +175,30 @@ pub const geo = struct { pub const unknown = "Unknown"; }; +// ── Sector taxonomy ────────────────────────────────────────── + +/// Canonical sector taxonomy (GICS-aligned 11-sector model). +/// Producers (Wikidata's `canonicalizeSector`, enrich's +/// `inferSectorFromTitle`) emit one of these strings; consumers +/// (analysis bucketing, display) compare against them. +/// +/// Lives here (not in any provider) so multiple producers can +/// share one taxonomy. Adding a 12th sector or renaming an +/// existing one is a one-place edit. +pub const sector = struct { + pub const technology = "Technology"; + pub const communication_services = "Communication Services"; + pub const consumer_cyclical = "Consumer Cyclical"; + pub const consumer_defensive = "Consumer Defensive"; + pub const healthcare = "Healthcare"; + pub const financial_services = "Financial Services"; + pub const energy = "Energy"; + pub const industrials = "Industrials"; + pub const basic_materials = "Basic Materials"; + pub const real_estate = "Real Estate"; + pub const utilities = "Utilities"; +}; + /// Country-code-to-geo-bucket lookup. Producers (Wikidata today, /// others tomorrow) hand us ISO-3166 alpha-2 codes via the /// `ClassificationRecord.country` field; we map them to the geo