diff --git a/src/commands/enrich.zig b/src/commands/enrich.zig index 90eb7ae..10ea8e8 100644 --- a/src/commands/enrich.zig +++ b/src/commands/enrich.zig @@ -4,8 +4,6 @@ const cli = @import("common.zig"); const framework = @import("framework.zig"); const isCusipLike = @import("../models/portfolio.zig").isCusipLike; const ClassificationRecord = zfin.classification.ClassificationRecord; -const EdgarLookup = @import("../service.zig").EdgarLookup; -const freeEdgarLookup = @import("../service.zig").freeEdgarLookup; pub const ParsedArgs = struct { /// Optional symbol (e.g. "AAPL"). Null = portfolio mode (uses @@ -85,27 +83,6 @@ const DerivedMeta = struct { asset_class: []const u8, }; -/// Whether a Wikidata classification record carries enough -/// information to derive a meaningful (sector, geo, asset_class) -/// triple. When it doesn't (e.g. SOXX returns an "index" entity -/// with only `name` populated, no industry / no country / no -/// instance-of), the caller should fall through to the EDGAR -/// ticker-map fallback rather than emit -/// `sector::Unknown,geo::Unknown,asset_class::Unknown` from the -/// half-empty record. -/// -/// The "useful" definition: the record must indicate the symbol -/// is a fund (any of `is_etf`, `asset_class`) OR carry country -/// data (so geo can be derived) OR carry sector data (so the -/// classification has *some* signal beyond just a name). -fn wikidataLooksUseful(c: ClassificationRecord) bool { - if (c.is_etf) return true; - if (c.asset_class != null) return true; - if (c.country != null) return true; - if (c.sector != null) return true; - return false; -} - /// Compose a `DerivedMeta` from the per-symbol Wikidata /// `ClassificationRecord` plus an optional `market_cap` estimate /// (shares-outstanding × latest close, in dollars). Pure data @@ -286,11 +263,12 @@ fn estimateMarketCap( /// failure (no data anywhere); a `.none` after a successful /// fetch with empty/sparse data is a manual-fill-in case (the /// symbol exists, just needs human attention). -const SummaryCounter = enum { edgar_fallback, failed, manual_todo }; +const SummaryCounter = enum { wikidata_hit, edgar_fallback, failed, manual_todo }; fn classifyForCounter(kind: FallbackKind, wikidata_errored: bool) SummaryCounter { return switch (kind) { - .managed_fund, .company_or_uit => .edgar_fallback, + .wikidata => .wikidata_hit, + .edgar_fallback => .edgar_fallback, .none => if (wikidata_errored) .failed else .manual_todo, }; } @@ -302,8 +280,8 @@ fn classifyForCounter(kind: FallbackKind, wikidata_errored: bool) SummaryCounter /// is a "should never happen" safety valve, not a normal path). fn formatProvenanceMessage(buf: []u8, sym: []const u8, kind: FallbackKind, err: ?anyerror) ?[]const u8 { return switch (kind) { - .managed_fund => std.fmt.bufPrint(buf, " {s}: classified via EDGAR fund fallback\n", .{sym}), - .company_or_uit => std.fmt.bufPrint(buf, " {s}: classified via EDGAR company/UIT fallback\n", .{sym}), + .wikidata => std.fmt.bufPrint(buf, " {s}: classified via Wikidata\n", .{sym}), + .edgar_fallback => std.fmt.bufPrint(buf, " {s}: classified via EDGAR fallback (Wikidata sparse or empty)\n", .{sym}), .none => if (err) |e| std.fmt.bufPrint(buf, " {s}: no classification (Wikidata errored {t}, EDGAR had no entry); fill in by hand\n", .{ sym, e }) else @@ -338,47 +316,96 @@ fn enrichSymbol(io: std.Io, allocator: std.mem.Allocator, svc: *zfin.DataService const opts: zfin.FetchOptions = .{}; - // EDGAR ticker-map fallback runs lazily inside - // `svc.lookupEdgarFallback` when Wikidata returns nothing - // useful. The service handles the maps internally; - // commands consume the digested `EdgarLookup` union. - const classification_result = svc.getClassification(sym, opts) catch |err| { - const action = reportFetchError(io, sym, err); - switch (action) { - .hard_stop => return, - .soft_skip => { - const kind = try emitFallbackForSymbol(svc, allocator, sym, err, opts, out); - stderrSymbolProvenance(io, sym, kind, err); + // `getClassification` runs the full Wikidata -> EDGAR fallback + // chain inside the service. The returned record always carries + // useful data (is_etf, asset_class, country, geo, source, ...); + // sparse-Wikidata symbols get merged with EDGAR ticker-map + + // NPORT-P data before this returns. + const result = svc.getClassification(sym, opts) catch |err| { + switch (err) { + zfin.DataError.NotFound => { + // Neither Wikidata nor EDGAR knows this symbol. + try out.print("# {s} -- no Wikidata or EDGAR entry\n", .{sym}); + try out.print("# symbol::{s},sector::TODO,geo::TODO,asset_class::TODO\n", .{sym}); + stderrSymbolProvenance(io, sym, .none, null); return; }, + else => { + const action = reportFetchError(io, sym, err); + switch (action) { + .hard_stop => return, + .soft_skip => { + try out.print("# {s} -- fetch failed ({t})\n", .{ sym, err }); + try out.print("# symbol::{s},sector::TODO,geo::TODO,asset_class::TODO\n", .{sym}); + stderrSymbolProvenance(io, sym, .none, err); + return; + }, + } + }, } }; - defer classification_result.deinit(); - if (classification_result.data.len == 0 or !wikidataLooksUseful(classification_result.data[0])) { - // Either Wikidata returned no rows, or it returned a - // record too sparse to derive sector/geo/asset_class - // from (SOXX-style: name only, no industry, no - // country, no instance-of). Fall through to the EDGAR - // ticker-map fallback so we at least pick up - // `geo::US,asset_class::Fund/ETF` for symbols listed - // there. - const kind = try emitFallbackForSymbol(svc, allocator, sym, null, opts, out); - stderrSymbolProvenance(io, sym, kind, null); - return; + defer result.deinit(); + const c = result.data[0]; + + if (c.is_etf) { + try emitEtfRows(svc, allocator, sym, c, opts, out); + } else { + const market_cap = estimateMarketCap(svc, sym, c.cik, opts); + var sector_buf: [64]u8 = undefined; + const derived = deriveMetadata(c, market_cap, §or_buf); + if (c.name) |name| { + try out.print("# {s}\n", .{name}); + } + try out.print("symbol::{s},sector::{s},geo::{s},asset_class::{s}\n", .{ + sym, derived.sector, derived.geo, derived.asset_class, + }); } - const classification = classification_result.data[0]; - const market_cap = estimateMarketCap(svc, sym, classification.cik, opts); + stderrSymbolProvenance(io, sym, kindFromSource(c.source), null); +} - var sector_buf: [64]u8 = undefined; - const derived = deriveMetadata(classification, market_cap, §or_buf); +/// Translate the classification record's `source` provenance +/// into the `FallbackKind` enum used by the existing +/// progress/summary plumbing. +fn kindFromSource(source: []const u8) FallbackKind { + if (std.mem.eql(u8, source, "wikidata")) return .wikidata; + if (std.mem.eql(u8, source, "edgar_fallback")) return .edgar_fallback; + return .none; +} - if (classification.name) |name| { - try out.print("# {s}\n", .{name}); +/// Emit multi-row sleeve breakdown for an ETF/fund. Sleeves come +/// from `getEtfMetrics` (NPORT-P sector decomposition); the +/// classification record supplies the asset_class, geo, and +/// (if title-keyword inference fired) the dominant sector to +/// override "Equity / Corporate" with. +fn emitEtfRows( + svc: *zfin.DataService, + allocator: std.mem.Allocator, + sym: []const u8, + c: ClassificationRecord, + opts: zfin.FetchOptions, + out: *std.Io.Writer, +) !void { + const fund_data = loadFundEtfData(svc, allocator, sym, opts); + defer if (fund_data) |d| freeFundEtfData(allocator, d); + const sectors: ?[]const FundSector = if (fund_data) |d| d.sectors else null; + + const asset_class = c.asset_class orelse "Fund"; + const geo = c.geo orelse "US"; + + const from_edgar = std.mem.eql(u8, c.source, "edgar_fallback"); + if (c.name) |name| { + if (from_edgar) { + try out.print("# {s} -- {s} (Wikidata had no entry)\n", .{ sym, name }); + } else { + try out.print("# {s} -- {s}\n", .{ sym, name }); + } + } else if (from_edgar) { + try out.print("# {s} -- (Wikidata had no entry)\n", .{sym}); + } else { + try out.print("# {s}\n", .{sym}); } - try out.print("symbol::{s},sector::{s},geo::{s},asset_class::{s}\n", .{ - sym, derived.sector, derived.geo, derived.asset_class, - }); + try emitFundLines(sym, asset_class, sectors, c.sector, geo, out); } /// Wikidata didn't return a classification for `sym` (either the @@ -414,171 +441,6 @@ pub const FundSector = struct { pct: f64, }; -// ── Title-keyword inference ────────────────────────────────── -// -// When the EDGAR fallback fires (Wikidata had no entry), the -// fund's title — pulled from NPORT-P `` or the -// company-tickers `title` — often carries enough signal to -// infer a useful sector or geo override. The default -// `sector::Equity / Corporate, geo::US, asset_class::Fund` -// triple is mechanically correct (NPORT-P really says the fund -// holds equity in corporate issuers, US-domiciled fund) but -// loses the specific GICS sector or international/emerging-market -// exposure that actually matters for portfolio analysis. -// -// Both inference functions are pure-data and tested directly. - -/// Returns true if `haystack` contains any of `needles` as a -/// substring. Case-sensitive — caller pre-lowercases when -/// case-insensitive matching is wanted. -fn titleContainsAny(haystack: []const u8, needles: []const []const u8) bool { - for (needles) |needle| { - if (std.mem.indexOf(u8, haystack, needle) != null) return true; - } - return false; -} - -/// Lowercase the title into a stack buffer for case-insensitive -/// keyword matching. Truncates titles longer than the buffer -/// (returns null) — real fund names easily fit in 256 bytes. -fn lowercaseTitle(buf: []u8, title: []const u8) ?[]const u8 { - if (title.len > buf.len) return null; - return std.ascii.lowerString(buf[0..title.len], title); -} - -/// Infer a GICS sector from a fund's title. Returns null when -/// no unambiguous keyword match — caller falls back to whatever -/// sector data NPORT-P provided (typically `Equity / Corporate`). -/// -/// Conservative keyword set: matches only words that map -/// unambiguously to a single GICS sector. "Income" / "Dividend" -/// / "Value" / "Growth" / "Momentum" / "Total" / "Equal Weight" -/// / "International" / "Emerging" don't appear here — they -/// describe the screening methodology or geo, not the sector. -/// -/// Reuses `zfin.classification.sector` constants so the -/// inference taxonomy stays in lock-step with the canonicalizer. -pub fn inferSectorFromTitle(title: ?[]const u8) ?[]const u8 { - const t = title orelse return null; - if (t.len == 0) return null; - - var buf: [256]u8 = undefined; - const lc = lowercaseTitle(&buf, t) orelse return null; - - // Order matters: more-specific keywords come first within - // each sector. "Health care" before "care" (irrelevant - // example), "semiconductor" before generic "tech" (which we - // don't include — too broad). - - // Healthcare. "Health care" with space (XLV title), "healthcare" - // (one word), "biotech", "pharmaceutical". - if (titleContainsAny(lc, &.{ "health care", "healthcare", "biotech", "pharmaceutical", "medical" })) { - return zfin.classification.sector.healthcare; - } - - // Technology. Specific terms only — "tech" alone is too - // broad (matches "biotech", "fintech", "edtech" — all - // sector-mixing). - if (titleContainsAny(lc, &.{ "semiconductor", "software", "cloud computing", "internet" })) { - return zfin.classification.sector.technology; - } - - // Financial Services. "Financial" is fairly specific in - // fund-name conventions ("Financial Select Sector SPDR", - // "Vanguard Financials ETF"). - if (titleContainsAny(lc, &.{ "financial", "bank" })) { - return zfin.classification.sector.financial_services; - } - - // Energy. "Energy" alone is mostly unambiguous in fund - // conventions; pair with "oil" / "gas" for redundancy. - if (titleContainsAny(lc, &.{ "energy", "oil & gas", "oil and gas", "petroleum" })) { - return zfin.classification.sector.energy; - } - - // Real Estate / REITs. - if (titleContainsAny(lc, &.{ "real estate", "reit" })) { - return zfin.classification.sector.real_estate; - } - - // Utilities. "Utilities" alone is unambiguous. - if (titleContainsAny(lc, &.{"utilities"})) { - return zfin.classification.sector.utilities; - } - - // Consumer Discretionary / Cyclical. Match the explicit - // labels — "consumer" alone is ambiguous (could be - // discretionary or staples). - if (titleContainsAny(lc, &.{ "consumer discretionary", "consumer cyclical" })) { - return zfin.classification.sector.consumer_cyclical; - } - - // Consumer Staples / Defensive. - if (titleContainsAny(lc, &.{ "consumer staples", "consumer defensive" })) { - return zfin.classification.sector.consumer_defensive; - } - - // Industrials. "Industrial" is more reliable than - // "industrials" because some fund names use the singular - // ("Industrial Select Sector SPDR"). - if (titleContainsAny(lc, &.{ "industrial", "aerospace", "defense" })) { - return zfin.classification.sector.industrials; - } - - // Basic Materials. - if (titleContainsAny(lc, &.{ "materials", "mining", "miners", "metals" })) { - return zfin.classification.sector.basic_materials; - } - - // Communication Services. "Communication" / "Telecom" - // unambiguous. - if (titleContainsAny(lc, &.{ "communication", "telecom", "media" })) { - return zfin.classification.sector.communication_services; - } - - return null; -} - -/// Infer a geo bucket from a fund's title. Returns null when -/// the title doesn't carry an unambiguous international/emerging -/// keyword — caller keeps the default `geo::US`. -/// -/// More important than sector inference: the default `geo::US` -/// is *factually wrong* for international funds (FRDM holds -/// emerging-market equities, not US), so this fix tightens -/// portfolio-level geographic-exposure reporting. -pub fn inferGeoFromTitle(title: ?[]const u8) ?[]const u8 { - const t = title orelse return null; - if (t.len == 0) return null; - - var buf: [256]u8 = undefined; - const lc = lowercaseTitle(&buf, t) orelse return null; - - // Emerging markets first — most specific. "Emerging" alone - // is rare in non-EM contexts in fund-name conventions. - // "Frontier" likewise is conventionally only used for - // frontier markets in fund titles. - if (titleContainsAny(lc, &.{ "emerging market", "emerging markets", "frontier market", "frontier markets", "frontier" })) { - return zfin.classification.geo.emerging; - } - - // International Developed. "International" / "Intl" / - // "ex-US" / "World ex US" / "Developed Markets" / - // specific developed-market regions. - // - // False-positive risk: a hypothetical "Vanguard Total - // International + US Equity Fund" would mis-tag here. None - // of the user's current portfolio holds such a hybrid - // fund. If one ever shows up, it'll get flagged in the - // diff-against-old-metadata.srf review and can be - // hand-corrected. - if (titleContainsAny(lc, &.{ "international", " intl", "ex-us", "ex us", "world ex", "developed market", "developed markets" })) { - return zfin.classification.geo.developed; - } - - return null; -} - /// Determine whether a fund's NPORT-P breakdown is dominated /// by a single Equity / Corporate sector — the precondition /// for sector inference firing. A "dominant" sector is one @@ -595,77 +457,6 @@ fn hasDominantEquitySector(fund_sectors: ?[]const FundSector) bool { return false; } -/// Wikidata didn't return a classification for `sym` (either the -/// fetch errored out softly, or returned an empty result set). -/// Emit a metadata line based on the EDGAR-fallback `lookup`: -/// -/// - `.managed_fund` / `.company_or_uit`: emit one SRF line per -/// sector if `fund_sectors` is non-null, else a single -/// `sector::TODO` line. Fund sector breakdowns come from -/// NPORT-P (cached via `getEtfMetrics`). -/// - `.none` → all-TODO commented stub. -/// -/// `err` is non-null when Wikidata's fetch errored (vs returning -/// empty); included in the comment so the user can see why the -/// auto-fill didn't work. -fn emitMissingClassification( - sym: []const u8, - lookup: EdgarLookup, - fund_sectors: ?[]const FundSector, - series_name: ?[]const u8, - err: ?anyerror, - out: *std.Io.Writer, -) !void { - switch (lookup) { - .managed_fund => { - // NPORT-P series_name is more authoritative than the - // generic "EDGAR managed fund" placeholder. The MF - // ticker file (`company_tickers_mf.json`) carries no - // human-readable name, but NPORT-P's - // does — and we already fetched it for the sector - // breakdown. - if (series_name) |name| { - try out.print("# {s} -- {s} (Wikidata had no entry)\n", .{ sym, name }); - } else { - try out.print("# {s} -- EDGAR managed fund (Wikidata had no entry)\n", .{sym}); - } - // Title-keyword inference: try the series_name (which - // is the only title source on the managed-fund path). - const inferred_sector = inferSectorFromTitle(series_name); - const inferred_geo = inferGeoFromTitle(series_name); - try emitFundLines(sym, "Fund", fund_sectors, inferred_sector, inferred_geo, out); - }, - .company_or_uit => |c| { - const asset_class = if (c.is_etf) "ETF" else "Fund"; - // Name preference: NPORT-P series_name > company_tickers - // title > generic fallback. NPORT-P's is - // the most authoritative (matches what `parseNportP` - // already prefers internally for the analytics path). - if (series_name) |name| { - try out.print("# {s} -- {s} (Wikidata had no entry)\n", .{ sym, name }); - } else if (c.title) |t| { - try out.print("# {s} -- {s} (Wikidata had no entry)\n", .{ sym, t }); - } else { - try out.print("# {s} -- EDGAR company-map entry (Wikidata had no entry)\n", .{sym}); - } - // Title-keyword inference: prefer series_name (more - // authoritative), fall back to company-tickers title. - const effective_title: ?[]const u8 = series_name orelse c.title; - const inferred_sector = inferSectorFromTitle(effective_title); - const inferred_geo = inferGeoFromTitle(effective_title); - try emitFundLines(sym, asset_class, fund_sectors, inferred_sector, inferred_geo, out); - }, - .none => { - if (err) |e| { - try out.print("# {s} -- fetch failed ({t})\n", .{ sym, e }); - } else { - try out.print("# {s} -- no Wikidata or EDGAR entry\n", .{sym}); - } - try out.print("# symbol::{s},sector::TODO,geo::TODO,asset_class::TODO\n", .{sym}); - }, - } -} - /// Emit the body lines for a fund-classified symbol. When /// `fund_sectors` is non-null and non-empty, emits one /// `pct:num:N` line per sector; otherwise emits a single @@ -811,42 +602,10 @@ fn freeFundEtfData(allocator: std.mem.Allocator, data: FundEtfData) void { if (data.sectors) |secs| freeFundSectors(allocator, secs); } -/// Variant tag of an `EdgarLookup`, returned from -/// `emitFallbackForSymbol` so the caller can update counters -/// without holding onto the lookup's owned strings (which the -/// function frees before returning). -const FallbackKind = enum { managed_fund, company_or_uit, none }; - -/// One-shot wrapper around the EDGAR-fallback emit path. Asks -/// the service for a digested lookup result, fetches NPORT-P -/// sector breakdown for fund variants, calls -/// `emitMissingClassification`, frees the lookup's owned strings -/// and the sector slice. Returns the variant tag so the caller -/// can update counters by inspecting which path fired. -fn emitFallbackForSymbol( - svc: *zfin.DataService, - allocator: std.mem.Allocator, - sym: []const u8, - err: ?anyerror, - opts: zfin.FetchOptions, - out: *std.Io.Writer, -) !FallbackKind { - const lookup = svc.lookupEdgarFallback(sym, opts); - defer freeEdgarLookup(allocator, lookup); - const fund_data: ?FundEtfData = switch (lookup) { - .managed_fund, .company_or_uit => loadFundEtfData(svc, allocator, sym, opts), - .none => null, - }; - defer if (fund_data) |d| freeFundEtfData(allocator, d); - const sectors: ?[]const FundSector = if (fund_data) |d| d.sectors else null; - const series_name: ?[]const u8 = if (fund_data) |d| d.series_name else null; - try emitMissingClassification(sym, lookup, sectors, series_name, err, out); - return switch (lookup) { - .managed_fund => .managed_fund, - .company_or_uit => .company_or_uit, - .none => .none, - }; -} +/// Provenance tag derived from a `ClassificationRecord.source` +/// string. Used for per-symbol summary counters and progress +/// messages. +const FallbackKind = enum { wikidata, edgar_fallback, none }; /// Sort symbol slice alphabetically in place. Used by /// `enrichPortfolio` to produce stable, diff-friendly output. @@ -934,76 +693,66 @@ fn enrichPortfolio(ctx: *framework.RunCtx, svc: *zfin.DataService) !void { cli.stderrPrint(io, msg); } - const classification_result = svc.getClassification(sym, opts) catch |err| { - const action = reportFetchError(io, sym, err); - const kind = try emitFallbackForSymbol(svc, allocator, sym, err, opts, out); - try out.print("\n", .{}); - // Counters describe what's IN the file, not what - // happened upstream. If EDGAR rescued this symbol - // there's a usable line for it; count it under - // edgar_fallback. The user already knows Wikidata - // errored from `reportFetchError`'s stderr message — - // double-counting it as `failed` would just make the - // summary lie about the file's contents. - switch (classifyForCounter(kind, true)) { - .edgar_fallback => edgar_fallback += 1, - .failed => failed += 1, - .manual_todo => unreachable, // wikidata_errored=true never returns this - } - switch (action) { - .hard_stop => { - // Every remaining symbol will hit the same - // condition (no API key / auth fail / rate - // limit). Stop the batch with a clear note - // about how many symbols were skipped, so the - // user knows to rerun rather than wonder why - // the SRF stops short. - var rem_buf: [256]u8 = undefined; - const remaining = syms.len - i - 1; - const rem_msg = std.fmt.bufPrint( - &rem_buf, - "Stopping enrichment: {d} symbol(s) not yet fetched. Rerun once the issue is resolved.\n", - .{remaining}, - ) catch "Stopping enrichment.\n"; - cli.stderrPrint(io, rem_msg); - break; + const result = svc.getClassification(sym, opts) catch |err| { + switch (err) { + zfin.DataError.NotFound => { + // Neither Wikidata nor EDGAR knows this + // symbol -- fill in by hand. + try out.print("# {s} -- no Wikidata or EDGAR entry\n", .{sym}); + try out.print("# symbol::{s},sector::TODO,geo::TODO,asset_class::TODO\n\n", .{sym}); + manual_todo += 1; + continue; + }, + else => { + const action = reportFetchError(io, sym, err); + try out.print("# {s} -- fetch failed ({t})\n", .{ sym, err }); + try out.print("# symbol::{s},sector::TODO,geo::TODO,asset_class::TODO\n\n", .{sym}); + failed += 1; + switch (action) { + .hard_stop => { + // Every remaining symbol will hit the + // same condition (no API key / auth + // fail / rate limit). Stop the batch + // with a clear note so the user knows + // how many were skipped. + var rem_buf: [256]u8 = undefined; + const remaining = syms.len - i - 1; + const rem_msg = std.fmt.bufPrint( + &rem_buf, + "Stopping enrichment: {d} symbol(s) not yet fetched. Rerun once the issue is resolved.\n", + .{remaining}, + ) catch "Stopping enrichment.\n"; + cli.stderrPrint(io, rem_msg); + break; + }, + .soft_skip => continue, + } }, - .soft_skip => continue, } }; - defer classification_result.deinit(); + defer result.deinit(); + const c = result.data[0]; - if (classification_result.data.len == 0 or !wikidataLooksUseful(classification_result.data[0])) { - // Wikidata returned nothing useful — fall through to - // the EDGAR ticker-map fallback. See the same branch - // in `enrichSymbol` for the rationale. - const kind = try emitFallbackForSymbol(svc, allocator, sym, null, opts, out); + if (c.is_etf) { + try emitEtfRows(svc, allocator, sym, c, opts, out); try out.print("\n", .{}); - // Distinguish "EDGAR rescued this symbol" from - // "neither source had it" so the summary is honest - // about how many entries actually carry useful data. - switch (classifyForCounter(kind, false)) { - .edgar_fallback => edgar_fallback += 1, - .manual_todo => manual_todo += 1, - .failed => unreachable, // wikidata_errored=false never returns this + } else { + const market_cap = estimateMarketCap(svc, sym, c.cik, opts); + var sector_buf: [64]u8 = undefined; + const derived = deriveMetadata(c, market_cap, §or_buf); + if (c.name) |name| { + try out.print("# {s}\n", .{name}); } - continue; + try out.print("symbol::{s},sector::{s},geo::{s},asset_class::{s}\n\n", .{ + sym, derived.sector, derived.geo, derived.asset_class, + }); } - const classification = classification_result.data[0]; - const market_cap = estimateMarketCap(svc, sym, classification.cik, opts); - - var sector_buf: [64]u8 = undefined; - const derived = deriveMetadata(classification, market_cap, §or_buf); - - // Comment with the name for readability - if (classification.name) |name| { - try out.print("# {s}\n", .{name}); + switch (kindFromSource(c.source)) { + .wikidata => wikidata_hits += 1, + .edgar_fallback => edgar_fallback += 1, + .none => manual_todo += 1, // shouldn't happen for a successful return } - try out.print("symbol::{s},sector::{s},geo::{s},asset_class::{s}\n\n", .{ - sym, derived.sector, derived.geo, derived.asset_class, - }); - wikidata_hits += 1; } // Summary. Every symbol contributes to exactly one bucket; @@ -1259,84 +1008,6 @@ test "deriveMetadata: asset_class set but not 'Mutual Fund' -> falls through to try std.testing.expectEqualStrings("US Large Cap", derived.asset_class); } -// ── wikidataLooksUseful ───────────────────────────────────── - -test "wikidataLooksUseful: is_etf true -> useful" { - const c: ClassificationRecord = .{ - .symbol = "VTI", - .is_etf = true, - .as_of = "2026-05-29", - .source = "wikidata", - }; - try std.testing.expect(wikidataLooksUseful(c)); -} - -test "wikidataLooksUseful: country set -> useful" { - const c: ClassificationRecord = .{ - .symbol = "AAPL", - .country = "US", - .as_of = "2026-05-29", - .source = "wikidata", - }; - try std.testing.expect(wikidataLooksUseful(c)); -} - -test "wikidataLooksUseful: sector set -> useful" { - const c: ClassificationRecord = .{ - .symbol = "AAPL", - .sector = "Technology", - .as_of = "2026-05-29", - .source = "wikidata", - }; - try std.testing.expect(wikidataLooksUseful(c)); -} - -test "wikidataLooksUseful: asset_class set -> useful" { - const c: ClassificationRecord = .{ - .symbol = "FOO", - .asset_class = "Mutual Fund", - .as_of = "2026-05-29", - .source = "wikidata", - }; - try std.testing.expect(wikidataLooksUseful(c)); -} - -test "wikidataLooksUseful: only name set -> NOT useful (SOXX shape)" { - // Wikidata returned an entity but only the label came back. - // No country, no industry, no instance-of. We can't derive - // sector/geo/asset_class from this; the caller falls - // through to the EDGAR fallback. - const c: ClassificationRecord = .{ - .symbol = "SOXX", - .name = "PHLX Semiconductor Sector", - .as_of = "2026-05-29", - .source = "wikidata", - }; - try std.testing.expect(!wikidataLooksUseful(c)); -} - -test "wikidataLooksUseful: bare record (only required fields) -> NOT useful" { - const c: ClassificationRecord = .{ - .symbol = "BARE", - .as_of = "2026-05-29", - .source = "wikidata", - }; - try std.testing.expect(!wikidataLooksUseful(c)); -} - -test "wikidataLooksUseful: industry alone (no country/sector/etf flag) -> NOT useful" { - // industry is set but not promoted to sector (canonicalize - // returned null). We don't treat industry-without-sector as - // useful because sector is the actual user-facing field. - const c: ClassificationRecord = .{ - .symbol = "WEIRD", - .industry = "weird esoteric industry", - .as_of = "2026-05-29", - .source = "wikidata", - }; - try std.testing.expect(!wikidataLooksUseful(c)); -} - // ── reportFetchError ──────────────────────────────────────── // // `reportFetchError` writes a user-facing diagnostic to stderr @@ -1394,19 +1065,19 @@ test "reportFetchError: long symbol still classifies correctly (bufPrint fallbac // ── formatProvenanceMessage ──────────────────────────────────── -test "formatProvenanceMessage: managed_fund -> 'EDGAR fund fallback' line" { +test "formatProvenanceMessage: wikidata -> 'classified via Wikidata' line" { var buf: [256]u8 = undefined; - const msg = formatProvenanceMessage(&buf, "FAGIX", .managed_fund, null) orelse return error.Format; - try std.testing.expect(std.mem.indexOf(u8, msg, "FAGIX") != null); - try std.testing.expect(std.mem.indexOf(u8, msg, "EDGAR fund fallback") != null); + const msg = formatProvenanceMessage(&buf, "AAPL", .wikidata, null) orelse return error.Format; + try std.testing.expect(std.mem.indexOf(u8, msg, "AAPL") != null); + try std.testing.expect(std.mem.indexOf(u8, msg, "Wikidata") != null); try std.testing.expect(std.mem.endsWith(u8, msg, "\n")); } -test "formatProvenanceMessage: company_or_uit -> 'EDGAR company/UIT fallback' line" { +test "formatProvenanceMessage: edgar_fallback -> 'classified via EDGAR fallback' line" { var buf: [256]u8 = undefined; - const msg = formatProvenanceMessage(&buf, "SPY", .company_or_uit, null) orelse return error.Format; - try std.testing.expect(std.mem.indexOf(u8, msg, "SPY") != null); - try std.testing.expect(std.mem.indexOf(u8, msg, "EDGAR company/UIT fallback") != null); + const msg = formatProvenanceMessage(&buf, "SOXX", .edgar_fallback, null) orelse return error.Format; + try std.testing.expect(std.mem.indexOf(u8, msg, "SOXX") != null); + try std.testing.expect(std.mem.indexOf(u8, msg, "EDGAR fallback") != null); } test "formatProvenanceMessage: none with no error -> 'no Wikidata or EDGAR entry'" { @@ -1428,46 +1099,35 @@ test "formatProvenanceMessage: none with error -> includes error name" { try std.testing.expect(std.mem.indexOf(u8, msg, "Wikidata errored") != null); } -test "formatProvenanceMessage: managed_fund ignores error arg (irrelevant on success path)" { - // If Wikidata errored but EDGAR rescued the symbol, we don't - // surface the Wikidata error name in the breadcrumb — the - // file has a usable line, no action needed from the user. - var buf: [256]u8 = undefined; - const msg = formatProvenanceMessage(&buf, "VBTLX", .managed_fund, error.RateLimited) orelse return error.Format; - try std.testing.expect(std.mem.indexOf(u8, msg, "RateLimited") == null); - try std.testing.expect(std.mem.indexOf(u8, msg, "EDGAR fund fallback") != null); -} - test "formatProvenanceMessage: small buffer returns null (safety valve)" { // 16-byte buffer can't hold any of the message variants. // Should return null rather than crash; caller treats null // as "skip the breadcrumb" rather than panicking. var buf: [16]u8 = undefined; - try std.testing.expect(formatProvenanceMessage(&buf, "AAPL", .managed_fund, null) == null); + try std.testing.expect(formatProvenanceMessage(&buf, "AAPL", .edgar_fallback, null) == null); } test "formatProvenanceMessage: messages have leading two-space indent" { // Match the rest of enrich's stderr output (progress // messages, fetch breadcrumbs all use " " prefix). var buf: [256]u8 = undefined; - const msg = formatProvenanceMessage(&buf, "X", .managed_fund, null) orelse return error.Format; + const msg = formatProvenanceMessage(&buf, "X", .edgar_fallback, null) orelse return error.Format; try std.testing.expect(std.mem.startsWith(u8, msg, " ")); } // ── classifyForCounter ──────────────────────────────────────── -test "classifyForCounter: managed_fund -> edgar_fallback regardless of wikidata error" { - // EDGAR rescued the symbol; the file has a usable line; it - // counts as edgar_fallback whether or not Wikidata errored - // upstream. This is the load-bearing fix for the - // line-586 counter bug. - try std.testing.expectEqual(SummaryCounter.edgar_fallback, classifyForCounter(.managed_fund, true)); - try std.testing.expectEqual(SummaryCounter.edgar_fallback, classifyForCounter(.managed_fund, false)); +test "classifyForCounter: wikidata -> wikidata_hit regardless of error arg" { + try std.testing.expectEqual(SummaryCounter.wikidata_hit, classifyForCounter(.wikidata, false)); + try std.testing.expectEqual(SummaryCounter.wikidata_hit, classifyForCounter(.wikidata, true)); } -test "classifyForCounter: company_or_uit -> edgar_fallback regardless of wikidata error" { - try std.testing.expectEqual(SummaryCounter.edgar_fallback, classifyForCounter(.company_or_uit, true)); - try std.testing.expectEqual(SummaryCounter.edgar_fallback, classifyForCounter(.company_or_uit, false)); +test "classifyForCounter: edgar_fallback -> edgar_fallback regardless of wikidata error" { + // EDGAR rescued the symbol; the file has a usable line; it + // counts as edgar_fallback whether or not Wikidata errored + // upstream. + try std.testing.expectEqual(SummaryCounter.edgar_fallback, classifyForCounter(.edgar_fallback, true)); + try std.testing.expectEqual(SummaryCounter.edgar_fallback, classifyForCounter(.edgar_fallback, false)); } test "classifyForCounter: none + wikidata errored -> failed (no data anywhere)" { @@ -1488,472 +1148,14 @@ test "classifyForCounter: none + wikidata succeeded but empty -> manual_todo" { test "classifyForCounter: covers all (FallbackKind, bool) input combinations" { // Exhaustive combinator test — locks in the truth table so // any future change to the policy has to update this test. - try std.testing.expectEqual(SummaryCounter.edgar_fallback, classifyForCounter(.managed_fund, false)); - try std.testing.expectEqual(SummaryCounter.edgar_fallback, classifyForCounter(.managed_fund, true)); - try std.testing.expectEqual(SummaryCounter.edgar_fallback, classifyForCounter(.company_or_uit, false)); - try std.testing.expectEqual(SummaryCounter.edgar_fallback, classifyForCounter(.company_or_uit, true)); + try std.testing.expectEqual(SummaryCounter.wikidata_hit, classifyForCounter(.wikidata, false)); + try std.testing.expectEqual(SummaryCounter.wikidata_hit, classifyForCounter(.wikidata, true)); + try std.testing.expectEqual(SummaryCounter.edgar_fallback, classifyForCounter(.edgar_fallback, false)); + try std.testing.expectEqual(SummaryCounter.edgar_fallback, classifyForCounter(.edgar_fallback, true)); try std.testing.expectEqual(SummaryCounter.manual_todo, classifyForCounter(.none, false)); try std.testing.expectEqual(SummaryCounter.failed, classifyForCounter(.none, true)); } -test "emitMissingClassification: .managed_fund -> Fund line" { - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - try emitMissingClassification("FAGIX", .managed_fund, null, null, null, &out); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "EDGAR managed fund") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FAGIX,sector::TODO,geo::US,asset_class::Fund") != null); - // No leading `# ` on the data line — it should be the - // canonical metadata line, not a commented-out TODO stub. - try std.testing.expect(std.mem.indexOf(u8, written, "# symbol::FAGIX") == null); -} - -test "emitMissingClassification: .managed_fund with sector breakdown -> multi-line" { - var out_buf: [1024]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - const sectors = [_]FundSector{ - .{ .description = "Equity / Corporate", .pct = 60.5 }, - .{ .description = "Debt / Corporate", .pct = 39.5 }, - }; - try emitMissingClassification("FAGIX", .managed_fund, sectors[0..], null, null, &out); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "EDGAR managed fund") != null); - // Two body lines, one per sector, with pct. - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FAGIX,sector::Equity / Corporate,geo::US,asset_class::Fund,pct:num:60.50") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FAGIX,sector::Debt / Corporate,geo::US,asset_class::Fund,pct:num:39.50") != null); - // No TODO line — sectors are present. - try std.testing.expect(std.mem.indexOf(u8, written, "sector::TODO") == null); -} - -test "emitMissingClassification: .company_or_uit with ETF hint and sector breakdown" { - var out_buf: [1024]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - const sectors = [_]FundSector{ - .{ .description = "Equity / Corporate", .pct = 100.0 }, - }; - try emitMissingClassification( - "SPY", - .{ .company_or_uit = .{ .title = "SPDR S&P 500 ETF TRUST", .is_etf = true } }, - sectors[0..], - null, - null, - &out, - ); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "SPDR S&P 500 ETF TRUST") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::SPY,sector::Equity / Corporate,geo::US,asset_class::ETF,pct:num:100.00") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "sector::TODO") == null); -} - -test "emitMissingClassification: .managed_fund with empty sectors slice -> still TODO line" { - // Empty slice (vs null) — `loadFundSectors` returns null - // when the fund has no sector records, but defensive check - // for [_]FundSector{} too. - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - const sectors: [0]FundSector = .{}; - try emitMissingClassification("FAGIX", .managed_fund, sectors[0..], null, null, &out); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FAGIX,sector::TODO,geo::US,asset_class::Fund") != null); -} - -test "emitMissingClassification: .company_or_uit with ETF hint -> ETF line" { - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - try emitMissingClassification( - "SPY", - .{ .company_or_uit = .{ .title = "SPDR S&P 500 ETF TRUST", .is_etf = true } }, - null, - null, - null, - &out, - ); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "SPDR S&P 500 ETF TRUST") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::SPY,sector::TODO,geo::US,asset_class::ETF") != null); -} - -test "emitMissingClassification: .company_or_uit without ETF hint -> Fund line" { - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - try emitMissingClassification( - "FOO", - .{ .company_or_uit = .{ .title = "Foo Holdings Inc", .is_etf = false } }, - null, - null, - null, - &out, - ); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FOO,sector::TODO,geo::US,asset_class::Fund") != null); -} - -test "emitMissingClassification: .company_or_uit with null title -> generic comment" { - // When the EDGAR company-map row has a CIK but no title - // string, we fall back to a generic "EDGAR company-map - // entry" comment instead of trying to render a null name. - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - try emitMissingClassification( - "BAR", - .{ .company_or_uit = .{ .title = null, .is_etf = false } }, - null, - null, - null, - &out, - ); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "BAR -- EDGAR company-map entry (Wikidata had no entry)") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::BAR,sector::TODO,geo::US,asset_class::Fund") != null); -} - -test "emitMissingClassification: .company_or_uit with null title + ETF hint" { - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - try emitMissingClassification( - "BAZ", - .{ .company_or_uit = .{ .title = null, .is_etf = true } }, - null, - null, - null, - &out, - ); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "BAZ -- EDGAR company-map entry") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::BAZ,sector::TODO,geo::US,asset_class::ETF") != null); -} - -test "emitMissingClassification: .none with error -> all-TODO with error name" { - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - try emitMissingClassification("MISSING", .none, null, null, error.NotFound, &out); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "fetch failed") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "NotFound") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "# symbol::MISSING,sector::TODO,geo::TODO,asset_class::TODO") != null); -} - -test "emitMissingClassification: .none without error -> 'no entry' message" { - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - try emitMissingClassification("MISSING", .none, null, null, null, &out); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "no Wikidata or EDGAR entry") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "fetch failed") == null); - try std.testing.expect(std.mem.indexOf(u8, written, "# symbol::MISSING,sector::TODO,geo::TODO,asset_class::TODO") != null); -} - -test "emitMissingClassification: .none with sectors arg ignored (sectors only meaningful for funds)" { - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - const sectors = [_]FundSector{ - .{ .description = "Equity / Corporate", .pct = 100.0 }, - }; - try emitMissingClassification("MISSING", .none, sectors[0..], null, null, &out); - - const written = out.buffered(); - // Sector breakdown is ignored for .none (we don't know if it's - // even a fund); the all-TODO path runs as if no sectors were - // provided. - try std.testing.expect(std.mem.indexOf(u8, written, "# symbol::MISSING,sector::TODO,geo::TODO,asset_class::TODO") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "Equity / Corporate") == null); -} - -// ── series_name plumbing ────────────────────────────────────── -// -// NPORT-P's gives us a real human-readable name -// even when the EDGAR mutual-fund ticker file (which has no -// `title` field) is the only thing that matched. Verify the -// name flows through into the comment line. - -test "emitMissingClassification: .managed_fund with series_name -> name in comment" { - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - try emitMissingClassification( - "FAGIX", - .managed_fund, - null, - "Fidelity Capital and Income Fund", - null, - &out, - ); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "FAGIX -- Fidelity Capital and Income Fund (Wikidata had no entry)") != null); - // Generic placeholder is suppressed when we have a real name. - try std.testing.expect(std.mem.indexOf(u8, written, "EDGAR managed fund") == null); -} - -test "emitMissingClassification: .managed_fund with series_name and sectors -> name + breakdown" { - var out_buf: [1024]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - const sectors = [_]FundSector{ - .{ .description = "Debt / Corporate", .pct = 47.69 }, - .{ .description = "Equity / Corporate", .pct = 22.49 }, - }; - try emitMissingClassification( - "FAGIX", - .managed_fund, - sectors[0..], - "Fidelity Capital and Income Fund", - null, - &out, - ); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "Fidelity Capital and Income Fund") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "Debt / Corporate") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "Equity / Corporate") != null); -} - -test "emitMissingClassification: .company_or_uit prefers series_name over title" { - // NPORT-P series_name is more authoritative than the - // company_tickers.json title. If both are present, the - // series_name wins. - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - try emitMissingClassification( - "SPY", - .{ .company_or_uit = .{ .title = "SPDR S&P 500 ETF TRUST", .is_etf = true } }, - null, - "SPDR S&P 500 ETF Trust", - null, - &out, - ); - - const written = out.buffered(); - // Mixed-case "Trust" (NPORT-P) should appear, NOT all-caps - // "TRUST" (company_tickers). - try std.testing.expect(std.mem.indexOf(u8, written, "SPDR S&P 500 ETF Trust (Wikidata had no entry)") != null); - // The all-caps version should NOT appear. - try std.testing.expect(std.mem.indexOf(u8, written, "SPY -- SPDR S&P 500 ETF TRUST") == null); -} - -test "emitMissingClassification: .company_or_uit falls back to title when series_name null" { - // No NPORT-P data; title from company_tickers.json is the - // only name we have. Use it. - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - try emitMissingClassification( - "SPY", - .{ .company_or_uit = .{ .title = "SPDR S&P 500 ETF TRUST", .is_etf = true } }, - null, - null, - null, - &out, - ); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "SPY -- SPDR S&P 500 ETF TRUST (Wikidata had no entry)") != null); -} - -test "emitMissingClassification: .company_or_uit with both title and series_name null -> generic" { - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - try emitMissingClassification( - "BARE", - .{ .company_or_uit = .{ .title = null, .is_etf = true } }, - null, - null, - null, - &out, - ); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "BARE -- EDGAR company-map entry (Wikidata had no entry)") != null); -} - -test "emitMissingClassification: .none ignores series_name (no fund name to display)" { - // .none means the symbol isn't in EITHER ticker map, so a - // series_name shouldn't even arrive. Defensive check: - // even if the caller mistakenly passes one, the output - // should match the original .none format (fetch failed - // / no Wikidata or EDGAR entry). - var out_buf: [512]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - try emitMissingClassification("UNKNOWN", .none, null, "Spurious Name", null, &out); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "no Wikidata or EDGAR entry") != null); - // The spurious name should NOT appear in the output. - try std.testing.expect(std.mem.indexOf(u8, written, "Spurious Name") == null); -} - -// ── inferSectorFromTitle ───────────────────────────────────── - -test "inferSectorFromTitle: Health Care / Healthcare → Healthcare" { - try std.testing.expectEqualStrings( - "Healthcare", - inferSectorFromTitle("State Street(R) Health Care Select Sector SPDR(R) ETF").?, - ); - try std.testing.expectEqualStrings("Healthcare", inferSectorFromTitle("iShares U.S. Healthcare ETF").?); - try std.testing.expectEqualStrings("Healthcare", inferSectorFromTitle("Vanguard Health Care ETF").?); - try std.testing.expectEqualStrings("Healthcare", inferSectorFromTitle("SPDR S&P Pharmaceutical ETF").?); - try std.testing.expectEqualStrings("Healthcare", inferSectorFromTitle("iShares Biotech ETF").?); -} - -test "inferSectorFromTitle: Semiconductor / Software → Technology" { - try std.testing.expectEqualStrings("Technology", inferSectorFromTitle("iShares Semiconductor ETF").?); - try std.testing.expectEqualStrings("Technology", inferSectorFromTitle("VanEck Semiconductor ETF").?); - try std.testing.expectEqualStrings("Technology", inferSectorFromTitle("Invesco Software ETF").?); - try std.testing.expectEqualStrings("Technology", inferSectorFromTitle("First Trust Cloud Computing ETF").?); -} - -test "inferSectorFromTitle: Financial → Financial Services" { - try std.testing.expectEqualStrings("Financial Services", inferSectorFromTitle("Financial Select Sector SPDR Fund").?); - try std.testing.expectEqualStrings("Financial Services", inferSectorFromTitle("Vanguard Financials ETF").?); - try std.testing.expectEqualStrings("Financial Services", inferSectorFromTitle("SPDR S&P Bank ETF").?); -} - -test "inferSectorFromTitle: Energy → Energy" { - try std.testing.expectEqualStrings("Energy", inferSectorFromTitle("Energy Select Sector SPDR Fund").?); - try std.testing.expectEqualStrings("Energy", inferSectorFromTitle("SPDR S&P Oil & Gas Exploration ETF").?); - try std.testing.expectEqualStrings("Energy", inferSectorFromTitle("VanEck Oil and Gas ETF").?); - try std.testing.expectEqualStrings("Energy", inferSectorFromTitle("Invesco Petroleum ETF").?); -} - -test "inferSectorFromTitle: Real Estate / REIT → Real Estate" { - try std.testing.expectEqualStrings("Real Estate", inferSectorFromTitle("Vanguard Real Estate ETF").?); - try std.testing.expectEqualStrings("Real Estate", inferSectorFromTitle("Schwab U.S. REIT ETF").?); -} - -test "inferSectorFromTitle: Utilities → Utilities" { - try std.testing.expectEqualStrings("Utilities", inferSectorFromTitle("Utilities Select Sector SPDR Fund").?); - try std.testing.expectEqualStrings("Utilities", inferSectorFromTitle("Vanguard Utilities ETF").?); -} - -test "inferSectorFromTitle: Industrials and Materials" { - try std.testing.expectEqualStrings("Industrials", inferSectorFromTitle("Industrial Select Sector SPDR Fund").?); - try std.testing.expectEqualStrings("Industrials", inferSectorFromTitle("iShares Aerospace & Defense ETF").?); - try std.testing.expectEqualStrings("Basic Materials", inferSectorFromTitle("Materials Select Sector SPDR Fund").?); - try std.testing.expectEqualStrings("Basic Materials", inferSectorFromTitle("VanEck Gold Miners ETF").?); -} - -test "inferSectorFromTitle: Communication Services" { - try std.testing.expectEqualStrings("Communication Services", inferSectorFromTitle("Communication Services Select Sector SPDR Fund").?); - try std.testing.expectEqualStrings("Communication Services", inferSectorFromTitle("iShares U.S. Telecom ETF").?); -} - -test "inferSectorFromTitle: Consumer Discretionary / Cyclical / Staples / Defensive" { - try std.testing.expectEqualStrings("Consumer Cyclical", inferSectorFromTitle("Consumer Discretionary Select Sector SPDR Fund").?); - try std.testing.expectEqualStrings("Consumer Cyclical", inferSectorFromTitle("iShares U.S. Consumer Cyclical ETF").?); - try std.testing.expectEqualStrings("Consumer Defensive", inferSectorFromTitle("Consumer Staples Select Sector SPDR Fund").?); - try std.testing.expectEqualStrings("Consumer Defensive", inferSectorFromTitle("Vanguard Consumer Defensive ETF").?); -} - -test "inferSectorFromTitle: broad-market / strategy funds return null" { - // No sector keyword — falls through. Caller keeps the - // NPORT-P generic Equity / Corporate. - try std.testing.expect(inferSectorFromTitle("SPDR S&P 500 ETF Trust") == null); - try std.testing.expect(inferSectorFromTitle("Vanguard Total Stock Market ETF") == null); - try std.testing.expect(inferSectorFromTitle("Schwab U.S. Dividend Equity ETF") == null); - try std.testing.expect(inferSectorFromTitle("Invesco S&P 500 Equal Weight ETF") == null); - try std.testing.expect(inferSectorFromTitle("Vanguard Total Bond Market Index Fund") == null); - try std.testing.expect(inferSectorFromTitle("Fidelity Capital and Income Fund") == null); -} - -test "inferSectorFromTitle: target-date funds return null (multi-sector)" { - // Target-date funds hold a mix of equity and bonds across - // multiple sectors. No keyword should match. - try std.testing.expect(inferSectorFromTitle("VANGUARD TARGET RETIREMENT 2040 FUND") == null); - try std.testing.expect(inferSectorFromTitle("Fidelity Freedom 2050 Fund") == null); -} - -test "inferSectorFromTitle: null and empty input return null" { - try std.testing.expect(inferSectorFromTitle(null) == null); - try std.testing.expect(inferSectorFromTitle("") == null); -} - -test "inferSectorFromTitle: oversized title returns null safely" { - // Buffer-bounded; titles >256 bytes return null rather - // than crash. Real fund names are far shorter; this is a - // defensive check. - const long_title = "X" ** 300; - try std.testing.expect(inferSectorFromTitle(long_title) == null); -} - -test "inferSectorFromTitle: case-insensitive matching" { - // "HEALTH CARE" (all caps, e.g. Vanguard's all-caps style) - // matches the same as "Health Care". - try std.testing.expectEqualStrings("Healthcare", inferSectorFromTitle("VANGUARD HEALTH CARE ETF").?); - try std.testing.expectEqualStrings("Technology", inferSectorFromTitle("ISHARES SEMICONDUCTOR ETF").?); -} - -test "inferSectorFromTitle: returns same pointer for same bucket (static literal)" { - // The taxonomy constants are static literals; multiple - // calls returning the same bucket should hand out the - // same byte pointer. Lets callers compare via ptr equality - // and use the result as a stable HashMap key. - const a = inferSectorFromTitle("Vanguard Health Care ETF").?; - const b = inferSectorFromTitle("iShares U.S. Healthcare ETF").?; - try std.testing.expectEqual(@intFromPtr(a.ptr), @intFromPtr(b.ptr)); -} - -// ── inferGeoFromTitle ──────────────────────────────────────── - -test "inferGeoFromTitle: Emerging Markets → Emerging Markets" { - try std.testing.expectEqualStrings("Emerging Markets", inferGeoFromTitle("Freedom 100 Emerging Markets ETF").?); - try std.testing.expectEqualStrings("Emerging Markets", inferGeoFromTitle("Vanguard FTSE Emerging Markets ETF").?); - try std.testing.expectEqualStrings("Emerging Markets", inferGeoFromTitle("iShares MSCI Emerging Markets ETF").?); -} - -test "inferGeoFromTitle: Frontier Markets → Emerging Markets bucket" { - try std.testing.expectEqualStrings("Emerging Markets", inferGeoFromTitle("iShares MSCI Frontier 100 ETF").?); - try std.testing.expectEqualStrings("Emerging Markets", inferGeoFromTitle("Vanguard Frontier Markets ETF").?); -} - -test "inferGeoFromTitle: International → International Developed" { - try std.testing.expectEqualStrings("International Developed", inferGeoFromTitle("iShares MSCI Intl Value Factor ETF").?); - try std.testing.expectEqualStrings("International Developed", inferGeoFromTitle("Vanguard FTSE Developed Markets ETF").?); - try std.testing.expectEqualStrings("International Developed", inferGeoFromTitle("Invesco S&P International Developed Momentum ETF").?); - try std.testing.expectEqualStrings("International Developed", inferGeoFromTitle("NYLI FTSE International Equity Currency Neutral ETF").?); - try std.testing.expectEqualStrings("International Developed", inferGeoFromTitle("Vanguard FTSE All-World ex-US ETF").?); -} - -test "inferGeoFromTitle: US-only / no geo keyword returns null" { - try std.testing.expect(inferGeoFromTitle("SPDR S&P 500 ETF Trust") == null); - try std.testing.expect(inferGeoFromTitle("Vanguard Total Stock Market ETF") == null); - try std.testing.expect(inferGeoFromTitle("Schwab U.S. Dividend Equity ETF") == null); - try std.testing.expect(inferGeoFromTitle("Fidelity Capital and Income Fund") == null); -} - -test "inferGeoFromTitle: Emerging beats International when both present" { - // Defensive: "iShares MSCI International Emerging Markets - // ETF" (hypothetical) would match both branches. Emerging - // Markets is more specific and is checked first; verify - // the priority order holds. - try std.testing.expectEqualStrings( - "Emerging Markets", - inferGeoFromTitle("iShares MSCI International Emerging Markets ETF").?, - ); -} - -test "inferGeoFromTitle: null and empty input return null" { - try std.testing.expect(inferGeoFromTitle(null) == null); - try std.testing.expect(inferGeoFromTitle("") == null); -} - -test "inferGeoFromTitle: case-insensitive matching" { - try std.testing.expectEqualStrings("Emerging Markets", inferGeoFromTitle("FREEDOM 100 EMERGING MARKETS ETF").?); - try std.testing.expectEqualStrings("International Developed", inferGeoFromTitle("ISHARES MSCI INTL VALUE FACTOR ETF").?); -} - -test "inferGeoFromTitle: returns static-literal pointers" { - const a = inferGeoFromTitle("iShares Emerging Markets ETF").?; - const b = inferGeoFromTitle("Vanguard Emerging Markets ETF").?; - try std.testing.expectEqual(@intFromPtr(a.ptr), @intFromPtr(b.ptr)); -} - // ── hasDominantEquitySector ────────────────────────────────── test "hasDominantEquitySector: single 99% Equity / Corporate -> true" { @@ -2004,142 +1206,6 @@ test "hasDominantEquitySector: null and empty -> false" { try std.testing.expect(!hasDominantEquitySector(empty[0..])); } -// ── inference integration with emitMissingClassification ───── - -test "emitMissingClassification: XLV-shape applies sector inference (Health Care -> Healthcare)" { - // Single dominant Equity / Corporate (99.76%) AND title - // contains "Health Care" → the Equity row gets replaced - // with Healthcare. Cash sleeve stays as STIV. - var out_buf: [1024]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - const sectors = [_]FundSector{ - .{ .description = "Equity / Corporate", .pct = 99.76 }, - .{ .description = "Short-Term Investment Vehicle / Registered Fund", .pct = 0.45 }, - }; - try emitMissingClassification( - "XLV", - .{ .company_or_uit = .{ .title = "SPDR HEALTH CARE SELECT SECTOR ETF", .is_etf = true } }, - sectors[0..], - "State Street(R) Health Care Select Sector SPDR(R) ETF", - null, - &out, - ); - - const written = out.buffered(); - // The dominant row gets the GICS sector. Note pct preserved. - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::XLV,sector::Healthcare,geo::US,asset_class::ETF,pct:num:99.76") != null); - // Cash sleeve unchanged. - try std.testing.expect(std.mem.indexOf(u8, written, "Short-Term Investment Vehicle / Registered Fund") != null); - // The generic Equity / Corporate row should NOT appear. - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::XLV,sector::Equity / Corporate") == null); -} - -test "emitMissingClassification: FRDM-shape applies geo inference (Emerging Markets)" { - // Title "Emerging Markets" → every row gets geo::Emerging Markets. - // No sector inference (no sector keyword in title). - var out_buf: [1024]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - const sectors = [_]FundSector{ - .{ .description = "Equity / Corporate", .pct = 99.55 }, - }; - try emitMissingClassification( - "FRDM", - .managed_fund, - sectors[0..], - "Freedom 100 Emerging Markets ETF", - null, - &out, - ); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "geo::Emerging Markets") != null); - // No US geo on this row. - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FRDM,sector::Equity / Corporate,geo::US") == null); -} - -test "emitMissingClassification: multi-asset fund (FAGIX-shape) does NOT apply sector inference" { - // Multi-asset breakdown — no dominant Equity / Corporate - // sleeve. Sector inference should NOT fire even if the - // title had a sector keyword (FAGIX's title doesn't, but - // this guards the multi-asset case generally). - var out_buf: [1024]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - const sectors = [_]FundSector{ - .{ .description = "Debt / Corporate", .pct = 47.69 }, - .{ .description = "Equity / Corporate", .pct = 22.49 }, - .{ .description = "Loan / Corporate", .pct = 9.99 }, - }; - try emitMissingClassification( - "FAKE", - .managed_fund, - sectors[0..], - // Hypothetical title with a sector keyword the - // inference would normally pick up. - "Hypothetical Healthcare Multi-Asset Fund", - null, - &out, - ); - - const written = out.buffered(); - // Inference should NOT have fired — the Equity / Corporate - // row stays raw. The fund's *comment line* contains the - // word "Healthcare" because the title does, but no - // `sector::Healthcare` row should appear. - try std.testing.expect(std.mem.indexOf(u8, written, "sector::Healthcare") == null); - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FAKE,sector::Equity / Corporate") != null); - // Other sleeves unchanged. - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FAKE,sector::Debt / Corporate") != null); - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FAKE,sector::Loan / Corporate") != null); -} - -test "emitMissingClassification: SCHD-shape (no sector keyword) keeps NPORT-P breakdown" { - // SCHD has dominant Equity / Corporate but no sector - // keyword in its title — inference returns null and the - // raw NPORT-P row stays. (User can hand-edit if they want - // a different label.) - var out_buf: [1024]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - const sectors = [_]FundSector{ - .{ .description = "Equity / Corporate", .pct = 99.70 }, - .{ .description = "Short-Term Investment Vehicle / Registered Fund", .pct = 0.19 }, - }; - try emitMissingClassification( - "SCHD", - .managed_fund, - sectors[0..], - "Schwab U.S. Dividend Equity ETF", - null, - &out, - ); - - const written = out.buffered(); - // No sector inference: keep the raw row. - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::SCHD,sector::Equity / Corporate") != null); -} - -test "emitMissingClassification: combined sector + geo inference" { - // Hypothetical "iShares MSCI Healthcare Emerging Markets - // ETF" — both keywords fire. Healthcare overrides the - // Equity / Corporate row; Emerging Markets overrides the - // geo on every row. - var out_buf: [1024]u8 = undefined; - var out: std.Io.Writer = .fixed(&out_buf); - const sectors = [_]FundSector{ - .{ .description = "Equity / Corporate", .pct = 99.0 }, - }; - try emitMissingClassification( - "FAKE", - .managed_fund, - sectors[0..], - "iShares MSCI Healthcare Emerging Markets ETF", - null, - &out, - ); - - const written = out.buffered(); - try std.testing.expect(std.mem.indexOf(u8, written, "symbol::FAKE,sector::Healthcare,geo::Emerging Markets,asset_class::Fund,pct:num:99.00") != null); -} - test "emitFundLines: null sectors -> single TODO line" { var out_buf: [256]u8 = undefined; var out: std.Io.Writer = .fixed(&out_buf); diff --git a/src/models/classification.zig b/src/models/classification.zig index 148c3d7..57f866a 100644 --- a/src/models/classification.zig +++ b/src/models/classification.zig @@ -129,6 +129,13 @@ pub const ClassificationRecord = struct { industry: ?[]const u8 = null, // owned /// ISO-3166 alpha-2 country code (e.g. "US", "GB", "DE"). country: ?[]const u8 = null, // owned + /// Resolved geo bucket (e.g. "US", "International Developed", + /// "Emerging Markets"). Populated either from `geoFor(country)` + /// or from title-keyword inference (`inferGeoFromTitle`) for + /// symbols where Wikidata didn't supply a country. Producers + /// pick one of the `geo.*` constants above; consumers compare + /// against the same constants. + geo: ?[]const u8 = null, // owned asset_class: ?[]const u8 = null, // owned is_etf: bool = false, /// YYYY-MM-DD; trimmed from upstream's ISO-8601 date. @@ -148,6 +155,7 @@ pub const ClassificationRecord = struct { if (self.sector) |s| allocator.free(s); if (self.industry) |s| allocator.free(s); if (self.country) |s| allocator.free(s); + if (self.geo) |s| allocator.free(s); if (self.asset_class) |s| allocator.free(s); if (self.inception_date) |s| allocator.free(s); if (self.cik) |s| allocator.free(s); @@ -303,3 +311,202 @@ test "geo bucket labels are stable strings (not byte copies)" { try std.testing.expectEqual(@intFromPtr(geo.emerging.ptr), @intFromPtr(geoFor("CN").ptr)); try std.testing.expectEqual(@intFromPtr(geo.unknown.ptr), @intFromPtr(geoFor(null).ptr)); } + +// ── Title-keyword inference ────────────────────────────────── +// +// Pure functions over a fund/security title string. Used by +// `service.getClassification` to populate the sector / geo of a +// `ClassificationRecord` when Wikidata didn't carry one and the +// EDGAR ticker-map fallback fired. Lives here (not in any +// provider) because the inference is provider-agnostic and +// shares the canonical sector/geo taxonomy declared above. + +fn titleContainsAny(haystack: []const u8, needles: []const []const u8) bool { + for (needles) |needle| { + if (std.mem.indexOf(u8, haystack, needle) != null) return true; + } + return false; +} + +/// Lowercase the title into a stack buffer for case-insensitive +/// keyword matching. Truncates titles longer than the buffer +/// (returns null) — real fund names easily fit in 256 bytes. +fn lowercaseTitle(buf: []u8, title: []const u8) ?[]const u8 { + if (title.len > buf.len) return null; + return std.ascii.lowerString(buf[0..title.len], title); +} + +/// Infer a GICS sector from a fund's title. Returns null when +/// no unambiguous keyword match — caller falls back to whatever +/// sector data the upstream source provided (typically null). +/// +/// Conservative keyword set: matches only words that map +/// unambiguously to a single GICS sector. "Income" / "Dividend" +/// / "Value" / "Growth" / "Momentum" / "Total" / "Equal Weight" +/// / "International" / "Emerging" don't appear here — they +/// describe the screening methodology or geo, not the sector. +/// +/// Reuses the `sector` constants above so the inference taxonomy +/// stays in lock-step with the canonicalizer. +pub fn inferSectorFromTitle(title: ?[]const u8) ?[]const u8 { + const t = title orelse return null; + if (t.len == 0) return null; + + var buf: [256]u8 = undefined; + const lc = lowercaseTitle(&buf, t) orelse return null; + + // Order matters: more-specific keywords come first within + // each sector. "Health care" before "care" (irrelevant + // example), "semiconductor" before generic "tech" (which we + // don't include — too broad). + + // Healthcare. "Health care" with space (XLV title), "healthcare" + // (one word), "biotech", "pharmaceutical". + if (titleContainsAny(lc, &.{ "health care", "healthcare", "biotech", "pharmaceutical", "medical" })) { + return sector.healthcare; + } + + // Technology. Specific terms only — "tech" alone is too + // broad (matches "biotech", "fintech", "edtech" — all + // sector-mixing). + if (titleContainsAny(lc, &.{ "semiconductor", "software", "cloud computing", "internet" })) { + return sector.technology; + } + + // Financial Services. "Financial" is fairly specific in + // fund-name conventions ("Financial Select Sector SPDR", + // "Vanguard Financials ETF"). + if (titleContainsAny(lc, &.{ "financial", "bank" })) { + return sector.financial_services; + } + + // Energy. "Energy" alone is mostly unambiguous in fund + // conventions; pair with "oil" / "gas" for redundancy. + if (titleContainsAny(lc, &.{ "energy", "oil & gas", "oil and gas", "petroleum" })) { + return sector.energy; + } + + // Real Estate / REITs. + if (titleContainsAny(lc, &.{ "real estate", "reit" })) { + return sector.real_estate; + } + + // Utilities. "Utilities" alone is unambiguous. + if (titleContainsAny(lc, &.{"utilities"})) { + return sector.utilities; + } + + // Consumer Discretionary / Cyclical. Match the explicit + // labels — "consumer" alone is ambiguous (could be + // discretionary or staples). + if (titleContainsAny(lc, &.{ "consumer discretionary", "consumer cyclical" })) { + return sector.consumer_cyclical; + } + + // Consumer Staples / Defensive. + if (titleContainsAny(lc, &.{ "consumer staples", "consumer defensive" })) { + return sector.consumer_defensive; + } + + // Industrials. "Industrial" is more reliable than + // "industrials" because some fund names use the singular + // ("Industrial Select Sector SPDR"). + if (titleContainsAny(lc, &.{ "industrial", "aerospace", "defense" })) { + return sector.industrials; + } + + // Basic Materials. + if (titleContainsAny(lc, &.{ "materials", "mining", "miners", "metals" })) { + return sector.basic_materials; + } + + // Communication Services. "Communication" / "Telecom" + // unambiguous. + if (titleContainsAny(lc, &.{ "communication", "telecom", "media" })) { + return sector.communication_services; + } + + return null; +} + +/// Infer a geo bucket from a fund's title. Returns null when +/// the title doesn't carry an unambiguous international/emerging +/// keyword — caller keeps whatever default they have (typically +/// US for SEC-filed funds). +/// +/// More important than sector inference: a default `geo::US` is +/// *factually wrong* for international funds (FRDM holds +/// emerging-market equities, not US), so this fix tightens +/// portfolio-level geographic-exposure reporting. +pub fn inferGeoFromTitle(title: ?[]const u8) ?[]const u8 { + const t = title orelse return null; + if (t.len == 0) return null; + + var buf: [256]u8 = undefined; + const lc = lowercaseTitle(&buf, t) orelse return null; + + // Emerging markets first — most specific. "Emerging" alone + // is rare in non-EM contexts in fund-name conventions. + // "Frontier" likewise is conventionally only used for + // frontier markets in fund titles. + if (titleContainsAny(lc, &.{ "emerging market", "emerging markets", "frontier market", "frontier markets", "frontier" })) { + return geo.emerging; + } + + // International Developed. "International" / "Intl" / + // "ex-US" / "World ex US" / "Developed Markets" / + // specific developed-market regions. + // + // False-positive risk: a hypothetical "Vanguard Total + // International + US Equity Fund" would mis-tag here. None + // of the user's current portfolio holds such a hybrid + // fund. If one ever shows up, it'll get flagged in the + // diff-against-old-metadata.srf review and can be + // hand-corrected. + if (titleContainsAny(lc, &.{ "international", " intl", "ex-us", "ex us", "world ex", "developed market", "developed markets" })) { + return geo.developed; + } + + return null; +} + +test "inferSectorFromTitle: null/empty -> null" { + try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle(null)); + try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("")); +} + +test "inferSectorFromTitle: technology keywords" { + try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("iShares Semiconductor ETF")); + try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("Vanguard Software ETF")); +} + +test "inferSectorFromTitle: healthcare keywords" { + try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("Health Care Select Sector SPDR")); + try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("iShares Biotech ETF")); +} + +test "inferSectorFromTitle: ambiguous title -> null" { + try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("Vanguard Total Stock Market")); + try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("SPDR S&P 500")); +} + +test "inferGeoFromTitle: null/empty -> null" { + try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle(null)); + try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("")); +} + +test "inferGeoFromTitle: emerging markets" { + try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("Freedom 100 Emerging Markets ETF")); + try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("iShares Frontier Markets")); +} + +test "inferGeoFromTitle: international developed" { + try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard FTSE Developed Markets")); + try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("iShares MSCI EAFE International")); + try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard Total World ex-US")); +} + +test "inferGeoFromTitle: no match -> null" { + try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("SPDR S&P 500")); + try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("iShares Semiconductor ETF")); +} diff --git a/src/service.zig b/src/service.zig index 0d96197..6908af0 100644 --- a/src/service.zig +++ b/src/service.zig @@ -35,6 +35,7 @@ const Yahoo = @import("providers/yahoo.zig").Yahoo; const Tiingo = @import("providers/tiingo.zig").Tiingo; const Wikidata = @import("providers/Wikidata.zig"); const Edgar = @import("providers/Edgar.zig"); +const classification = @import("models/classification.zig"); const fmt = @import("format.zig"); const performance = @import("analytics/performance.zig"); const http = @import("net/http.zig"); @@ -1037,9 +1038,9 @@ pub const DataService = struct { // higher-quality name. Best-effort: if the fetch fails we // still return the EDGAR-only profile. var inception_date: ?Date = null; - if (self.getClassification(symbol, opts)) |classification| { - defer classification.deinit(); - for (classification.data) |c| { + if (self.getClassification(symbol, opts)) |class_result| { + defer class_result.deinit(); + for (class_result.data) |c| { if (c.inception_date) |idate_str| { if (Date.parse(idate_str)) |d| inception_date = d else |_| {} } @@ -1092,11 +1093,6 @@ pub const DataService = struct { /// `opts.skip_network = true` returns cached data even if stale, /// `FetchFailed` on cache miss. `opts.force_refresh = true` /// ignores the cache and re-fetches. - /// - /// Callers fetching classifications for many symbols should use - /// `getClassifications(symbols)` instead — Wikidata's SPARQL API - /// is naturally batched, and one query for N symbols is much - /// cheaper than N queries for 1 symbol each. pub fn getClassification(self: *DataService, symbol: []const u8, opts: FetchOptions) DataError!FetchResult(Wikidata.ClassificationRecord) { var s = self.store(); @@ -1131,124 +1127,282 @@ pub const DataService = struct { const fetched = wd.fetch(self.allocator, &symbols) catch |err| { if (err == error.RateLimited) { self.rateLimitBackoff(); - break_blk: { - const retried = wd.fetch(self.allocator, &symbols) catch break :break_blk; - if (retried.len > 0) { - s.write(Wikidata.ClassificationRecord, symbol, retried, cache.DataType.classification.ttl()); - return .{ .data = retried, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator }; - } - self.allocator.free(retried); - } + if (wd.fetch(self.allocator, &symbols)) |retried| { + return self.finalizeClassification(symbol, retried, opts); + } else |_| {} } log.warn("{s}: wikidata fetch failed: {s}", .{ symbol, @errorName(err) }); return DataError.FetchFailed; }; - if (fetched.len == 0) { - self.allocator.free(fetched); - // Wikidata had no row for this symbol. Negative-cache to - // suppress retries until the user explicitly refreshes. - s.writeNegative(symbol, .classification); - return DataError.NotFound; - } - - s.write(Wikidata.ClassificationRecord, symbol, fetched, cache.DataType.classification.ttl()); - - return .{ .data = fetched, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator }; + return self.finalizeClassification(symbol, fetched, opts); } - /// Batched classification fetch. Wikidata's SPARQL API takes a - /// `VALUES ?ticker { ... }` set in one query; this method runs - /// that query for the requested set, splits the response into - /// per-symbol cache writes, and returns the slice. Symbols not - /// in Wikidata are silently dropped from the result (the user- - /// facing cache for them gets a negative entry). + /// Common post-Wikidata path: decide if the result is useful as + /// returned, otherwise consult EDGAR to fill in the gaps, + /// otherwise negative-cache. Either way the cache gets written + /// and a `FetchResult` is returned (or `DataError.NotFound`). /// - /// The cache is consulted first per-symbol; only the symbols - /// that miss the cache (or are stale) are passed to the SPARQL - /// query. This minimizes the upstream load when most symbols - /// were already classified in a prior run. - pub fn getClassifications( + /// Takes ownership of `wikidata_records`. The slice is either + /// returned as the result data, freed and replaced by a + /// synthesized slice, or freed and the symbol negative-cached. + fn finalizeClassification( self: *DataService, - result_allocator: std.mem.Allocator, - symbols: []const []const u8, + symbol: []const u8, + wikidata_records: []Wikidata.ClassificationRecord, opts: FetchOptions, - ) DataError![]Wikidata.ClassificationRecord { - if (symbols.len == 0) return &.{}; + ) DataError!FetchResult(Wikidata.ClassificationRecord) { var s = self.store(); + const ttl = cache.DataType.classification.ttl(); - // Identify cache misses. - var to_fetch: std.ArrayList([]const u8) = .empty; - defer to_fetch.deinit(self.allocator); - var cached_records: std.ArrayList(Wikidata.ClassificationRecord) = .empty; - errdefer { - for (cached_records.items) |*r| { - var m = r.*; - m.deinit(self.allocator); + // Wikidata returned a useful row -> populate geo from + // geoFor(country) and cache as-is. + if (wikidata_records.len > 0 and wikidataLooksUseful(wikidata_records[0])) { + try self.populateGeo(&wikidata_records[0]); + s.write(Wikidata.ClassificationRecord, symbol, wikidata_records, ttl); + return .{ .data = wikidata_records, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator }; + } + + // Sparse or empty: try EDGAR fallback. `synthesizeClassification` + // takes ownership of the wikidata slice (frees it, returns a + // new one-element slice with the merged record). Returns + // `error.NotFound` when even EDGAR has nothing. + const merged = self.synthesizeClassification(symbol, wikidata_records, opts) catch |err| { + if (err == error.NotFound) { + s.writeNegative(symbol, .classification); + return DataError.NotFound; } - cached_records.deinit(self.allocator); - } - - for (symbols) |sym| { - if (!opts.force_refresh) { - if (s.read(Wikidata.ClassificationRecord, sym, null, .fresh_only)) |cached| { - // The on-disk shape is a length-1 slice. - if (cached.data.len > 0) { - try cached_records.append(self.allocator, cached.data[0]); - // Free the rest if any (shouldn't happen for - // per-symbol classification, but defensive). - for (cached.data[1..]) |*r| { - var m = r.*; - m.deinit(self.allocator); - } - self.allocator.free(cached.data); - continue; - } - self.allocator.free(cached.data); - } - } - try to_fetch.append(self.allocator, sym); - } - - if (to_fetch.items.len == 0) { - // All cached — assemble result from cached_records. - const out = try result_allocator.alloc(Wikidata.ClassificationRecord, cached_records.items.len); - @memcpy(out, cached_records.items); - cached_records.clearRetainingCapacity(); - return out; - } - - if (opts.skip_network) { - // Offline mode: return what we have from cache. - const out = try result_allocator.alloc(Wikidata.ClassificationRecord, cached_records.items.len); - @memcpy(out, cached_records.items); - cached_records.clearRetainingCapacity(); - return out; - } - - log.debug("fetching {d} classifications from Wikidata", .{to_fetch.items.len}); - self.assertNetworkAllowed("getClassifications wikidata.fetch"); - var wd = try self.getProvider(Wikidata); - - const fetched = wd.fetch(self.allocator, to_fetch.items) catch |err| { - log.warn("wikidata batch fetch failed: {s}", .{@errorName(err)}); return DataError.FetchFailed; }; - defer self.allocator.free(fetched); - // Write each fetched record to its per-symbol cache file. - for (fetched) |rec| { - const single = [_]Wikidata.ClassificationRecord{rec}; - s.write(Wikidata.ClassificationRecord, rec.symbol, &single, cache.DataType.classification.ttl()); + s.write(Wikidata.ClassificationRecord, symbol, merged, ttl); + return .{ .data = merged, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator }; + } + + /// Populate `record.geo` from `geoFor(record.country)` when it + /// isn't already set. Best-effort: if duping the geo string + /// fails, leaves the field null and propagates the error so the + /// caller can decide whether to bail. + fn populateGeo(self: *DataService, record: *Wikidata.ClassificationRecord) !void { + if (record.geo != null) return; + const country = record.country orelse return; + const g = classification.geoFor(country); + if (std.mem.eql(u8, g, classification.geo.unknown)) return; + record.geo = try self.allocator.dupe(u8, g); + } + + /// Whether a Wikidata classification record carries enough + /// downstream-usable data to skip the EDGAR fallback. A record + /// with at least one of `is_etf`, `sector`, `country`, or + /// `asset_class` set is "useful"; sparse records (e.g. SOXX + /// getting only a `name` from Wikidata) need the EDGAR + /// ticker-map fallback to fill in `is_etf=true, + /// asset_class=ETF, country=US`. + fn wikidataLooksUseful(c: Wikidata.ClassificationRecord) bool { + if (c.is_etf) return true; + if (c.asset_class != null) return true; + if (c.country != null) return true; + if (c.sector != null) return true; + return false; + } + + /// Synthesize a `ClassificationRecord` for a symbol that + /// Wikidata couldn't classify usefully. Consults the EDGAR + /// ticker maps; if found, also fetches `getEtfMetrics` to + /// recover the NPORT-P series_name (more authoritative than + /// the company_tickers title). Title-keyword inference fills + /// in `sector` and `geo` when the name carries an unambiguous + /// keyword. + /// + /// Takes ownership of `wikidata_records`: frees them at exit. + /// Wikidata's `name`/`industry`/`inception_date`/`cik` fields + /// are preserved into the synthesized record when present. + /// Returns `error.NotFound` when EDGAR has nothing either. + fn synthesizeClassification( + self: *DataService, + symbol: []const u8, + wikidata_records: []Wikidata.ClassificationRecord, + opts: FetchOptions, + ) !cache.Store.DataFor(Wikidata.ClassificationRecord) { + defer Wikidata.ClassificationRecord.freeSlice(self.allocator, wikidata_records); + + const lookup = self.lookupEdgarFallback(symbol, opts); + defer freeEdgarLookup(self.allocator, lookup); + if (lookup == .none) return error.NotFound; + + // For ETF/fund hits, try to get the richer series_name from + // NPORT-P. Cache hit is cheap; cache miss triggers an EDGAR + // fetch but is bounded by EDGAR's rate limiter. If the call + // fails (e.g. money-market funds with no NPORT-P), we fall + // back to the ticker-map title. + var etf_metrics_result: ?FetchResult(Edgar.EtfMetricRecord) = null; + defer if (etf_metrics_result) |*r| r.deinit(); + etf_metrics_result = self.getEtfMetrics(symbol, opts) catch null; + + // Extract series_name and cik from the etf_metrics profile row. + var series_name: ?[]const u8 = null; + var etf_cik: ?[]const u8 = null; + if (etf_metrics_result) |r| { + for (r.data) |rec| switch (rec) { + .profile => |p| { + if (p.series_name) |sn| series_name = sn; + etf_cik = p.cik; + break; + }, + else => {}, + }; } - // Combine cached + fetched into the result. - const total = cached_records.items.len + fetched.len; - const out = try result_allocator.alloc(Wikidata.ClassificationRecord, total); - @memcpy(out[0..cached_records.items.len], cached_records.items); - @memcpy(out[cached_records.items.len..], fetched); - cached_records.clearRetainingCapacity(); - return out; + // Pull whatever Wikidata's sparse record carried so we + // don't lose data on the merge. + const wd: ?Wikidata.ClassificationRecord = if (wikidata_records.len > 0) wikidata_records[0] else null; + + // Pick the best name source: NPORT-P series_name > + // EDGAR ticker-map title > Wikidata name > nothing. + // + // We're on the EDGAR-fallback path because Wikidata's + // record was sparse. For funds, Wikidata's `name` (when + // present) is frequently the underlying INDEX rather than + // the FUND itself -- e.g. SOXX's Wikidata `name` is "PHLX + // Semiconductor Sector" but the fund is "iShares + // Semiconductor ETF" per NPORT-P seriesName. Prefer the + // fund-authoritative source so downstream comments and + // labels show the fund name, not the index name. + const ticker_title: ?[]const u8 = switch (lookup) { + .company_or_uit => |c| c.title, + else => null, + }; + const best_name: ?[]const u8 = blk: { + if (series_name) |n| break :blk n; + if (ticker_title) |n| break :blk n; + if (wd) |w| { + if (w.name) |n| break :blk n; + } + break :blk null; + }; + + // Name source for title-keyword inference: prefer the + // most-authoritative source for fund-style classification + // even when Wikidata supplied a (different) name. Wikidata's + // name for a fund is often less informative than NPORT-P's + // seriesName (e.g. SOXX's Wikidata name is "PHLX + // Semiconductor Sector" which is the index name, not the + // fund name). + const inference_name: ?[]const u8 = series_name orelse ticker_title orelse if (wd) |w| w.name else null; + + const inferred_sector = classification.inferSectorFromTitle(inference_name); + const inferred_geo = classification.inferGeoFromTitle(inference_name); + + // `is_etf` here means "this is fund-shaped, emit multi-row + // breakdown" -- true for ANY EDGAR-found symbol. The + // `tickers_funds.srf` map mixes mutual funds and + // series-of-trust ETFs alike. The `tickers_companies.srf` + // map carries operating companies, closed-end funds, and + // UITs; operating companies usually have Wikidata coverage + // and wouldn't reach this fallback, so anything that + // dropped here is also fund-shaped (e.g. PIMCO closed-end + // funds whose title says "FUND" but not "ETF" or "TRUST"). + // + // The ETF/TRUST keyword in the title still drives the + // asset_class label below ("ETF" vs "Fund"), but the + // fund-shaped routing decision applies regardless. + const is_etf = true; + const asset_class: []const u8 = switch (lookup) { + .managed_fund => "Fund", + .company_or_uit => |c| if (c.is_etf) "ETF" else "Fund", + .none => unreachable, + }; + + // Country: prefer Wikidata's. Default to "US" for + // EDGAR-found symbols (they're SEC filers). + const country_str: []const u8 = if (wd) |w| (w.country orelse "US") else "US"; + + // Sector: prefer Wikidata's existing sector (rare in this + // sparse-fallback path), else fall back to inferred. + const sector_str: ?[]const u8 = blk: { + if (wd) |w| { + if (w.sector) |sec| break :blk sec; + } + break :blk inferred_sector; + }; + + // CIK: prefer Wikidata's, fall back to NPORT-P's. + const cik_str: ?[]const u8 = blk: { + if (wd) |w| { + if (w.cik) |c| break :blk c; + } + if (etf_cik) |c| break :blk c; + break :blk null; + }; + + // Geo: prefer the Wikidata-derived geo (computed from + // `geoFor(country)` against the country code), else use + // title-keyword inference. Default to "US" when neither + // is available -- EDGAR-found symbols are SEC filers. + const geo_str: []const u8 = blk: { + if (wd) |w| { + if (w.country) |c| { + const g = classification.geoFor(c); + if (!std.mem.eql(u8, g, classification.geo.unknown)) break :blk g; + } + } + if (inferred_geo) |g| break :blk g; + break :blk classification.geo.us; + }; + + const today = fmt.todayDate(self.io); + var as_of_buf: [10]u8 = undefined; + const as_of_str = try std.fmt.bufPrint(&as_of_buf, "{f}", .{today}); + + // Allocate each owned field up front with its own errdefer + // so a partial-build on OOM doesn't leak the earlier + // successful dupes. Once all dupes succeed we assemble the + // record (no fallible ops below this point). + const symbol_owned = try self.allocator.dupe(u8, symbol); + errdefer self.allocator.free(symbol_owned); + const name_owned: ?[]const u8 = if (best_name) |n| try self.allocator.dupe(u8, n) else null; + errdefer if (name_owned) |s| self.allocator.free(s); + const sector_owned: ?[]const u8 = if (sector_str) |s| try self.allocator.dupe(u8, s) else null; + errdefer if (sector_owned) |s| self.allocator.free(s); + const industry_owned: ?[]const u8 = if (wd) |w| + (if (w.industry) |i| try self.allocator.dupe(u8, i) else null) + else + null; + errdefer if (industry_owned) |s| self.allocator.free(s); + const country_owned = try self.allocator.dupe(u8, country_str); + errdefer self.allocator.free(country_owned); + const geo_owned = try self.allocator.dupe(u8, geo_str); + errdefer self.allocator.free(geo_owned); + const asset_class_owned = try self.allocator.dupe(u8, asset_class); + errdefer self.allocator.free(asset_class_owned); + const inception_owned: ?[]const u8 = if (wd) |w| + (if (w.inception_date) |i| try self.allocator.dupe(u8, i) else null) + else + null; + errdefer if (inception_owned) |s| self.allocator.free(s); + const cik_owned: ?[]const u8 = if (cik_str) |c| try self.allocator.dupe(u8, c) else null; + errdefer if (cik_owned) |s| self.allocator.free(s); + const as_of_owned = try self.allocator.dupe(u8, as_of_str); + errdefer self.allocator.free(as_of_owned); + const source_owned = try self.allocator.dupe(u8, "edgar_fallback"); + errdefer self.allocator.free(source_owned); + + const result = try self.allocator.alloc(Wikidata.ClassificationRecord, 1); + result[0] = .{ + .symbol = symbol_owned, + .name = name_owned, + .sector = sector_owned, + .industry = industry_owned, + .country = country_owned, + .geo = geo_owned, + .asset_class = asset_class_owned, + .is_etf = is_etf, + .inception_date = inception_owned, + .cik = cik_owned, + .as_of = as_of_owned, + .source = source_owned, + }; + return result; } /// Fetch XBRL-derived entity facts for a CIK (currently @@ -2955,6 +3109,326 @@ test "getClassification: cache hit returns cached data without network" { try std.testing.expectEqual(Source.cached, result.source); } +test "populateGeo: country US -> geo US" { + const allocator = std.testing.allocator; + const io = std.testing.io; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator); + defer allocator.free(dir_path); + + const config = Config{ .cache_dir = dir_path }; + var svc = DataService.init(io, allocator, config); + defer svc.deinit(); + + var record: Wikidata.ClassificationRecord = .{ + .symbol = try allocator.dupe(u8, "TEST"), + .country = try allocator.dupe(u8, "US"), + .as_of = try allocator.dupe(u8, "2026-06-01"), + .source = try allocator.dupe(u8, "wikidata"), + }; + defer record.deinit(allocator); + + try svc.populateGeo(&record); + try std.testing.expect(record.geo != null); + try std.testing.expectEqualStrings("US", record.geo.?); +} + +test "populateGeo: country GB -> geo International Developed" { + const allocator = std.testing.allocator; + const io = std.testing.io; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator); + defer allocator.free(dir_path); + + const config = Config{ .cache_dir = dir_path }; + var svc = DataService.init(io, allocator, config); + defer svc.deinit(); + + var record: Wikidata.ClassificationRecord = .{ + .symbol = try allocator.dupe(u8, "TEST"), + .country = try allocator.dupe(u8, "GB"), + .as_of = try allocator.dupe(u8, "2026-06-01"), + .source = try allocator.dupe(u8, "wikidata"), + }; + defer record.deinit(allocator); + + try svc.populateGeo(&record); + try std.testing.expect(record.geo != null); + try std.testing.expectEqualStrings("International Developed", record.geo.?); +} + +test "populateGeo: null country -> noop" { + const allocator = std.testing.allocator; + const io = std.testing.io; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator); + defer allocator.free(dir_path); + + const config = Config{ .cache_dir = dir_path }; + var svc = DataService.init(io, allocator, config); + defer svc.deinit(); + + var record: Wikidata.ClassificationRecord = .{ + .symbol = try allocator.dupe(u8, "TEST"), + .as_of = try allocator.dupe(u8, "2026-06-01"), + .source = try allocator.dupe(u8, "wikidata"), + }; + defer record.deinit(allocator); + + try svc.populateGeo(&record); + try std.testing.expectEqual(@as(?[]const u8, null), record.geo); +} + +test "populateGeo: existing geo not overwritten" { + const allocator = std.testing.allocator; + const io = std.testing.io; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator); + defer allocator.free(dir_path); + + const config = Config{ .cache_dir = dir_path }; + var svc = DataService.init(io, allocator, config); + defer svc.deinit(); + + var record: Wikidata.ClassificationRecord = .{ + .symbol = try allocator.dupe(u8, "TEST"), + .country = try allocator.dupe(u8, "US"), + .geo = try allocator.dupe(u8, "Already Set"), + .as_of = try allocator.dupe(u8, "2026-06-01"), + .source = try allocator.dupe(u8, "wikidata"), + }; + defer record.deinit(allocator); + + try svc.populateGeo(&record); + try std.testing.expectEqualStrings("Already Set", record.geo.?); +} + +test "getClassification: sparse Wikidata + EDGAR managed_fund hit produces merged record" { + const allocator = std.testing.allocator; + const io = std.testing.io; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator); + defer allocator.free(dir_path); + + const config = Config{ .cache_dir = dir_path }; + var svc = DataService.init(io, allocator, config); + defer svc.deinit(); + + // Seed both EDGAR ticker map caches with at least one entry + // each so the synthesizeClassification path doesn't try to + // fetch them (the load helpers treat empty cached slices as + // "miss" and fall through to a network fetch). + var s = svc.store(); + var mf_entries = [_]Edgar.MutualFundTickerEntry{.{ + .symbol = "FAGIX", + .cik = "0000275309", + }}; + s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl()); + var co_entries = [_]Edgar.CompanyTickerEntry{.{ + .symbol = "DUMMY", + .cik = "0000000001", + }}; + s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl()); + + // Seed an etf_metrics negative cache so getEtfMetrics doesn't + // try to fetch from the network. + s.writeNegative("FAGIX", .etf_metrics); + + // Sparse Wikidata records (length 1, only name set -- not useful). + var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1); + sparse[0] = .{ + .symbol = try allocator.dupe(u8, "FAGIX"), + .name = try allocator.dupe(u8, "Test Fund"), + .as_of = try allocator.dupe(u8, "2026-06-01"), + .source = try allocator.dupe(u8, "wikidata"), + }; + + // Drive directly through synthesizeClassification (skip the + // Wikidata fetch). It takes ownership of `sparse`. + svc.panic_on_network_attempt = true; // any provider call -> panic + const merged = try svc.synthesizeClassification("FAGIX", sparse, .{ .skip_network = true }); + defer Wikidata.ClassificationRecord.freeSlice(allocator, merged); + + try std.testing.expectEqual(@as(usize, 1), merged.len); + const c = merged[0]; + try std.testing.expectEqualStrings("FAGIX", c.symbol); + try std.testing.expect(c.is_etf); + try std.testing.expectEqualStrings("Fund", c.asset_class.?); + try std.testing.expectEqualStrings("US", c.country.?); + try std.testing.expectEqualStrings("US", c.geo.?); + try std.testing.expectEqualStrings("edgar_fallback", c.source); + // Wikidata's name preserved on merge. + try std.testing.expectEqualStrings("Test Fund", c.name.?); +} + +test "synthesizeClassification: no EDGAR hit returns NotFound" { + const allocator = std.testing.allocator; + const io = std.testing.io; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator); + defer allocator.free(dir_path); + + const config = Config{ .cache_dir = dir_path }; + var svc = DataService.init(io, allocator, config); + defer svc.deinit(); + + // Seed both ticker maps with throwaway entries so the + // EDGAR lookup returns .none for our test symbol but doesn't + // try to fetch the maps from the network. + var s = svc.store(); + var mf_entries = [_]Edgar.MutualFundTickerEntry{.{ + .symbol = "DUMMY1", + .cik = "0000000001", + }}; + s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl()); + var co_entries = [_]Edgar.CompanyTickerEntry{.{ + .symbol = "DUMMY2", + .cik = "0000000002", + }}; + s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl()); + + var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1); + sparse[0] = .{ + .symbol = try allocator.dupe(u8, "NEVERHEARDOFIT"), + .name = try allocator.dupe(u8, "ghost"), + .as_of = try allocator.dupe(u8, "2026-06-01"), + .source = try allocator.dupe(u8, "wikidata"), + }; + + svc.panic_on_network_attempt = true; + try std.testing.expectError(error.NotFound, svc.synthesizeClassification("NEVERHEARDOFIT", sparse, .{ .skip_network = true })); +} + +test "synthesizeClassification: company_or_uit without ETF/TRUST keyword still routes to multi-row" { + // PTY shape: closed-end fund whose company_tickers title is + // "PIMCO CORPORATE & INCOME OPPORTUNITY FUND" -- no "ETF" or + // "TRUST" in the title, so lookupInTickerMaps returns + // .company_or_uit{is_etf=false}. But it's still fund-shaped + // and should produce multi-row metadata in enrich. + // + // The downstream signal for "fund-like, emit multi-row" is + // ClassificationRecord.is_etf. Set it to true for any + // EDGAR-found .company_or_uit hit (even when the title + // doesn't carry the ETF/TRUST keyword), so PTY-shape + // closed-end funds get the same treatment as ETFs. + const allocator = std.testing.allocator; + const io = std.testing.io; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator); + defer allocator.free(dir_path); + + const config = Config{ .cache_dir = dir_path }; + var svc = DataService.init(io, allocator, config); + defer svc.deinit(); + + var s = svc.store(); + // Throwaway MF entry so the MF lookup returns null. + var mf_entries = [_]Edgar.MutualFundTickerEntry{.{ + .symbol = "DUMMY", + .cik = "0000000001", + }}; + s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl()); + // PTY in the company map with NO ETF/TRUST in title. + var co_entries = [_]Edgar.CompanyTickerEntry{.{ + .symbol = "PTY", + .cik = "0001202604", + .title = "PIMCO CORPORATE & INCOME OPPORTUNITY FUND", + }}; + s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl()); + s.writeNegative("PTY", .etf_metrics); + + var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1); + sparse[0] = .{ + .symbol = try allocator.dupe(u8, "PTY"), + .name = try allocator.dupe(u8, "PIMCO Corporate & Income Opportunity Fund"), + .as_of = try allocator.dupe(u8, "2026-06-01"), + .source = try allocator.dupe(u8, "wikidata"), + }; + + svc.panic_on_network_attempt = true; + const merged = try svc.synthesizeClassification("PTY", sparse, .{ .skip_network = true }); + defer Wikidata.ClassificationRecord.freeSlice(allocator, merged); + + try std.testing.expectEqual(@as(usize, 1), merged.len); + const c = merged[0]; + // is_etf MUST be true so enrich routes through emitEtfRows + // (multi-row sleeve breakdown). The asset_class stays "Fund" + // because no ETF/TRUST keyword in title. + try std.testing.expect(c.is_etf); + try std.testing.expectEqualStrings("Fund", c.asset_class.?); +} + +test "synthesizeClassification: NPORT-P series_name beats Wikidata's index name for funds" { + // SOXX shape: Wikidata returns the underlying INDEX name + // ("PHLX Semiconductor Sector") which is technically what the + // ticker symbol is for, but downstream consumers want the + // FUND name ("iShares Semiconductor ETF") that NPORT-P + // carries. Series_name is more authoritative + // for the fund itself. + const allocator = std.testing.allocator; + const io = std.testing.io; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator); + defer allocator.free(dir_path); + + const config = Config{ .cache_dir = dir_path }; + var svc = DataService.init(io, allocator, config); + defer svc.deinit(); + + var s = svc.store(); + var mf_entries = [_]Edgar.MutualFundTickerEntry{.{ + .symbol = "DUMMY", + .cik = "0000000001", + }}; + s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl()); + var co_entries = [_]Edgar.CompanyTickerEntry{.{ + .symbol = "SOXX", + .cik = "0001100663", + .title = "iShares Trust", + }}; + s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl()); + + // Pre-seed etf_metrics with a profile row carrying the + // NPORT-P seriesName. + var etf_records = [_]Edgar.EtfMetricRecord{ + .{ .profile = .{ + .symbol = try allocator.dupe(u8, "SOXX"), + .series_name = try allocator.dupe(u8, "iShares Semiconductor ETF"), + .cik = try allocator.dupe(u8, "0001100663"), + .as_of = try allocator.dupe(u8, "2026-06-01"), + .source = try allocator.dupe(u8, "edgar"), + } }, + }; + defer for (etf_records) |r| r.deinit(allocator); + s.write(Edgar.EtfMetricRecord, "SOXX", etf_records[0..], cache.DataType.etf_metrics.ttl()); + + // Wikidata returned only the index name (sparse). + var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1); + sparse[0] = .{ + .symbol = try allocator.dupe(u8, "SOXX"), + .name = try allocator.dupe(u8, "PHLX Semiconductor Sector"), + .as_of = try allocator.dupe(u8, "2026-06-01"), + .source = try allocator.dupe(u8, "wikidata"), + }; + + svc.panic_on_network_attempt = true; + const merged = try svc.synthesizeClassification("SOXX", sparse, .{ .skip_network = true }); + defer Wikidata.ClassificationRecord.freeSlice(allocator, merged); + + try std.testing.expectEqual(@as(usize, 1), merged.len); + const c = merged[0]; + // Series_name from NPORT-P wins -- not Wikidata's index name. + try std.testing.expectEqualStrings("iShares Semiconductor ETF", c.name.?); +} + test "getEntityFacts: skip_network with no cache returns FetchFailed" { const allocator = std.testing.allocator; const io = std.testing.io;