move Wikidata -> edgar logic into getClassification

2026-06-01 16:11:09 -07:00 · 2026-06-01 16:11:09 -07:00 · b796a46699
commit b796a46699
parent 4d65cc45f4
3 changed files with 953 additions and 1206 deletions
--- a/src/commands/enrich.zig
+++ b/src/commands/enrich.zig
--- a/src/models/classification.zig
+++ b/src/models/classification.zig
@ -129,6 +129,13 @@ pub const ClassificationRecord = struct {
    industry: ?[]const u8 = null, // owned
    /// ISO-3166 alpha-2 country code (e.g. "US", "GB", "DE").
    country: ?[]const u8 = null, // owned
+    /// Resolved geo bucket (e.g. "US", "International Developed",
+    /// "Emerging Markets"). Populated either from `geoFor(country)`
+    /// or from title-keyword inference (`inferGeoFromTitle`) for
+    /// symbols where Wikidata didn't supply a country. Producers
+    /// pick one of the `geo.*` constants above; consumers compare
+    /// against the same constants.
+    geo: ?[]const u8 = null, // owned
    asset_class: ?[]const u8 = null, // owned
    is_etf: bool = false,
    /// YYYY-MM-DD; trimmed from upstream's ISO-8601 date.
@ -148,6 +155,7 @@ pub const ClassificationRecord = struct {
        if (self.sector) |s| allocator.free(s);
        if (self.industry) |s| allocator.free(s);
        if (self.country) |s| allocator.free(s);
+        if (self.geo) |s| allocator.free(s);
        if (self.asset_class) |s| allocator.free(s);
        if (self.inception_date) |s| allocator.free(s);
        if (self.cik) |s| allocator.free(s);
@ -303,3 +311,202 @@ test "geo bucket labels are stable strings (not byte copies)" {
    try std.testing.expectEqual(@intFromPtr(geo.emerging.ptr), @intFromPtr(geoFor("CN").ptr));
    try std.testing.expectEqual(@intFromPtr(geo.unknown.ptr), @intFromPtr(geoFor(null).ptr));
 }
+
+// ── Title-keyword inference ──────────────────────────────────
+//
+// Pure functions over a fund/security title string. Used by
+// `service.getClassification` to populate the sector / geo of a
+// `ClassificationRecord` when Wikidata didn't carry one and the
+// EDGAR ticker-map fallback fired. Lives here (not in any
+// provider) because the inference is provider-agnostic and
+// shares the canonical sector/geo taxonomy declared above.
+
+fn titleContainsAny(haystack: []const u8, needles: []const []const u8) bool {
+    for (needles) |needle| {
+        if (std.mem.indexOf(u8, haystack, needle) != null) return true;
+    }
+    return false;
+}
+
+/// Lowercase the title into a stack buffer for case-insensitive
+/// keyword matching. Truncates titles longer than the buffer
+/// (returns null) — real fund names easily fit in 256 bytes.
+fn lowercaseTitle(buf: []u8, title: []const u8) ?[]const u8 {
+    if (title.len > buf.len) return null;
+    return std.ascii.lowerString(buf[0..title.len], title);
+}
+
+/// Infer a GICS sector from a fund's title. Returns null when
+/// no unambiguous keyword match — caller falls back to whatever
+/// sector data the upstream source provided (typically null).
+///
+/// Conservative keyword set: matches only words that map
+/// unambiguously to a single GICS sector. "Income" / "Dividend"
+/// / "Value" / "Growth" / "Momentum" / "Total" / "Equal Weight"
+/// / "International" / "Emerging" don't appear here — they
+/// describe the screening methodology or geo, not the sector.
+///
+/// Reuses the `sector` constants above so the inference taxonomy
+/// stays in lock-step with the canonicalizer.
+pub fn inferSectorFromTitle(title: ?[]const u8) ?[]const u8 {
+    const t = title orelse return null;
+    if (t.len == 0) return null;
+
+    var buf: [256]u8 = undefined;
+    const lc = lowercaseTitle(&buf, t) orelse return null;
+
+    // Order matters: more-specific keywords come first within
+    // each sector. "Health care" before "care" (irrelevant
+    // example), "semiconductor" before generic "tech" (which we
+    // don't include — too broad).
+
+    // Healthcare. "Health care" with space (XLV title), "healthcare"
+    // (one word), "biotech", "pharmaceutical".
+    if (titleContainsAny(lc, &.{ "health care", "healthcare", "biotech", "pharmaceutical", "medical" })) {
+        return sector.healthcare;
+    }
+
+    // Technology. Specific terms only — "tech" alone is too
+    // broad (matches "biotech", "fintech", "edtech" — all
+    // sector-mixing).
+    if (titleContainsAny(lc, &.{ "semiconductor", "software", "cloud computing", "internet" })) {
+        return sector.technology;
+    }
+
+    // Financial Services. "Financial" is fairly specific in
+    // fund-name conventions ("Financial Select Sector SPDR",
+    // "Vanguard Financials ETF").
+    if (titleContainsAny(lc, &.{ "financial", "bank" })) {
+        return sector.financial_services;
+    }
+
+    // Energy. "Energy" alone is mostly unambiguous in fund
+    // conventions; pair with "oil" / "gas" for redundancy.
+    if (titleContainsAny(lc, &.{ "energy", "oil & gas", "oil and gas", "petroleum" })) {
+        return sector.energy;
+    }
+
+    // Real Estate / REITs.
+    if (titleContainsAny(lc, &.{ "real estate", "reit" })) {
+        return sector.real_estate;
+    }
+
+    // Utilities. "Utilities" alone is unambiguous.
+    if (titleContainsAny(lc, &.{"utilities"})) {
+        return sector.utilities;
+    }
+
+    // Consumer Discretionary / Cyclical. Match the explicit
+    // labels — "consumer" alone is ambiguous (could be
+    // discretionary or staples).
+    if (titleContainsAny(lc, &.{ "consumer discretionary", "consumer cyclical" })) {
+        return sector.consumer_cyclical;
+    }
+
+    // Consumer Staples / Defensive.
+    if (titleContainsAny(lc, &.{ "consumer staples", "consumer defensive" })) {
+        return sector.consumer_defensive;
+    }
+
+    // Industrials. "Industrial" is more reliable than
+    // "industrials" because some fund names use the singular
+    // ("Industrial Select Sector SPDR").
+    if (titleContainsAny(lc, &.{ "industrial", "aerospace", "defense" })) {
+        return sector.industrials;
+    }
+
+    // Basic Materials.
+    if (titleContainsAny(lc, &.{ "materials", "mining", "miners", "metals" })) {
+        return sector.basic_materials;
+    }
+
+    // Communication Services. "Communication" / "Telecom"
+    // unambiguous.
+    if (titleContainsAny(lc, &.{ "communication", "telecom", "media" })) {
+        return sector.communication_services;
+    }
+
+    return null;
+}
+
+/// Infer a geo bucket from a fund's title. Returns null when
+/// the title doesn't carry an unambiguous international/emerging
+/// keyword — caller keeps whatever default they have (typically
+/// US for SEC-filed funds).
+///
+/// More important than sector inference: a default `geo::US` is
+/// *factually wrong* for international funds (FRDM holds
+/// emerging-market equities, not US), so this fix tightens
+/// portfolio-level geographic-exposure reporting.
+pub fn inferGeoFromTitle(title: ?[]const u8) ?[]const u8 {
+    const t = title orelse return null;
+    if (t.len == 0) return null;
+
+    var buf: [256]u8 = undefined;
+    const lc = lowercaseTitle(&buf, t) orelse return null;
+
+    // Emerging markets first — most specific. "Emerging" alone
+    // is rare in non-EM contexts in fund-name conventions.
+    // "Frontier" likewise is conventionally only used for
+    // frontier markets in fund titles.
+    if (titleContainsAny(lc, &.{ "emerging market", "emerging markets", "frontier market", "frontier markets", "frontier" })) {
+        return geo.emerging;
+    }
+
+    // International Developed. "International" / "Intl" /
+    // "ex-US" / "World ex US" / "Developed Markets" /
+    // specific developed-market regions.
+    //
+    // False-positive risk: a hypothetical "Vanguard Total
+    // International + US Equity Fund" would mis-tag here. None
+    // of the user's current portfolio holds such a hybrid
+    // fund. If one ever shows up, it'll get flagged in the
+    // diff-against-old-metadata.srf review and can be
+    // hand-corrected.
+    if (titleContainsAny(lc, &.{ "international", " intl", "ex-us", "ex us", "world ex", "developed market", "developed markets" })) {
+        return geo.developed;
+    }
+
+    return null;
+}
+
+test "inferSectorFromTitle: null/empty -> null" {
+    try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle(null));
+    try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle(""));
+}
+
+test "inferSectorFromTitle: technology keywords" {
+    try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("iShares Semiconductor ETF"));
+    try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("Vanguard Software ETF"));
+}
+
+test "inferSectorFromTitle: healthcare keywords" {
+    try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("Health Care Select Sector SPDR"));
+    try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("iShares Biotech ETF"));
+}
+
+test "inferSectorFromTitle: ambiguous title -> null" {
+    try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("Vanguard Total Stock Market"));
+    try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("SPDR S&P 500"));
+}
+
+test "inferGeoFromTitle: null/empty -> null" {
+    try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle(null));
+    try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle(""));
+}
+
+test "inferGeoFromTitle: emerging markets" {
+    try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("Freedom 100 Emerging Markets ETF"));
+    try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("iShares Frontier Markets"));
+}
+
+test "inferGeoFromTitle: international developed" {
+    try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard FTSE Developed Markets"));
+    try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("iShares MSCI EAFE International"));
+    try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard Total World ex-US"));
+}
+
+test "inferGeoFromTitle: no match -> null" {
+    try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("SPDR S&P 500"));
+    try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("iShares Semiconductor ETF"));
+}
--- a/src/service.zig
+++ b/src/service.zig
@ -35,6 +35,7 @@ const Yahoo = @import("providers/yahoo.zig").Yahoo;
 const Tiingo = @import("providers/tiingo.zig").Tiingo;
 const Wikidata = @import("providers/Wikidata.zig");
 const Edgar = @import("providers/Edgar.zig");
+const classification = @import("models/classification.zig");
 const fmt = @import("format.zig");
 const performance = @import("analytics/performance.zig");
 const http = @import("net/http.zig");
@ -1037,9 +1038,9 @@ pub const DataService = struct {
        // higher-quality name. Best-effort: if the fetch fails we
        // still return the EDGAR-only profile.
        var inception_date: ?Date = null;
-        if (self.getClassification(symbol, opts)) |classification| {
-            defer classification.deinit();
-            for (classification.data) |c| {
+        if (self.getClassification(symbol, opts)) |class_result| {
+            defer class_result.deinit();
+            for (class_result.data) |c| {
                if (c.inception_date) |idate_str| {
                    if (Date.parse(idate_str)) |d| inception_date = d else |_| {}
                }
@ -1092,11 +1093,6 @@ pub const DataService = struct {
    /// `opts.skip_network = true` returns cached data even if stale,
    /// `FetchFailed` on cache miss. `opts.force_refresh = true`
    /// ignores the cache and re-fetches.
-    ///
-    /// Callers fetching classifications for many symbols should use
-    /// `getClassifications(symbols)` instead — Wikidata's SPARQL API
-    /// is naturally batched, and one query for N symbols is much
-    /// cheaper than N queries for 1 symbol each.
    pub fn getClassification(self: *DataService, symbol: []const u8, opts: FetchOptions) DataError!FetchResult(Wikidata.ClassificationRecord) {
        var s = self.store();

@ -1131,124 +1127,282 @@ pub const DataService = struct {
        const fetched = wd.fetch(self.allocator, &symbols) catch |err| {
            if (err == error.RateLimited) {
                self.rateLimitBackoff();
-                break_blk: {
-                    const retried = wd.fetch(self.allocator, &symbols) catch break :break_blk;
-                    if (retried.len > 0) {
-                        s.write(Wikidata.ClassificationRecord, symbol, retried, cache.DataType.classification.ttl());
-                        return .{ .data = retried, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator };
-                    }
-                    self.allocator.free(retried);
-                }
+                if (wd.fetch(self.allocator, &symbols)) |retried| {
+                    return self.finalizeClassification(symbol, retried, opts);
+                } else |_| {}
            }
            log.warn("{s}: wikidata fetch failed: {s}", .{ symbol, @errorName(err) });
            return DataError.FetchFailed;
        };

-        if (fetched.len == 0) {
-            self.allocator.free(fetched);
-            // Wikidata had no row for this symbol. Negative-cache to
-            // suppress retries until the user explicitly refreshes.
-            s.writeNegative(symbol, .classification);
-            return DataError.NotFound;
-        }
-
-        s.write(Wikidata.ClassificationRecord, symbol, fetched, cache.DataType.classification.ttl());
-
-        return .{ .data = fetched, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator };
+        return self.finalizeClassification(symbol, fetched, opts);
    }

-    /// Batched classification fetch. Wikidata's SPARQL API takes a
-    /// `VALUES ?ticker { ... }` set in one query; this method runs
-    /// that query for the requested set, splits the response into
-    /// per-symbol cache writes, and returns the slice. Symbols not
-    /// in Wikidata are silently dropped from the result (the user-
-    /// facing cache for them gets a negative entry).
+    /// Common post-Wikidata path: decide if the result is useful as
+    /// returned, otherwise consult EDGAR to fill in the gaps,
+    /// otherwise negative-cache. Either way the cache gets written
+    /// and a `FetchResult` is returned (or `DataError.NotFound`).
    ///
-    /// The cache is consulted first per-symbol; only the symbols
-    /// that miss the cache (or are stale) are passed to the SPARQL
-    /// query. This minimizes the upstream load when most symbols
-    /// were already classified in a prior run.
-    pub fn getClassifications(
+    /// Takes ownership of `wikidata_records`. The slice is either
+    /// returned as the result data, freed and replaced by a
+    /// synthesized slice, or freed and the symbol negative-cached.
+    fn finalizeClassification(
        self: *DataService,
-        result_allocator: std.mem.Allocator,
-        symbols: []const []const u8,
+        symbol: []const u8,
+        wikidata_records: []Wikidata.ClassificationRecord,
        opts: FetchOptions,
-    ) DataError![]Wikidata.ClassificationRecord {
-        if (symbols.len == 0) return &.{};
+    ) DataError!FetchResult(Wikidata.ClassificationRecord) {
        var s = self.store();
+        const ttl = cache.DataType.classification.ttl();

-        // Identify cache misses.
-        var to_fetch: std.ArrayList([]const u8) = .empty;
-        defer to_fetch.deinit(self.allocator);
-        var cached_records: std.ArrayList(Wikidata.ClassificationRecord) = .empty;
-        errdefer {
-            for (cached_records.items) |*r| {
-                var m = r.*;
-                m.deinit(self.allocator);
+        // Wikidata returned a useful row -> populate geo from
+        // geoFor(country) and cache as-is.
+        if (wikidata_records.len > 0 and wikidataLooksUseful(wikidata_records[0])) {
+            try self.populateGeo(&wikidata_records[0]);
+            s.write(Wikidata.ClassificationRecord, symbol, wikidata_records, ttl);
+            return .{ .data = wikidata_records, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator };
+        }
+
+        // Sparse or empty: try EDGAR fallback. `synthesizeClassification`
+        // takes ownership of the wikidata slice (frees it, returns a
+        // new one-element slice with the merged record). Returns
+        // `error.NotFound` when even EDGAR has nothing.
+        const merged = self.synthesizeClassification(symbol, wikidata_records, opts) catch |err| {
+            if (err == error.NotFound) {
+                s.writeNegative(symbol, .classification);
+                return DataError.NotFound;
            }
-            cached_records.deinit(self.allocator);
-        }
-
-        for (symbols) |sym| {
-            if (!opts.force_refresh) {
-                if (s.read(Wikidata.ClassificationRecord, sym, null, .fresh_only)) |cached| {
-                    // The on-disk shape is a length-1 slice.
-                    if (cached.data.len > 0) {
-                        try cached_records.append(self.allocator, cached.data[0]);
-                        // Free the rest if any (shouldn't happen for
-                        // per-symbol classification, but defensive).
-                        for (cached.data[1..]) |*r| {
-                            var m = r.*;
-                            m.deinit(self.allocator);
-                        }
-                        self.allocator.free(cached.data);
-                        continue;
-                    }
-                    self.allocator.free(cached.data);
-                }
-            }
-            try to_fetch.append(self.allocator, sym);
-        }
-
-        if (to_fetch.items.len == 0) {
-            // All cached — assemble result from cached_records.
-            const out = try result_allocator.alloc(Wikidata.ClassificationRecord, cached_records.items.len);
-            @memcpy(out, cached_records.items);
-            cached_records.clearRetainingCapacity();
-            return out;
-        }
-
-        if (opts.skip_network) {
-            // Offline mode: return what we have from cache.
-            const out = try result_allocator.alloc(Wikidata.ClassificationRecord, cached_records.items.len);
-            @memcpy(out, cached_records.items);
-            cached_records.clearRetainingCapacity();
-            return out;
-        }
-
-        log.debug("fetching {d} classifications from Wikidata", .{to_fetch.items.len});
-        self.assertNetworkAllowed("getClassifications wikidata.fetch");
-        var wd = try self.getProvider(Wikidata);
-
-        const fetched = wd.fetch(self.allocator, to_fetch.items) catch |err| {
-            log.warn("wikidata batch fetch failed: {s}", .{@errorName(err)});
            return DataError.FetchFailed;
        };
-        defer self.allocator.free(fetched);

-        // Write each fetched record to its per-symbol cache file.
-        for (fetched) |rec| {
-            const single = [_]Wikidata.ClassificationRecord{rec};
-            s.write(Wikidata.ClassificationRecord, rec.symbol, &single, cache.DataType.classification.ttl());
+        s.write(Wikidata.ClassificationRecord, symbol, merged, ttl);
+        return .{ .data = merged, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator };
+    }
+
+    /// Populate `record.geo` from `geoFor(record.country)` when it
+    /// isn't already set. Best-effort: if duping the geo string
+    /// fails, leaves the field null and propagates the error so the
+    /// caller can decide whether to bail.
+    fn populateGeo(self: *DataService, record: *Wikidata.ClassificationRecord) !void {
+        if (record.geo != null) return;
+        const country = record.country orelse return;
+        const g = classification.geoFor(country);
+        if (std.mem.eql(u8, g, classification.geo.unknown)) return;
+        record.geo = try self.allocator.dupe(u8, g);
+    }
+
+    /// Whether a Wikidata classification record carries enough
+    /// downstream-usable data to skip the EDGAR fallback. A record
+    /// with at least one of `is_etf`, `sector`, `country`, or
+    /// `asset_class` set is "useful"; sparse records (e.g. SOXX
+    /// getting only a `name` from Wikidata) need the EDGAR
+    /// ticker-map fallback to fill in `is_etf=true,
+    /// asset_class=ETF, country=US`.
+    fn wikidataLooksUseful(c: Wikidata.ClassificationRecord) bool {
+        if (c.is_etf) return true;
+        if (c.asset_class != null) return true;
+        if (c.country != null) return true;
+        if (c.sector != null) return true;
+        return false;
+    }
+
+    /// Synthesize a `ClassificationRecord` for a symbol that
+    /// Wikidata couldn't classify usefully. Consults the EDGAR
+    /// ticker maps; if found, also fetches `getEtfMetrics` to
+    /// recover the NPORT-P series_name (more authoritative than
+    /// the company_tickers title). Title-keyword inference fills
+    /// in `sector` and `geo` when the name carries an unambiguous
+    /// keyword.
+    ///
+    /// Takes ownership of `wikidata_records`: frees them at exit.
+    /// Wikidata's `name`/`industry`/`inception_date`/`cik` fields
+    /// are preserved into the synthesized record when present.
+    /// Returns `error.NotFound` when EDGAR has nothing either.
+    fn synthesizeClassification(
+        self: *DataService,
+        symbol: []const u8,
+        wikidata_records: []Wikidata.ClassificationRecord,
+        opts: FetchOptions,
+    ) !cache.Store.DataFor(Wikidata.ClassificationRecord) {
+        defer Wikidata.ClassificationRecord.freeSlice(self.allocator, wikidata_records);
+
+        const lookup = self.lookupEdgarFallback(symbol, opts);
+        defer freeEdgarLookup(self.allocator, lookup);
+        if (lookup == .none) return error.NotFound;
+
+        // For ETF/fund hits, try to get the richer series_name from
+        // NPORT-P. Cache hit is cheap; cache miss triggers an EDGAR
+        // fetch but is bounded by EDGAR's rate limiter. If the call
+        // fails (e.g. money-market funds with no NPORT-P), we fall
+        // back to the ticker-map title.
+        var etf_metrics_result: ?FetchResult(Edgar.EtfMetricRecord) = null;
+        defer if (etf_metrics_result) |*r| r.deinit();
+        etf_metrics_result = self.getEtfMetrics(symbol, opts) catch null;
+
+        // Extract series_name and cik from the etf_metrics profile row.
+        var series_name: ?[]const u8 = null;
+        var etf_cik: ?[]const u8 = null;
+        if (etf_metrics_result) |r| {
+            for (r.data) |rec| switch (rec) {
+                .profile => |p| {
+                    if (p.series_name) |sn| series_name = sn;
+                    etf_cik = p.cik;
+                    break;
+                },
+                else => {},
+            };
        }

-        // Combine cached + fetched into the result.
-        const total = cached_records.items.len + fetched.len;
-        const out = try result_allocator.alloc(Wikidata.ClassificationRecord, total);
-        @memcpy(out[0..cached_records.items.len], cached_records.items);
-        @memcpy(out[cached_records.items.len..], fetched);
-        cached_records.clearRetainingCapacity();
-        return out;
+        // Pull whatever Wikidata's sparse record carried so we
+        // don't lose data on the merge.
+        const wd: ?Wikidata.ClassificationRecord = if (wikidata_records.len > 0) wikidata_records[0] else null;
+
+        // Pick the best name source: NPORT-P series_name >
+        // EDGAR ticker-map title > Wikidata name > nothing.
+        //
+        // We're on the EDGAR-fallback path because Wikidata's
+        // record was sparse. For funds, Wikidata's `name` (when
+        // present) is frequently the underlying INDEX rather than
+        // the FUND itself -- e.g. SOXX's Wikidata `name` is "PHLX
+        // Semiconductor Sector" but the fund is "iShares
+        // Semiconductor ETF" per NPORT-P seriesName. Prefer the
+        // fund-authoritative source so downstream comments and
+        // labels show the fund name, not the index name.
+        const ticker_title: ?[]const u8 = switch (lookup) {
+            .company_or_uit => |c| c.title,
+            else => null,
+        };
+        const best_name: ?[]const u8 = blk: {
+            if (series_name) |n| break :blk n;
+            if (ticker_title) |n| break :blk n;
+            if (wd) |w| {
+                if (w.name) |n| break :blk n;
+            }
+            break :blk null;
+        };
+
+        // Name source for title-keyword inference: prefer the
+        // most-authoritative source for fund-style classification
+        // even when Wikidata supplied a (different) name. Wikidata's
+        // name for a fund is often less informative than NPORT-P's
+        // seriesName (e.g. SOXX's Wikidata name is "PHLX
+        // Semiconductor Sector" which is the index name, not the
+        // fund name).
+        const inference_name: ?[]const u8 = series_name orelse ticker_title orelse if (wd) |w| w.name else null;
+
+        const inferred_sector = classification.inferSectorFromTitle(inference_name);
+        const inferred_geo = classification.inferGeoFromTitle(inference_name);
+
+        // `is_etf` here means "this is fund-shaped, emit multi-row
+        // breakdown" -- true for ANY EDGAR-found symbol. The
+        // `tickers_funds.srf` map mixes mutual funds and
+        // series-of-trust ETFs alike. The `tickers_companies.srf`
+        // map carries operating companies, closed-end funds, and
+        // UITs; operating companies usually have Wikidata coverage
+        // and wouldn't reach this fallback, so anything that
+        // dropped here is also fund-shaped (e.g. PIMCO closed-end
+        // funds whose title says "FUND" but not "ETF" or "TRUST").
+        //
+        // The ETF/TRUST keyword in the title still drives the
+        // asset_class label below ("ETF" vs "Fund"), but the
+        // fund-shaped routing decision applies regardless.
+        const is_etf = true;
+        const asset_class: []const u8 = switch (lookup) {
+            .managed_fund => "Fund",
+            .company_or_uit => |c| if (c.is_etf) "ETF" else "Fund",
+            .none => unreachable,
+        };
+
+        // Country: prefer Wikidata's. Default to "US" for
+        // EDGAR-found symbols (they're SEC filers).
+        const country_str: []const u8 = if (wd) |w| (w.country orelse "US") else "US";
+
+        // Sector: prefer Wikidata's existing sector (rare in this
+        // sparse-fallback path), else fall back to inferred.
+        const sector_str: ?[]const u8 = blk: {
+            if (wd) |w| {
+                if (w.sector) |sec| break :blk sec;
+            }
+            break :blk inferred_sector;
+        };
+
+        // CIK: prefer Wikidata's, fall back to NPORT-P's.
+        const cik_str: ?[]const u8 = blk: {
+            if (wd) |w| {
+                if (w.cik) |c| break :blk c;
+            }
+            if (etf_cik) |c| break :blk c;
+            break :blk null;
+        };
+
+        // Geo: prefer the Wikidata-derived geo (computed from
+        // `geoFor(country)` against the country code), else use
+        // title-keyword inference. Default to "US" when neither
+        // is available -- EDGAR-found symbols are SEC filers.
+        const geo_str: []const u8 = blk: {
+            if (wd) |w| {
+                if (w.country) |c| {
+                    const g = classification.geoFor(c);
+                    if (!std.mem.eql(u8, g, classification.geo.unknown)) break :blk g;
+                }
+            }
+            if (inferred_geo) |g| break :blk g;
+            break :blk classification.geo.us;
+        };
+
+        const today = fmt.todayDate(self.io);
+        var as_of_buf: [10]u8 = undefined;
+        const as_of_str = try std.fmt.bufPrint(&as_of_buf, "{f}", .{today});
+
+        // Allocate each owned field up front with its own errdefer
+        // so a partial-build on OOM doesn't leak the earlier
+        // successful dupes. Once all dupes succeed we assemble the
+        // record (no fallible ops below this point).
+        const symbol_owned = try self.allocator.dupe(u8, symbol);
+        errdefer self.allocator.free(symbol_owned);
+        const name_owned: ?[]const u8 = if (best_name) |n| try self.allocator.dupe(u8, n) else null;
+        errdefer if (name_owned) |s| self.allocator.free(s);
+        const sector_owned: ?[]const u8 = if (sector_str) |s| try self.allocator.dupe(u8, s) else null;
+        errdefer if (sector_owned) |s| self.allocator.free(s);
+        const industry_owned: ?[]const u8 = if (wd) |w|
+            (if (w.industry) |i| try self.allocator.dupe(u8, i) else null)
+        else
+            null;
+        errdefer if (industry_owned) |s| self.allocator.free(s);
+        const country_owned = try self.allocator.dupe(u8, country_str);
+        errdefer self.allocator.free(country_owned);
+        const geo_owned = try self.allocator.dupe(u8, geo_str);
+        errdefer self.allocator.free(geo_owned);
+        const asset_class_owned = try self.allocator.dupe(u8, asset_class);
+        errdefer self.allocator.free(asset_class_owned);
+        const inception_owned: ?[]const u8 = if (wd) |w|
+            (if (w.inception_date) |i| try self.allocator.dupe(u8, i) else null)
+        else
+            null;
+        errdefer if (inception_owned) |s| self.allocator.free(s);
+        const cik_owned: ?[]const u8 = if (cik_str) |c| try self.allocator.dupe(u8, c) else null;
+        errdefer if (cik_owned) |s| self.allocator.free(s);
+        const as_of_owned = try self.allocator.dupe(u8, as_of_str);
+        errdefer self.allocator.free(as_of_owned);
+        const source_owned = try self.allocator.dupe(u8, "edgar_fallback");
+        errdefer self.allocator.free(source_owned);
+
+        const result = try self.allocator.alloc(Wikidata.ClassificationRecord, 1);
+        result[0] = .{
+            .symbol = symbol_owned,
+            .name = name_owned,
+            .sector = sector_owned,
+            .industry = industry_owned,
+            .country = country_owned,
+            .geo = geo_owned,
+            .asset_class = asset_class_owned,
+            .is_etf = is_etf,
+            .inception_date = inception_owned,
+            .cik = cik_owned,
+            .as_of = as_of_owned,
+            .source = source_owned,
+        };
+        return result;
    }

    /// Fetch XBRL-derived entity facts for a CIK (currently
@ -2955,6 +3109,326 @@ test "getClassification: cache hit returns cached data without network" {
    try std.testing.expectEqual(Source.cached, result.source);
 }

+test "populateGeo: country US -> geo US" {
+    const allocator = std.testing.allocator;
+    const io = std.testing.io;
+    var tmp = std.testing.tmpDir(.{});
+    defer tmp.cleanup();
+    const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
+    defer allocator.free(dir_path);
+
+    const config = Config{ .cache_dir = dir_path };
+    var svc = DataService.init(io, allocator, config);
+    defer svc.deinit();
+
+    var record: Wikidata.ClassificationRecord = .{
+        .symbol = try allocator.dupe(u8, "TEST"),
+        .country = try allocator.dupe(u8, "US"),
+        .as_of = try allocator.dupe(u8, "2026-06-01"),
+        .source = try allocator.dupe(u8, "wikidata"),
+    };
+    defer record.deinit(allocator);
+
+    try svc.populateGeo(&record);
+    try std.testing.expect(record.geo != null);
+    try std.testing.expectEqualStrings("US", record.geo.?);
+}
+
+test "populateGeo: country GB -> geo International Developed" {
+    const allocator = std.testing.allocator;
+    const io = std.testing.io;
+    var tmp = std.testing.tmpDir(.{});
+    defer tmp.cleanup();
+    const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
+    defer allocator.free(dir_path);
+
+    const config = Config{ .cache_dir = dir_path };
+    var svc = DataService.init(io, allocator, config);
+    defer svc.deinit();
+
+    var record: Wikidata.ClassificationRecord = .{
+        .symbol = try allocator.dupe(u8, "TEST"),
+        .country = try allocator.dupe(u8, "GB"),
+        .as_of = try allocator.dupe(u8, "2026-06-01"),
+        .source = try allocator.dupe(u8, "wikidata"),
+    };
+    defer record.deinit(allocator);
+
+    try svc.populateGeo(&record);
+    try std.testing.expect(record.geo != null);
+    try std.testing.expectEqualStrings("International Developed", record.geo.?);
+}
+
+test "populateGeo: null country -> noop" {
+    const allocator = std.testing.allocator;
+    const io = std.testing.io;
+    var tmp = std.testing.tmpDir(.{});
+    defer tmp.cleanup();
+    const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
+    defer allocator.free(dir_path);
+
+    const config = Config{ .cache_dir = dir_path };
+    var svc = DataService.init(io, allocator, config);
+    defer svc.deinit();
+
+    var record: Wikidata.ClassificationRecord = .{
+        .symbol = try allocator.dupe(u8, "TEST"),
+        .as_of = try allocator.dupe(u8, "2026-06-01"),
+        .source = try allocator.dupe(u8, "wikidata"),
+    };
+    defer record.deinit(allocator);
+
+    try svc.populateGeo(&record);
+    try std.testing.expectEqual(@as(?[]const u8, null), record.geo);
+}
+
+test "populateGeo: existing geo not overwritten" {
+    const allocator = std.testing.allocator;
+    const io = std.testing.io;
+    var tmp = std.testing.tmpDir(.{});
+    defer tmp.cleanup();
+    const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
+    defer allocator.free(dir_path);
+
+    const config = Config{ .cache_dir = dir_path };
+    var svc = DataService.init(io, allocator, config);
+    defer svc.deinit();
+
+    var record: Wikidata.ClassificationRecord = .{
+        .symbol = try allocator.dupe(u8, "TEST"),
+        .country = try allocator.dupe(u8, "US"),
+        .geo = try allocator.dupe(u8, "Already Set"),
+        .as_of = try allocator.dupe(u8, "2026-06-01"),
+        .source = try allocator.dupe(u8, "wikidata"),
+    };
+    defer record.deinit(allocator);
+
+    try svc.populateGeo(&record);
+    try std.testing.expectEqualStrings("Already Set", record.geo.?);
+}
+
+test "getClassification: sparse Wikidata + EDGAR managed_fund hit produces merged record" {
+    const allocator = std.testing.allocator;
+    const io = std.testing.io;
+    var tmp = std.testing.tmpDir(.{});
+    defer tmp.cleanup();
+    const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
+    defer allocator.free(dir_path);
+
+    const config = Config{ .cache_dir = dir_path };
+    var svc = DataService.init(io, allocator, config);
+    defer svc.deinit();
+
+    // Seed both EDGAR ticker map caches with at least one entry
+    // each so the synthesizeClassification path doesn't try to
+    // fetch them (the load helpers treat empty cached slices as
+    // "miss" and fall through to a network fetch).
+    var s = svc.store();
+    var mf_entries = [_]Edgar.MutualFundTickerEntry{.{
+        .symbol = "FAGIX",
+        .cik = "0000275309",
+    }};
+    s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl());
+    var co_entries = [_]Edgar.CompanyTickerEntry{.{
+        .symbol = "DUMMY",
+        .cik = "0000000001",
+    }};
+    s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl());
+
+    // Seed an etf_metrics negative cache so getEtfMetrics doesn't
+    // try to fetch from the network.
+    s.writeNegative("FAGIX", .etf_metrics);
+
+    // Sparse Wikidata records (length 1, only name set -- not useful).
+    var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1);
+    sparse[0] = .{
+        .symbol = try allocator.dupe(u8, "FAGIX"),
+        .name = try allocator.dupe(u8, "Test Fund"),
+        .as_of = try allocator.dupe(u8, "2026-06-01"),
+        .source = try allocator.dupe(u8, "wikidata"),
+    };
+
+    // Drive directly through synthesizeClassification (skip the
+    // Wikidata fetch). It takes ownership of `sparse`.
+    svc.panic_on_network_attempt = true; // any provider call -> panic
+    const merged = try svc.synthesizeClassification("FAGIX", sparse, .{ .skip_network = true });
+    defer Wikidata.ClassificationRecord.freeSlice(allocator, merged);
+
+    try std.testing.expectEqual(@as(usize, 1), merged.len);
+    const c = merged[0];
+    try std.testing.expectEqualStrings("FAGIX", c.symbol);
+    try std.testing.expect(c.is_etf);
+    try std.testing.expectEqualStrings("Fund", c.asset_class.?);
+    try std.testing.expectEqualStrings("US", c.country.?);
+    try std.testing.expectEqualStrings("US", c.geo.?);
+    try std.testing.expectEqualStrings("edgar_fallback", c.source);
+    // Wikidata's name preserved on merge.
+    try std.testing.expectEqualStrings("Test Fund", c.name.?);
+}
+
+test "synthesizeClassification: no EDGAR hit returns NotFound" {
+    const allocator = std.testing.allocator;
+    const io = std.testing.io;
+    var tmp = std.testing.tmpDir(.{});
+    defer tmp.cleanup();
+    const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
+    defer allocator.free(dir_path);
+
+    const config = Config{ .cache_dir = dir_path };
+    var svc = DataService.init(io, allocator, config);
+    defer svc.deinit();
+
+    // Seed both ticker maps with throwaway entries so the
+    // EDGAR lookup returns .none for our test symbol but doesn't
+    // try to fetch the maps from the network.
+    var s = svc.store();
+    var mf_entries = [_]Edgar.MutualFundTickerEntry{.{
+        .symbol = "DUMMY1",
+        .cik = "0000000001",
+    }};
+    s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl());
+    var co_entries = [_]Edgar.CompanyTickerEntry{.{
+        .symbol = "DUMMY2",
+        .cik = "0000000002",
+    }};
+    s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl());
+
+    var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1);
+    sparse[0] = .{
+        .symbol = try allocator.dupe(u8, "NEVERHEARDOFIT"),
+        .name = try allocator.dupe(u8, "ghost"),
+        .as_of = try allocator.dupe(u8, "2026-06-01"),
+        .source = try allocator.dupe(u8, "wikidata"),
+    };
+
+    svc.panic_on_network_attempt = true;
+    try std.testing.expectError(error.NotFound, svc.synthesizeClassification("NEVERHEARDOFIT", sparse, .{ .skip_network = true }));
+}
+
+test "synthesizeClassification: company_or_uit without ETF/TRUST keyword still routes to multi-row" {
+    // PTY shape: closed-end fund whose company_tickers title is
+    // "PIMCO CORPORATE & INCOME OPPORTUNITY FUND" -- no "ETF" or
+    // "TRUST" in the title, so lookupInTickerMaps returns
+    // .company_or_uit{is_etf=false}. But it's still fund-shaped
+    // and should produce multi-row metadata in enrich.
+    //
+    // The downstream signal for "fund-like, emit multi-row" is
+    // ClassificationRecord.is_etf. Set it to true for any
+    // EDGAR-found .company_or_uit hit (even when the title
+    // doesn't carry the ETF/TRUST keyword), so PTY-shape
+    // closed-end funds get the same treatment as ETFs.
+    const allocator = std.testing.allocator;
+    const io = std.testing.io;
+    var tmp = std.testing.tmpDir(.{});
+    defer tmp.cleanup();
+    const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
+    defer allocator.free(dir_path);
+
+    const config = Config{ .cache_dir = dir_path };
+    var svc = DataService.init(io, allocator, config);
+    defer svc.deinit();
+
+    var s = svc.store();
+    // Throwaway MF entry so the MF lookup returns null.
+    var mf_entries = [_]Edgar.MutualFundTickerEntry{.{
+        .symbol = "DUMMY",
+        .cik = "0000000001",
+    }};
+    s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl());
+    // PTY in the company map with NO ETF/TRUST in title.
+    var co_entries = [_]Edgar.CompanyTickerEntry{.{
+        .symbol = "PTY",
+        .cik = "0001202604",
+        .title = "PIMCO CORPORATE & INCOME OPPORTUNITY FUND",
+    }};
+    s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl());
+    s.writeNegative("PTY", .etf_metrics);
+
+    var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1);
+    sparse[0] = .{
+        .symbol = try allocator.dupe(u8, "PTY"),
+        .name = try allocator.dupe(u8, "PIMCO Corporate & Income Opportunity Fund"),
+        .as_of = try allocator.dupe(u8, "2026-06-01"),
+        .source = try allocator.dupe(u8, "wikidata"),
+    };
+
+    svc.panic_on_network_attempt = true;
+    const merged = try svc.synthesizeClassification("PTY", sparse, .{ .skip_network = true });
+    defer Wikidata.ClassificationRecord.freeSlice(allocator, merged);
+
+    try std.testing.expectEqual(@as(usize, 1), merged.len);
+    const c = merged[0];
+    // is_etf MUST be true so enrich routes through emitEtfRows
+    // (multi-row sleeve breakdown). The asset_class stays "Fund"
+    // because no ETF/TRUST keyword in title.
+    try std.testing.expect(c.is_etf);
+    try std.testing.expectEqualStrings("Fund", c.asset_class.?);
+}
+
+test "synthesizeClassification: NPORT-P series_name beats Wikidata's index name for funds" {
+    // SOXX shape: Wikidata returns the underlying INDEX name
+    // ("PHLX Semiconductor Sector") which is technically what the
+    // ticker symbol is for, but downstream consumers want the
+    // FUND name ("iShares Semiconductor ETF") that NPORT-P
+    // <seriesName> carries. Series_name is more authoritative
+    // for the fund itself.
+    const allocator = std.testing.allocator;
+    const io = std.testing.io;
+    var tmp = std.testing.tmpDir(.{});
+    defer tmp.cleanup();
+    const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
+    defer allocator.free(dir_path);
+
+    const config = Config{ .cache_dir = dir_path };
+    var svc = DataService.init(io, allocator, config);
+    defer svc.deinit();
+
+    var s = svc.store();
+    var mf_entries = [_]Edgar.MutualFundTickerEntry{.{
+        .symbol = "DUMMY",
+        .cik = "0000000001",
+    }};
+    s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl());
+    var co_entries = [_]Edgar.CompanyTickerEntry{.{
+        .symbol = "SOXX",
+        .cik = "0001100663",
+        .title = "iShares Trust",
+    }};
+    s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl());
+
+    // Pre-seed etf_metrics with a profile row carrying the
+    // NPORT-P seriesName.
+    var etf_records = [_]Edgar.EtfMetricRecord{
+        .{ .profile = .{
+            .symbol = try allocator.dupe(u8, "SOXX"),
+            .series_name = try allocator.dupe(u8, "iShares Semiconductor ETF"),
+            .cik = try allocator.dupe(u8, "0001100663"),
+            .as_of = try allocator.dupe(u8, "2026-06-01"),
+            .source = try allocator.dupe(u8, "edgar"),
+        } },
+    };
+    defer for (etf_records) |r| r.deinit(allocator);
+    s.write(Edgar.EtfMetricRecord, "SOXX", etf_records[0..], cache.DataType.etf_metrics.ttl());
+
+    // Wikidata returned only the index name (sparse).
+    var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1);
+    sparse[0] = .{
+        .symbol = try allocator.dupe(u8, "SOXX"),
+        .name = try allocator.dupe(u8, "PHLX Semiconductor Sector"),
+        .as_of = try allocator.dupe(u8, "2026-06-01"),
+        .source = try allocator.dupe(u8, "wikidata"),
+    };
+
+    svc.panic_on_network_attempt = true;
+    const merged = try svc.synthesizeClassification("SOXX", sparse, .{ .skip_network = true });
+    defer Wikidata.ClassificationRecord.freeSlice(allocator, merged);
+
+    try std.testing.expectEqual(@as(usize, 1), merged.len);
+    const c = merged[0];
+    // Series_name from NPORT-P wins -- not Wikidata's index name.
+    try std.testing.expectEqualStrings("iShares Semiconductor ETF", c.name.?);
+}
+
 test "getEntityFacts: skip_network with no cache returns FetchFailed" {
    const allocator = std.testing.allocator;
    const io = std.testing.io;