zfin/src/models/classification.zig

/// Classification metadata for portfolio analysis.
///
/// Each entry maps a symbol to one or more asset class / sector / geographic allocations.
/// For individual stocks, there's typically one entry at 100%.
/// For blended funds (e.g., target date), there can be multiple entries that sum to ~100%.
///
/// Loaded from a metadata SRF file like `metadata.srf`:
///   symbol::AMZN,sector::Technology,geo::US,asset_class::US Large Cap
///   symbol::02315N600,asset_class::US Large Cap,pct:num:55
///   symbol::02315N600,asset_class::International Developed,pct:num:20
///   symbol::02315N600,asset_class::Bonds,pct:num:15
const std = @import("std");
const srf = @import("srf");

/// A single classification entry for a symbol.
pub const ClassificationEntry = struct {
    symbol: []const u8,
    /// Human-readable security name (e.g., "Amazon", "SPDR S&P 500
    /// ETF Trust"). Optional — older metadata.srf files may not
    /// have this field. Renderers fall back to `symbol` /
    /// `display_symbol` when null.
    name: ?[]const u8 = null,
    /// User-curated grouping label that overrides the auto-derived
    /// bucket for concentration / dominance checks and the
    /// analysis tab's Sector breakdown. Use this when the upstream
    /// `sector` field is the NPORT-P "Equity / Corporate" mush
    /// that doesn't actually distinguish your holdings (e.g. SPY
    /// vs FRDM vs HFXI all tagged the same way). When null,
    /// `deriveBucket` falls back to a sensible default.
    bucket: ?[]const u8 = null,
    /// Sector (e.g., "Technology", "Healthcare", "Financials")
    sector: ?[]const u8 = null,
    /// Geographic region (e.g., "US", "International Developed", "Emerging Markets")
    geo: ?[]const u8 = null,
    /// Asset class (e.g., "US Large Cap", "Bonds", "Cash")
    asset_class: ?[]const u8 = null,
    /// Percentage weight for this entry (0-100). Default 100 for single-class assets.
    pct: f64 = 100.0,
};

/// Parsed classification data for the entire portfolio.
pub const ClassificationMap = struct {
    entries: []ClassificationEntry,
    allocator: std.mem.Allocator,

    pub fn deinit(self: *ClassificationMap) void {
        for (self.entries) |e| {
            self.allocator.free(e.symbol);
            if (e.name) |n| self.allocator.free(n);
            if (e.bucket) |b| self.allocator.free(b);
            if (e.sector) |s| self.allocator.free(s);
            if (e.geo) |g| self.allocator.free(g);
            if (e.asset_class) |a| self.allocator.free(a);
        }
        self.allocator.free(self.entries);
    }
};

/// Parse a metadata SRF file into a ClassificationMap.
/// Each record has: symbol::<SYM>,name::<N>,bucket::<B>,sector::<S>,geo::<G>,asset_class::<A>,pct:num:<P>
/// All fields except symbol are optional. pct defaults to 100.
pub fn parseClassificationFile(allocator: std.mem.Allocator, data: []const u8) !ClassificationMap {
    var entries = std.ArrayList(ClassificationEntry).empty;
    errdefer {
        for (entries.items) |e| {
            allocator.free(e.symbol);
            if (e.name) |n| allocator.free(n);
            if (e.bucket) |b| allocator.free(b);
            if (e.sector) |s| allocator.free(s);
            if (e.geo) |g| allocator.free(g);
            if (e.asset_class) |a| allocator.free(a);
        }
        entries.deinit(allocator);
    }

    var reader = std.Io.Reader.fixed(data);
    var it = srf.iterator(&reader, allocator, .{ .parse_allocator = .none }) catch return error.InvalidData;
    defer it.deinit();

    while (try it.next()) |fields| {
        const entry = fields.to(ClassificationEntry, .{}) catch continue;
        // Pre-fill `bucket` if the user didn't curate one. This
        // shifts the cost of `deriveBucket` to parse time and
        // makes downstream code free to read `entry.bucket`
        // directly without juggling allocator parameters.
        const built_bucket: []const u8 = if (entry.bucket) |b|
            try allocator.dupe(u8, b)
        else
            try deriveBucket(entry, allocator);
        try entries.append(allocator, .{
            .symbol = try allocator.dupe(u8, entry.symbol),
            .name = if (entry.name) |n| try allocator.dupe(u8, n) else null,
            .bucket = built_bucket,
            .sector = if (entry.sector) |s| try allocator.dupe(u8, s) else null,
            .geo = if (entry.geo) |g| try allocator.dupe(u8, g) else null,
            .asset_class = if (entry.asset_class) |a| try allocator.dupe(u8, a) else null,
            .pct = entry.pct,
        });
    }

    return .{
        .entries = try entries.toOwnedSlice(allocator),
        .allocator = allocator,
    };
}

/// Resolve a classification entry to its display bucket. Used by
/// the review tab's Sector column, by `analyzePortfolio`'s sector
/// rollup, and by the observation engine's concentration /
/// dominance checks.
///
/// Four-tier fallback (caller owns the returned slice; allocated
/// via `allocator`):
///   1. `entry.bucket` if set — user-curated, always wins.
///   2. `entry.sector` if set AND doesn't contain '/' — GICS-style
///      sector ("Technology", "Healthcare"). The '/' rules out
///      NPORT-P fund-decomp categories ("Equity / Corporate")
///      that are noise rather than meaningful sectors.
///   3. Composite "<geo> <asset_class>" if both are set. For
///      funds without a curated bucket, this gives a meaningful
///      grouping like "International Developed Fund" or "US ETF".
///   4. Literal "Unclassified".
pub fn deriveBucket(entry: ClassificationEntry, allocator: std.mem.Allocator) ![]const u8 {
    if (entry.bucket) |b| return try allocator.dupe(u8, b);
    if (entry.sector) |s| {
        if (std.mem.indexOfScalar(u8, s, '/') == null) return try allocator.dupe(u8, s);
    }
    if (entry.geo != null and entry.asset_class != null) {
        const g = entry.geo.?;
        const ac = entry.asset_class.?;
        // Avoid duplicate-geo composites like "US US Large Cap".
        // If the asset_class starts with the geo prefix (followed
        // by a space or end-of-string), use it alone. Same for
        // common geographic-noun asset classes that already imply
        // their region ("International Developed", "Emerging
        // Markets") — these don't need a geo prefix.
        const ac_starts_with_geo = std.mem.startsWith(u8, ac, g) and
            (ac.len == g.len or ac[g.len] == ' ');
        const ac_has_implicit_geo = std.mem.startsWith(u8, ac, "International") or
            std.mem.startsWith(u8, ac, "Emerging");
        if (ac_starts_with_geo or ac_has_implicit_geo) {
            return try allocator.dupe(u8, ac);
        }
        return try std.fmt.allocPrint(allocator, "{s} {s}", .{ g, ac });
    }
    return try allocator.dupe(u8, "Unclassified");
}

test "parse classification file" {
    const data =
        \\#!srfv1
        \\# Stock: single sector
        \\symbol::AMZN,name::Amazon,sector::Technology,geo::US,asset_class::US Large Cap
        \\
        \\# Target date fund: blended
        \\symbol::TGT2035,name::Target Retirement 2035,asset_class::US Large Cap,pct:num:55
        \\symbol::TGT2035,name::Target Retirement 2035,asset_class::Bonds,pct:num:15
        \\symbol::TGT2035,name::Target Retirement 2035,asset_class::International Developed,pct:num:20
    ;
    const allocator = std.testing.allocator;
    var cm = try parseClassificationFile(allocator, data);
    defer cm.deinit();

    try std.testing.expectEqual(@as(usize, 4), cm.entries.len);
    try std.testing.expectEqualStrings("AMZN", cm.entries[0].symbol);
    try std.testing.expectEqualStrings("Amazon", cm.entries[0].name.?);
    try std.testing.expectEqualStrings("Technology", cm.entries[0].sector.?);
    try std.testing.expectEqualStrings("US", cm.entries[0].geo.?);
    try std.testing.expectApproxEqAbs(@as(f64, 100.0), cm.entries[0].pct, 0.01);

    try std.testing.expectEqualStrings("TGT2035", cm.entries[1].symbol);
    try std.testing.expectEqualStrings("Target Retirement 2035", cm.entries[1].name.?);
    try std.testing.expectEqualStrings("US Large Cap", cm.entries[1].asset_class.?);
    try std.testing.expectApproxEqAbs(@as(f64, 55.0), cm.entries[1].pct, 0.01);
}

test "parse classification file: missing name field stays null (backwards compat)" {
    // Older metadata.srf files predate the name:: field. Parsing
    // must still succeed; consumers fall back to symbol /
    // display_symbol when name is null.
    const data =
        \\#!srfv1
        \\symbol::AMZN,sector::Technology,geo::US,asset_class::US Large Cap
    ;
    const allocator = std.testing.allocator;
    var cm = try parseClassificationFile(allocator, data);
    defer cm.deinit();

    try std.testing.expectEqual(@as(usize, 1), cm.entries.len);
    try std.testing.expectEqualStrings("AMZN", cm.entries[0].symbol);
    try std.testing.expect(cm.entries[0].name == null);
    // `bucket` is pre-filled by the parser via deriveBucket. For
    // a GICS-style sector ("Technology"), it equals the sector.
    try std.testing.expectEqualStrings("Technology", cm.entries[0].bucket.?);
    try std.testing.expectEqualStrings("Technology", cm.entries[0].sector.?);
}

test "parse classification file: bucket round-trips" {
    const data =
        \\#!srfv1
        \\symbol::SPY,name::SPDR S&P 500 ETF Trust,bucket::US Large Cap,sector::Equity / Corporate,geo::US,asset_class::ETF
    ;
    const allocator = std.testing.allocator;
    var cm = try parseClassificationFile(allocator, data);
    defer cm.deinit();

    try std.testing.expectEqual(@as(usize, 1), cm.entries.len);
    try std.testing.expectEqualStrings("SPY", cm.entries[0].symbol);
    try std.testing.expectEqualStrings("US Large Cap", cm.entries[0].bucket.?);
    try std.testing.expectEqualStrings("Equity / Corporate", cm.entries[0].sector.?);
}

test "deriveBucket: returns user-curated bucket when set" {
    const e: ClassificationEntry = .{
        .symbol = "SPY",
        .bucket = "US Large Cap",
        .sector = "Equity / Corporate", // would otherwise force fallback
        .geo = "US",
        .asset_class = "ETF",
    };
    const out = try deriveBucket(e, std.testing.allocator);
    defer std.testing.allocator.free(out);
    try std.testing.expectEqualStrings("US Large Cap", out);
}

test "deriveBucket: returns sector when GICS-like (no '/')" {
    const e: ClassificationEntry = .{
        .symbol = "AMZN",
        .sector = "Technology",
        .geo = "US",
        .asset_class = "US Large Cap",
    };
    const out = try deriveBucket(e, std.testing.allocator);
    defer std.testing.allocator.free(out);
    try std.testing.expectEqualStrings("Technology", out);
}

test "deriveBucket: composite fallback when sector is NPORT-P mush" {
    const e: ClassificationEntry = .{
        .symbol = "HFXI",
        .sector = "Equity / Corporate",
        .geo = "International Developed",
        .asset_class = "Fund",
    };
    const out = try deriveBucket(e, std.testing.allocator);
    defer std.testing.allocator.free(out);
    try std.testing.expectEqualStrings("International Developed Fund", out);
}

test "deriveBucket: returns Unclassified when nothing usable is set" {
    const e: ClassificationEntry = .{
        .symbol = "UNK",
    };
    const out = try deriveBucket(e, std.testing.allocator);
    defer std.testing.allocator.free(out);
    try std.testing.expectEqualStrings("Unclassified", out);
}

test "deriveBucket: NPORT-P sector with no geo/asset_class falls through to Unclassified" {
    // Defensive: sector is NPORT-P-style (skipped by the GICS
    // filter) AND we don't have both geo and asset_class to
    // build a composite. Falls through to Unclassified.
    const e: ClassificationEntry = .{
        .symbol = "X",
        .sector = "Debt / Corporate",
        .geo = "US",
        // asset_class missing
    };
    const out = try deriveBucket(e, std.testing.allocator);
    defer std.testing.allocator.free(out);
    try std.testing.expectEqualStrings("Unclassified", out);
}

test "deriveBucket: composite avoids duplicate geo when asset_class already starts with it" {
    // Hand-written entries often have geographically-prefixed
    // asset_class values like "US Large Cap" alongside
    // geo="US". The naive composite "{geo} {asset_class}" then
    // produces "US US Large Cap" which is ugly and clusters
    // incorrectly in the breakdown. Detect the duplicate prefix
    // and use the asset_class alone.
    const e: ClassificationEntry = .{
        .symbol = "VOO",
        .geo = "US",
        .asset_class = "US Large Cap",
    };
    const out = try deriveBucket(e, std.testing.allocator);
    defer std.testing.allocator.free(out);
    try std.testing.expectEqualStrings("US Large Cap", out);
}

test "deriveBucket: composite uses asset_class alone for International/Emerging implicit-geo classes" {
    // "International Developed" and "Emerging Markets" are
    // already geographic; the composite shouldn't re-prepend
    // the geo.
    const e1: ClassificationEntry = .{
        .symbol = "VEA",
        .geo = "International Developed",
        .asset_class = "International Developed",
    };
    const out1 = try deriveBucket(e1, std.testing.allocator);
    defer std.testing.allocator.free(out1);
    try std.testing.expectEqualStrings("International Developed", out1);

    const e2: ClassificationEntry = .{
        .symbol = "VWO",
        .geo = "Emerging Markets",
        .asset_class = "Emerging Markets",
    };
    const out2 = try deriveBucket(e2, std.testing.allocator);
    defer std.testing.allocator.free(out2);
    try std.testing.expectEqualStrings("Emerging Markets", out2);
}

test "deriveBucket: composite still prepends geo when asset_class is generic (Fund/ETF/Bonds)" {
    // The whole point of the composite is to disambiguate
    // generic asset_class labels by their geo. Make sure we
    // don't accidentally regress on this case while fixing
    // the duplicate-prefix one.
    const e: ClassificationEntry = .{
        .symbol = "BND",
        .geo = "US",
        .asset_class = "Fund",
    };
    const out = try deriveBucket(e, std.testing.allocator);
    defer std.testing.allocator.free(out);
    try std.testing.expectEqualStrings("US Fund", out);
}

// ── ClassificationRecord ─────────────────────────────────────
//
// Distinct from `ClassificationEntry` above: that one represents
// a row in the user's `metadata.srf` (already-curated portfolio
// data). `ClassificationRecord` is the upstream-fetched
// per-symbol shape that flows OUT of `DataService.getClassification`.
// `enrich` reads it to write the metadata.srf row that becomes
// a `ClassificationEntry` later.
//
// Lives here (not in `providers/Wikidata.zig`) because the shape
// is provider-agnostic: any future classification source (FMP,
// Alpha Vantage, hand-written) populates the same record. The
// fact that today the only producer is Wikidata is incidental.

/// A single fetched classification result for one symbol.
///
/// All optional fields default to `null`; populators only set
/// the fields they have data for. The `source` field always
/// emits per the project's source-pure invariant.
pub const ClassificationRecord = struct {
    symbol: []const u8, // owned
    name: ?[]const u8 = null, // owned
    sector: ?[]const u8 = null, // owned
    industry: ?[]const u8 = null, // owned
    /// ISO-3166 alpha-2 country code (e.g. "US", "GB", "DE").
    country: ?[]const u8 = null, // owned
    /// Resolved geo bucket (e.g. "US", "International Developed",
    /// "Emerging Markets"). Populated either from `geoFor(country)`
    /// or from title-keyword inference (`inferGeoFromTitle`) for
    /// symbols where Wikidata didn't supply a country. Producers
    /// pick one of the `geo.*` constants above; consumers compare
    /// against the same constants.
    geo: ?[]const u8 = null, // owned
    asset_class: ?[]const u8 = null, // owned
    is_etf: bool = false,
    /// YYYY-MM-DD; trimmed from upstream's ISO-8601 date.
    inception_date: ?[]const u8 = null, // owned
    /// Wikidata's P5531 — the SEC CIK as a digit string. Already
    /// zero-padded to 10 digits, matching the project-wide CIK
    /// normalization convention.
    cik: ?[]const u8 = null, // owned
    /// YYYY-MM-DD when this provider ran, NOT when upstream last
    /// updated the underlying entity.
    as_of: []const u8, // owned
    source: []const u8, // no default — provenance always emitted

    pub fn deinit(self: ClassificationRecord, allocator: std.mem.Allocator) void {
        allocator.free(self.symbol);
        if (self.name) |s| allocator.free(s);
        if (self.sector) |s| allocator.free(s);
        if (self.industry) |s| allocator.free(s);
        if (self.country) |s| allocator.free(s);
        if (self.geo) |s| allocator.free(s);
        if (self.asset_class) |s| allocator.free(s);
        if (self.inception_date) |s| allocator.free(s);
        if (self.cik) |s| allocator.free(s);
        allocator.free(self.as_of);
        allocator.free(self.source);
    }

    /// Free a slice of records, calling deinit on each element first.
    pub fn freeSlice(allocator: std.mem.Allocator, recs: []const ClassificationRecord) void {
        for (recs) |r| r.deinit(allocator);
        allocator.free(recs);
    }
};

// ── Geographic taxonomy ──────────────────────────────────────

/// Geo-bucket constants used by the country → geo lookup. Kept
/// as named constants (rather than inline string literals in the
/// map) so callers can reference them without typo risk and the
/// taxonomy is tweakable in one place.
pub const geo = struct {
    pub const us = "US";
    pub const developed = "International Developed";
    pub const emerging = "Emerging Markets";
    pub const unknown = "Unknown";
};

// ── Sector taxonomy ──────────────────────────────────────────

/// Canonical sector taxonomy (GICS-aligned 11-sector model).
/// Producers (Wikidata's `canonicalizeSector`, enrich's
/// `inferSectorFromTitle`) emit one of these strings; consumers
/// (analysis bucketing, display) compare against them.
///
/// Lives here (not in any provider) so multiple producers can
/// share one taxonomy. Adding a 12th sector or renaming an
/// existing one is a one-place edit.
pub const sector = struct {
    pub const technology = "Technology";
    pub const communication_services = "Communication Services";
    pub const consumer_cyclical = "Consumer Cyclical";
    pub const consumer_defensive = "Consumer Defensive";
    pub const healthcare = "Healthcare";
    pub const financial_services = "Financial Services";
    pub const energy = "Energy";
    pub const industrials = "Industrials";
    pub const basic_materials = "Basic Materials";
    pub const real_estate = "Real Estate";
    pub const utilities = "Utilities";
};

/// Country-code-to-geo-bucket lookup. Producers (Wikidata today,
/// others tomorrow) hand us ISO-3166 alpha-2 codes via the
/// `ClassificationRecord.country` field; we map them to the geo
/// taxonomy (`geo.us` / `geo.developed` / `geo.emerging` /
/// `geo.unknown`).
///
/// MSCI conventions used as the developed/emerging split. Taiwan
/// and South Korea are MSCI-emerging despite FTSE classifying
/// them developed. Israel is MSCI-developed (upgraded 2010).
/// Canada is folded into International Developed (some users
/// prefer separate Canada bucket; override in `metadata.srf`).
const country_to_geo = std.StaticStringMap([]const u8).initComptime(.{
    // United States
    .{ "US", geo.us },
    // Alpha-3 fallback for entries that use the longer form.
    .{ "USA", geo.us },

    // International Developed — Europe ex-CIS
    .{ "GB", geo.developed },
    .{ "DE", geo.developed },
    .{ "FR", geo.developed },
    .{ "NL", geo.developed },
    .{ "CH", geo.developed },
    .{ "SE", geo.developed },
    .{ "DK", geo.developed },
    .{ "NO", geo.developed },
    .{ "FI", geo.developed },
    .{ "IT", geo.developed },
    .{ "ES", geo.developed },
    .{ "BE", geo.developed },
    .{ "AT", geo.developed },
    .{ "IE", geo.developed },
    .{ "LU", geo.developed },
    .{ "PT", geo.developed },
    .{ "GR", geo.developed },
    .{ "IS", geo.developed },

    // International Developed — Asia-Pacific + Israel + Canada
    .{ "JP", geo.developed },
    .{ "AU", geo.developed },
    .{ "NZ", geo.developed },
    .{ "SG", geo.developed },
    .{ "HK", geo.developed },
    .{ "IL", geo.developed },
    .{ "CA", geo.developed },

    // Emerging Markets (MSCI)
    .{ "CN", geo.emerging },
    .{ "TW", geo.emerging },
    .{ "KR", geo.emerging },
    .{ "IN", geo.emerging },
    .{ "BR", geo.emerging },
    .{ "MX", geo.emerging },
    .{ "RU", geo.emerging },
    .{ "TR", geo.emerging },
    .{ "ZA", geo.emerging },
    .{ "TH", geo.emerging },
    .{ "MY", geo.emerging },
    .{ "ID", geo.emerging },
    .{ "PH", geo.emerging },
    .{ "VN", geo.emerging },
    .{ "AR", geo.emerging },
    .{ "CL", geo.emerging },
    .{ "CO", geo.emerging },
    .{ "PE", geo.emerging },
    .{ "EG", geo.emerging },
});

/// Map an ISO-3166 alpha-2 country code to one of the geo
/// buckets. Null/empty input or an unknown code returns
/// `geo.unknown` so the user can override in `metadata.srf`.
pub fn geoFor(iso2: ?[]const u8) []const u8 {
    const code = iso2 orelse return geo.unknown;
    if (code.len == 0) return geo.unknown;
    return country_to_geo.get(code) orelse geo.unknown;
}

test "geoFor maps known ISO-3166 codes to bucket" {
    try std.testing.expectEqualStrings(geo.us, geoFor("US"));
    try std.testing.expectEqualStrings(geo.us, geoFor("USA"));
    try std.testing.expectEqualStrings(geo.developed, geoFor("GB"));
    try std.testing.expectEqualStrings(geo.developed, geoFor("DE"));
    try std.testing.expectEqualStrings(geo.developed, geoFor("CA"));
    try std.testing.expectEqualStrings(geo.developed, geoFor("IL"));
    try std.testing.expectEqualStrings(geo.emerging, geoFor("CN"));
    try std.testing.expectEqualStrings(geo.emerging, geoFor("TW"));
    try std.testing.expectEqualStrings(geo.emerging, geoFor("KR"));
}

test "geoFor returns Unknown for null/empty/unmapped" {
    try std.testing.expectEqualStrings(geo.unknown, geoFor(null));
    try std.testing.expectEqualStrings(geo.unknown, geoFor(""));
    try std.testing.expectEqualStrings(geo.unknown, geoFor("ZZ")); // unassigned ISO-2
    try std.testing.expectEqualStrings(geo.unknown, geoFor("XX"));
}

test "geo bucket labels are stable strings (not byte copies)" {
    // Callers stash these in HashMap keys without duping.
    // Verify the literal-pointer property holds across calls.
    try std.testing.expectEqual(@intFromPtr(geo.us.ptr), @intFromPtr(geoFor("US").ptr));
    try std.testing.expectEqual(@intFromPtr(geo.developed.ptr), @intFromPtr(geoFor("GB").ptr));
    try std.testing.expectEqual(@intFromPtr(geo.emerging.ptr), @intFromPtr(geoFor("CN").ptr));
    try std.testing.expectEqual(@intFromPtr(geo.unknown.ptr), @intFromPtr(geoFor(null).ptr));
}

// ── Title-keyword inference ──────────────────────────────────
//
// Pure functions over a fund/security title string. Used by
// `service.getClassification` to populate the sector / geo of a
// `ClassificationRecord` when Wikidata didn't carry one and the
// EDGAR ticker-map fallback fired. Lives here (not in any
// provider) because the inference is provider-agnostic and
// shares the canonical sector/geo taxonomy declared above.

fn titleContainsAny(haystack: []const u8, needles: []const []const u8) bool {
    for (needles) |needle| {
        if (std.mem.indexOf(u8, haystack, needle) != null) return true;
    }
    return false;
}

/// Lowercase the title into a stack buffer for case-insensitive
/// keyword matching. Truncates titles longer than the buffer
/// (returns null) — real fund names easily fit in 256 bytes.
fn lowercaseTitle(buf: []u8, title: []const u8) ?[]const u8 {
    if (title.len > buf.len) return null;
    return std.ascii.lowerString(buf[0..title.len], title);
}

/// Infer a GICS sector from a fund's title. Returns null when
/// no unambiguous keyword match — caller falls back to whatever
/// sector data the upstream source provided (typically null).
///
/// Conservative keyword set: matches only words that map
/// unambiguously to a single GICS sector. "Income" / "Dividend"
/// / "Value" / "Growth" / "Momentum" / "Total" / "Equal Weight"
/// / "International" / "Emerging" don't appear here — they
/// describe the screening methodology or geo, not the sector.
///
/// Reuses the `sector` constants above so the inference taxonomy
/// stays in lock-step with the canonicalizer.
pub fn inferSectorFromTitle(title: ?[]const u8) ?[]const u8 {
    const t = title orelse return null;
    if (t.len == 0) return null;

    var buf: [256]u8 = undefined;
    const lc = lowercaseTitle(&buf, t) orelse return null;

    // Order matters: more-specific keywords come first within
    // each sector. "Health care" before "care" (irrelevant
    // example), "semiconductor" before generic "tech" (which we
    // don't include — too broad).

    // Healthcare. "Health care" with space (XLV title), "healthcare"
    // (one word), "biotech", "pharmaceutical".
    if (titleContainsAny(lc, &.{ "health care", "healthcare", "biotech", "pharmaceutical", "medical" })) {
        return sector.healthcare;
    }

    // Technology. Specific terms only — "tech" alone is too
    // broad (matches "biotech", "fintech", "edtech" — all
    // sector-mixing).
    if (titleContainsAny(lc, &.{ "semiconductor", "software", "cloud computing", "internet" })) {
        return sector.technology;
    }

    // Financial Services. "Financial" is fairly specific in
    // fund-name conventions ("Financial Select Sector SPDR",
    // "Vanguard Financials ETF").
    if (titleContainsAny(lc, &.{ "financial", "bank" })) {
        return sector.financial_services;
    }

    // Energy. "Energy" alone is mostly unambiguous in fund
    // conventions; pair with "oil" / "gas" for redundancy.
    if (titleContainsAny(lc, &.{ "energy", "oil & gas", "oil and gas", "petroleum" })) {
        return sector.energy;
    }

    // Real Estate / REITs.
    if (titleContainsAny(lc, &.{ "real estate", "reit" })) {
        return sector.real_estate;
    }

    // Utilities. "Utilities" alone is unambiguous.
    if (titleContainsAny(lc, &.{"utilities"})) {
        return sector.utilities;
    }

    // Consumer Discretionary / Cyclical. Match the explicit
    // labels — "consumer" alone is ambiguous (could be
    // discretionary or staples).
    if (titleContainsAny(lc, &.{ "consumer discretionary", "consumer cyclical" })) {
        return sector.consumer_cyclical;
    }

    // Consumer Staples / Defensive.
    if (titleContainsAny(lc, &.{ "consumer staples", "consumer defensive" })) {
        return sector.consumer_defensive;
    }

    // Industrials. "Industrial" is more reliable than
    // "industrials" because some fund names use the singular
    // ("Industrial Select Sector SPDR").
    if (titleContainsAny(lc, &.{ "industrial", "aerospace", "defense" })) {
        return sector.industrials;
    }

    // Basic Materials.
    if (titleContainsAny(lc, &.{ "materials", "mining", "miners", "metals" })) {
        return sector.basic_materials;
    }

    // Communication Services. "Communication" / "Telecom"
    // unambiguous.
    if (titleContainsAny(lc, &.{ "communication", "telecom", "media" })) {
        return sector.communication_services;
    }

    return null;
}

/// Infer a geo bucket from a fund's title. Returns null when
/// the title doesn't carry an unambiguous international/emerging
/// keyword — caller keeps whatever default they have (typically
/// US for SEC-filed funds).
///
/// More important than sector inference: a default `geo::US` is
/// *factually wrong* for international funds (FRDM holds
/// emerging-market equities, not US), so this fix tightens
/// portfolio-level geographic-exposure reporting.
pub fn inferGeoFromTitle(title: ?[]const u8) ?[]const u8 {
    const t = title orelse return null;
    if (t.len == 0) return null;

    var buf: [256]u8 = undefined;
    const lc = lowercaseTitle(&buf, t) orelse return null;

    // Emerging markets first — most specific. "Emerging" alone
    // is rare in non-EM contexts in fund-name conventions.
    // "Frontier" likewise is conventionally only used for
    // frontier markets in fund titles.
    if (titleContainsAny(lc, &.{ "emerging market", "emerging markets", "frontier market", "frontier markets", "frontier" })) {
        return geo.emerging;
    }

    // International Developed. "International" / "Intl" /
    // "ex-US" / "World ex US" / "Developed Markets" /
    // specific developed-market regions.
    //
    // False-positive risk: a hypothetical "Vanguard Total
    // International + US Equity Fund" would mis-tag here. None
    // of the user's current portfolio holds such a hybrid
    // fund. If one ever shows up, it'll get flagged in the
    // diff-against-old-metadata.srf review and can be
    // hand-corrected.
    if (titleContainsAny(lc, &.{ "international", " intl", "ex-us", "ex us", "world ex", "developed market", "developed markets" })) {
        return geo.developed;
    }

    return null;
}

test "inferSectorFromTitle: null/empty -> null" {
    try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle(null));
    try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle(""));
}

test "inferSectorFromTitle: technology keywords" {
    try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("iShares Semiconductor ETF"));
    try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("Vanguard Software ETF"));
}

test "inferSectorFromTitle: healthcare keywords" {
    try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("Health Care Select Sector SPDR"));
    try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("iShares Biotech ETF"));
}

test "inferSectorFromTitle: ambiguous title -> null" {
    try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("Vanguard Total Stock Market"));
    try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("SPDR S&P 500"));
}

test "inferGeoFromTitle: null/empty -> null" {
    try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle(null));
    try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle(""));
}

test "inferGeoFromTitle: emerging markets" {
    try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("Freedom 100 Emerging Markets ETF"));
    try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("iShares Frontier Markets"));
}

test "inferGeoFromTitle: international developed" {
    try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard FTSE Developed Markets"));
    try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("iShares MSCI EAFE International"));
    try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard Total World ex-US"));
}

test "inferGeoFromTitle: no match -> null" {
    try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("SPDR S&P 500"));
    try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("iShares Semiconductor ETF"));
}