zfin/src/models/classification.zig

735 lines
31 KiB
Zig

/// Classification metadata for portfolio analysis.
///
/// Each entry maps a symbol to one or more asset class / sector / geographic allocations.
/// For individual stocks, there's typically one entry at 100%.
/// For blended funds (e.g., target date), there can be multiple entries that sum to ~100%.
///
/// Loaded from a metadata SRF file like `metadata.srf`:
/// symbol::AMZN,sector::Technology,geo::US,asset_class::US Large Cap
/// symbol::02315N600,asset_class::US Large Cap,pct:num:55
/// symbol::02315N600,asset_class::International Developed,pct:num:20
/// symbol::02315N600,asset_class::Bonds,pct:num:15
const std = @import("std");
const srf = @import("srf");
/// A single classification entry for a symbol.
pub const ClassificationEntry = struct {
symbol: []const u8,
/// Human-readable security name (e.g., "Amazon", "SPDR S&P 500
/// ETF Trust"). Optional — older metadata.srf files may not
/// have this field. Renderers fall back to `symbol` /
/// `display_symbol` when null.
name: ?[]const u8 = null,
/// User-curated grouping label that overrides the auto-derived
/// bucket for concentration / dominance checks and the
/// analysis tab's Sector breakdown. Use this when the upstream
/// `sector` field is the NPORT-P "Equity / Corporate" mush
/// that doesn't actually distinguish your holdings (e.g. SPY
/// vs FRDM vs HFXI all tagged the same way). When null,
/// `deriveBucket` falls back to a sensible default.
bucket: ?[]const u8 = null,
/// Sector (e.g., "Technology", "Healthcare", "Financials")
sector: ?[]const u8 = null,
/// Geographic region (e.g., "US", "International Developed", "Emerging Markets")
geo: ?[]const u8 = null,
/// Asset class (e.g., "US Large Cap", "Bonds", "Cash")
asset_class: ?[]const u8 = null,
/// Percentage weight for this entry (0-100). Default 100 for single-class assets.
pct: f64 = 100.0,
};
/// Parsed classification data for the entire portfolio.
pub const ClassificationMap = struct {
entries: []ClassificationEntry,
allocator: std.mem.Allocator,
pub fn deinit(self: *ClassificationMap) void {
for (self.entries) |e| {
self.allocator.free(e.symbol);
if (e.name) |n| self.allocator.free(n);
if (e.bucket) |b| self.allocator.free(b);
if (e.sector) |s| self.allocator.free(s);
if (e.geo) |g| self.allocator.free(g);
if (e.asset_class) |a| self.allocator.free(a);
}
self.allocator.free(self.entries);
}
};
/// Parse a metadata SRF file into a ClassificationMap.
/// Each record has: symbol::<SYM>,name::<N>,bucket::<B>,sector::<S>,geo::<G>,asset_class::<A>,pct:num:<P>
/// All fields except symbol are optional. pct defaults to 100.
pub fn parseClassificationFile(allocator: std.mem.Allocator, data: []const u8) !ClassificationMap {
var entries = std.ArrayList(ClassificationEntry).empty;
errdefer {
for (entries.items) |e| {
allocator.free(e.symbol);
if (e.name) |n| allocator.free(n);
if (e.bucket) |b| allocator.free(b);
if (e.sector) |s| allocator.free(s);
if (e.geo) |g| allocator.free(g);
if (e.asset_class) |a| allocator.free(a);
}
entries.deinit(allocator);
}
var reader = std.Io.Reader.fixed(data);
var it = srf.iterator(&reader, allocator, .{ .parse_allocator = .none }) catch return error.InvalidData;
defer it.deinit();
while (try it.next()) |fields| {
const entry = fields.to(ClassificationEntry, .{}) catch continue;
// Pre-fill `bucket` if the user didn't curate one. This
// shifts the cost of `deriveBucket` to parse time and
// makes downstream code free to read `entry.bucket`
// directly without juggling allocator parameters.
const built_bucket: []const u8 = if (entry.bucket) |b|
try allocator.dupe(u8, b)
else
try deriveBucket(entry, allocator);
try entries.append(allocator, .{
.symbol = try allocator.dupe(u8, entry.symbol),
.name = if (entry.name) |n| try allocator.dupe(u8, n) else null,
.bucket = built_bucket,
.sector = if (entry.sector) |s| try allocator.dupe(u8, s) else null,
.geo = if (entry.geo) |g| try allocator.dupe(u8, g) else null,
.asset_class = if (entry.asset_class) |a| try allocator.dupe(u8, a) else null,
.pct = entry.pct,
});
}
return .{
.entries = try entries.toOwnedSlice(allocator),
.allocator = allocator,
};
}
/// Resolve a classification entry to its display bucket. Used by
/// the review tab's Sector column, by `analyzePortfolio`'s sector
/// rollup, and by the observation engine's concentration /
/// dominance checks.
///
/// Four-tier fallback (caller owns the returned slice; allocated
/// via `allocator`):
/// 1. `entry.bucket` if set — user-curated, always wins.
/// 2. `entry.sector` if set AND doesn't contain '/' — GICS-style
/// sector ("Technology", "Healthcare"). The '/' rules out
/// NPORT-P fund-decomp categories ("Equity / Corporate")
/// that are noise rather than meaningful sectors.
/// 3. Composite "<geo> <asset_class>" if both are set. For
/// funds without a curated bucket, this gives a meaningful
/// grouping like "International Developed Fund" or "US ETF".
/// 4. Literal "Unclassified".
pub fn deriveBucket(entry: ClassificationEntry, allocator: std.mem.Allocator) ![]const u8 {
if (entry.bucket) |b| return try allocator.dupe(u8, b);
if (entry.sector) |s| {
if (std.mem.indexOfScalar(u8, s, '/') == null) return try allocator.dupe(u8, s);
}
if (entry.geo != null and entry.asset_class != null) {
const g = entry.geo.?;
const ac = entry.asset_class.?;
// Avoid duplicate-geo composites like "US US Large Cap".
// If the asset_class starts with the geo prefix (followed
// by a space or end-of-string), use it alone. Same for
// common geographic-noun asset classes that already imply
// their region ("International Developed", "Emerging
// Markets") — these don't need a geo prefix.
const ac_starts_with_geo = std.mem.startsWith(u8, ac, g) and
(ac.len == g.len or ac[g.len] == ' ');
const ac_has_implicit_geo = std.mem.startsWith(u8, ac, "International") or
std.mem.startsWith(u8, ac, "Emerging");
if (ac_starts_with_geo or ac_has_implicit_geo) {
return try allocator.dupe(u8, ac);
}
return try std.fmt.allocPrint(allocator, "{s} {s}", .{ g, ac });
}
return try allocator.dupe(u8, "Unclassified");
}
test "parse classification file" {
const data =
\\#!srfv1
\\# Stock: single sector
\\symbol::AMZN,name::Amazon,sector::Technology,geo::US,asset_class::US Large Cap
\\
\\# Target date fund: blended
\\symbol::TGT2035,name::Target Retirement 2035,asset_class::US Large Cap,pct:num:55
\\symbol::TGT2035,name::Target Retirement 2035,asset_class::Bonds,pct:num:15
\\symbol::TGT2035,name::Target Retirement 2035,asset_class::International Developed,pct:num:20
;
const allocator = std.testing.allocator;
var cm = try parseClassificationFile(allocator, data);
defer cm.deinit();
try std.testing.expectEqual(@as(usize, 4), cm.entries.len);
try std.testing.expectEqualStrings("AMZN", cm.entries[0].symbol);
try std.testing.expectEqualStrings("Amazon", cm.entries[0].name.?);
try std.testing.expectEqualStrings("Technology", cm.entries[0].sector.?);
try std.testing.expectEqualStrings("US", cm.entries[0].geo.?);
try std.testing.expectApproxEqAbs(@as(f64, 100.0), cm.entries[0].pct, 0.01);
try std.testing.expectEqualStrings("TGT2035", cm.entries[1].symbol);
try std.testing.expectEqualStrings("Target Retirement 2035", cm.entries[1].name.?);
try std.testing.expectEqualStrings("US Large Cap", cm.entries[1].asset_class.?);
try std.testing.expectApproxEqAbs(@as(f64, 55.0), cm.entries[1].pct, 0.01);
}
test "parse classification file: missing name field stays null (backwards compat)" {
// Older metadata.srf files predate the name:: field. Parsing
// must still succeed; consumers fall back to symbol /
// display_symbol when name is null.
const data =
\\#!srfv1
\\symbol::AMZN,sector::Technology,geo::US,asset_class::US Large Cap
;
const allocator = std.testing.allocator;
var cm = try parseClassificationFile(allocator, data);
defer cm.deinit();
try std.testing.expectEqual(@as(usize, 1), cm.entries.len);
try std.testing.expectEqualStrings("AMZN", cm.entries[0].symbol);
try std.testing.expect(cm.entries[0].name == null);
// `bucket` is pre-filled by the parser via deriveBucket. For
// a GICS-style sector ("Technology"), it equals the sector.
try std.testing.expectEqualStrings("Technology", cm.entries[0].bucket.?);
try std.testing.expectEqualStrings("Technology", cm.entries[0].sector.?);
}
test "parse classification file: bucket round-trips" {
const data =
\\#!srfv1
\\symbol::SPY,name::SPDR S&P 500 ETF Trust,bucket::US Large Cap,sector::Equity / Corporate,geo::US,asset_class::ETF
;
const allocator = std.testing.allocator;
var cm = try parseClassificationFile(allocator, data);
defer cm.deinit();
try std.testing.expectEqual(@as(usize, 1), cm.entries.len);
try std.testing.expectEqualStrings("SPY", cm.entries[0].symbol);
try std.testing.expectEqualStrings("US Large Cap", cm.entries[0].bucket.?);
try std.testing.expectEqualStrings("Equity / Corporate", cm.entries[0].sector.?);
}
test "deriveBucket: returns user-curated bucket when set" {
const e: ClassificationEntry = .{
.symbol = "SPY",
.bucket = "US Large Cap",
.sector = "Equity / Corporate", // would otherwise force fallback
.geo = "US",
.asset_class = "ETF",
};
const out = try deriveBucket(e, std.testing.allocator);
defer std.testing.allocator.free(out);
try std.testing.expectEqualStrings("US Large Cap", out);
}
test "deriveBucket: returns sector when GICS-like (no '/')" {
const e: ClassificationEntry = .{
.symbol = "AMZN",
.sector = "Technology",
.geo = "US",
.asset_class = "US Large Cap",
};
const out = try deriveBucket(e, std.testing.allocator);
defer std.testing.allocator.free(out);
try std.testing.expectEqualStrings("Technology", out);
}
test "deriveBucket: composite fallback when sector is NPORT-P mush" {
const e: ClassificationEntry = .{
.symbol = "HFXI",
.sector = "Equity / Corporate",
.geo = "International Developed",
.asset_class = "Fund",
};
const out = try deriveBucket(e, std.testing.allocator);
defer std.testing.allocator.free(out);
try std.testing.expectEqualStrings("International Developed Fund", out);
}
test "deriveBucket: returns Unclassified when nothing usable is set" {
const e: ClassificationEntry = .{
.symbol = "UNK",
};
const out = try deriveBucket(e, std.testing.allocator);
defer std.testing.allocator.free(out);
try std.testing.expectEqualStrings("Unclassified", out);
}
test "deriveBucket: NPORT-P sector with no geo/asset_class falls through to Unclassified" {
// Defensive: sector is NPORT-P-style (skipped by the GICS
// filter) AND we don't have both geo and asset_class to
// build a composite. Falls through to Unclassified.
const e: ClassificationEntry = .{
.symbol = "X",
.sector = "Debt / Corporate",
.geo = "US",
// asset_class missing
};
const out = try deriveBucket(e, std.testing.allocator);
defer std.testing.allocator.free(out);
try std.testing.expectEqualStrings("Unclassified", out);
}
test "deriveBucket: composite avoids duplicate geo when asset_class already starts with it" {
// Hand-written entries often have geographically-prefixed
// asset_class values like "US Large Cap" alongside
// geo="US". The naive composite "{geo} {asset_class}" then
// produces "US US Large Cap" which is ugly and clusters
// incorrectly in the breakdown. Detect the duplicate prefix
// and use the asset_class alone.
const e: ClassificationEntry = .{
.symbol = "VOO",
.geo = "US",
.asset_class = "US Large Cap",
};
const out = try deriveBucket(e, std.testing.allocator);
defer std.testing.allocator.free(out);
try std.testing.expectEqualStrings("US Large Cap", out);
}
test "deriveBucket: composite uses asset_class alone for International/Emerging implicit-geo classes" {
// "International Developed" and "Emerging Markets" are
// already geographic; the composite shouldn't re-prepend
// the geo.
const e1: ClassificationEntry = .{
.symbol = "VEA",
.geo = "International Developed",
.asset_class = "International Developed",
};
const out1 = try deriveBucket(e1, std.testing.allocator);
defer std.testing.allocator.free(out1);
try std.testing.expectEqualStrings("International Developed", out1);
const e2: ClassificationEntry = .{
.symbol = "VWO",
.geo = "Emerging Markets",
.asset_class = "Emerging Markets",
};
const out2 = try deriveBucket(e2, std.testing.allocator);
defer std.testing.allocator.free(out2);
try std.testing.expectEqualStrings("Emerging Markets", out2);
}
test "deriveBucket: composite still prepends geo when asset_class is generic (Fund/ETF/Bonds)" {
// The whole point of the composite is to disambiguate
// generic asset_class labels by their geo. Make sure we
// don't accidentally regress on this case while fixing
// the duplicate-prefix one.
const e: ClassificationEntry = .{
.symbol = "BND",
.geo = "US",
.asset_class = "Fund",
};
const out = try deriveBucket(e, std.testing.allocator);
defer std.testing.allocator.free(out);
try std.testing.expectEqualStrings("US Fund", out);
}
// ── ClassificationRecord ─────────────────────────────────────
//
// Distinct from `ClassificationEntry` above: that one represents
// a row in the user's `metadata.srf` (already-curated portfolio
// data). `ClassificationRecord` is the upstream-fetched
// per-symbol shape that flows OUT of `DataService.getClassification`.
// `enrich` reads it to write the metadata.srf row that becomes
// a `ClassificationEntry` later.
//
// Lives here (not in `providers/Wikidata.zig`) because the shape
// is provider-agnostic: any future classification source (FMP,
// Alpha Vantage, hand-written) populates the same record. The
// fact that today the only producer is Wikidata is incidental.
/// A single fetched classification result for one symbol.
///
/// All optional fields default to `null`; populators only set
/// the fields they have data for. The `source` field always
/// emits per the project's source-pure invariant.
pub const ClassificationRecord = struct {
symbol: []const u8, // owned
name: ?[]const u8 = null, // owned
sector: ?[]const u8 = null, // owned
industry: ?[]const u8 = null, // owned
/// ISO-3166 alpha-2 country code (e.g. "US", "GB", "DE").
country: ?[]const u8 = null, // owned
/// Resolved geo bucket (e.g. "US", "International Developed",
/// "Emerging Markets"). Populated either from `geoFor(country)`
/// or from title-keyword inference (`inferGeoFromTitle`) for
/// symbols where Wikidata didn't supply a country. Producers
/// pick one of the `geo.*` constants above; consumers compare
/// against the same constants.
geo: ?[]const u8 = null, // owned
asset_class: ?[]const u8 = null, // owned
is_etf: bool = false,
/// YYYY-MM-DD; trimmed from upstream's ISO-8601 date.
inception_date: ?[]const u8 = null, // owned
/// Wikidata's P5531 — the SEC CIK as a digit string. Already
/// zero-padded to 10 digits, matching the project-wide CIK
/// normalization convention.
cik: ?[]const u8 = null, // owned
/// YYYY-MM-DD when this provider ran, NOT when upstream last
/// updated the underlying entity.
as_of: []const u8, // owned
source: []const u8, // no default — provenance always emitted
pub fn deinit(self: ClassificationRecord, allocator: std.mem.Allocator) void {
allocator.free(self.symbol);
if (self.name) |s| allocator.free(s);
if (self.sector) |s| allocator.free(s);
if (self.industry) |s| allocator.free(s);
if (self.country) |s| allocator.free(s);
if (self.geo) |s| allocator.free(s);
if (self.asset_class) |s| allocator.free(s);
if (self.inception_date) |s| allocator.free(s);
if (self.cik) |s| allocator.free(s);
allocator.free(self.as_of);
allocator.free(self.source);
}
/// Free a slice of records, calling deinit on each element first.
pub fn freeSlice(allocator: std.mem.Allocator, recs: []const ClassificationRecord) void {
for (recs) |r| r.deinit(allocator);
allocator.free(recs);
}
};
// ── Geographic taxonomy ──────────────────────────────────────
/// Geo-bucket constants used by the country → geo lookup. Kept
/// as named constants (rather than inline string literals in the
/// map) so callers can reference them without typo risk and the
/// taxonomy is tweakable in one place.
pub const geo = struct {
pub const us = "US";
pub const developed = "International Developed";
pub const emerging = "Emerging Markets";
pub const unknown = "Unknown";
};
// ── Sector taxonomy ──────────────────────────────────────────
/// Canonical sector taxonomy (GICS-aligned 11-sector model).
/// Producers (Wikidata's `canonicalizeSector`, enrich's
/// `inferSectorFromTitle`) emit one of these strings; consumers
/// (analysis bucketing, display) compare against them.
///
/// Lives here (not in any provider) so multiple producers can
/// share one taxonomy. Adding a 12th sector or renaming an
/// existing one is a one-place edit.
pub const sector = struct {
pub const technology = "Technology";
pub const communication_services = "Communication Services";
pub const consumer_cyclical = "Consumer Cyclical";
pub const consumer_defensive = "Consumer Defensive";
pub const healthcare = "Healthcare";
pub const financial_services = "Financial Services";
pub const energy = "Energy";
pub const industrials = "Industrials";
pub const basic_materials = "Basic Materials";
pub const real_estate = "Real Estate";
pub const utilities = "Utilities";
};
/// Country-code-to-geo-bucket lookup. Producers (Wikidata today,
/// others tomorrow) hand us ISO-3166 alpha-2 codes via the
/// `ClassificationRecord.country` field; we map them to the geo
/// taxonomy (`geo.us` / `geo.developed` / `geo.emerging` /
/// `geo.unknown`).
///
/// MSCI conventions used as the developed/emerging split. Taiwan
/// and South Korea are MSCI-emerging despite FTSE classifying
/// them developed. Israel is MSCI-developed (upgraded 2010).
/// Canada is folded into International Developed (some users
/// prefer separate Canada bucket; override in `metadata.srf`).
const country_to_geo = std.StaticStringMap([]const u8).initComptime(.{
// United States
.{ "US", geo.us },
// Alpha-3 fallback for entries that use the longer form.
.{ "USA", geo.us },
// International Developed — Europe ex-CIS
.{ "GB", geo.developed },
.{ "DE", geo.developed },
.{ "FR", geo.developed },
.{ "NL", geo.developed },
.{ "CH", geo.developed },
.{ "SE", geo.developed },
.{ "DK", geo.developed },
.{ "NO", geo.developed },
.{ "FI", geo.developed },
.{ "IT", geo.developed },
.{ "ES", geo.developed },
.{ "BE", geo.developed },
.{ "AT", geo.developed },
.{ "IE", geo.developed },
.{ "LU", geo.developed },
.{ "PT", geo.developed },
.{ "GR", geo.developed },
.{ "IS", geo.developed },
// International Developed — Asia-Pacific + Israel + Canada
.{ "JP", geo.developed },
.{ "AU", geo.developed },
.{ "NZ", geo.developed },
.{ "SG", geo.developed },
.{ "HK", geo.developed },
.{ "IL", geo.developed },
.{ "CA", geo.developed },
// Emerging Markets (MSCI)
.{ "CN", geo.emerging },
.{ "TW", geo.emerging },
.{ "KR", geo.emerging },
.{ "IN", geo.emerging },
.{ "BR", geo.emerging },
.{ "MX", geo.emerging },
.{ "RU", geo.emerging },
.{ "TR", geo.emerging },
.{ "ZA", geo.emerging },
.{ "TH", geo.emerging },
.{ "MY", geo.emerging },
.{ "ID", geo.emerging },
.{ "PH", geo.emerging },
.{ "VN", geo.emerging },
.{ "AR", geo.emerging },
.{ "CL", geo.emerging },
.{ "CO", geo.emerging },
.{ "PE", geo.emerging },
.{ "EG", geo.emerging },
});
/// Map an ISO-3166 alpha-2 country code to one of the geo
/// buckets. Null/empty input or an unknown code returns
/// `geo.unknown` so the user can override in `metadata.srf`.
pub fn geoFor(iso2: ?[]const u8) []const u8 {
const code = iso2 orelse return geo.unknown;
if (code.len == 0) return geo.unknown;
return country_to_geo.get(code) orelse geo.unknown;
}
test "geoFor maps known ISO-3166 codes to bucket" {
try std.testing.expectEqualStrings(geo.us, geoFor("US"));
try std.testing.expectEqualStrings(geo.us, geoFor("USA"));
try std.testing.expectEqualStrings(geo.developed, geoFor("GB"));
try std.testing.expectEqualStrings(geo.developed, geoFor("DE"));
try std.testing.expectEqualStrings(geo.developed, geoFor("CA"));
try std.testing.expectEqualStrings(geo.developed, geoFor("IL"));
try std.testing.expectEqualStrings(geo.emerging, geoFor("CN"));
try std.testing.expectEqualStrings(geo.emerging, geoFor("TW"));
try std.testing.expectEqualStrings(geo.emerging, geoFor("KR"));
}
test "geoFor returns Unknown for null/empty/unmapped" {
try std.testing.expectEqualStrings(geo.unknown, geoFor(null));
try std.testing.expectEqualStrings(geo.unknown, geoFor(""));
try std.testing.expectEqualStrings(geo.unknown, geoFor("ZZ")); // unassigned ISO-2
try std.testing.expectEqualStrings(geo.unknown, geoFor("XX"));
}
test "geo bucket labels are stable strings (not byte copies)" {
// Callers stash these in HashMap keys without duping.
// Verify the literal-pointer property holds across calls.
try std.testing.expectEqual(@intFromPtr(geo.us.ptr), @intFromPtr(geoFor("US").ptr));
try std.testing.expectEqual(@intFromPtr(geo.developed.ptr), @intFromPtr(geoFor("GB").ptr));
try std.testing.expectEqual(@intFromPtr(geo.emerging.ptr), @intFromPtr(geoFor("CN").ptr));
try std.testing.expectEqual(@intFromPtr(geo.unknown.ptr), @intFromPtr(geoFor(null).ptr));
}
// ── Title-keyword inference ──────────────────────────────────
//
// Pure functions over a fund/security title string. Used by
// `service.getClassification` to populate the sector / geo of a
// `ClassificationRecord` when Wikidata didn't carry one and the
// EDGAR ticker-map fallback fired. Lives here (not in any
// provider) because the inference is provider-agnostic and
// shares the canonical sector/geo taxonomy declared above.
fn titleContainsAny(haystack: []const u8, needles: []const []const u8) bool {
for (needles) |needle| {
if (std.mem.indexOf(u8, haystack, needle) != null) return true;
}
return false;
}
/// Lowercase the title into a stack buffer for case-insensitive
/// keyword matching. Truncates titles longer than the buffer
/// (returns null) — real fund names easily fit in 256 bytes.
fn lowercaseTitle(buf: []u8, title: []const u8) ?[]const u8 {
if (title.len > buf.len) return null;
return std.ascii.lowerString(buf[0..title.len], title);
}
/// Infer a GICS sector from a fund's title. Returns null when
/// no unambiguous keyword match — caller falls back to whatever
/// sector data the upstream source provided (typically null).
///
/// Conservative keyword set: matches only words that map
/// unambiguously to a single GICS sector. "Income" / "Dividend"
/// / "Value" / "Growth" / "Momentum" / "Total" / "Equal Weight"
/// / "International" / "Emerging" don't appear here — they
/// describe the screening methodology or geo, not the sector.
///
/// Reuses the `sector` constants above so the inference taxonomy
/// stays in lock-step with the canonicalizer.
pub fn inferSectorFromTitle(title: ?[]const u8) ?[]const u8 {
const t = title orelse return null;
if (t.len == 0) return null;
var buf: [256]u8 = undefined;
const lc = lowercaseTitle(&buf, t) orelse return null;
// Order matters: more-specific keywords come first within
// each sector. "Health care" before "care" (irrelevant
// example), "semiconductor" before generic "tech" (which we
// don't include — too broad).
// Healthcare. "Health care" with space (XLV title), "healthcare"
// (one word), "biotech", "pharmaceutical".
if (titleContainsAny(lc, &.{ "health care", "healthcare", "biotech", "pharmaceutical", "medical" })) {
return sector.healthcare;
}
// Technology. Specific terms only — "tech" alone is too
// broad (matches "biotech", "fintech", "edtech" — all
// sector-mixing).
if (titleContainsAny(lc, &.{ "semiconductor", "software", "cloud computing", "internet" })) {
return sector.technology;
}
// Financial Services. "Financial" is fairly specific in
// fund-name conventions ("Financial Select Sector SPDR",
// "Vanguard Financials ETF").
if (titleContainsAny(lc, &.{ "financial", "bank" })) {
return sector.financial_services;
}
// Energy. "Energy" alone is mostly unambiguous in fund
// conventions; pair with "oil" / "gas" for redundancy.
if (titleContainsAny(lc, &.{ "energy", "oil & gas", "oil and gas", "petroleum" })) {
return sector.energy;
}
// Real Estate / REITs.
if (titleContainsAny(lc, &.{ "real estate", "reit" })) {
return sector.real_estate;
}
// Utilities. "Utilities" alone is unambiguous.
if (titleContainsAny(lc, &.{"utilities"})) {
return sector.utilities;
}
// Consumer Discretionary / Cyclical. Match the explicit
// labels — "consumer" alone is ambiguous (could be
// discretionary or staples).
if (titleContainsAny(lc, &.{ "consumer discretionary", "consumer cyclical" })) {
return sector.consumer_cyclical;
}
// Consumer Staples / Defensive.
if (titleContainsAny(lc, &.{ "consumer staples", "consumer defensive" })) {
return sector.consumer_defensive;
}
// Industrials. "Industrial" is more reliable than
// "industrials" because some fund names use the singular
// ("Industrial Select Sector SPDR").
if (titleContainsAny(lc, &.{ "industrial", "aerospace", "defense" })) {
return sector.industrials;
}
// Basic Materials.
if (titleContainsAny(lc, &.{ "materials", "mining", "miners", "metals" })) {
return sector.basic_materials;
}
// Communication Services. "Communication" / "Telecom"
// unambiguous.
if (titleContainsAny(lc, &.{ "communication", "telecom", "media" })) {
return sector.communication_services;
}
return null;
}
/// Infer a geo bucket from a fund's title. Returns null when
/// the title doesn't carry an unambiguous international/emerging
/// keyword — caller keeps whatever default they have (typically
/// US for SEC-filed funds).
///
/// More important than sector inference: a default `geo::US` is
/// *factually wrong* for international funds (FRDM holds
/// emerging-market equities, not US), so this fix tightens
/// portfolio-level geographic-exposure reporting.
pub fn inferGeoFromTitle(title: ?[]const u8) ?[]const u8 {
const t = title orelse return null;
if (t.len == 0) return null;
var buf: [256]u8 = undefined;
const lc = lowercaseTitle(&buf, t) orelse return null;
// Emerging markets first — most specific. "Emerging" alone
// is rare in non-EM contexts in fund-name conventions.
// "Frontier" likewise is conventionally only used for
// frontier markets in fund titles.
if (titleContainsAny(lc, &.{ "emerging market", "emerging markets", "frontier market", "frontier markets", "frontier" })) {
return geo.emerging;
}
// International Developed. "International" / "Intl" /
// "ex-US" / "World ex US" / "Developed Markets" /
// specific developed-market regions.
//
// False-positive risk: a hypothetical "Vanguard Total
// International + US Equity Fund" would mis-tag here. None
// of the user's current portfolio holds such a hybrid
// fund. If one ever shows up, it'll get flagged in the
// diff-against-old-metadata.srf review and can be
// hand-corrected.
if (titleContainsAny(lc, &.{ "international", " intl", "ex-us", "ex us", "world ex", "developed market", "developed markets" })) {
return geo.developed;
}
return null;
}
test "inferSectorFromTitle: null/empty -> null" {
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle(null));
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle(""));
}
test "inferSectorFromTitle: technology keywords" {
try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("iShares Semiconductor ETF"));
try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("Vanguard Software ETF"));
}
test "inferSectorFromTitle: healthcare keywords" {
try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("Health Care Select Sector SPDR"));
try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("iShares Biotech ETF"));
}
test "inferSectorFromTitle: ambiguous title -> null" {
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("Vanguard Total Stock Market"));
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("SPDR S&P 500"));
}
test "inferGeoFromTitle: null/empty -> null" {
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle(null));
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle(""));
}
test "inferGeoFromTitle: emerging markets" {
try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("Freedom 100 Emerging Markets ETF"));
try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("iShares Frontier Markets"));
}
test "inferGeoFromTitle: international developed" {
try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard FTSE Developed Markets"));
try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("iShares MSCI EAFE International"));
try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard Total World ex-US"));
}
test "inferGeoFromTitle: no match -> null" {
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("SPDR S&P 500"));
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("iShares Semiconductor ETF"));
}