735 lines
31 KiB
Zig
735 lines
31 KiB
Zig
/// Classification metadata for portfolio analysis.
|
|
///
|
|
/// Each entry maps a symbol to one or more asset class / sector / geographic allocations.
|
|
/// For individual stocks, there's typically one entry at 100%.
|
|
/// For blended funds (e.g., target date), there can be multiple entries that sum to ~100%.
|
|
///
|
|
/// Loaded from a metadata SRF file like `metadata.srf`:
|
|
/// symbol::AMZN,sector::Technology,geo::US,asset_class::US Large Cap
|
|
/// symbol::02315N600,asset_class::US Large Cap,pct:num:55
|
|
/// symbol::02315N600,asset_class::International Developed,pct:num:20
|
|
/// symbol::02315N600,asset_class::Bonds,pct:num:15
|
|
const std = @import("std");
|
|
const srf = @import("srf");
|
|
|
|
/// A single classification entry for a symbol.
|
|
pub const ClassificationEntry = struct {
|
|
symbol: []const u8,
|
|
/// Human-readable security name (e.g., "Amazon", "SPDR S&P 500
|
|
/// ETF Trust"). Optional — older metadata.srf files may not
|
|
/// have this field. Renderers fall back to `symbol` /
|
|
/// `display_symbol` when null.
|
|
name: ?[]const u8 = null,
|
|
/// User-curated grouping label that overrides the auto-derived
|
|
/// bucket for concentration / dominance checks and the
|
|
/// analysis tab's Sector breakdown. Use this when the upstream
|
|
/// `sector` field is the NPORT-P "Equity / Corporate" mush
|
|
/// that doesn't actually distinguish your holdings (e.g. SPY
|
|
/// vs FRDM vs HFXI all tagged the same way). When null,
|
|
/// `deriveBucket` falls back to a sensible default.
|
|
bucket: ?[]const u8 = null,
|
|
/// Sector (e.g., "Technology", "Healthcare", "Financials")
|
|
sector: ?[]const u8 = null,
|
|
/// Geographic region (e.g., "US", "International Developed", "Emerging Markets")
|
|
geo: ?[]const u8 = null,
|
|
/// Asset class (e.g., "US Large Cap", "Bonds", "Cash")
|
|
asset_class: ?[]const u8 = null,
|
|
/// Percentage weight for this entry (0-100). Default 100 for single-class assets.
|
|
pct: f64 = 100.0,
|
|
};
|
|
|
|
/// Parsed classification data for the entire portfolio.
|
|
pub const ClassificationMap = struct {
|
|
entries: []ClassificationEntry,
|
|
allocator: std.mem.Allocator,
|
|
|
|
pub fn deinit(self: *ClassificationMap) void {
|
|
for (self.entries) |e| {
|
|
self.allocator.free(e.symbol);
|
|
if (e.name) |n| self.allocator.free(n);
|
|
if (e.bucket) |b| self.allocator.free(b);
|
|
if (e.sector) |s| self.allocator.free(s);
|
|
if (e.geo) |g| self.allocator.free(g);
|
|
if (e.asset_class) |a| self.allocator.free(a);
|
|
}
|
|
self.allocator.free(self.entries);
|
|
}
|
|
};
|
|
|
|
/// Parse a metadata SRF file into a ClassificationMap.
|
|
/// Each record has: symbol::<SYM>,name::<N>,bucket::<B>,sector::<S>,geo::<G>,asset_class::<A>,pct:num:<P>
|
|
/// All fields except symbol are optional. pct defaults to 100.
|
|
pub fn parseClassificationFile(allocator: std.mem.Allocator, data: []const u8) !ClassificationMap {
|
|
var entries = std.ArrayList(ClassificationEntry).empty;
|
|
errdefer {
|
|
for (entries.items) |e| {
|
|
allocator.free(e.symbol);
|
|
if (e.name) |n| allocator.free(n);
|
|
if (e.bucket) |b| allocator.free(b);
|
|
if (e.sector) |s| allocator.free(s);
|
|
if (e.geo) |g| allocator.free(g);
|
|
if (e.asset_class) |a| allocator.free(a);
|
|
}
|
|
entries.deinit(allocator);
|
|
}
|
|
|
|
var reader = std.Io.Reader.fixed(data);
|
|
var it = srf.iterator(&reader, allocator, .{ .parse_allocator = .none }) catch return error.InvalidData;
|
|
defer it.deinit();
|
|
|
|
while (try it.next()) |fields| {
|
|
const entry = fields.to(ClassificationEntry, .{}) catch continue;
|
|
// Pre-fill `bucket` if the user didn't curate one. This
|
|
// shifts the cost of `deriveBucket` to parse time and
|
|
// makes downstream code free to read `entry.bucket`
|
|
// directly without juggling allocator parameters.
|
|
const built_bucket: []const u8 = if (entry.bucket) |b|
|
|
try allocator.dupe(u8, b)
|
|
else
|
|
try deriveBucket(entry, allocator);
|
|
try entries.append(allocator, .{
|
|
.symbol = try allocator.dupe(u8, entry.symbol),
|
|
.name = if (entry.name) |n| try allocator.dupe(u8, n) else null,
|
|
.bucket = built_bucket,
|
|
.sector = if (entry.sector) |s| try allocator.dupe(u8, s) else null,
|
|
.geo = if (entry.geo) |g| try allocator.dupe(u8, g) else null,
|
|
.asset_class = if (entry.asset_class) |a| try allocator.dupe(u8, a) else null,
|
|
.pct = entry.pct,
|
|
});
|
|
}
|
|
|
|
return .{
|
|
.entries = try entries.toOwnedSlice(allocator),
|
|
.allocator = allocator,
|
|
};
|
|
}
|
|
|
|
/// Resolve a classification entry to its display bucket. Used by
|
|
/// the review tab's Sector column, by `analyzePortfolio`'s sector
|
|
/// rollup, and by the observation engine's concentration /
|
|
/// dominance checks.
|
|
///
|
|
/// Four-tier fallback (caller owns the returned slice; allocated
|
|
/// via `allocator`):
|
|
/// 1. `entry.bucket` if set — user-curated, always wins.
|
|
/// 2. `entry.sector` if set AND doesn't contain '/' — GICS-style
|
|
/// sector ("Technology", "Healthcare"). The '/' rules out
|
|
/// NPORT-P fund-decomp categories ("Equity / Corporate")
|
|
/// that are noise rather than meaningful sectors.
|
|
/// 3. Composite "<geo> <asset_class>" if both are set. For
|
|
/// funds without a curated bucket, this gives a meaningful
|
|
/// grouping like "International Developed Fund" or "US ETF".
|
|
/// 4. Literal "Unclassified".
|
|
pub fn deriveBucket(entry: ClassificationEntry, allocator: std.mem.Allocator) ![]const u8 {
|
|
if (entry.bucket) |b| return try allocator.dupe(u8, b);
|
|
if (entry.sector) |s| {
|
|
if (std.mem.indexOfScalar(u8, s, '/') == null) return try allocator.dupe(u8, s);
|
|
}
|
|
if (entry.geo != null and entry.asset_class != null) {
|
|
const g = entry.geo.?;
|
|
const ac = entry.asset_class.?;
|
|
// Avoid duplicate-geo composites like "US US Large Cap".
|
|
// If the asset_class starts with the geo prefix (followed
|
|
// by a space or end-of-string), use it alone. Same for
|
|
// common geographic-noun asset classes that already imply
|
|
// their region ("International Developed", "Emerging
|
|
// Markets") — these don't need a geo prefix.
|
|
const ac_starts_with_geo = std.mem.startsWith(u8, ac, g) and
|
|
(ac.len == g.len or ac[g.len] == ' ');
|
|
const ac_has_implicit_geo = std.mem.startsWith(u8, ac, "International") or
|
|
std.mem.startsWith(u8, ac, "Emerging");
|
|
if (ac_starts_with_geo or ac_has_implicit_geo) {
|
|
return try allocator.dupe(u8, ac);
|
|
}
|
|
return try std.fmt.allocPrint(allocator, "{s} {s}", .{ g, ac });
|
|
}
|
|
return try allocator.dupe(u8, "Unclassified");
|
|
}
|
|
|
|
test "parse classification file" {
|
|
const data =
|
|
\\#!srfv1
|
|
\\# Stock: single sector
|
|
\\symbol::AMZN,name::Amazon,sector::Technology,geo::US,asset_class::US Large Cap
|
|
\\
|
|
\\# Target date fund: blended
|
|
\\symbol::TGT2035,name::Target Retirement 2035,asset_class::US Large Cap,pct:num:55
|
|
\\symbol::TGT2035,name::Target Retirement 2035,asset_class::Bonds,pct:num:15
|
|
\\symbol::TGT2035,name::Target Retirement 2035,asset_class::International Developed,pct:num:20
|
|
;
|
|
const allocator = std.testing.allocator;
|
|
var cm = try parseClassificationFile(allocator, data);
|
|
defer cm.deinit();
|
|
|
|
try std.testing.expectEqual(@as(usize, 4), cm.entries.len);
|
|
try std.testing.expectEqualStrings("AMZN", cm.entries[0].symbol);
|
|
try std.testing.expectEqualStrings("Amazon", cm.entries[0].name.?);
|
|
try std.testing.expectEqualStrings("Technology", cm.entries[0].sector.?);
|
|
try std.testing.expectEqualStrings("US", cm.entries[0].geo.?);
|
|
try std.testing.expectApproxEqAbs(@as(f64, 100.0), cm.entries[0].pct, 0.01);
|
|
|
|
try std.testing.expectEqualStrings("TGT2035", cm.entries[1].symbol);
|
|
try std.testing.expectEqualStrings("Target Retirement 2035", cm.entries[1].name.?);
|
|
try std.testing.expectEqualStrings("US Large Cap", cm.entries[1].asset_class.?);
|
|
try std.testing.expectApproxEqAbs(@as(f64, 55.0), cm.entries[1].pct, 0.01);
|
|
}
|
|
|
|
test "parse classification file: missing name field stays null (backwards compat)" {
|
|
// Older metadata.srf files predate the name:: field. Parsing
|
|
// must still succeed; consumers fall back to symbol /
|
|
// display_symbol when name is null.
|
|
const data =
|
|
\\#!srfv1
|
|
\\symbol::AMZN,sector::Technology,geo::US,asset_class::US Large Cap
|
|
;
|
|
const allocator = std.testing.allocator;
|
|
var cm = try parseClassificationFile(allocator, data);
|
|
defer cm.deinit();
|
|
|
|
try std.testing.expectEqual(@as(usize, 1), cm.entries.len);
|
|
try std.testing.expectEqualStrings("AMZN", cm.entries[0].symbol);
|
|
try std.testing.expect(cm.entries[0].name == null);
|
|
// `bucket` is pre-filled by the parser via deriveBucket. For
|
|
// a GICS-style sector ("Technology"), it equals the sector.
|
|
try std.testing.expectEqualStrings("Technology", cm.entries[0].bucket.?);
|
|
try std.testing.expectEqualStrings("Technology", cm.entries[0].sector.?);
|
|
}
|
|
|
|
test "parse classification file: bucket round-trips" {
|
|
const data =
|
|
\\#!srfv1
|
|
\\symbol::SPY,name::SPDR S&P 500 ETF Trust,bucket::US Large Cap,sector::Equity / Corporate,geo::US,asset_class::ETF
|
|
;
|
|
const allocator = std.testing.allocator;
|
|
var cm = try parseClassificationFile(allocator, data);
|
|
defer cm.deinit();
|
|
|
|
try std.testing.expectEqual(@as(usize, 1), cm.entries.len);
|
|
try std.testing.expectEqualStrings("SPY", cm.entries[0].symbol);
|
|
try std.testing.expectEqualStrings("US Large Cap", cm.entries[0].bucket.?);
|
|
try std.testing.expectEqualStrings("Equity / Corporate", cm.entries[0].sector.?);
|
|
}
|
|
|
|
test "deriveBucket: returns user-curated bucket when set" {
|
|
const e: ClassificationEntry = .{
|
|
.symbol = "SPY",
|
|
.bucket = "US Large Cap",
|
|
.sector = "Equity / Corporate", // would otherwise force fallback
|
|
.geo = "US",
|
|
.asset_class = "ETF",
|
|
};
|
|
const out = try deriveBucket(e, std.testing.allocator);
|
|
defer std.testing.allocator.free(out);
|
|
try std.testing.expectEqualStrings("US Large Cap", out);
|
|
}
|
|
|
|
test "deriveBucket: returns sector when GICS-like (no '/')" {
|
|
const e: ClassificationEntry = .{
|
|
.symbol = "AMZN",
|
|
.sector = "Technology",
|
|
.geo = "US",
|
|
.asset_class = "US Large Cap",
|
|
};
|
|
const out = try deriveBucket(e, std.testing.allocator);
|
|
defer std.testing.allocator.free(out);
|
|
try std.testing.expectEqualStrings("Technology", out);
|
|
}
|
|
|
|
test "deriveBucket: composite fallback when sector is NPORT-P mush" {
|
|
const e: ClassificationEntry = .{
|
|
.symbol = "HFXI",
|
|
.sector = "Equity / Corporate",
|
|
.geo = "International Developed",
|
|
.asset_class = "Fund",
|
|
};
|
|
const out = try deriveBucket(e, std.testing.allocator);
|
|
defer std.testing.allocator.free(out);
|
|
try std.testing.expectEqualStrings("International Developed Fund", out);
|
|
}
|
|
|
|
test "deriveBucket: returns Unclassified when nothing usable is set" {
|
|
const e: ClassificationEntry = .{
|
|
.symbol = "UNK",
|
|
};
|
|
const out = try deriveBucket(e, std.testing.allocator);
|
|
defer std.testing.allocator.free(out);
|
|
try std.testing.expectEqualStrings("Unclassified", out);
|
|
}
|
|
|
|
test "deriveBucket: NPORT-P sector with no geo/asset_class falls through to Unclassified" {
|
|
// Defensive: sector is NPORT-P-style (skipped by the GICS
|
|
// filter) AND we don't have both geo and asset_class to
|
|
// build a composite. Falls through to Unclassified.
|
|
const e: ClassificationEntry = .{
|
|
.symbol = "X",
|
|
.sector = "Debt / Corporate",
|
|
.geo = "US",
|
|
// asset_class missing
|
|
};
|
|
const out = try deriveBucket(e, std.testing.allocator);
|
|
defer std.testing.allocator.free(out);
|
|
try std.testing.expectEqualStrings("Unclassified", out);
|
|
}
|
|
|
|
test "deriveBucket: composite avoids duplicate geo when asset_class already starts with it" {
|
|
// Hand-written entries often have geographically-prefixed
|
|
// asset_class values like "US Large Cap" alongside
|
|
// geo="US". The naive composite "{geo} {asset_class}" then
|
|
// produces "US US Large Cap" which is ugly and clusters
|
|
// incorrectly in the breakdown. Detect the duplicate prefix
|
|
// and use the asset_class alone.
|
|
const e: ClassificationEntry = .{
|
|
.symbol = "VOO",
|
|
.geo = "US",
|
|
.asset_class = "US Large Cap",
|
|
};
|
|
const out = try deriveBucket(e, std.testing.allocator);
|
|
defer std.testing.allocator.free(out);
|
|
try std.testing.expectEqualStrings("US Large Cap", out);
|
|
}
|
|
|
|
test "deriveBucket: composite uses asset_class alone for International/Emerging implicit-geo classes" {
|
|
// "International Developed" and "Emerging Markets" are
|
|
// already geographic; the composite shouldn't re-prepend
|
|
// the geo.
|
|
const e1: ClassificationEntry = .{
|
|
.symbol = "VEA",
|
|
.geo = "International Developed",
|
|
.asset_class = "International Developed",
|
|
};
|
|
const out1 = try deriveBucket(e1, std.testing.allocator);
|
|
defer std.testing.allocator.free(out1);
|
|
try std.testing.expectEqualStrings("International Developed", out1);
|
|
|
|
const e2: ClassificationEntry = .{
|
|
.symbol = "VWO",
|
|
.geo = "Emerging Markets",
|
|
.asset_class = "Emerging Markets",
|
|
};
|
|
const out2 = try deriveBucket(e2, std.testing.allocator);
|
|
defer std.testing.allocator.free(out2);
|
|
try std.testing.expectEqualStrings("Emerging Markets", out2);
|
|
}
|
|
|
|
test "deriveBucket: composite still prepends geo when asset_class is generic (Fund/ETF/Bonds)" {
|
|
// The whole point of the composite is to disambiguate
|
|
// generic asset_class labels by their geo. Make sure we
|
|
// don't accidentally regress on this case while fixing
|
|
// the duplicate-prefix one.
|
|
const e: ClassificationEntry = .{
|
|
.symbol = "BND",
|
|
.geo = "US",
|
|
.asset_class = "Fund",
|
|
};
|
|
const out = try deriveBucket(e, std.testing.allocator);
|
|
defer std.testing.allocator.free(out);
|
|
try std.testing.expectEqualStrings("US Fund", out);
|
|
}
|
|
|
|
// ── ClassificationRecord ─────────────────────────────────────
|
|
//
|
|
// Distinct from `ClassificationEntry` above: that one represents
|
|
// a row in the user's `metadata.srf` (already-curated portfolio
|
|
// data). `ClassificationRecord` is the upstream-fetched
|
|
// per-symbol shape that flows OUT of `DataService.getClassification`.
|
|
// `enrich` reads it to write the metadata.srf row that becomes
|
|
// a `ClassificationEntry` later.
|
|
//
|
|
// Lives here (not in `providers/Wikidata.zig`) because the shape
|
|
// is provider-agnostic: any future classification source (FMP,
|
|
// Alpha Vantage, hand-written) populates the same record. The
|
|
// fact that today the only producer is Wikidata is incidental.
|
|
|
|
/// A single fetched classification result for one symbol.
|
|
///
|
|
/// All optional fields default to `null`; populators only set
|
|
/// the fields they have data for. The `source` field always
|
|
/// emits per the project's source-pure invariant.
|
|
pub const ClassificationRecord = struct {
|
|
symbol: []const u8, // owned
|
|
name: ?[]const u8 = null, // owned
|
|
sector: ?[]const u8 = null, // owned
|
|
industry: ?[]const u8 = null, // owned
|
|
/// ISO-3166 alpha-2 country code (e.g. "US", "GB", "DE").
|
|
country: ?[]const u8 = null, // owned
|
|
/// Resolved geo bucket (e.g. "US", "International Developed",
|
|
/// "Emerging Markets"). Populated either from `geoFor(country)`
|
|
/// or from title-keyword inference (`inferGeoFromTitle`) for
|
|
/// symbols where Wikidata didn't supply a country. Producers
|
|
/// pick one of the `geo.*` constants above; consumers compare
|
|
/// against the same constants.
|
|
geo: ?[]const u8 = null, // owned
|
|
asset_class: ?[]const u8 = null, // owned
|
|
is_etf: bool = false,
|
|
/// YYYY-MM-DD; trimmed from upstream's ISO-8601 date.
|
|
inception_date: ?[]const u8 = null, // owned
|
|
/// Wikidata's P5531 — the SEC CIK as a digit string. Already
|
|
/// zero-padded to 10 digits, matching the project-wide CIK
|
|
/// normalization convention.
|
|
cik: ?[]const u8 = null, // owned
|
|
/// YYYY-MM-DD when this provider ran, NOT when upstream last
|
|
/// updated the underlying entity.
|
|
as_of: []const u8, // owned
|
|
source: []const u8, // no default — provenance always emitted
|
|
|
|
pub fn deinit(self: ClassificationRecord, allocator: std.mem.Allocator) void {
|
|
allocator.free(self.symbol);
|
|
if (self.name) |s| allocator.free(s);
|
|
if (self.sector) |s| allocator.free(s);
|
|
if (self.industry) |s| allocator.free(s);
|
|
if (self.country) |s| allocator.free(s);
|
|
if (self.geo) |s| allocator.free(s);
|
|
if (self.asset_class) |s| allocator.free(s);
|
|
if (self.inception_date) |s| allocator.free(s);
|
|
if (self.cik) |s| allocator.free(s);
|
|
allocator.free(self.as_of);
|
|
allocator.free(self.source);
|
|
}
|
|
|
|
/// Free a slice of records, calling deinit on each element first.
|
|
pub fn freeSlice(allocator: std.mem.Allocator, recs: []const ClassificationRecord) void {
|
|
for (recs) |r| r.deinit(allocator);
|
|
allocator.free(recs);
|
|
}
|
|
};
|
|
|
|
// ── Geographic taxonomy ──────────────────────────────────────
|
|
|
|
/// Geo-bucket constants used by the country → geo lookup. Kept
|
|
/// as named constants (rather than inline string literals in the
|
|
/// map) so callers can reference them without typo risk and the
|
|
/// taxonomy is tweakable in one place.
|
|
pub const geo = struct {
|
|
pub const us = "US";
|
|
pub const developed = "International Developed";
|
|
pub const emerging = "Emerging Markets";
|
|
pub const unknown = "Unknown";
|
|
};
|
|
|
|
// ── Sector taxonomy ──────────────────────────────────────────
|
|
|
|
/// Canonical sector taxonomy (GICS-aligned 11-sector model).
|
|
/// Producers (Wikidata's `canonicalizeSector`, enrich's
|
|
/// `inferSectorFromTitle`) emit one of these strings; consumers
|
|
/// (analysis bucketing, display) compare against them.
|
|
///
|
|
/// Lives here (not in any provider) so multiple producers can
|
|
/// share one taxonomy. Adding a 12th sector or renaming an
|
|
/// existing one is a one-place edit.
|
|
pub const sector = struct {
|
|
pub const technology = "Technology";
|
|
pub const communication_services = "Communication Services";
|
|
pub const consumer_cyclical = "Consumer Cyclical";
|
|
pub const consumer_defensive = "Consumer Defensive";
|
|
pub const healthcare = "Healthcare";
|
|
pub const financial_services = "Financial Services";
|
|
pub const energy = "Energy";
|
|
pub const industrials = "Industrials";
|
|
pub const basic_materials = "Basic Materials";
|
|
pub const real_estate = "Real Estate";
|
|
pub const utilities = "Utilities";
|
|
};
|
|
|
|
/// Country-code-to-geo-bucket lookup. Producers (Wikidata today,
|
|
/// others tomorrow) hand us ISO-3166 alpha-2 codes via the
|
|
/// `ClassificationRecord.country` field; we map them to the geo
|
|
/// taxonomy (`geo.us` / `geo.developed` / `geo.emerging` /
|
|
/// `geo.unknown`).
|
|
///
|
|
/// MSCI conventions used as the developed/emerging split. Taiwan
|
|
/// and South Korea are MSCI-emerging despite FTSE classifying
|
|
/// them developed. Israel is MSCI-developed (upgraded 2010).
|
|
/// Canada is folded into International Developed (some users
|
|
/// prefer separate Canada bucket; override in `metadata.srf`).
|
|
const country_to_geo = std.StaticStringMap([]const u8).initComptime(.{
|
|
// United States
|
|
.{ "US", geo.us },
|
|
// Alpha-3 fallback for entries that use the longer form.
|
|
.{ "USA", geo.us },
|
|
|
|
// International Developed — Europe ex-CIS
|
|
.{ "GB", geo.developed },
|
|
.{ "DE", geo.developed },
|
|
.{ "FR", geo.developed },
|
|
.{ "NL", geo.developed },
|
|
.{ "CH", geo.developed },
|
|
.{ "SE", geo.developed },
|
|
.{ "DK", geo.developed },
|
|
.{ "NO", geo.developed },
|
|
.{ "FI", geo.developed },
|
|
.{ "IT", geo.developed },
|
|
.{ "ES", geo.developed },
|
|
.{ "BE", geo.developed },
|
|
.{ "AT", geo.developed },
|
|
.{ "IE", geo.developed },
|
|
.{ "LU", geo.developed },
|
|
.{ "PT", geo.developed },
|
|
.{ "GR", geo.developed },
|
|
.{ "IS", geo.developed },
|
|
|
|
// International Developed — Asia-Pacific + Israel + Canada
|
|
.{ "JP", geo.developed },
|
|
.{ "AU", geo.developed },
|
|
.{ "NZ", geo.developed },
|
|
.{ "SG", geo.developed },
|
|
.{ "HK", geo.developed },
|
|
.{ "IL", geo.developed },
|
|
.{ "CA", geo.developed },
|
|
|
|
// Emerging Markets (MSCI)
|
|
.{ "CN", geo.emerging },
|
|
.{ "TW", geo.emerging },
|
|
.{ "KR", geo.emerging },
|
|
.{ "IN", geo.emerging },
|
|
.{ "BR", geo.emerging },
|
|
.{ "MX", geo.emerging },
|
|
.{ "RU", geo.emerging },
|
|
.{ "TR", geo.emerging },
|
|
.{ "ZA", geo.emerging },
|
|
.{ "TH", geo.emerging },
|
|
.{ "MY", geo.emerging },
|
|
.{ "ID", geo.emerging },
|
|
.{ "PH", geo.emerging },
|
|
.{ "VN", geo.emerging },
|
|
.{ "AR", geo.emerging },
|
|
.{ "CL", geo.emerging },
|
|
.{ "CO", geo.emerging },
|
|
.{ "PE", geo.emerging },
|
|
.{ "EG", geo.emerging },
|
|
});
|
|
|
|
/// Map an ISO-3166 alpha-2 country code to one of the geo
|
|
/// buckets. Null/empty input or an unknown code returns
|
|
/// `geo.unknown` so the user can override in `metadata.srf`.
|
|
pub fn geoFor(iso2: ?[]const u8) []const u8 {
|
|
const code = iso2 orelse return geo.unknown;
|
|
if (code.len == 0) return geo.unknown;
|
|
return country_to_geo.get(code) orelse geo.unknown;
|
|
}
|
|
|
|
test "geoFor maps known ISO-3166 codes to bucket" {
|
|
try std.testing.expectEqualStrings(geo.us, geoFor("US"));
|
|
try std.testing.expectEqualStrings(geo.us, geoFor("USA"));
|
|
try std.testing.expectEqualStrings(geo.developed, geoFor("GB"));
|
|
try std.testing.expectEqualStrings(geo.developed, geoFor("DE"));
|
|
try std.testing.expectEqualStrings(geo.developed, geoFor("CA"));
|
|
try std.testing.expectEqualStrings(geo.developed, geoFor("IL"));
|
|
try std.testing.expectEqualStrings(geo.emerging, geoFor("CN"));
|
|
try std.testing.expectEqualStrings(geo.emerging, geoFor("TW"));
|
|
try std.testing.expectEqualStrings(geo.emerging, geoFor("KR"));
|
|
}
|
|
|
|
test "geoFor returns Unknown for null/empty/unmapped" {
|
|
try std.testing.expectEqualStrings(geo.unknown, geoFor(null));
|
|
try std.testing.expectEqualStrings(geo.unknown, geoFor(""));
|
|
try std.testing.expectEqualStrings(geo.unknown, geoFor("ZZ")); // unassigned ISO-2
|
|
try std.testing.expectEqualStrings(geo.unknown, geoFor("XX"));
|
|
}
|
|
|
|
test "geo bucket labels are stable strings (not byte copies)" {
|
|
// Callers stash these in HashMap keys without duping.
|
|
// Verify the literal-pointer property holds across calls.
|
|
try std.testing.expectEqual(@intFromPtr(geo.us.ptr), @intFromPtr(geoFor("US").ptr));
|
|
try std.testing.expectEqual(@intFromPtr(geo.developed.ptr), @intFromPtr(geoFor("GB").ptr));
|
|
try std.testing.expectEqual(@intFromPtr(geo.emerging.ptr), @intFromPtr(geoFor("CN").ptr));
|
|
try std.testing.expectEqual(@intFromPtr(geo.unknown.ptr), @intFromPtr(geoFor(null).ptr));
|
|
}
|
|
|
|
// ── Title-keyword inference ──────────────────────────────────
|
|
//
|
|
// Pure functions over a fund/security title string. Used by
|
|
// `service.getClassification` to populate the sector / geo of a
|
|
// `ClassificationRecord` when Wikidata didn't carry one and the
|
|
// EDGAR ticker-map fallback fired. Lives here (not in any
|
|
// provider) because the inference is provider-agnostic and
|
|
// shares the canonical sector/geo taxonomy declared above.
|
|
|
|
fn titleContainsAny(haystack: []const u8, needles: []const []const u8) bool {
|
|
for (needles) |needle| {
|
|
if (std.mem.indexOf(u8, haystack, needle) != null) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// Lowercase the title into a stack buffer for case-insensitive
|
|
/// keyword matching. Truncates titles longer than the buffer
|
|
/// (returns null) — real fund names easily fit in 256 bytes.
|
|
fn lowercaseTitle(buf: []u8, title: []const u8) ?[]const u8 {
|
|
if (title.len > buf.len) return null;
|
|
return std.ascii.lowerString(buf[0..title.len], title);
|
|
}
|
|
|
|
/// Infer a GICS sector from a fund's title. Returns null when
|
|
/// no unambiguous keyword match — caller falls back to whatever
|
|
/// sector data the upstream source provided (typically null).
|
|
///
|
|
/// Conservative keyword set: matches only words that map
|
|
/// unambiguously to a single GICS sector. "Income" / "Dividend"
|
|
/// / "Value" / "Growth" / "Momentum" / "Total" / "Equal Weight"
|
|
/// / "International" / "Emerging" don't appear here — they
|
|
/// describe the screening methodology or geo, not the sector.
|
|
///
|
|
/// Reuses the `sector` constants above so the inference taxonomy
|
|
/// stays in lock-step with the canonicalizer.
|
|
pub fn inferSectorFromTitle(title: ?[]const u8) ?[]const u8 {
|
|
const t = title orelse return null;
|
|
if (t.len == 0) return null;
|
|
|
|
var buf: [256]u8 = undefined;
|
|
const lc = lowercaseTitle(&buf, t) orelse return null;
|
|
|
|
// Order matters: more-specific keywords come first within
|
|
// each sector. "Health care" before "care" (irrelevant
|
|
// example), "semiconductor" before generic "tech" (which we
|
|
// don't include — too broad).
|
|
|
|
// Healthcare. "Health care" with space (XLV title), "healthcare"
|
|
// (one word), "biotech", "pharmaceutical".
|
|
if (titleContainsAny(lc, &.{ "health care", "healthcare", "biotech", "pharmaceutical", "medical" })) {
|
|
return sector.healthcare;
|
|
}
|
|
|
|
// Technology. Specific terms only — "tech" alone is too
|
|
// broad (matches "biotech", "fintech", "edtech" — all
|
|
// sector-mixing).
|
|
if (titleContainsAny(lc, &.{ "semiconductor", "software", "cloud computing", "internet" })) {
|
|
return sector.technology;
|
|
}
|
|
|
|
// Financial Services. "Financial" is fairly specific in
|
|
// fund-name conventions ("Financial Select Sector SPDR",
|
|
// "Vanguard Financials ETF").
|
|
if (titleContainsAny(lc, &.{ "financial", "bank" })) {
|
|
return sector.financial_services;
|
|
}
|
|
|
|
// Energy. "Energy" alone is mostly unambiguous in fund
|
|
// conventions; pair with "oil" / "gas" for redundancy.
|
|
if (titleContainsAny(lc, &.{ "energy", "oil & gas", "oil and gas", "petroleum" })) {
|
|
return sector.energy;
|
|
}
|
|
|
|
// Real Estate / REITs.
|
|
if (titleContainsAny(lc, &.{ "real estate", "reit" })) {
|
|
return sector.real_estate;
|
|
}
|
|
|
|
// Utilities. "Utilities" alone is unambiguous.
|
|
if (titleContainsAny(lc, &.{"utilities"})) {
|
|
return sector.utilities;
|
|
}
|
|
|
|
// Consumer Discretionary / Cyclical. Match the explicit
|
|
// labels — "consumer" alone is ambiguous (could be
|
|
// discretionary or staples).
|
|
if (titleContainsAny(lc, &.{ "consumer discretionary", "consumer cyclical" })) {
|
|
return sector.consumer_cyclical;
|
|
}
|
|
|
|
// Consumer Staples / Defensive.
|
|
if (titleContainsAny(lc, &.{ "consumer staples", "consumer defensive" })) {
|
|
return sector.consumer_defensive;
|
|
}
|
|
|
|
// Industrials. "Industrial" is more reliable than
|
|
// "industrials" because some fund names use the singular
|
|
// ("Industrial Select Sector SPDR").
|
|
if (titleContainsAny(lc, &.{ "industrial", "aerospace", "defense" })) {
|
|
return sector.industrials;
|
|
}
|
|
|
|
// Basic Materials.
|
|
if (titleContainsAny(lc, &.{ "materials", "mining", "miners", "metals" })) {
|
|
return sector.basic_materials;
|
|
}
|
|
|
|
// Communication Services. "Communication" / "Telecom"
|
|
// unambiguous.
|
|
if (titleContainsAny(lc, &.{ "communication", "telecom", "media" })) {
|
|
return sector.communication_services;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/// Infer a geo bucket from a fund's title. Returns null when
|
|
/// the title doesn't carry an unambiguous international/emerging
|
|
/// keyword — caller keeps whatever default they have (typically
|
|
/// US for SEC-filed funds).
|
|
///
|
|
/// More important than sector inference: a default `geo::US` is
|
|
/// *factually wrong* for international funds (FRDM holds
|
|
/// emerging-market equities, not US), so this fix tightens
|
|
/// portfolio-level geographic-exposure reporting.
|
|
pub fn inferGeoFromTitle(title: ?[]const u8) ?[]const u8 {
|
|
const t = title orelse return null;
|
|
if (t.len == 0) return null;
|
|
|
|
var buf: [256]u8 = undefined;
|
|
const lc = lowercaseTitle(&buf, t) orelse return null;
|
|
|
|
// Emerging markets first — most specific. "Emerging" alone
|
|
// is rare in non-EM contexts in fund-name conventions.
|
|
// "Frontier" likewise is conventionally only used for
|
|
// frontier markets in fund titles.
|
|
if (titleContainsAny(lc, &.{ "emerging market", "emerging markets", "frontier market", "frontier markets", "frontier" })) {
|
|
return geo.emerging;
|
|
}
|
|
|
|
// International Developed. "International" / "Intl" /
|
|
// "ex-US" / "World ex US" / "Developed Markets" /
|
|
// specific developed-market regions.
|
|
//
|
|
// False-positive risk: a hypothetical "Vanguard Total
|
|
// International + US Equity Fund" would mis-tag here. None
|
|
// of the user's current portfolio holds such a hybrid
|
|
// fund. If one ever shows up, it'll get flagged in the
|
|
// diff-against-old-metadata.srf review and can be
|
|
// hand-corrected.
|
|
if (titleContainsAny(lc, &.{ "international", " intl", "ex-us", "ex us", "world ex", "developed market", "developed markets" })) {
|
|
return geo.developed;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
test "inferSectorFromTitle: null/empty -> null" {
|
|
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle(null));
|
|
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle(""));
|
|
}
|
|
|
|
test "inferSectorFromTitle: technology keywords" {
|
|
try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("iShares Semiconductor ETF"));
|
|
try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("Vanguard Software ETF"));
|
|
}
|
|
|
|
test "inferSectorFromTitle: healthcare keywords" {
|
|
try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("Health Care Select Sector SPDR"));
|
|
try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("iShares Biotech ETF"));
|
|
}
|
|
|
|
test "inferSectorFromTitle: ambiguous title -> null" {
|
|
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("Vanguard Total Stock Market"));
|
|
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("SPDR S&P 500"));
|
|
}
|
|
|
|
test "inferGeoFromTitle: null/empty -> null" {
|
|
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle(null));
|
|
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle(""));
|
|
}
|
|
|
|
test "inferGeoFromTitle: emerging markets" {
|
|
try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("Freedom 100 Emerging Markets ETF"));
|
|
try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("iShares Frontier Markets"));
|
|
}
|
|
|
|
test "inferGeoFromTitle: international developed" {
|
|
try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard FTSE Developed Markets"));
|
|
try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("iShares MSCI EAFE International"));
|
|
try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard Total World ex-US"));
|
|
}
|
|
|
|
test "inferGeoFromTitle: no match -> null" {
|
|
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("SPDR S&P 500"));
|
|
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("iShares Semiconductor ETF"));
|
|
}
|