move Wikidata -> edgar logic into getClassification
All checks were successful
Generic zig build / build (push) Successful in 4m20s
Generic zig build / deploy (push) Successful in 17s
Generic zig build / publish-macos (push) Successful in 41s

This commit is contained in:
Emil Lerch 2026-06-01 16:11:09 -07:00
parent 4d65cc45f4
commit b796a46699
Signed by: lobo
GPG key ID: A7B62D657EF764F8
3 changed files with 953 additions and 1206 deletions

File diff suppressed because it is too large Load diff

View file

@ -129,6 +129,13 @@ pub const ClassificationRecord = struct {
industry: ?[]const u8 = null, // owned
/// ISO-3166 alpha-2 country code (e.g. "US", "GB", "DE").
country: ?[]const u8 = null, // owned
/// Resolved geo bucket (e.g. "US", "International Developed",
/// "Emerging Markets"). Populated either from `geoFor(country)`
/// or from title-keyword inference (`inferGeoFromTitle`) for
/// symbols where Wikidata didn't supply a country. Producers
/// pick one of the `geo.*` constants above; consumers compare
/// against the same constants.
geo: ?[]const u8 = null, // owned
asset_class: ?[]const u8 = null, // owned
is_etf: bool = false,
/// YYYY-MM-DD; trimmed from upstream's ISO-8601 date.
@ -148,6 +155,7 @@ pub const ClassificationRecord = struct {
if (self.sector) |s| allocator.free(s);
if (self.industry) |s| allocator.free(s);
if (self.country) |s| allocator.free(s);
if (self.geo) |s| allocator.free(s);
if (self.asset_class) |s| allocator.free(s);
if (self.inception_date) |s| allocator.free(s);
if (self.cik) |s| allocator.free(s);
@ -303,3 +311,202 @@ test "geo bucket labels are stable strings (not byte copies)" {
try std.testing.expectEqual(@intFromPtr(geo.emerging.ptr), @intFromPtr(geoFor("CN").ptr));
try std.testing.expectEqual(@intFromPtr(geo.unknown.ptr), @intFromPtr(geoFor(null).ptr));
}
// Title-keyword inference
//
// Pure functions over a fund/security title string. Used by
// `service.getClassification` to populate the sector / geo of a
// `ClassificationRecord` when Wikidata didn't carry one and the
// EDGAR ticker-map fallback fired. Lives here (not in any
// provider) because the inference is provider-agnostic and
// shares the canonical sector/geo taxonomy declared above.
fn titleContainsAny(haystack: []const u8, needles: []const []const u8) bool {
for (needles) |needle| {
if (std.mem.indexOf(u8, haystack, needle) != null) return true;
}
return false;
}
/// Lowercase the title into a stack buffer for case-insensitive
/// keyword matching. Truncates titles longer than the buffer
/// (returns null) real fund names easily fit in 256 bytes.
fn lowercaseTitle(buf: []u8, title: []const u8) ?[]const u8 {
if (title.len > buf.len) return null;
return std.ascii.lowerString(buf[0..title.len], title);
}
/// Infer a GICS sector from a fund's title. Returns null when
/// no unambiguous keyword match caller falls back to whatever
/// sector data the upstream source provided (typically null).
///
/// Conservative keyword set: matches only words that map
/// unambiguously to a single GICS sector. "Income" / "Dividend"
/// / "Value" / "Growth" / "Momentum" / "Total" / "Equal Weight"
/// / "International" / "Emerging" don't appear here they
/// describe the screening methodology or geo, not the sector.
///
/// Reuses the `sector` constants above so the inference taxonomy
/// stays in lock-step with the canonicalizer.
pub fn inferSectorFromTitle(title: ?[]const u8) ?[]const u8 {
const t = title orelse return null;
if (t.len == 0) return null;
var buf: [256]u8 = undefined;
const lc = lowercaseTitle(&buf, t) orelse return null;
// Order matters: more-specific keywords come first within
// each sector. "Health care" before "care" (irrelevant
// example), "semiconductor" before generic "tech" (which we
// don't include too broad).
// Healthcare. "Health care" with space (XLV title), "healthcare"
// (one word), "biotech", "pharmaceutical".
if (titleContainsAny(lc, &.{ "health care", "healthcare", "biotech", "pharmaceutical", "medical" })) {
return sector.healthcare;
}
// Technology. Specific terms only "tech" alone is too
// broad (matches "biotech", "fintech", "edtech" all
// sector-mixing).
if (titleContainsAny(lc, &.{ "semiconductor", "software", "cloud computing", "internet" })) {
return sector.technology;
}
// Financial Services. "Financial" is fairly specific in
// fund-name conventions ("Financial Select Sector SPDR",
// "Vanguard Financials ETF").
if (titleContainsAny(lc, &.{ "financial", "bank" })) {
return sector.financial_services;
}
// Energy. "Energy" alone is mostly unambiguous in fund
// conventions; pair with "oil" / "gas" for redundancy.
if (titleContainsAny(lc, &.{ "energy", "oil & gas", "oil and gas", "petroleum" })) {
return sector.energy;
}
// Real Estate / REITs.
if (titleContainsAny(lc, &.{ "real estate", "reit" })) {
return sector.real_estate;
}
// Utilities. "Utilities" alone is unambiguous.
if (titleContainsAny(lc, &.{"utilities"})) {
return sector.utilities;
}
// Consumer Discretionary / Cyclical. Match the explicit
// labels "consumer" alone is ambiguous (could be
// discretionary or staples).
if (titleContainsAny(lc, &.{ "consumer discretionary", "consumer cyclical" })) {
return sector.consumer_cyclical;
}
// Consumer Staples / Defensive.
if (titleContainsAny(lc, &.{ "consumer staples", "consumer defensive" })) {
return sector.consumer_defensive;
}
// Industrials. "Industrial" is more reliable than
// "industrials" because some fund names use the singular
// ("Industrial Select Sector SPDR").
if (titleContainsAny(lc, &.{ "industrial", "aerospace", "defense" })) {
return sector.industrials;
}
// Basic Materials.
if (titleContainsAny(lc, &.{ "materials", "mining", "miners", "metals" })) {
return sector.basic_materials;
}
// Communication Services. "Communication" / "Telecom"
// unambiguous.
if (titleContainsAny(lc, &.{ "communication", "telecom", "media" })) {
return sector.communication_services;
}
return null;
}
/// Infer a geo bucket from a fund's title. Returns null when
/// the title doesn't carry an unambiguous international/emerging
/// keyword caller keeps whatever default they have (typically
/// US for SEC-filed funds).
///
/// More important than sector inference: a default `geo::US` is
/// *factually wrong* for international funds (FRDM holds
/// emerging-market equities, not US), so this fix tightens
/// portfolio-level geographic-exposure reporting.
pub fn inferGeoFromTitle(title: ?[]const u8) ?[]const u8 {
const t = title orelse return null;
if (t.len == 0) return null;
var buf: [256]u8 = undefined;
const lc = lowercaseTitle(&buf, t) orelse return null;
// Emerging markets first most specific. "Emerging" alone
// is rare in non-EM contexts in fund-name conventions.
// "Frontier" likewise is conventionally only used for
// frontier markets in fund titles.
if (titleContainsAny(lc, &.{ "emerging market", "emerging markets", "frontier market", "frontier markets", "frontier" })) {
return geo.emerging;
}
// International Developed. "International" / "Intl" /
// "ex-US" / "World ex US" / "Developed Markets" /
// specific developed-market regions.
//
// False-positive risk: a hypothetical "Vanguard Total
// International + US Equity Fund" would mis-tag here. None
// of the user's current portfolio holds such a hybrid
// fund. If one ever shows up, it'll get flagged in the
// diff-against-old-metadata.srf review and can be
// hand-corrected.
if (titleContainsAny(lc, &.{ "international", " intl", "ex-us", "ex us", "world ex", "developed market", "developed markets" })) {
return geo.developed;
}
return null;
}
test "inferSectorFromTitle: null/empty -> null" {
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle(null));
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle(""));
}
test "inferSectorFromTitle: technology keywords" {
try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("iShares Semiconductor ETF"));
try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("Vanguard Software ETF"));
}
test "inferSectorFromTitle: healthcare keywords" {
try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("Health Care Select Sector SPDR"));
try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("iShares Biotech ETF"));
}
test "inferSectorFromTitle: ambiguous title -> null" {
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("Vanguard Total Stock Market"));
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("SPDR S&P 500"));
}
test "inferGeoFromTitle: null/empty -> null" {
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle(null));
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle(""));
}
test "inferGeoFromTitle: emerging markets" {
try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("Freedom 100 Emerging Markets ETF"));
try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("iShares Frontier Markets"));
}
test "inferGeoFromTitle: international developed" {
try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard FTSE Developed Markets"));
try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("iShares MSCI EAFE International"));
try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard Total World ex-US"));
}
test "inferGeoFromTitle: no match -> null" {
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("SPDR S&P 500"));
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("iShares Semiconductor ETF"));
}

View file

@ -35,6 +35,7 @@ const Yahoo = @import("providers/yahoo.zig").Yahoo;
const Tiingo = @import("providers/tiingo.zig").Tiingo;
const Wikidata = @import("providers/Wikidata.zig");
const Edgar = @import("providers/Edgar.zig");
const classification = @import("models/classification.zig");
const fmt = @import("format.zig");
const performance = @import("analytics/performance.zig");
const http = @import("net/http.zig");
@ -1037,9 +1038,9 @@ pub const DataService = struct {
// higher-quality name. Best-effort: if the fetch fails we
// still return the EDGAR-only profile.
var inception_date: ?Date = null;
if (self.getClassification(symbol, opts)) |classification| {
defer classification.deinit();
for (classification.data) |c| {
if (self.getClassification(symbol, opts)) |class_result| {
defer class_result.deinit();
for (class_result.data) |c| {
if (c.inception_date) |idate_str| {
if (Date.parse(idate_str)) |d| inception_date = d else |_| {}
}
@ -1092,11 +1093,6 @@ pub const DataService = struct {
/// `opts.skip_network = true` returns cached data even if stale,
/// `FetchFailed` on cache miss. `opts.force_refresh = true`
/// ignores the cache and re-fetches.
///
/// Callers fetching classifications for many symbols should use
/// `getClassifications(symbols)` instead Wikidata's SPARQL API
/// is naturally batched, and one query for N symbols is much
/// cheaper than N queries for 1 symbol each.
pub fn getClassification(self: *DataService, symbol: []const u8, opts: FetchOptions) DataError!FetchResult(Wikidata.ClassificationRecord) {
var s = self.store();
@ -1131,124 +1127,282 @@ pub const DataService = struct {
const fetched = wd.fetch(self.allocator, &symbols) catch |err| {
if (err == error.RateLimited) {
self.rateLimitBackoff();
break_blk: {
const retried = wd.fetch(self.allocator, &symbols) catch break :break_blk;
if (retried.len > 0) {
s.write(Wikidata.ClassificationRecord, symbol, retried, cache.DataType.classification.ttl());
return .{ .data = retried, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator };
}
self.allocator.free(retried);
}
if (wd.fetch(self.allocator, &symbols)) |retried| {
return self.finalizeClassification(symbol, retried, opts);
} else |_| {}
}
log.warn("{s}: wikidata fetch failed: {s}", .{ symbol, @errorName(err) });
return DataError.FetchFailed;
};
if (fetched.len == 0) {
self.allocator.free(fetched);
// Wikidata had no row for this symbol. Negative-cache to
// suppress retries until the user explicitly refreshes.
s.writeNegative(symbol, .classification);
return DataError.NotFound;
}
s.write(Wikidata.ClassificationRecord, symbol, fetched, cache.DataType.classification.ttl());
return .{ .data = fetched, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator };
return self.finalizeClassification(symbol, fetched, opts);
}
/// Batched classification fetch. Wikidata's SPARQL API takes a
/// `VALUES ?ticker { ... }` set in one query; this method runs
/// that query for the requested set, splits the response into
/// per-symbol cache writes, and returns the slice. Symbols not
/// in Wikidata are silently dropped from the result (the user-
/// facing cache for them gets a negative entry).
/// Common post-Wikidata path: decide if the result is useful as
/// returned, otherwise consult EDGAR to fill in the gaps,
/// otherwise negative-cache. Either way the cache gets written
/// and a `FetchResult` is returned (or `DataError.NotFound`).
///
/// The cache is consulted first per-symbol; only the symbols
/// that miss the cache (or are stale) are passed to the SPARQL
/// query. This minimizes the upstream load when most symbols
/// were already classified in a prior run.
pub fn getClassifications(
/// Takes ownership of `wikidata_records`. The slice is either
/// returned as the result data, freed and replaced by a
/// synthesized slice, or freed and the symbol negative-cached.
fn finalizeClassification(
self: *DataService,
result_allocator: std.mem.Allocator,
symbols: []const []const u8,
symbol: []const u8,
wikidata_records: []Wikidata.ClassificationRecord,
opts: FetchOptions,
) DataError![]Wikidata.ClassificationRecord {
if (symbols.len == 0) return &.{};
) DataError!FetchResult(Wikidata.ClassificationRecord) {
var s = self.store();
const ttl = cache.DataType.classification.ttl();
// Identify cache misses.
var to_fetch: std.ArrayList([]const u8) = .empty;
defer to_fetch.deinit(self.allocator);
var cached_records: std.ArrayList(Wikidata.ClassificationRecord) = .empty;
errdefer {
for (cached_records.items) |*r| {
var m = r.*;
m.deinit(self.allocator);
// Wikidata returned a useful row -> populate geo from
// geoFor(country) and cache as-is.
if (wikidata_records.len > 0 and wikidataLooksUseful(wikidata_records[0])) {
try self.populateGeo(&wikidata_records[0]);
s.write(Wikidata.ClassificationRecord, symbol, wikidata_records, ttl);
return .{ .data = wikidata_records, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator };
}
// Sparse or empty: try EDGAR fallback. `synthesizeClassification`
// takes ownership of the wikidata slice (frees it, returns a
// new one-element slice with the merged record). Returns
// `error.NotFound` when even EDGAR has nothing.
const merged = self.synthesizeClassification(symbol, wikidata_records, opts) catch |err| {
if (err == error.NotFound) {
s.writeNegative(symbol, .classification);
return DataError.NotFound;
}
cached_records.deinit(self.allocator);
}
for (symbols) |sym| {
if (!opts.force_refresh) {
if (s.read(Wikidata.ClassificationRecord, sym, null, .fresh_only)) |cached| {
// The on-disk shape is a length-1 slice.
if (cached.data.len > 0) {
try cached_records.append(self.allocator, cached.data[0]);
// Free the rest if any (shouldn't happen for
// per-symbol classification, but defensive).
for (cached.data[1..]) |*r| {
var m = r.*;
m.deinit(self.allocator);
}
self.allocator.free(cached.data);
continue;
}
self.allocator.free(cached.data);
}
}
try to_fetch.append(self.allocator, sym);
}
if (to_fetch.items.len == 0) {
// All cached assemble result from cached_records.
const out = try result_allocator.alloc(Wikidata.ClassificationRecord, cached_records.items.len);
@memcpy(out, cached_records.items);
cached_records.clearRetainingCapacity();
return out;
}
if (opts.skip_network) {
// Offline mode: return what we have from cache.
const out = try result_allocator.alloc(Wikidata.ClassificationRecord, cached_records.items.len);
@memcpy(out, cached_records.items);
cached_records.clearRetainingCapacity();
return out;
}
log.debug("fetching {d} classifications from Wikidata", .{to_fetch.items.len});
self.assertNetworkAllowed("getClassifications wikidata.fetch");
var wd = try self.getProvider(Wikidata);
const fetched = wd.fetch(self.allocator, to_fetch.items) catch |err| {
log.warn("wikidata batch fetch failed: {s}", .{@errorName(err)});
return DataError.FetchFailed;
};
defer self.allocator.free(fetched);
// Write each fetched record to its per-symbol cache file.
for (fetched) |rec| {
const single = [_]Wikidata.ClassificationRecord{rec};
s.write(Wikidata.ClassificationRecord, rec.symbol, &single, cache.DataType.classification.ttl());
s.write(Wikidata.ClassificationRecord, symbol, merged, ttl);
return .{ .data = merged, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator };
}
/// Populate `record.geo` from `geoFor(record.country)` when it
/// isn't already set. Best-effort: if duping the geo string
/// fails, leaves the field null and propagates the error so the
/// caller can decide whether to bail.
fn populateGeo(self: *DataService, record: *Wikidata.ClassificationRecord) !void {
if (record.geo != null) return;
const country = record.country orelse return;
const g = classification.geoFor(country);
if (std.mem.eql(u8, g, classification.geo.unknown)) return;
record.geo = try self.allocator.dupe(u8, g);
}
/// Whether a Wikidata classification record carries enough
/// downstream-usable data to skip the EDGAR fallback. A record
/// with at least one of `is_etf`, `sector`, `country`, or
/// `asset_class` set is "useful"; sparse records (e.g. SOXX
/// getting only a `name` from Wikidata) need the EDGAR
/// ticker-map fallback to fill in `is_etf=true,
/// asset_class=ETF, country=US`.
fn wikidataLooksUseful(c: Wikidata.ClassificationRecord) bool {
if (c.is_etf) return true;
if (c.asset_class != null) return true;
if (c.country != null) return true;
if (c.sector != null) return true;
return false;
}
/// Synthesize a `ClassificationRecord` for a symbol that
/// Wikidata couldn't classify usefully. Consults the EDGAR
/// ticker maps; if found, also fetches `getEtfMetrics` to
/// recover the NPORT-P series_name (more authoritative than
/// the company_tickers title). Title-keyword inference fills
/// in `sector` and `geo` when the name carries an unambiguous
/// keyword.
///
/// Takes ownership of `wikidata_records`: frees them at exit.
/// Wikidata's `name`/`industry`/`inception_date`/`cik` fields
/// are preserved into the synthesized record when present.
/// Returns `error.NotFound` when EDGAR has nothing either.
fn synthesizeClassification(
self: *DataService,
symbol: []const u8,
wikidata_records: []Wikidata.ClassificationRecord,
opts: FetchOptions,
) !cache.Store.DataFor(Wikidata.ClassificationRecord) {
defer Wikidata.ClassificationRecord.freeSlice(self.allocator, wikidata_records);
const lookup = self.lookupEdgarFallback(symbol, opts);
defer freeEdgarLookup(self.allocator, lookup);
if (lookup == .none) return error.NotFound;
// For ETF/fund hits, try to get the richer series_name from
// NPORT-P. Cache hit is cheap; cache miss triggers an EDGAR
// fetch but is bounded by EDGAR's rate limiter. If the call
// fails (e.g. money-market funds with no NPORT-P), we fall
// back to the ticker-map title.
var etf_metrics_result: ?FetchResult(Edgar.EtfMetricRecord) = null;
defer if (etf_metrics_result) |*r| r.deinit();
etf_metrics_result = self.getEtfMetrics(symbol, opts) catch null;
// Extract series_name and cik from the etf_metrics profile row.
var series_name: ?[]const u8 = null;
var etf_cik: ?[]const u8 = null;
if (etf_metrics_result) |r| {
for (r.data) |rec| switch (rec) {
.profile => |p| {
if (p.series_name) |sn| series_name = sn;
etf_cik = p.cik;
break;
},
else => {},
};
}
// Combine cached + fetched into the result.
const total = cached_records.items.len + fetched.len;
const out = try result_allocator.alloc(Wikidata.ClassificationRecord, total);
@memcpy(out[0..cached_records.items.len], cached_records.items);
@memcpy(out[cached_records.items.len..], fetched);
cached_records.clearRetainingCapacity();
return out;
// Pull whatever Wikidata's sparse record carried so we
// don't lose data on the merge.
const wd: ?Wikidata.ClassificationRecord = if (wikidata_records.len > 0) wikidata_records[0] else null;
// Pick the best name source: NPORT-P series_name >
// EDGAR ticker-map title > Wikidata name > nothing.
//
// We're on the EDGAR-fallback path because Wikidata's
// record was sparse. For funds, Wikidata's `name` (when
// present) is frequently the underlying INDEX rather than
// the FUND itself -- e.g. SOXX's Wikidata `name` is "PHLX
// Semiconductor Sector" but the fund is "iShares
// Semiconductor ETF" per NPORT-P seriesName. Prefer the
// fund-authoritative source so downstream comments and
// labels show the fund name, not the index name.
const ticker_title: ?[]const u8 = switch (lookup) {
.company_or_uit => |c| c.title,
else => null,
};
const best_name: ?[]const u8 = blk: {
if (series_name) |n| break :blk n;
if (ticker_title) |n| break :blk n;
if (wd) |w| {
if (w.name) |n| break :blk n;
}
break :blk null;
};
// Name source for title-keyword inference: prefer the
// most-authoritative source for fund-style classification
// even when Wikidata supplied a (different) name. Wikidata's
// name for a fund is often less informative than NPORT-P's
// seriesName (e.g. SOXX's Wikidata name is "PHLX
// Semiconductor Sector" which is the index name, not the
// fund name).
const inference_name: ?[]const u8 = series_name orelse ticker_title orelse if (wd) |w| w.name else null;
const inferred_sector = classification.inferSectorFromTitle(inference_name);
const inferred_geo = classification.inferGeoFromTitle(inference_name);
// `is_etf` here means "this is fund-shaped, emit multi-row
// breakdown" -- true for ANY EDGAR-found symbol. The
// `tickers_funds.srf` map mixes mutual funds and
// series-of-trust ETFs alike. The `tickers_companies.srf`
// map carries operating companies, closed-end funds, and
// UITs; operating companies usually have Wikidata coverage
// and wouldn't reach this fallback, so anything that
// dropped here is also fund-shaped (e.g. PIMCO closed-end
// funds whose title says "FUND" but not "ETF" or "TRUST").
//
// The ETF/TRUST keyword in the title still drives the
// asset_class label below ("ETF" vs "Fund"), but the
// fund-shaped routing decision applies regardless.
const is_etf = true;
const asset_class: []const u8 = switch (lookup) {
.managed_fund => "Fund",
.company_or_uit => |c| if (c.is_etf) "ETF" else "Fund",
.none => unreachable,
};
// Country: prefer Wikidata's. Default to "US" for
// EDGAR-found symbols (they're SEC filers).
const country_str: []const u8 = if (wd) |w| (w.country orelse "US") else "US";
// Sector: prefer Wikidata's existing sector (rare in this
// sparse-fallback path), else fall back to inferred.
const sector_str: ?[]const u8 = blk: {
if (wd) |w| {
if (w.sector) |sec| break :blk sec;
}
break :blk inferred_sector;
};
// CIK: prefer Wikidata's, fall back to NPORT-P's.
const cik_str: ?[]const u8 = blk: {
if (wd) |w| {
if (w.cik) |c| break :blk c;
}
if (etf_cik) |c| break :blk c;
break :blk null;
};
// Geo: prefer the Wikidata-derived geo (computed from
// `geoFor(country)` against the country code), else use
// title-keyword inference. Default to "US" when neither
// is available -- EDGAR-found symbols are SEC filers.
const geo_str: []const u8 = blk: {
if (wd) |w| {
if (w.country) |c| {
const g = classification.geoFor(c);
if (!std.mem.eql(u8, g, classification.geo.unknown)) break :blk g;
}
}
if (inferred_geo) |g| break :blk g;
break :blk classification.geo.us;
};
const today = fmt.todayDate(self.io);
var as_of_buf: [10]u8 = undefined;
const as_of_str = try std.fmt.bufPrint(&as_of_buf, "{f}", .{today});
// Allocate each owned field up front with its own errdefer
// so a partial-build on OOM doesn't leak the earlier
// successful dupes. Once all dupes succeed we assemble the
// record (no fallible ops below this point).
const symbol_owned = try self.allocator.dupe(u8, symbol);
errdefer self.allocator.free(symbol_owned);
const name_owned: ?[]const u8 = if (best_name) |n| try self.allocator.dupe(u8, n) else null;
errdefer if (name_owned) |s| self.allocator.free(s);
const sector_owned: ?[]const u8 = if (sector_str) |s| try self.allocator.dupe(u8, s) else null;
errdefer if (sector_owned) |s| self.allocator.free(s);
const industry_owned: ?[]const u8 = if (wd) |w|
(if (w.industry) |i| try self.allocator.dupe(u8, i) else null)
else
null;
errdefer if (industry_owned) |s| self.allocator.free(s);
const country_owned = try self.allocator.dupe(u8, country_str);
errdefer self.allocator.free(country_owned);
const geo_owned = try self.allocator.dupe(u8, geo_str);
errdefer self.allocator.free(geo_owned);
const asset_class_owned = try self.allocator.dupe(u8, asset_class);
errdefer self.allocator.free(asset_class_owned);
const inception_owned: ?[]const u8 = if (wd) |w|
(if (w.inception_date) |i| try self.allocator.dupe(u8, i) else null)
else
null;
errdefer if (inception_owned) |s| self.allocator.free(s);
const cik_owned: ?[]const u8 = if (cik_str) |c| try self.allocator.dupe(u8, c) else null;
errdefer if (cik_owned) |s| self.allocator.free(s);
const as_of_owned = try self.allocator.dupe(u8, as_of_str);
errdefer self.allocator.free(as_of_owned);
const source_owned = try self.allocator.dupe(u8, "edgar_fallback");
errdefer self.allocator.free(source_owned);
const result = try self.allocator.alloc(Wikidata.ClassificationRecord, 1);
result[0] = .{
.symbol = symbol_owned,
.name = name_owned,
.sector = sector_owned,
.industry = industry_owned,
.country = country_owned,
.geo = geo_owned,
.asset_class = asset_class_owned,
.is_etf = is_etf,
.inception_date = inception_owned,
.cik = cik_owned,
.as_of = as_of_owned,
.source = source_owned,
};
return result;
}
/// Fetch XBRL-derived entity facts for a CIK (currently
@ -2955,6 +3109,326 @@ test "getClassification: cache hit returns cached data without network" {
try std.testing.expectEqual(Source.cached, result.source);
}
test "populateGeo: country US -> geo US" {
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
const config = Config{ .cache_dir = dir_path };
var svc = DataService.init(io, allocator, config);
defer svc.deinit();
var record: Wikidata.ClassificationRecord = .{
.symbol = try allocator.dupe(u8, "TEST"),
.country = try allocator.dupe(u8, "US"),
.as_of = try allocator.dupe(u8, "2026-06-01"),
.source = try allocator.dupe(u8, "wikidata"),
};
defer record.deinit(allocator);
try svc.populateGeo(&record);
try std.testing.expect(record.geo != null);
try std.testing.expectEqualStrings("US", record.geo.?);
}
test "populateGeo: country GB -> geo International Developed" {
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
const config = Config{ .cache_dir = dir_path };
var svc = DataService.init(io, allocator, config);
defer svc.deinit();
var record: Wikidata.ClassificationRecord = .{
.symbol = try allocator.dupe(u8, "TEST"),
.country = try allocator.dupe(u8, "GB"),
.as_of = try allocator.dupe(u8, "2026-06-01"),
.source = try allocator.dupe(u8, "wikidata"),
};
defer record.deinit(allocator);
try svc.populateGeo(&record);
try std.testing.expect(record.geo != null);
try std.testing.expectEqualStrings("International Developed", record.geo.?);
}
test "populateGeo: null country -> noop" {
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
const config = Config{ .cache_dir = dir_path };
var svc = DataService.init(io, allocator, config);
defer svc.deinit();
var record: Wikidata.ClassificationRecord = .{
.symbol = try allocator.dupe(u8, "TEST"),
.as_of = try allocator.dupe(u8, "2026-06-01"),
.source = try allocator.dupe(u8, "wikidata"),
};
defer record.deinit(allocator);
try svc.populateGeo(&record);
try std.testing.expectEqual(@as(?[]const u8, null), record.geo);
}
test "populateGeo: existing geo not overwritten" {
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
const config = Config{ .cache_dir = dir_path };
var svc = DataService.init(io, allocator, config);
defer svc.deinit();
var record: Wikidata.ClassificationRecord = .{
.symbol = try allocator.dupe(u8, "TEST"),
.country = try allocator.dupe(u8, "US"),
.geo = try allocator.dupe(u8, "Already Set"),
.as_of = try allocator.dupe(u8, "2026-06-01"),
.source = try allocator.dupe(u8, "wikidata"),
};
defer record.deinit(allocator);
try svc.populateGeo(&record);
try std.testing.expectEqualStrings("Already Set", record.geo.?);
}
test "getClassification: sparse Wikidata + EDGAR managed_fund hit produces merged record" {
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
const config = Config{ .cache_dir = dir_path };
var svc = DataService.init(io, allocator, config);
defer svc.deinit();
// Seed both EDGAR ticker map caches with at least one entry
// each so the synthesizeClassification path doesn't try to
// fetch them (the load helpers treat empty cached slices as
// "miss" and fall through to a network fetch).
var s = svc.store();
var mf_entries = [_]Edgar.MutualFundTickerEntry{.{
.symbol = "FAGIX",
.cik = "0000275309",
}};
s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl());
var co_entries = [_]Edgar.CompanyTickerEntry{.{
.symbol = "DUMMY",
.cik = "0000000001",
}};
s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl());
// Seed an etf_metrics negative cache so getEtfMetrics doesn't
// try to fetch from the network.
s.writeNegative("FAGIX", .etf_metrics);
// Sparse Wikidata records (length 1, only name set -- not useful).
var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1);
sparse[0] = .{
.symbol = try allocator.dupe(u8, "FAGIX"),
.name = try allocator.dupe(u8, "Test Fund"),
.as_of = try allocator.dupe(u8, "2026-06-01"),
.source = try allocator.dupe(u8, "wikidata"),
};
// Drive directly through synthesizeClassification (skip the
// Wikidata fetch). It takes ownership of `sparse`.
svc.panic_on_network_attempt = true; // any provider call -> panic
const merged = try svc.synthesizeClassification("FAGIX", sparse, .{ .skip_network = true });
defer Wikidata.ClassificationRecord.freeSlice(allocator, merged);
try std.testing.expectEqual(@as(usize, 1), merged.len);
const c = merged[0];
try std.testing.expectEqualStrings("FAGIX", c.symbol);
try std.testing.expect(c.is_etf);
try std.testing.expectEqualStrings("Fund", c.asset_class.?);
try std.testing.expectEqualStrings("US", c.country.?);
try std.testing.expectEqualStrings("US", c.geo.?);
try std.testing.expectEqualStrings("edgar_fallback", c.source);
// Wikidata's name preserved on merge.
try std.testing.expectEqualStrings("Test Fund", c.name.?);
}
test "synthesizeClassification: no EDGAR hit returns NotFound" {
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
const config = Config{ .cache_dir = dir_path };
var svc = DataService.init(io, allocator, config);
defer svc.deinit();
// Seed both ticker maps with throwaway entries so the
// EDGAR lookup returns .none for our test symbol but doesn't
// try to fetch the maps from the network.
var s = svc.store();
var mf_entries = [_]Edgar.MutualFundTickerEntry{.{
.symbol = "DUMMY1",
.cik = "0000000001",
}};
s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl());
var co_entries = [_]Edgar.CompanyTickerEntry{.{
.symbol = "DUMMY2",
.cik = "0000000002",
}};
s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl());
var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1);
sparse[0] = .{
.symbol = try allocator.dupe(u8, "NEVERHEARDOFIT"),
.name = try allocator.dupe(u8, "ghost"),
.as_of = try allocator.dupe(u8, "2026-06-01"),
.source = try allocator.dupe(u8, "wikidata"),
};
svc.panic_on_network_attempt = true;
try std.testing.expectError(error.NotFound, svc.synthesizeClassification("NEVERHEARDOFIT", sparse, .{ .skip_network = true }));
}
test "synthesizeClassification: company_or_uit without ETF/TRUST keyword still routes to multi-row" {
// PTY shape: closed-end fund whose company_tickers title is
// "PIMCO CORPORATE & INCOME OPPORTUNITY FUND" -- no "ETF" or
// "TRUST" in the title, so lookupInTickerMaps returns
// .company_or_uit{is_etf=false}. But it's still fund-shaped
// and should produce multi-row metadata in enrich.
//
// The downstream signal for "fund-like, emit multi-row" is
// ClassificationRecord.is_etf. Set it to true for any
// EDGAR-found .company_or_uit hit (even when the title
// doesn't carry the ETF/TRUST keyword), so PTY-shape
// closed-end funds get the same treatment as ETFs.
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
const config = Config{ .cache_dir = dir_path };
var svc = DataService.init(io, allocator, config);
defer svc.deinit();
var s = svc.store();
// Throwaway MF entry so the MF lookup returns null.
var mf_entries = [_]Edgar.MutualFundTickerEntry{.{
.symbol = "DUMMY",
.cik = "0000000001",
}};
s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl());
// PTY in the company map with NO ETF/TRUST in title.
var co_entries = [_]Edgar.CompanyTickerEntry{.{
.symbol = "PTY",
.cik = "0001202604",
.title = "PIMCO CORPORATE & INCOME OPPORTUNITY FUND",
}};
s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl());
s.writeNegative("PTY", .etf_metrics);
var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1);
sparse[0] = .{
.symbol = try allocator.dupe(u8, "PTY"),
.name = try allocator.dupe(u8, "PIMCO Corporate & Income Opportunity Fund"),
.as_of = try allocator.dupe(u8, "2026-06-01"),
.source = try allocator.dupe(u8, "wikidata"),
};
svc.panic_on_network_attempt = true;
const merged = try svc.synthesizeClassification("PTY", sparse, .{ .skip_network = true });
defer Wikidata.ClassificationRecord.freeSlice(allocator, merged);
try std.testing.expectEqual(@as(usize, 1), merged.len);
const c = merged[0];
// is_etf MUST be true so enrich routes through emitEtfRows
// (multi-row sleeve breakdown). The asset_class stays "Fund"
// because no ETF/TRUST keyword in title.
try std.testing.expect(c.is_etf);
try std.testing.expectEqualStrings("Fund", c.asset_class.?);
}
test "synthesizeClassification: NPORT-P series_name beats Wikidata's index name for funds" {
// SOXX shape: Wikidata returns the underlying INDEX name
// ("PHLX Semiconductor Sector") which is technically what the
// ticker symbol is for, but downstream consumers want the
// FUND name ("iShares Semiconductor ETF") that NPORT-P
// <seriesName> carries. Series_name is more authoritative
// for the fund itself.
const allocator = std.testing.allocator;
const io = std.testing.io;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
defer allocator.free(dir_path);
const config = Config{ .cache_dir = dir_path };
var svc = DataService.init(io, allocator, config);
defer svc.deinit();
var s = svc.store();
var mf_entries = [_]Edgar.MutualFundTickerEntry{.{
.symbol = "DUMMY",
.cik = "0000000001",
}};
s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl());
var co_entries = [_]Edgar.CompanyTickerEntry{.{
.symbol = "SOXX",
.cik = "0001100663",
.title = "iShares Trust",
}};
s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl());
// Pre-seed etf_metrics with a profile row carrying the
// NPORT-P seriesName.
var etf_records = [_]Edgar.EtfMetricRecord{
.{ .profile = .{
.symbol = try allocator.dupe(u8, "SOXX"),
.series_name = try allocator.dupe(u8, "iShares Semiconductor ETF"),
.cik = try allocator.dupe(u8, "0001100663"),
.as_of = try allocator.dupe(u8, "2026-06-01"),
.source = try allocator.dupe(u8, "edgar"),
} },
};
defer for (etf_records) |r| r.deinit(allocator);
s.write(Edgar.EtfMetricRecord, "SOXX", etf_records[0..], cache.DataType.etf_metrics.ttl());
// Wikidata returned only the index name (sparse).
var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1);
sparse[0] = .{
.symbol = try allocator.dupe(u8, "SOXX"),
.name = try allocator.dupe(u8, "PHLX Semiconductor Sector"),
.as_of = try allocator.dupe(u8, "2026-06-01"),
.source = try allocator.dupe(u8, "wikidata"),
};
svc.panic_on_network_attempt = true;
const merged = try svc.synthesizeClassification("SOXX", sparse, .{ .skip_network = true });
defer Wikidata.ClassificationRecord.freeSlice(allocator, merged);
try std.testing.expectEqual(@as(usize, 1), merged.len);
const c = merged[0];
// Series_name from NPORT-P wins -- not Wikidata's index name.
try std.testing.expectEqualStrings("iShares Semiconductor ETF", c.name.?);
}
test "getEntityFacts: skip_network with no cache returns FetchFailed" {
const allocator = std.testing.allocator;
const io = std.testing.io;