move Wikidata -> edgar logic into getClassification
This commit is contained in:
parent
4d65cc45f4
commit
b796a46699
3 changed files with 953 additions and 1206 deletions
File diff suppressed because it is too large
Load diff
|
|
@ -129,6 +129,13 @@ pub const ClassificationRecord = struct {
|
|||
industry: ?[]const u8 = null, // owned
|
||||
/// ISO-3166 alpha-2 country code (e.g. "US", "GB", "DE").
|
||||
country: ?[]const u8 = null, // owned
|
||||
/// Resolved geo bucket (e.g. "US", "International Developed",
|
||||
/// "Emerging Markets"). Populated either from `geoFor(country)`
|
||||
/// or from title-keyword inference (`inferGeoFromTitle`) for
|
||||
/// symbols where Wikidata didn't supply a country. Producers
|
||||
/// pick one of the `geo.*` constants above; consumers compare
|
||||
/// against the same constants.
|
||||
geo: ?[]const u8 = null, // owned
|
||||
asset_class: ?[]const u8 = null, // owned
|
||||
is_etf: bool = false,
|
||||
/// YYYY-MM-DD; trimmed from upstream's ISO-8601 date.
|
||||
|
|
@ -148,6 +155,7 @@ pub const ClassificationRecord = struct {
|
|||
if (self.sector) |s| allocator.free(s);
|
||||
if (self.industry) |s| allocator.free(s);
|
||||
if (self.country) |s| allocator.free(s);
|
||||
if (self.geo) |s| allocator.free(s);
|
||||
if (self.asset_class) |s| allocator.free(s);
|
||||
if (self.inception_date) |s| allocator.free(s);
|
||||
if (self.cik) |s| allocator.free(s);
|
||||
|
|
@ -303,3 +311,202 @@ test "geo bucket labels are stable strings (not byte copies)" {
|
|||
try std.testing.expectEqual(@intFromPtr(geo.emerging.ptr), @intFromPtr(geoFor("CN").ptr));
|
||||
try std.testing.expectEqual(@intFromPtr(geo.unknown.ptr), @intFromPtr(geoFor(null).ptr));
|
||||
}
|
||||
|
||||
// ── Title-keyword inference ──────────────────────────────────
|
||||
//
|
||||
// Pure functions over a fund/security title string. Used by
|
||||
// `service.getClassification` to populate the sector / geo of a
|
||||
// `ClassificationRecord` when Wikidata didn't carry one and the
|
||||
// EDGAR ticker-map fallback fired. Lives here (not in any
|
||||
// provider) because the inference is provider-agnostic and
|
||||
// shares the canonical sector/geo taxonomy declared above.
|
||||
|
||||
fn titleContainsAny(haystack: []const u8, needles: []const []const u8) bool {
|
||||
for (needles) |needle| {
|
||||
if (std.mem.indexOf(u8, haystack, needle) != null) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Lowercase the title into a stack buffer for case-insensitive
|
||||
/// keyword matching. Truncates titles longer than the buffer
|
||||
/// (returns null) — real fund names easily fit in 256 bytes.
|
||||
fn lowercaseTitle(buf: []u8, title: []const u8) ?[]const u8 {
|
||||
if (title.len > buf.len) return null;
|
||||
return std.ascii.lowerString(buf[0..title.len], title);
|
||||
}
|
||||
|
||||
/// Infer a GICS sector from a fund's title. Returns null when
|
||||
/// no unambiguous keyword match — caller falls back to whatever
|
||||
/// sector data the upstream source provided (typically null).
|
||||
///
|
||||
/// Conservative keyword set: matches only words that map
|
||||
/// unambiguously to a single GICS sector. "Income" / "Dividend"
|
||||
/// / "Value" / "Growth" / "Momentum" / "Total" / "Equal Weight"
|
||||
/// / "International" / "Emerging" don't appear here — they
|
||||
/// describe the screening methodology or geo, not the sector.
|
||||
///
|
||||
/// Reuses the `sector` constants above so the inference taxonomy
|
||||
/// stays in lock-step with the canonicalizer.
|
||||
pub fn inferSectorFromTitle(title: ?[]const u8) ?[]const u8 {
|
||||
const t = title orelse return null;
|
||||
if (t.len == 0) return null;
|
||||
|
||||
var buf: [256]u8 = undefined;
|
||||
const lc = lowercaseTitle(&buf, t) orelse return null;
|
||||
|
||||
// Order matters: more-specific keywords come first within
|
||||
// each sector. "Health care" before "care" (irrelevant
|
||||
// example), "semiconductor" before generic "tech" (which we
|
||||
// don't include — too broad).
|
||||
|
||||
// Healthcare. "Health care" with space (XLV title), "healthcare"
|
||||
// (one word), "biotech", "pharmaceutical".
|
||||
if (titleContainsAny(lc, &.{ "health care", "healthcare", "biotech", "pharmaceutical", "medical" })) {
|
||||
return sector.healthcare;
|
||||
}
|
||||
|
||||
// Technology. Specific terms only — "tech" alone is too
|
||||
// broad (matches "biotech", "fintech", "edtech" — all
|
||||
// sector-mixing).
|
||||
if (titleContainsAny(lc, &.{ "semiconductor", "software", "cloud computing", "internet" })) {
|
||||
return sector.technology;
|
||||
}
|
||||
|
||||
// Financial Services. "Financial" is fairly specific in
|
||||
// fund-name conventions ("Financial Select Sector SPDR",
|
||||
// "Vanguard Financials ETF").
|
||||
if (titleContainsAny(lc, &.{ "financial", "bank" })) {
|
||||
return sector.financial_services;
|
||||
}
|
||||
|
||||
// Energy. "Energy" alone is mostly unambiguous in fund
|
||||
// conventions; pair with "oil" / "gas" for redundancy.
|
||||
if (titleContainsAny(lc, &.{ "energy", "oil & gas", "oil and gas", "petroleum" })) {
|
||||
return sector.energy;
|
||||
}
|
||||
|
||||
// Real Estate / REITs.
|
||||
if (titleContainsAny(lc, &.{ "real estate", "reit" })) {
|
||||
return sector.real_estate;
|
||||
}
|
||||
|
||||
// Utilities. "Utilities" alone is unambiguous.
|
||||
if (titleContainsAny(lc, &.{"utilities"})) {
|
||||
return sector.utilities;
|
||||
}
|
||||
|
||||
// Consumer Discretionary / Cyclical. Match the explicit
|
||||
// labels — "consumer" alone is ambiguous (could be
|
||||
// discretionary or staples).
|
||||
if (titleContainsAny(lc, &.{ "consumer discretionary", "consumer cyclical" })) {
|
||||
return sector.consumer_cyclical;
|
||||
}
|
||||
|
||||
// Consumer Staples / Defensive.
|
||||
if (titleContainsAny(lc, &.{ "consumer staples", "consumer defensive" })) {
|
||||
return sector.consumer_defensive;
|
||||
}
|
||||
|
||||
// Industrials. "Industrial" is more reliable than
|
||||
// "industrials" because some fund names use the singular
|
||||
// ("Industrial Select Sector SPDR").
|
||||
if (titleContainsAny(lc, &.{ "industrial", "aerospace", "defense" })) {
|
||||
return sector.industrials;
|
||||
}
|
||||
|
||||
// Basic Materials.
|
||||
if (titleContainsAny(lc, &.{ "materials", "mining", "miners", "metals" })) {
|
||||
return sector.basic_materials;
|
||||
}
|
||||
|
||||
// Communication Services. "Communication" / "Telecom"
|
||||
// unambiguous.
|
||||
if (titleContainsAny(lc, &.{ "communication", "telecom", "media" })) {
|
||||
return sector.communication_services;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// Infer a geo bucket from a fund's title. Returns null when
|
||||
/// the title doesn't carry an unambiguous international/emerging
|
||||
/// keyword — caller keeps whatever default they have (typically
|
||||
/// US for SEC-filed funds).
|
||||
///
|
||||
/// More important than sector inference: a default `geo::US` is
|
||||
/// *factually wrong* for international funds (FRDM holds
|
||||
/// emerging-market equities, not US), so this fix tightens
|
||||
/// portfolio-level geographic-exposure reporting.
|
||||
pub fn inferGeoFromTitle(title: ?[]const u8) ?[]const u8 {
|
||||
const t = title orelse return null;
|
||||
if (t.len == 0) return null;
|
||||
|
||||
var buf: [256]u8 = undefined;
|
||||
const lc = lowercaseTitle(&buf, t) orelse return null;
|
||||
|
||||
// Emerging markets first — most specific. "Emerging" alone
|
||||
// is rare in non-EM contexts in fund-name conventions.
|
||||
// "Frontier" likewise is conventionally only used for
|
||||
// frontier markets in fund titles.
|
||||
if (titleContainsAny(lc, &.{ "emerging market", "emerging markets", "frontier market", "frontier markets", "frontier" })) {
|
||||
return geo.emerging;
|
||||
}
|
||||
|
||||
// International Developed. "International" / "Intl" /
|
||||
// "ex-US" / "World ex US" / "Developed Markets" /
|
||||
// specific developed-market regions.
|
||||
//
|
||||
// False-positive risk: a hypothetical "Vanguard Total
|
||||
// International + US Equity Fund" would mis-tag here. None
|
||||
// of the user's current portfolio holds such a hybrid
|
||||
// fund. If one ever shows up, it'll get flagged in the
|
||||
// diff-against-old-metadata.srf review and can be
|
||||
// hand-corrected.
|
||||
if (titleContainsAny(lc, &.{ "international", " intl", "ex-us", "ex us", "world ex", "developed market", "developed markets" })) {
|
||||
return geo.developed;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
test "inferSectorFromTitle: null/empty -> null" {
|
||||
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle(null));
|
||||
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle(""));
|
||||
}
|
||||
|
||||
test "inferSectorFromTitle: technology keywords" {
|
||||
try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("iShares Semiconductor ETF"));
|
||||
try std.testing.expectEqual(@as(?[]const u8, sector.technology), inferSectorFromTitle("Vanguard Software ETF"));
|
||||
}
|
||||
|
||||
test "inferSectorFromTitle: healthcare keywords" {
|
||||
try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("Health Care Select Sector SPDR"));
|
||||
try std.testing.expectEqual(@as(?[]const u8, sector.healthcare), inferSectorFromTitle("iShares Biotech ETF"));
|
||||
}
|
||||
|
||||
test "inferSectorFromTitle: ambiguous title -> null" {
|
||||
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("Vanguard Total Stock Market"));
|
||||
try std.testing.expectEqual(@as(?[]const u8, null), inferSectorFromTitle("SPDR S&P 500"));
|
||||
}
|
||||
|
||||
test "inferGeoFromTitle: null/empty -> null" {
|
||||
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle(null));
|
||||
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle(""));
|
||||
}
|
||||
|
||||
test "inferGeoFromTitle: emerging markets" {
|
||||
try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("Freedom 100 Emerging Markets ETF"));
|
||||
try std.testing.expectEqual(@as(?[]const u8, geo.emerging), inferGeoFromTitle("iShares Frontier Markets"));
|
||||
}
|
||||
|
||||
test "inferGeoFromTitle: international developed" {
|
||||
try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard FTSE Developed Markets"));
|
||||
try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("iShares MSCI EAFE International"));
|
||||
try std.testing.expectEqual(@as(?[]const u8, geo.developed), inferGeoFromTitle("Vanguard Total World ex-US"));
|
||||
}
|
||||
|
||||
test "inferGeoFromTitle: no match -> null" {
|
||||
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("SPDR S&P 500"));
|
||||
try std.testing.expectEqual(@as(?[]const u8, null), inferGeoFromTitle("iShares Semiconductor ETF"));
|
||||
}
|
||||
|
|
|
|||
690
src/service.zig
690
src/service.zig
|
|
@ -35,6 +35,7 @@ const Yahoo = @import("providers/yahoo.zig").Yahoo;
|
|||
const Tiingo = @import("providers/tiingo.zig").Tiingo;
|
||||
const Wikidata = @import("providers/Wikidata.zig");
|
||||
const Edgar = @import("providers/Edgar.zig");
|
||||
const classification = @import("models/classification.zig");
|
||||
const fmt = @import("format.zig");
|
||||
const performance = @import("analytics/performance.zig");
|
||||
const http = @import("net/http.zig");
|
||||
|
|
@ -1037,9 +1038,9 @@ pub const DataService = struct {
|
|||
// higher-quality name. Best-effort: if the fetch fails we
|
||||
// still return the EDGAR-only profile.
|
||||
var inception_date: ?Date = null;
|
||||
if (self.getClassification(symbol, opts)) |classification| {
|
||||
defer classification.deinit();
|
||||
for (classification.data) |c| {
|
||||
if (self.getClassification(symbol, opts)) |class_result| {
|
||||
defer class_result.deinit();
|
||||
for (class_result.data) |c| {
|
||||
if (c.inception_date) |idate_str| {
|
||||
if (Date.parse(idate_str)) |d| inception_date = d else |_| {}
|
||||
}
|
||||
|
|
@ -1092,11 +1093,6 @@ pub const DataService = struct {
|
|||
/// `opts.skip_network = true` returns cached data even if stale,
|
||||
/// `FetchFailed` on cache miss. `opts.force_refresh = true`
|
||||
/// ignores the cache and re-fetches.
|
||||
///
|
||||
/// Callers fetching classifications for many symbols should use
|
||||
/// `getClassifications(symbols)` instead — Wikidata's SPARQL API
|
||||
/// is naturally batched, and one query for N symbols is much
|
||||
/// cheaper than N queries for 1 symbol each.
|
||||
pub fn getClassification(self: *DataService, symbol: []const u8, opts: FetchOptions) DataError!FetchResult(Wikidata.ClassificationRecord) {
|
||||
var s = self.store();
|
||||
|
||||
|
|
@ -1131,124 +1127,282 @@ pub const DataService = struct {
|
|||
const fetched = wd.fetch(self.allocator, &symbols) catch |err| {
|
||||
if (err == error.RateLimited) {
|
||||
self.rateLimitBackoff();
|
||||
break_blk: {
|
||||
const retried = wd.fetch(self.allocator, &symbols) catch break :break_blk;
|
||||
if (retried.len > 0) {
|
||||
s.write(Wikidata.ClassificationRecord, symbol, retried, cache.DataType.classification.ttl());
|
||||
return .{ .data = retried, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator };
|
||||
}
|
||||
self.allocator.free(retried);
|
||||
}
|
||||
if (wd.fetch(self.allocator, &symbols)) |retried| {
|
||||
return self.finalizeClassification(symbol, retried, opts);
|
||||
} else |_| {}
|
||||
}
|
||||
log.warn("{s}: wikidata fetch failed: {s}", .{ symbol, @errorName(err) });
|
||||
return DataError.FetchFailed;
|
||||
};
|
||||
|
||||
if (fetched.len == 0) {
|
||||
self.allocator.free(fetched);
|
||||
// Wikidata had no row for this symbol. Negative-cache to
|
||||
// suppress retries until the user explicitly refreshes.
|
||||
s.writeNegative(symbol, .classification);
|
||||
return DataError.NotFound;
|
||||
}
|
||||
|
||||
s.write(Wikidata.ClassificationRecord, symbol, fetched, cache.DataType.classification.ttl());
|
||||
|
||||
return .{ .data = fetched, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator };
|
||||
return self.finalizeClassification(symbol, fetched, opts);
|
||||
}
|
||||
|
||||
/// Batched classification fetch. Wikidata's SPARQL API takes a
|
||||
/// `VALUES ?ticker { ... }` set in one query; this method runs
|
||||
/// that query for the requested set, splits the response into
|
||||
/// per-symbol cache writes, and returns the slice. Symbols not
|
||||
/// in Wikidata are silently dropped from the result (the user-
|
||||
/// facing cache for them gets a negative entry).
|
||||
/// Common post-Wikidata path: decide if the result is useful as
|
||||
/// returned, otherwise consult EDGAR to fill in the gaps,
|
||||
/// otherwise negative-cache. Either way the cache gets written
|
||||
/// and a `FetchResult` is returned (or `DataError.NotFound`).
|
||||
///
|
||||
/// The cache is consulted first per-symbol; only the symbols
|
||||
/// that miss the cache (or are stale) are passed to the SPARQL
|
||||
/// query. This minimizes the upstream load when most symbols
|
||||
/// were already classified in a prior run.
|
||||
pub fn getClassifications(
|
||||
/// Takes ownership of `wikidata_records`. The slice is either
|
||||
/// returned as the result data, freed and replaced by a
|
||||
/// synthesized slice, or freed and the symbol negative-cached.
|
||||
fn finalizeClassification(
|
||||
self: *DataService,
|
||||
result_allocator: std.mem.Allocator,
|
||||
symbols: []const []const u8,
|
||||
symbol: []const u8,
|
||||
wikidata_records: []Wikidata.ClassificationRecord,
|
||||
opts: FetchOptions,
|
||||
) DataError![]Wikidata.ClassificationRecord {
|
||||
if (symbols.len == 0) return &.{};
|
||||
) DataError!FetchResult(Wikidata.ClassificationRecord) {
|
||||
var s = self.store();
|
||||
const ttl = cache.DataType.classification.ttl();
|
||||
|
||||
// Identify cache misses.
|
||||
var to_fetch: std.ArrayList([]const u8) = .empty;
|
||||
defer to_fetch.deinit(self.allocator);
|
||||
var cached_records: std.ArrayList(Wikidata.ClassificationRecord) = .empty;
|
||||
errdefer {
|
||||
for (cached_records.items) |*r| {
|
||||
var m = r.*;
|
||||
m.deinit(self.allocator);
|
||||
// Wikidata returned a useful row -> populate geo from
|
||||
// geoFor(country) and cache as-is.
|
||||
if (wikidata_records.len > 0 and wikidataLooksUseful(wikidata_records[0])) {
|
||||
try self.populateGeo(&wikidata_records[0]);
|
||||
s.write(Wikidata.ClassificationRecord, symbol, wikidata_records, ttl);
|
||||
return .{ .data = wikidata_records, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator };
|
||||
}
|
||||
|
||||
// Sparse or empty: try EDGAR fallback. `synthesizeClassification`
|
||||
// takes ownership of the wikidata slice (frees it, returns a
|
||||
// new one-element slice with the merged record). Returns
|
||||
// `error.NotFound` when even EDGAR has nothing.
|
||||
const merged = self.synthesizeClassification(symbol, wikidata_records, opts) catch |err| {
|
||||
if (err == error.NotFound) {
|
||||
s.writeNegative(symbol, .classification);
|
||||
return DataError.NotFound;
|
||||
}
|
||||
cached_records.deinit(self.allocator);
|
||||
}
|
||||
|
||||
for (symbols) |sym| {
|
||||
if (!opts.force_refresh) {
|
||||
if (s.read(Wikidata.ClassificationRecord, sym, null, .fresh_only)) |cached| {
|
||||
// The on-disk shape is a length-1 slice.
|
||||
if (cached.data.len > 0) {
|
||||
try cached_records.append(self.allocator, cached.data[0]);
|
||||
// Free the rest if any (shouldn't happen for
|
||||
// per-symbol classification, but defensive).
|
||||
for (cached.data[1..]) |*r| {
|
||||
var m = r.*;
|
||||
m.deinit(self.allocator);
|
||||
}
|
||||
self.allocator.free(cached.data);
|
||||
continue;
|
||||
}
|
||||
self.allocator.free(cached.data);
|
||||
}
|
||||
}
|
||||
try to_fetch.append(self.allocator, sym);
|
||||
}
|
||||
|
||||
if (to_fetch.items.len == 0) {
|
||||
// All cached — assemble result from cached_records.
|
||||
const out = try result_allocator.alloc(Wikidata.ClassificationRecord, cached_records.items.len);
|
||||
@memcpy(out, cached_records.items);
|
||||
cached_records.clearRetainingCapacity();
|
||||
return out;
|
||||
}
|
||||
|
||||
if (opts.skip_network) {
|
||||
// Offline mode: return what we have from cache.
|
||||
const out = try result_allocator.alloc(Wikidata.ClassificationRecord, cached_records.items.len);
|
||||
@memcpy(out, cached_records.items);
|
||||
cached_records.clearRetainingCapacity();
|
||||
return out;
|
||||
}
|
||||
|
||||
log.debug("fetching {d} classifications from Wikidata", .{to_fetch.items.len});
|
||||
self.assertNetworkAllowed("getClassifications wikidata.fetch");
|
||||
var wd = try self.getProvider(Wikidata);
|
||||
|
||||
const fetched = wd.fetch(self.allocator, to_fetch.items) catch |err| {
|
||||
log.warn("wikidata batch fetch failed: {s}", .{@errorName(err)});
|
||||
return DataError.FetchFailed;
|
||||
};
|
||||
defer self.allocator.free(fetched);
|
||||
|
||||
// Write each fetched record to its per-symbol cache file.
|
||||
for (fetched) |rec| {
|
||||
const single = [_]Wikidata.ClassificationRecord{rec};
|
||||
s.write(Wikidata.ClassificationRecord, rec.symbol, &single, cache.DataType.classification.ttl());
|
||||
s.write(Wikidata.ClassificationRecord, symbol, merged, ttl);
|
||||
return .{ .data = merged, .source = .fetched, .timestamp = std.Io.Timestamp.now(self.io, .real).toSeconds(), .allocator = self.allocator };
|
||||
}
|
||||
|
||||
/// Populate `record.geo` from `geoFor(record.country)` when it
|
||||
/// isn't already set. Best-effort: if duping the geo string
|
||||
/// fails, leaves the field null and propagates the error so the
|
||||
/// caller can decide whether to bail.
|
||||
fn populateGeo(self: *DataService, record: *Wikidata.ClassificationRecord) !void {
|
||||
if (record.geo != null) return;
|
||||
const country = record.country orelse return;
|
||||
const g = classification.geoFor(country);
|
||||
if (std.mem.eql(u8, g, classification.geo.unknown)) return;
|
||||
record.geo = try self.allocator.dupe(u8, g);
|
||||
}
|
||||
|
||||
/// Whether a Wikidata classification record carries enough
|
||||
/// downstream-usable data to skip the EDGAR fallback. A record
|
||||
/// with at least one of `is_etf`, `sector`, `country`, or
|
||||
/// `asset_class` set is "useful"; sparse records (e.g. SOXX
|
||||
/// getting only a `name` from Wikidata) need the EDGAR
|
||||
/// ticker-map fallback to fill in `is_etf=true,
|
||||
/// asset_class=ETF, country=US`.
|
||||
fn wikidataLooksUseful(c: Wikidata.ClassificationRecord) bool {
|
||||
if (c.is_etf) return true;
|
||||
if (c.asset_class != null) return true;
|
||||
if (c.country != null) return true;
|
||||
if (c.sector != null) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Synthesize a `ClassificationRecord` for a symbol that
|
||||
/// Wikidata couldn't classify usefully. Consults the EDGAR
|
||||
/// ticker maps; if found, also fetches `getEtfMetrics` to
|
||||
/// recover the NPORT-P series_name (more authoritative than
|
||||
/// the company_tickers title). Title-keyword inference fills
|
||||
/// in `sector` and `geo` when the name carries an unambiguous
|
||||
/// keyword.
|
||||
///
|
||||
/// Takes ownership of `wikidata_records`: frees them at exit.
|
||||
/// Wikidata's `name`/`industry`/`inception_date`/`cik` fields
|
||||
/// are preserved into the synthesized record when present.
|
||||
/// Returns `error.NotFound` when EDGAR has nothing either.
|
||||
fn synthesizeClassification(
|
||||
self: *DataService,
|
||||
symbol: []const u8,
|
||||
wikidata_records: []Wikidata.ClassificationRecord,
|
||||
opts: FetchOptions,
|
||||
) !cache.Store.DataFor(Wikidata.ClassificationRecord) {
|
||||
defer Wikidata.ClassificationRecord.freeSlice(self.allocator, wikidata_records);
|
||||
|
||||
const lookup = self.lookupEdgarFallback(symbol, opts);
|
||||
defer freeEdgarLookup(self.allocator, lookup);
|
||||
if (lookup == .none) return error.NotFound;
|
||||
|
||||
// For ETF/fund hits, try to get the richer series_name from
|
||||
// NPORT-P. Cache hit is cheap; cache miss triggers an EDGAR
|
||||
// fetch but is bounded by EDGAR's rate limiter. If the call
|
||||
// fails (e.g. money-market funds with no NPORT-P), we fall
|
||||
// back to the ticker-map title.
|
||||
var etf_metrics_result: ?FetchResult(Edgar.EtfMetricRecord) = null;
|
||||
defer if (etf_metrics_result) |*r| r.deinit();
|
||||
etf_metrics_result = self.getEtfMetrics(symbol, opts) catch null;
|
||||
|
||||
// Extract series_name and cik from the etf_metrics profile row.
|
||||
var series_name: ?[]const u8 = null;
|
||||
var etf_cik: ?[]const u8 = null;
|
||||
if (etf_metrics_result) |r| {
|
||||
for (r.data) |rec| switch (rec) {
|
||||
.profile => |p| {
|
||||
if (p.series_name) |sn| series_name = sn;
|
||||
etf_cik = p.cik;
|
||||
break;
|
||||
},
|
||||
else => {},
|
||||
};
|
||||
}
|
||||
|
||||
// Combine cached + fetched into the result.
|
||||
const total = cached_records.items.len + fetched.len;
|
||||
const out = try result_allocator.alloc(Wikidata.ClassificationRecord, total);
|
||||
@memcpy(out[0..cached_records.items.len], cached_records.items);
|
||||
@memcpy(out[cached_records.items.len..], fetched);
|
||||
cached_records.clearRetainingCapacity();
|
||||
return out;
|
||||
// Pull whatever Wikidata's sparse record carried so we
|
||||
// don't lose data on the merge.
|
||||
const wd: ?Wikidata.ClassificationRecord = if (wikidata_records.len > 0) wikidata_records[0] else null;
|
||||
|
||||
// Pick the best name source: NPORT-P series_name >
|
||||
// EDGAR ticker-map title > Wikidata name > nothing.
|
||||
//
|
||||
// We're on the EDGAR-fallback path because Wikidata's
|
||||
// record was sparse. For funds, Wikidata's `name` (when
|
||||
// present) is frequently the underlying INDEX rather than
|
||||
// the FUND itself -- e.g. SOXX's Wikidata `name` is "PHLX
|
||||
// Semiconductor Sector" but the fund is "iShares
|
||||
// Semiconductor ETF" per NPORT-P seriesName. Prefer the
|
||||
// fund-authoritative source so downstream comments and
|
||||
// labels show the fund name, not the index name.
|
||||
const ticker_title: ?[]const u8 = switch (lookup) {
|
||||
.company_or_uit => |c| c.title,
|
||||
else => null,
|
||||
};
|
||||
const best_name: ?[]const u8 = blk: {
|
||||
if (series_name) |n| break :blk n;
|
||||
if (ticker_title) |n| break :blk n;
|
||||
if (wd) |w| {
|
||||
if (w.name) |n| break :blk n;
|
||||
}
|
||||
break :blk null;
|
||||
};
|
||||
|
||||
// Name source for title-keyword inference: prefer the
|
||||
// most-authoritative source for fund-style classification
|
||||
// even when Wikidata supplied a (different) name. Wikidata's
|
||||
// name for a fund is often less informative than NPORT-P's
|
||||
// seriesName (e.g. SOXX's Wikidata name is "PHLX
|
||||
// Semiconductor Sector" which is the index name, not the
|
||||
// fund name).
|
||||
const inference_name: ?[]const u8 = series_name orelse ticker_title orelse if (wd) |w| w.name else null;
|
||||
|
||||
const inferred_sector = classification.inferSectorFromTitle(inference_name);
|
||||
const inferred_geo = classification.inferGeoFromTitle(inference_name);
|
||||
|
||||
// `is_etf` here means "this is fund-shaped, emit multi-row
|
||||
// breakdown" -- true for ANY EDGAR-found symbol. The
|
||||
// `tickers_funds.srf` map mixes mutual funds and
|
||||
// series-of-trust ETFs alike. The `tickers_companies.srf`
|
||||
// map carries operating companies, closed-end funds, and
|
||||
// UITs; operating companies usually have Wikidata coverage
|
||||
// and wouldn't reach this fallback, so anything that
|
||||
// dropped here is also fund-shaped (e.g. PIMCO closed-end
|
||||
// funds whose title says "FUND" but not "ETF" or "TRUST").
|
||||
//
|
||||
// The ETF/TRUST keyword in the title still drives the
|
||||
// asset_class label below ("ETF" vs "Fund"), but the
|
||||
// fund-shaped routing decision applies regardless.
|
||||
const is_etf = true;
|
||||
const asset_class: []const u8 = switch (lookup) {
|
||||
.managed_fund => "Fund",
|
||||
.company_or_uit => |c| if (c.is_etf) "ETF" else "Fund",
|
||||
.none => unreachable,
|
||||
};
|
||||
|
||||
// Country: prefer Wikidata's. Default to "US" for
|
||||
// EDGAR-found symbols (they're SEC filers).
|
||||
const country_str: []const u8 = if (wd) |w| (w.country orelse "US") else "US";
|
||||
|
||||
// Sector: prefer Wikidata's existing sector (rare in this
|
||||
// sparse-fallback path), else fall back to inferred.
|
||||
const sector_str: ?[]const u8 = blk: {
|
||||
if (wd) |w| {
|
||||
if (w.sector) |sec| break :blk sec;
|
||||
}
|
||||
break :blk inferred_sector;
|
||||
};
|
||||
|
||||
// CIK: prefer Wikidata's, fall back to NPORT-P's.
|
||||
const cik_str: ?[]const u8 = blk: {
|
||||
if (wd) |w| {
|
||||
if (w.cik) |c| break :blk c;
|
||||
}
|
||||
if (etf_cik) |c| break :blk c;
|
||||
break :blk null;
|
||||
};
|
||||
|
||||
// Geo: prefer the Wikidata-derived geo (computed from
|
||||
// `geoFor(country)` against the country code), else use
|
||||
// title-keyword inference. Default to "US" when neither
|
||||
// is available -- EDGAR-found symbols are SEC filers.
|
||||
const geo_str: []const u8 = blk: {
|
||||
if (wd) |w| {
|
||||
if (w.country) |c| {
|
||||
const g = classification.geoFor(c);
|
||||
if (!std.mem.eql(u8, g, classification.geo.unknown)) break :blk g;
|
||||
}
|
||||
}
|
||||
if (inferred_geo) |g| break :blk g;
|
||||
break :blk classification.geo.us;
|
||||
};
|
||||
|
||||
const today = fmt.todayDate(self.io);
|
||||
var as_of_buf: [10]u8 = undefined;
|
||||
const as_of_str = try std.fmt.bufPrint(&as_of_buf, "{f}", .{today});
|
||||
|
||||
// Allocate each owned field up front with its own errdefer
|
||||
// so a partial-build on OOM doesn't leak the earlier
|
||||
// successful dupes. Once all dupes succeed we assemble the
|
||||
// record (no fallible ops below this point).
|
||||
const symbol_owned = try self.allocator.dupe(u8, symbol);
|
||||
errdefer self.allocator.free(symbol_owned);
|
||||
const name_owned: ?[]const u8 = if (best_name) |n| try self.allocator.dupe(u8, n) else null;
|
||||
errdefer if (name_owned) |s| self.allocator.free(s);
|
||||
const sector_owned: ?[]const u8 = if (sector_str) |s| try self.allocator.dupe(u8, s) else null;
|
||||
errdefer if (sector_owned) |s| self.allocator.free(s);
|
||||
const industry_owned: ?[]const u8 = if (wd) |w|
|
||||
(if (w.industry) |i| try self.allocator.dupe(u8, i) else null)
|
||||
else
|
||||
null;
|
||||
errdefer if (industry_owned) |s| self.allocator.free(s);
|
||||
const country_owned = try self.allocator.dupe(u8, country_str);
|
||||
errdefer self.allocator.free(country_owned);
|
||||
const geo_owned = try self.allocator.dupe(u8, geo_str);
|
||||
errdefer self.allocator.free(geo_owned);
|
||||
const asset_class_owned = try self.allocator.dupe(u8, asset_class);
|
||||
errdefer self.allocator.free(asset_class_owned);
|
||||
const inception_owned: ?[]const u8 = if (wd) |w|
|
||||
(if (w.inception_date) |i| try self.allocator.dupe(u8, i) else null)
|
||||
else
|
||||
null;
|
||||
errdefer if (inception_owned) |s| self.allocator.free(s);
|
||||
const cik_owned: ?[]const u8 = if (cik_str) |c| try self.allocator.dupe(u8, c) else null;
|
||||
errdefer if (cik_owned) |s| self.allocator.free(s);
|
||||
const as_of_owned = try self.allocator.dupe(u8, as_of_str);
|
||||
errdefer self.allocator.free(as_of_owned);
|
||||
const source_owned = try self.allocator.dupe(u8, "edgar_fallback");
|
||||
errdefer self.allocator.free(source_owned);
|
||||
|
||||
const result = try self.allocator.alloc(Wikidata.ClassificationRecord, 1);
|
||||
result[0] = .{
|
||||
.symbol = symbol_owned,
|
||||
.name = name_owned,
|
||||
.sector = sector_owned,
|
||||
.industry = industry_owned,
|
||||
.country = country_owned,
|
||||
.geo = geo_owned,
|
||||
.asset_class = asset_class_owned,
|
||||
.is_etf = is_etf,
|
||||
.inception_date = inception_owned,
|
||||
.cik = cik_owned,
|
||||
.as_of = as_of_owned,
|
||||
.source = source_owned,
|
||||
};
|
||||
return result;
|
||||
}
|
||||
|
||||
/// Fetch XBRL-derived entity facts for a CIK (currently
|
||||
|
|
@ -2955,6 +3109,326 @@ test "getClassification: cache hit returns cached data without network" {
|
|||
try std.testing.expectEqual(Source.cached, result.source);
|
||||
}
|
||||
|
||||
test "populateGeo: country US -> geo US" {
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
var tmp = std.testing.tmpDir(.{});
|
||||
defer tmp.cleanup();
|
||||
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
|
||||
defer allocator.free(dir_path);
|
||||
|
||||
const config = Config{ .cache_dir = dir_path };
|
||||
var svc = DataService.init(io, allocator, config);
|
||||
defer svc.deinit();
|
||||
|
||||
var record: Wikidata.ClassificationRecord = .{
|
||||
.symbol = try allocator.dupe(u8, "TEST"),
|
||||
.country = try allocator.dupe(u8, "US"),
|
||||
.as_of = try allocator.dupe(u8, "2026-06-01"),
|
||||
.source = try allocator.dupe(u8, "wikidata"),
|
||||
};
|
||||
defer record.deinit(allocator);
|
||||
|
||||
try svc.populateGeo(&record);
|
||||
try std.testing.expect(record.geo != null);
|
||||
try std.testing.expectEqualStrings("US", record.geo.?);
|
||||
}
|
||||
|
||||
test "populateGeo: country GB -> geo International Developed" {
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
var tmp = std.testing.tmpDir(.{});
|
||||
defer tmp.cleanup();
|
||||
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
|
||||
defer allocator.free(dir_path);
|
||||
|
||||
const config = Config{ .cache_dir = dir_path };
|
||||
var svc = DataService.init(io, allocator, config);
|
||||
defer svc.deinit();
|
||||
|
||||
var record: Wikidata.ClassificationRecord = .{
|
||||
.symbol = try allocator.dupe(u8, "TEST"),
|
||||
.country = try allocator.dupe(u8, "GB"),
|
||||
.as_of = try allocator.dupe(u8, "2026-06-01"),
|
||||
.source = try allocator.dupe(u8, "wikidata"),
|
||||
};
|
||||
defer record.deinit(allocator);
|
||||
|
||||
try svc.populateGeo(&record);
|
||||
try std.testing.expect(record.geo != null);
|
||||
try std.testing.expectEqualStrings("International Developed", record.geo.?);
|
||||
}
|
||||
|
||||
test "populateGeo: null country -> noop" {
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
var tmp = std.testing.tmpDir(.{});
|
||||
defer tmp.cleanup();
|
||||
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
|
||||
defer allocator.free(dir_path);
|
||||
|
||||
const config = Config{ .cache_dir = dir_path };
|
||||
var svc = DataService.init(io, allocator, config);
|
||||
defer svc.deinit();
|
||||
|
||||
var record: Wikidata.ClassificationRecord = .{
|
||||
.symbol = try allocator.dupe(u8, "TEST"),
|
||||
.as_of = try allocator.dupe(u8, "2026-06-01"),
|
||||
.source = try allocator.dupe(u8, "wikidata"),
|
||||
};
|
||||
defer record.deinit(allocator);
|
||||
|
||||
try svc.populateGeo(&record);
|
||||
try std.testing.expectEqual(@as(?[]const u8, null), record.geo);
|
||||
}
|
||||
|
||||
test "populateGeo: existing geo not overwritten" {
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
var tmp = std.testing.tmpDir(.{});
|
||||
defer tmp.cleanup();
|
||||
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
|
||||
defer allocator.free(dir_path);
|
||||
|
||||
const config = Config{ .cache_dir = dir_path };
|
||||
var svc = DataService.init(io, allocator, config);
|
||||
defer svc.deinit();
|
||||
|
||||
var record: Wikidata.ClassificationRecord = .{
|
||||
.symbol = try allocator.dupe(u8, "TEST"),
|
||||
.country = try allocator.dupe(u8, "US"),
|
||||
.geo = try allocator.dupe(u8, "Already Set"),
|
||||
.as_of = try allocator.dupe(u8, "2026-06-01"),
|
||||
.source = try allocator.dupe(u8, "wikidata"),
|
||||
};
|
||||
defer record.deinit(allocator);
|
||||
|
||||
try svc.populateGeo(&record);
|
||||
try std.testing.expectEqualStrings("Already Set", record.geo.?);
|
||||
}
|
||||
|
||||
test "getClassification: sparse Wikidata + EDGAR managed_fund hit produces merged record" {
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
var tmp = std.testing.tmpDir(.{});
|
||||
defer tmp.cleanup();
|
||||
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
|
||||
defer allocator.free(dir_path);
|
||||
|
||||
const config = Config{ .cache_dir = dir_path };
|
||||
var svc = DataService.init(io, allocator, config);
|
||||
defer svc.deinit();
|
||||
|
||||
// Seed both EDGAR ticker map caches with at least one entry
|
||||
// each so the synthesizeClassification path doesn't try to
|
||||
// fetch them (the load helpers treat empty cached slices as
|
||||
// "miss" and fall through to a network fetch).
|
||||
var s = svc.store();
|
||||
var mf_entries = [_]Edgar.MutualFundTickerEntry{.{
|
||||
.symbol = "FAGIX",
|
||||
.cik = "0000275309",
|
||||
}};
|
||||
s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl());
|
||||
var co_entries = [_]Edgar.CompanyTickerEntry{.{
|
||||
.symbol = "DUMMY",
|
||||
.cik = "0000000001",
|
||||
}};
|
||||
s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl());
|
||||
|
||||
// Seed an etf_metrics negative cache so getEtfMetrics doesn't
|
||||
// try to fetch from the network.
|
||||
s.writeNegative("FAGIX", .etf_metrics);
|
||||
|
||||
// Sparse Wikidata records (length 1, only name set -- not useful).
|
||||
var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1);
|
||||
sparse[0] = .{
|
||||
.symbol = try allocator.dupe(u8, "FAGIX"),
|
||||
.name = try allocator.dupe(u8, "Test Fund"),
|
||||
.as_of = try allocator.dupe(u8, "2026-06-01"),
|
||||
.source = try allocator.dupe(u8, "wikidata"),
|
||||
};
|
||||
|
||||
// Drive directly through synthesizeClassification (skip the
|
||||
// Wikidata fetch). It takes ownership of `sparse`.
|
||||
svc.panic_on_network_attempt = true; // any provider call -> panic
|
||||
const merged = try svc.synthesizeClassification("FAGIX", sparse, .{ .skip_network = true });
|
||||
defer Wikidata.ClassificationRecord.freeSlice(allocator, merged);
|
||||
|
||||
try std.testing.expectEqual(@as(usize, 1), merged.len);
|
||||
const c = merged[0];
|
||||
try std.testing.expectEqualStrings("FAGIX", c.symbol);
|
||||
try std.testing.expect(c.is_etf);
|
||||
try std.testing.expectEqualStrings("Fund", c.asset_class.?);
|
||||
try std.testing.expectEqualStrings("US", c.country.?);
|
||||
try std.testing.expectEqualStrings("US", c.geo.?);
|
||||
try std.testing.expectEqualStrings("edgar_fallback", c.source);
|
||||
// Wikidata's name preserved on merge.
|
||||
try std.testing.expectEqualStrings("Test Fund", c.name.?);
|
||||
}
|
||||
|
||||
test "synthesizeClassification: no EDGAR hit returns NotFound" {
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
var tmp = std.testing.tmpDir(.{});
|
||||
defer tmp.cleanup();
|
||||
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
|
||||
defer allocator.free(dir_path);
|
||||
|
||||
const config = Config{ .cache_dir = dir_path };
|
||||
var svc = DataService.init(io, allocator, config);
|
||||
defer svc.deinit();
|
||||
|
||||
// Seed both ticker maps with throwaway entries so the
|
||||
// EDGAR lookup returns .none for our test symbol but doesn't
|
||||
// try to fetch the maps from the network.
|
||||
var s = svc.store();
|
||||
var mf_entries = [_]Edgar.MutualFundTickerEntry{.{
|
||||
.symbol = "DUMMY1",
|
||||
.cik = "0000000001",
|
||||
}};
|
||||
s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl());
|
||||
var co_entries = [_]Edgar.CompanyTickerEntry{.{
|
||||
.symbol = "DUMMY2",
|
||||
.cik = "0000000002",
|
||||
}};
|
||||
s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl());
|
||||
|
||||
var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1);
|
||||
sparse[0] = .{
|
||||
.symbol = try allocator.dupe(u8, "NEVERHEARDOFIT"),
|
||||
.name = try allocator.dupe(u8, "ghost"),
|
||||
.as_of = try allocator.dupe(u8, "2026-06-01"),
|
||||
.source = try allocator.dupe(u8, "wikidata"),
|
||||
};
|
||||
|
||||
svc.panic_on_network_attempt = true;
|
||||
try std.testing.expectError(error.NotFound, svc.synthesizeClassification("NEVERHEARDOFIT", sparse, .{ .skip_network = true }));
|
||||
}
|
||||
|
||||
test "synthesizeClassification: company_or_uit without ETF/TRUST keyword still routes to multi-row" {
|
||||
// PTY shape: closed-end fund whose company_tickers title is
|
||||
// "PIMCO CORPORATE & INCOME OPPORTUNITY FUND" -- no "ETF" or
|
||||
// "TRUST" in the title, so lookupInTickerMaps returns
|
||||
// .company_or_uit{is_etf=false}. But it's still fund-shaped
|
||||
// and should produce multi-row metadata in enrich.
|
||||
//
|
||||
// The downstream signal for "fund-like, emit multi-row" is
|
||||
// ClassificationRecord.is_etf. Set it to true for any
|
||||
// EDGAR-found .company_or_uit hit (even when the title
|
||||
// doesn't carry the ETF/TRUST keyword), so PTY-shape
|
||||
// closed-end funds get the same treatment as ETFs.
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
var tmp = std.testing.tmpDir(.{});
|
||||
defer tmp.cleanup();
|
||||
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
|
||||
defer allocator.free(dir_path);
|
||||
|
||||
const config = Config{ .cache_dir = dir_path };
|
||||
var svc = DataService.init(io, allocator, config);
|
||||
defer svc.deinit();
|
||||
|
||||
var s = svc.store();
|
||||
// Throwaway MF entry so the MF lookup returns null.
|
||||
var mf_entries = [_]Edgar.MutualFundTickerEntry{.{
|
||||
.symbol = "DUMMY",
|
||||
.cik = "0000000001",
|
||||
}};
|
||||
s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl());
|
||||
// PTY in the company map with NO ETF/TRUST in title.
|
||||
var co_entries = [_]Edgar.CompanyTickerEntry{.{
|
||||
.symbol = "PTY",
|
||||
.cik = "0001202604",
|
||||
.title = "PIMCO CORPORATE & INCOME OPPORTUNITY FUND",
|
||||
}};
|
||||
s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl());
|
||||
s.writeNegative("PTY", .etf_metrics);
|
||||
|
||||
var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1);
|
||||
sparse[0] = .{
|
||||
.symbol = try allocator.dupe(u8, "PTY"),
|
||||
.name = try allocator.dupe(u8, "PIMCO Corporate & Income Opportunity Fund"),
|
||||
.as_of = try allocator.dupe(u8, "2026-06-01"),
|
||||
.source = try allocator.dupe(u8, "wikidata"),
|
||||
};
|
||||
|
||||
svc.panic_on_network_attempt = true;
|
||||
const merged = try svc.synthesizeClassification("PTY", sparse, .{ .skip_network = true });
|
||||
defer Wikidata.ClassificationRecord.freeSlice(allocator, merged);
|
||||
|
||||
try std.testing.expectEqual(@as(usize, 1), merged.len);
|
||||
const c = merged[0];
|
||||
// is_etf MUST be true so enrich routes through emitEtfRows
|
||||
// (multi-row sleeve breakdown). The asset_class stays "Fund"
|
||||
// because no ETF/TRUST keyword in title.
|
||||
try std.testing.expect(c.is_etf);
|
||||
try std.testing.expectEqualStrings("Fund", c.asset_class.?);
|
||||
}
|
||||
|
||||
test "synthesizeClassification: NPORT-P series_name beats Wikidata's index name for funds" {
|
||||
// SOXX shape: Wikidata returns the underlying INDEX name
|
||||
// ("PHLX Semiconductor Sector") which is technically what the
|
||||
// ticker symbol is for, but downstream consumers want the
|
||||
// FUND name ("iShares Semiconductor ETF") that NPORT-P
|
||||
// <seriesName> carries. Series_name is more authoritative
|
||||
// for the fund itself.
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
var tmp = std.testing.tmpDir(.{});
|
||||
defer tmp.cleanup();
|
||||
const dir_path = try tmp.dir.realPathFileAlloc(io, ".", allocator);
|
||||
defer allocator.free(dir_path);
|
||||
|
||||
const config = Config{ .cache_dir = dir_path };
|
||||
var svc = DataService.init(io, allocator, config);
|
||||
defer svc.deinit();
|
||||
|
||||
var s = svc.store();
|
||||
var mf_entries = [_]Edgar.MutualFundTickerEntry{.{
|
||||
.symbol = "DUMMY",
|
||||
.cik = "0000000001",
|
||||
}};
|
||||
s.write(Edgar.MutualFundTickerEntry, "_edgar", mf_entries[0..], cache.DataType.tickers_funds.ttl());
|
||||
var co_entries = [_]Edgar.CompanyTickerEntry{.{
|
||||
.symbol = "SOXX",
|
||||
.cik = "0001100663",
|
||||
.title = "iShares Trust",
|
||||
}};
|
||||
s.write(Edgar.CompanyTickerEntry, "_edgar", co_entries[0..], cache.DataType.tickers_companies.ttl());
|
||||
|
||||
// Pre-seed etf_metrics with a profile row carrying the
|
||||
// NPORT-P seriesName.
|
||||
var etf_records = [_]Edgar.EtfMetricRecord{
|
||||
.{ .profile = .{
|
||||
.symbol = try allocator.dupe(u8, "SOXX"),
|
||||
.series_name = try allocator.dupe(u8, "iShares Semiconductor ETF"),
|
||||
.cik = try allocator.dupe(u8, "0001100663"),
|
||||
.as_of = try allocator.dupe(u8, "2026-06-01"),
|
||||
.source = try allocator.dupe(u8, "edgar"),
|
||||
} },
|
||||
};
|
||||
defer for (etf_records) |r| r.deinit(allocator);
|
||||
s.write(Edgar.EtfMetricRecord, "SOXX", etf_records[0..], cache.DataType.etf_metrics.ttl());
|
||||
|
||||
// Wikidata returned only the index name (sparse).
|
||||
var sparse = try allocator.alloc(Wikidata.ClassificationRecord, 1);
|
||||
sparse[0] = .{
|
||||
.symbol = try allocator.dupe(u8, "SOXX"),
|
||||
.name = try allocator.dupe(u8, "PHLX Semiconductor Sector"),
|
||||
.as_of = try allocator.dupe(u8, "2026-06-01"),
|
||||
.source = try allocator.dupe(u8, "wikidata"),
|
||||
};
|
||||
|
||||
svc.panic_on_network_attempt = true;
|
||||
const merged = try svc.synthesizeClassification("SOXX", sparse, .{ .skip_network = true });
|
||||
defer Wikidata.ClassificationRecord.freeSlice(allocator, merged);
|
||||
|
||||
try std.testing.expectEqual(@as(usize, 1), merged.len);
|
||||
const c = merged[0];
|
||||
// Series_name from NPORT-P wins -- not Wikidata's index name.
|
||||
try std.testing.expectEqualStrings("iShares Semiconductor ETF", c.name.?);
|
||||
}
|
||||
|
||||
test "getEntityFacts: skip_network with no cache returns FetchFailed" {
|
||||
const allocator = std.testing.allocator;
|
||||
const io = std.testing.io;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue