diff --git a/README.md b/README.md index b75c2fe..b312632 100644 --- a/README.md +++ b/README.md @@ -922,6 +922,20 @@ zig build run -- # build and run The compiled binary is at `zig-out/bin/zfin`. +## Vendored code + +A small amount of third-party source is vendored directly into the +tree (rather than added as a Zig package dependency) where the +upstream is small, stable, and not packaged for `build.zig.zon`: + +| File | Source | Purpose | +|-------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------| +| `src/providers/xml.zig` | [Snektron/vulkan-zig](https://github.com/Snektron/vulkan-zig/blob/797ae8af88e84753af9640266de61a985b76b580/generator/xml.zig), via [aws-zig](https://github.com/elerch/aws-sdk-for-zig) | XML DOM parser used by the EDGAR provider for NPORT-P primary documents. | + +Each vendored file carries a `// VENDORED - see README.md` header +identifying its upstream source. When updating, copy the new +upstream verbatim and re-add the header. + ## License MIT diff --git a/build.zig.zon b/build.zig.zon index a711d4d..b4b08cb 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -13,8 +13,8 @@ .hash = "z2d-0.11.0-j5P_HtLzDwBGyQt49DrT0v4BuVqI_SRs6CXsuj7eBVhR", }, .srf = .{ - .url = "git+https://git.lerch.org/lobo/srf.git?ref=master#512eab0db082f1679af4de77b1f1713409766fcf", - .hash = "srf-0.0.0-qZj57-7CAQBdAFgdiSB2bE5Socq8QNId8PFzynVQbSUN", + .url = "git+https://git.lerch.org/lobo/srf#12b755660e96ed65c645975110214fcc9c66ca4d", + .hash = "srf-0.0.0-qZj5743KAQAykeIHzFJdRDwgAA-Yy1RLaj0Lw4W5Rphx", }, }, .paths = .{ diff --git a/src/Config.zig b/src/Config.zig index 0f580b9..3250650 100644 --- a/src/Config.zig +++ b/src/Config.zig @@ -38,6 +38,11 @@ fmp_key: ?[]const u8 = null, alphavantage_key: ?[]const u8 = null, tiingo_key: ?[]const u8 = null, openfigi_key: ?[]const u8 = null, +/// User contact email used as the User-Agent / From header for +/// open-data providers that require politeness identification +/// (Wikidata SPARQL, EDGAR). No API-key authentication semantics — +/// just identifies the operator. Sourced from `ZFIN_USER_EMAIL`. +user_email: ?[]const u8 = null, /// URL of a zfin-server instance for lazy cache sync (e.g. "https://zfin.lerch.org") server_url: ?[]const u8 = null, cache_dir: []const u8, @@ -92,6 +97,7 @@ pub fn fromEnv(io: std.Io, allocator: std.mem.Allocator, environ_map: *const std self.alphavantage_key = self.resolve("ALPHAVANTAGE_API_KEY"); self.tiingo_key = self.resolve("TIINGO_API_KEY"); self.openfigi_key = self.resolve("OPENFIGI_API_KEY"); + self.user_email = self.resolve("ZFIN_USER_EMAIL"); self.server_url = self.resolve("ZFIN_SERVER"); const env_cache = self.resolve("ZFIN_CACHE_DIR"); diff --git a/src/cache/store.zig b/src/cache/store.zig index 5ec4159..2c91041 100644 --- a/src/cache/store.zig +++ b/src/cache/store.zig @@ -63,6 +63,15 @@ pub const Ttl = struct { /// Refreshes on quarterly filing cadence; 30-day TTL gives a /// fortnightly margin around each fiscal-quarter boundary. pub const entity_facts: i64 = 30 * s_per_day; + + /// EDGAR ticker-map indexes (`company_tickers.json` and the MF + /// equivalent). SEC updates these daily upstream, but the + /// ticker→CIK mapping is extremely stable (changes are rare + /// rename events). 30-day TTL with jitter keeps the load + /// reasonable while still picking up new listings within a + /// month. + pub const tickers_funds: i64 = 30 * s_per_day; + pub const tickers_companies: i64 = 30 * s_per_day; }; /// Cache TTL specification with optional per-key expiration jitter. @@ -175,6 +184,15 @@ pub const DataType = enum { /// symbol-keyed, so a single dual-class issuer (BRK.A / BRK.B) /// has one shared facts file. entity_facts, + /// EDGAR's `company_tickers_mf.json` index, cached at + /// `/_edgar/tickers_funds.srf`. Single-record file + /// (one MutualFundTickerMapBlob) under a synthetic `_edgar` key. + /// Updated daily upstream; refreshes monthly with jitter. + tickers_funds, + /// EDGAR's `company_tickers.json` index, cached at + /// `/_edgar/tickers_companies.srf`. Same shape as + /// `tickers_funds`. + tickers_companies, pub fn fileName(self: DataType) []const u8 { return switch (self) { @@ -189,6 +207,8 @@ pub const DataType = enum { .classification => "classification.srf", .etf_metrics => "etf_metrics.srf", .entity_facts => "entity_facts.srf", + .tickers_funds => "tickers_funds.srf", + .tickers_companies => "tickers_companies.srf", }; } @@ -202,6 +222,8 @@ pub const DataType = enum { .classification => Ttl.classification, .etf_metrics => Ttl.etf_metrics, .entity_facts => Ttl.entity_facts, + .tickers_funds => Ttl.tickers_funds, + .tickers_companies => Ttl.tickers_companies, .candles_daily, .candles_meta, .meta => 0, }; } @@ -2331,6 +2353,9 @@ test "TTL constants are reasonable" { try std.testing.expectEqual(@as(i64, 90 * std.time.s_per_day), Ttl.classification); try std.testing.expectEqual(@as(i64, 90 * std.time.s_per_day), Ttl.etf_metrics); try std.testing.expectEqual(@as(i64, 30 * std.time.s_per_day), Ttl.entity_facts); + // EDGAR ticker-map indexes refresh monthly with jitter. + try std.testing.expectEqual(@as(i64, 30 * std.time.s_per_day), Ttl.tickers_funds); + try std.testing.expectEqual(@as(i64, 30 * std.time.s_per_day), Ttl.tickers_companies); } test "DataType.ttl returns correct values" { @@ -2342,6 +2367,8 @@ test "DataType.ttl returns correct values" { try std.testing.expectEqual(Ttl.classification, DataType.classification.ttl()); try std.testing.expectEqual(Ttl.etf_metrics, DataType.etf_metrics.ttl()); try std.testing.expectEqual(Ttl.entity_facts, DataType.entity_facts.ttl()); + try std.testing.expectEqual(Ttl.tickers_funds, DataType.tickers_funds.ttl()); + try std.testing.expectEqual(Ttl.tickers_companies, DataType.tickers_companies.ttl()); // These types have no TTL (0 = managed elsewhere) try std.testing.expectEqual(@as(i64, 0), DataType.candles_daily.ttl()); @@ -2361,6 +2388,8 @@ test "DataType.fileName returns correct file names" { try std.testing.expectEqualStrings("classification.srf", DataType.classification.fileName()); try std.testing.expectEqualStrings("etf_metrics.srf", DataType.etf_metrics.fileName()); try std.testing.expectEqualStrings("entity_facts.srf", DataType.entity_facts.fileName()); + try std.testing.expectEqualStrings("tickers_funds.srf", DataType.tickers_funds.fileName()); + try std.testing.expectEqualStrings("tickers_companies.srf", DataType.tickers_companies.fileName()); } test "negative_cache_content format" { diff --git a/src/main.zig b/src/main.zig index 966b9f1..2b3df0f 100644 --- a/src/main.zig +++ b/src/main.zig @@ -721,4 +721,10 @@ test "looksLikeUnquotedGlob: empty arg returns false" { test { std.testing.refAllDecls(@This()); + // Wikidata and EDGAR providers aren't yet imported via + // `service.zig`; pull them in here for test discovery in the + // meantime. Drop these once the providers are wired through + // the data service. + _ = @import("providers/Wikidata.zig"); + _ = @import("providers/Edgar.zig"); } diff --git a/src/providers/Edgar.zig b/src/providers/Edgar.zig new file mode 100644 index 0000000..f0aa602 --- /dev/null +++ b/src/providers/Edgar.zig @@ -0,0 +1,1826 @@ +//! EDGAR provider — SEC's electronic filing system as a data source. +//! +//! ## What this provider does +//! +//! Given a stock or fund symbol, EDGAR can answer: +//! +//! * "What's this fund made of?" — the latest portfolio holdings, +//! sector breakdown, and net assets, parsed from the fund's most +//! recent NPORT-P filing. +//! * "How many shares does this company have outstanding?" — read +//! from XBRL-tagged fields on the company's most recent 10-K / +//! 10-Q / 40-F cover page. Combined with a price quote (from +//! elsewhere) this gives market cap. +//! * "Where in EDGAR does this symbol live?" — symbol → CIK +//! lookup via SEC's two ticker-map indexes. +//! +//! ## Workflow when a caller asks about one symbol +//! +//! Symbols don't carry CIKs, so the first step is always a +//! ticker-map lookup. From there the path forks: +//! +//! AAPL (operating company) +//! 1. Look up "AAPL" in the company ticker map → CIK 320193. +//! 2. Fetch the submissions feed for CIK 320193 → entityType +//! "operating", no NPORT-P. Classify as `not_a_fund`. +//! 3. (Optional) fetch shares-outstanding from the XBRL +//! companyconcept endpoint for use in market cap math. +//! +//! VTI (mutual-fund-trust ETF) +//! 1. Look up "VTI" in the mutual-fund ticker map → CIK 36405, +//! seriesId S000002848. +//! 2. Run the EDGAR full-text search for that seriesId, filtered +//! to NPORT-P. Get the URL of the most recent filing. +//! 3. Download the NPORT-P primary_doc.xml. Parse profile, +//! sectors, holdings. +//! +//! SPY (unit-investment-trust ETF) +//! 1. Not in mutual-fund ticker map. Look up "SPY" in the +//! company ticker map → CIK 884394. +//! 2. Fetch the submissions feed → entityType "other", has a +//! NPORT-P at trust-CIK level (UITs don't have a seriesId). +//! 3. Download that NPORT-P. Parse like a fund. +//! +//! GLD (commodity trust) +//! 1. Not in mutual-fund ticker map. Look up "GLD" in the +//! company ticker map → CIK 1222333. +//! 2. Submissions feed → entityType "operating", SIC describes a +//! commodity trust. No NPORT-P. Return profile-only metrics +//! (the trust exists but has no portfolio to disclose). +//! +//! ## Glossary +//! +//! CIK Central Index Key. SEC's primary identifier for a +//! filer. 10-digit zero-padded number; we normalize to +//! that shape at the boundary so all callers can +//! assume it. +//! NPORT-P Form NPORT-P (public). Quarterly portfolio +//! disclosure filed by registered investment companies +//! (mutual funds, most ETFs). Contains every position, +//! aggregated valuation, and asset/issuer classifiers. +//! 10-K Annual report filed by US-domiciled operating +//! companies. Cover page carries shares-outstanding. +//! 10-Q Quarterly equivalent of 10-K. +//! 40-F Annual report filed by Canadian companies that +//! participate in the SEC's MJDS regime. Same XBRL +//! cover-page fields as 10-K — the dei taxonomy +//! handles both. Barrick Mining, Shopify, etc. +//! 20-F Annual report filed by other foreign private +//! issuers (BP, Toyota, Sony, ...). Covers the same +//! financial-statement ground as 10-K but the SEC +//! doesn't require dei-tagged shares-outstanding here, +//! so the XBRL companyconcept endpoint returns 404 for +//! many of them. Caller treats this as "shares unknown." +//! XBRL Structured-data tagging for SEC filings. Makes +//! specific fields (revenue, shares outstanding, etc.) +//! machine-readable across forms. +//! dei Document and Entity Information — XBRL taxonomy for +//! cover-page metadata (entity name, registrant info, +//! shares outstanding). Cross-form, cross-jurisdiction. +//! us-gaap XBRL taxonomy for US GAAP financial concepts. +//! Carries fallback shares-outstanding for dual-class +//! issuers (GOOGL, META) that don't tag the dei field. +//! UIT Unit Investment Trust. A specific kind of fund +//! structure (SPY, GLD, IVV, ...) that files NPORT-P +//! at the trust-CIK level rather than under a +//! series-of-trust seriesId like mutual funds do. +//! SIC Standard Industrial Classification. Four-digit +//! industry code on the submissions feed; we use it to +//! distinguish commodity trusts (SIC 6221) from +//! operating companies (most other codes). +//! +//! ## SEC endpoints used +//! +//! 1. https://www.sec.gov/files/company_tickers_mf.json +//! Mutual fund and ETF ticker map: (ticker → CIK, seriesId, +//! classId). One file, ~3 MB. +//! +//! 2. https://www.sec.gov/files/company_tickers.json +//! Stocks and unit-investment-trust ETFs: (ticker → CIK, +//! title). One file, ~5 MB. +//! +//! 3. https://efts.sec.gov/LATEST/search-index?q=&forms=NPORT-P +//! Full-text search for NPORT-P filings referencing +//! `seriesId`. Necessary because the submissions feed only +//! lists at trust-CIK level — a trust hosting hundreds of +//! series would otherwise force us to download every NPORT-P +//! to find the one we want. +//! +//! 4. https://data.sec.gov/submissions/CIK{cik:0>10}.json +//! Per-CIK submissions feed. Carries entityType, +//! sicDescription, ticker list, and the most-recent NPORT-P URL +//! for UIT-style ETFs that lack a seriesId. +//! +//! 5. https://www.sec.gov/Archives/edgar/data///primary_doc.xml +//! The actual NPORT-P document. XML, ~50-100 MB depending on +//! fund size. +//! +//! 6. https://data.sec.gov/api/xbrl/companyconcept/CIK{cik:0>10}/{taxonomy}/{Concept}.json +//! XBRL companyconcept endpoint. Used for shares-outstanding +//! via `dei:EntityCommonStockSharesOutstanding` (single-class +//! issuers) with fallback to `us-gaap:CommonStockSharesOutstanding` +//! (dual-class issuers like GOOGL, META). +//! +//! ## Politeness +//! +//! SEC requires a descriptive User-Agent + From: header on every +//! request, populated from `Config.user_email` (env +//! `ZFIN_USER_EMAIL`). The provider takes the email as a non-null +//! constructor argument; callers must surface a clear error if the +//! env var is missing rather than letting requests go out +//! un-identified. SEC's documented ceiling is 10 req/s per IP; we +//! throttle at 8 req/s via a `RateLimiter`, leaving a 20% margin +//! against timing jitter and retry bursts. A per-symbol loop over a +//! typical portfolio reaches this ceiling quickly without it. +//! +//! ## Caching +//! +//! `Edgar` carries no cache state of its own. Every method does HTTP +//! + parse and returns a typed result; the `DataService` layer +//! writes the parsed results to the user-facing cache files +//! (`classification.srf`, `etf_metrics.srf`, `entity_facts.srf`) +//! and reads them back on subsequent calls. +//! +//! Ticker maps (`company_tickers*.json`) are the one upstream +//! document we cache through `Store` — typed `MutualFundTickerMapBlob` +//! / `CompanyTickerMapBlob` records under a synthetic `_edgar` key — +//! because they're refreshed at SEC's daily cadence rather than per +//! symbol. Everything else gets parsed into typed records and +//! written to the user-facing per-symbol or per-CIK cache files. + +const std = @import("std"); +const http = @import("../net/http.zig"); +const RateLimiter = @import("../net/RateLimiter.zig"); +const fmt = @import("../format.zig"); +const xml = @import("xml.zig"); + +const tickers_funds_url = "https://www.sec.gov/files/company_tickers_mf.json"; +const tickers_companies_url = "https://www.sec.gov/files/company_tickers.json"; +const search_url_prefix = "https://efts.sec.gov/LATEST/search-index?"; + +// ── Edgar provider state ───────────────────────────────────────── +// +// File-as-struct: the file's top-level fields and methods together +// form the `Edgar` provider. Callers do +// `const Edgar = @import("providers/Edgar.zig");` followed by +// `var ed = Edgar.init(...);` and `ed.fetchMutualFundTickerMap(...)` +// etc. + +client: http.Client, +/// Contact email for the User-Agent + From headers SEC requires on +/// every request. Sourced from `Config.user_email`. Required, not +/// optional: callers must surface a clear missing-config error +/// before constructing this provider rather than letting requests +/// go out un-identified. +user_email: []const u8, +/// Token-bucket throttle keeping us under SEC's 10 req/s ceiling. +/// Sized at 8 req/s to leave a 20% margin against timing jitter and +/// any retry bursts. Per-symbol fetch loops over a portfolio reach +/// this ceiling quickly without it. +rate_limiter: RateLimiter, +allocator: std.mem.Allocator, + +const Edgar = @This(); + +pub fn init(io: std.Io, allocator: std.mem.Allocator, user_email: []const u8) Edgar { + return .{ + .client = http.Client.init(io, allocator), + .user_email = user_email, + .rate_limiter = RateLimiter.init(io, 8, std.time.ns_per_s), + .allocator = allocator, + }; +} + +pub fn deinit(self: *Edgar) void { + self.client.deinit(); +} + +/// GET wrapper that attaches the User-Agent + From headers SEC +/// requires on every request and acquires a rate-limit token before +/// issuing the call. +fn httpGet(self: *Edgar, url: []const u8) !http.Response { + self.rate_limiter.acquire(); + + var ua_buf: [256]u8 = undefined; + const ua = std.fmt.bufPrint(&ua_buf, "zfin/0.1 ({s})", .{self.user_email}) catch return error.UserEmailTooLong; + + const headers = [_]std.http.Header{ + .{ .name = "User-Agent", .value = ua }, + .{ .name = "From", .value = self.user_email }, + .{ .name = "Accept-Encoding", .value = "identity" }, + }; + + return self.client.request(.GET, url, null, &headers); +} + +/// Fetch and parse SEC's mutual-fund/ETF ticker map +/// (`company_tickers_mf.json`). Maps each ticker to a CIK + +/// seriesId + classId. Returns the parsed map; caching is the +/// DataService's job. +pub fn fetchMutualFundTickerMap(self: *Edgar, allocator: std.mem.Allocator) !TickerMap { + var resp = try self.httpGet(tickers_funds_url); + defer resp.deinit(); + return parseTickerMap(allocator, resp.body); +} + +/// Fetch and parse SEC's stocks-and-UITs ticker map +/// (`company_tickers.json`). Despite the filename, this file covers +/// operating companies AND unit investment trust ETFs (SPY, GLD, +/// IVV) — anything that doesn't file under a series-of-trust shape. +/// Returns the parsed map. +pub fn fetchCompanyTickerMap(self: *Edgar, allocator: std.mem.Allocator) !TickerMap { + var resp = try self.httpGet(tickers_companies_url); + defer resp.deinit(); + return parseStockTickerMap(allocator, resp.body); +} + +/// Find the most recent NPORT-P filing for `series_id`. Returns null +/// if no filing exists. Caller owns the returned URL. +pub fn findLatestNportP( + self: *Edgar, + allocator: std.mem.Allocator, + series_id: []const u8, +) !?[]u8 { + const url = try std.fmt.allocPrint( + allocator, + "{s}q=%22{s}%22&forms=NPORT-P", + .{ search_url_prefix, series_id }, + ); + defer allocator.free(url); + var resp = try self.httpGet(url); + defer resp.deinit(); + + return parseLatestNportPFromSearch(allocator, resp.body); +} + +/// Find the most recent NPORT-P filing for a CIK. Used for UIT-style +/// ETFs (SPY, etc.) that file at the trust-CIK level rather than a +/// series. Returns null if the CIK has no NPORT-P. +pub fn findLatestNportPByCik( + self: *Edgar, + allocator: std.mem.Allocator, + cik: []const u8, +) !?[]u8 { + const sub = try self.fetchSubmissionsFeed(allocator, cik); + return sub.latest_nport_p_url; +} + +/// Fetch and parse the per-CIK submissions feed. +pub fn fetchSubmissionsFeed( + self: *Edgar, + allocator: std.mem.Allocator, + cik: []const u8, +) !SubmissionsSummary { + const url = try std.fmt.allocPrint( + allocator, + "https://data.sec.gov/submissions/CIK{s:0>10}.json", + .{cik}, + ); + defer allocator.free(url); + var resp = try self.httpGet(url); + defer resp.deinit(); + + return parseSubmissionsFeed(allocator, resp.body, cik); +} + +/// Fetch the most recent shares-outstanding value for a CIK. Returns +/// null on 404 (e.g. 20-F-only filers). Cascades through +/// `dei:EntityCommonStockSharesOutstanding` (single-class) then +/// `us-gaap:CommonStockSharesOutstanding` (dual-class fallback, +/// e.g. GOOGL, META). +pub fn fetchSharesOutstanding( + self: *Edgar, + allocator: std.mem.Allocator, + cik: []const u8, +) !?SharesOutstanding { + if (try self.fetchSharesConcept(allocator, cik, "dei", "EntityCommonStockSharesOutstanding")) |so| { + return so; + } + return try self.fetchSharesConcept(allocator, cik, "us-gaap", "CommonStockSharesOutstanding"); +} + +fn fetchSharesConcept( + self: *Edgar, + allocator: std.mem.Allocator, + cik: []const u8, + taxonomy: []const u8, + concept: []const u8, +) !?SharesOutstanding { + const url = try std.fmt.allocPrint( + allocator, + "https://data.sec.gov/api/xbrl/companyconcept/CIK{s:0>10}/{s}/{s}.json", + .{ cik, taxonomy, concept }, + ); + defer allocator.free(url); + var resp = self.httpGet(url) catch |err| { + if (err == error.NotFound) return null; + return err; + }; + defer resp.deinit(); + + return parseSharesOutstanding(allocator, resp.body); +} + +/// Fetch and parse N-PORT-P metrics for one ETF/MF ticker. The +/// return value describes what was found (full holdings, +/// profile-only, or not-a-fund). `top_n_holdings` caps holdings +/// emitted (sorted by pctVal descending). +pub fn fetchEtfMetrics( + self: *Edgar, + io: std.Io, + allocator: std.mem.Allocator, + mf_ticker_map: *const TickerMap, + stock_ticker_map: *const TickerMap, + symbol: []const u8, + top_n_holdings: usize, +) !EtfMetricsResult { + // MF/ETF map first — authoritative for symbols filed under a + // series. Series-keyed full-text search; CIK fallback would + // yield arbitrary other series under the same trust. + if (mf_ticker_map.map.get(symbol)) |entry| { + const filing_url = (try self.findLatestNportP(allocator, entry.series_id.?)) orelse { + return .not_a_fund; + }; + defer allocator.free(filing_url); + const m = try self.fetchAndParseNportP(io, allocator, &entry, filing_url, symbol, top_n_holdings); + return .{ .full = m }; + } + + // Stock map: probe the submissions feed (one extra HTTP per + // unique CIK) to classify the entity. Branches: + // - fund_shaped + has NPORT-P → full holdings (SPY) + // - fund_shaped + no NPORT-P → profile-only (SLVO ETN issuer) + // - trust_shaped → profile-only (GLD commodity) + // - operating → not-a-fund (AAPL, MSFT) + if (stock_ticker_map.map.get(symbol)) |entry| { + var sub = try self.fetchSubmissionsFeed(allocator, entry.cik); + defer sub.deinit(allocator); + + const class = classifyByEntityType(&sub); + switch (class) { + .operating => return .not_a_fund, + .fund_shaped => { + if (sub.latest_nport_p_url) |url| { + const m = try self.fetchAndParseNportP(io, allocator, &entry, url, symbol, top_n_holdings); + return .{ .full = m }; + } + const profile = try buildProfileOnlyMetrics(io, allocator, &entry, &sub, symbol); + return .{ .profile_only = profile }; + }, + .trust_shaped => { + // Skip the NPORT-P probe — by definition these + // don't file one. Saves an HTTP roundtrip. + const profile = try buildProfileOnlyMetrics(io, allocator, &entry, &sub, symbol); + return .{ .profile_only = profile }; + }, + } + } + + return .not_in_edgar; +} + +/// Download and parse a NPORT-P primary_doc.xml at `filing_url`. +/// Used by both the MF and UIT paths in `fetchEtfMetrics`. The +/// parsed `EtfMetrics` is the cacheable artifact; the XML bytes are +/// discarded after parsing — no provider-internal XML cache, so +/// re-fetches always re-download. +fn fetchAndParseNportP( + self: *Edgar, + io: std.Io, + allocator: std.mem.Allocator, + entry: *const TickerEntry, + filing_url: []const u8, + symbol: []const u8, + top_n_holdings: usize, +) !EtfMetrics { + var resp = try self.httpGet(filing_url); + defer resp.deinit(); + + return parseNportP(io, allocator, resp.body, symbol, entry, top_n_holdings); +} + +// ── Free types and helpers (no `self`) ─────────────────────────── + +pub const SectorWeight = struct { + code: []const u8, // owned; raw NPORT-P code, e.g. "EC/CORP" + description: []const u8, // owned; human-readable, e.g. "Equity / Corporate" + pct_of_portfolio: f64, +}; + +pub const Holding = struct { + name: []const u8, // owned + ticker: ?[]const u8 = null, // owned; present for some equity holdings + cusip: ?[]const u8 = null, // owned + lei: ?[]const u8 = null, // owned; ISO 17442 Legal Entity Identifier + country: ?[]const u8 = null, // owned; ISO-3166 alpha-2 from + pct_of_portfolio: f64, +}; + +pub const EtfMetrics = struct { + symbol: []const u8, // owned + series_name: ?[]const u8 = null, // owned + cik: []const u8, // owned + /// Null for unit-investment-trust ETFs (SPY, etc.) that file + /// NPORT-P at the trust-CIK level without a series identifier. + series_id: ?[]const u8 = null, // owned + net_assets: ?f64 = null, + period_end: ?[]const u8 = null, // owned + as_of: []const u8, // owned (date scraper ran) + holdings: []Holding, // owned + sectors: []SectorWeight, // owned + + pub fn deinit(self: *EtfMetrics, allocator: std.mem.Allocator) void { + allocator.free(self.symbol); + if (self.series_name) |s| allocator.free(s); + allocator.free(self.cik); + if (self.series_id) |s| allocator.free(s); + if (self.period_end) |s| allocator.free(s); + allocator.free(self.as_of); + for (self.holdings) |*h| { + allocator.free(h.name); + if (h.ticker) |t| allocator.free(t); + if (h.cusip) |c| allocator.free(c); + if (h.lei) |l| allocator.free(l); + if (h.country) |c| allocator.free(c); + } + allocator.free(self.holdings); + for (self.sectors) |*s| { + allocator.free(s.code); + allocator.free(s.description); + } + allocator.free(self.sectors); + } +}; + +pub const TickerEntry = struct { + cik: []const u8, // owned + /// Series identifier — present for ETFs/MFs filing as a series of a + /// trust (sourced from `company_tickers_mf.json`). Null for stocks + /// and unit-investment-trust ETFs (sourced from `company_tickers.json`), + /// which file at the trust-CIK level without a series. + series_id: ?[]const u8 = null, // owned + class_id: ?[]const u8 = null, // owned + /// Trust / company name from the ticker map. Useful as a friendly + /// label for symbols where Wikidata didn't surface anything. + title: ?[]const u8 = null, // owned +}; + +pub const TickerMap = struct { + map: std.StringHashMap(TickerEntry), + allocator: std.mem.Allocator, + + pub fn deinit(self: *TickerMap) void { + var it = self.map.iterator(); + while (it.next()) |entry| { + self.allocator.free(entry.key_ptr.*); + self.allocator.free(entry.value_ptr.cik); + if (entry.value_ptr.series_id) |s| self.allocator.free(s); + if (entry.value_ptr.class_id) |s| self.allocator.free(s); + if (entry.value_ptr.title) |s| self.allocator.free(s); + } + self.map.deinit(); + } +}; + +/// Cache shape for the SEC's `company_tickers_mf.json` document. +/// Held under a synthetic `_edgar` key in the typed `Store` (one +/// record per cache file), which gives us: +/// - `#!expires=` freshness via TtlSpec → DataType.tickers_mf +/// - Atomic write + temp-file-rename via Store.writeRaw +/// - SRF length-prefix encoding handles the JSON body's commas / +/// newlines / `::` without escaping +/// +/// The provider deserializes from a fresh-fetched HTTP response; the +/// DataService writes to cache, reads back as `MutualFundTickerMapBlob`, +/// and parses the `.json` field via `parseTickerMap`. The blob is +/// the on-disk shape; `TickerMap` is the in-memory shape. +pub const MutualFundTickerMapBlob = struct { + json: []const u8, // owned (post-process duped in cache reads) +}; + +/// Cache shape for the SEC's `company_tickers.json` document. +/// Same structure as `MutualFundTickerMapBlob`; the two are distinct +/// types because `Store.dataTypeFor(T)` keys on Zig type, not on a +/// string argument. +pub const CompanyTickerMapBlob = struct { + json: []const u8, // owned (post-process duped in cache reads) +}; + +/// Parse the SEC's `company_tickers_mf.json` shape into a TickerMap. +/// Exposed publicly so cache-hit paths in DataService can call this +/// directly on bytes loaded from `Store`. +pub fn parseTickerMap(allocator: std.mem.Allocator, json_bytes: []const u8) !TickerMap { + var out: TickerMap = .{ + .map = .init(allocator), + .allocator = allocator, + }; + errdefer out.deinit(); + + const parsed = try std.json.parseFromSlice(std.json.Value, allocator, json_bytes, .{}); + defer parsed.deinit(); + + const root = switch (parsed.value) { + .object => |o| o, + else => return error.InvalidTickerMap, + }; + const data_array = switch (root.get("data") orelse return error.InvalidTickerMap) { + .array => |a| a.items, + else => return error.InvalidTickerMap, + }; + + for (data_array) |row| { + const fields = switch (row) { + .array => |a| a.items, + else => continue, + }; + if (fields.len < 4) continue; + + const cik_n = switch (fields[0]) { + .integer => |n| n, + else => continue, + }; + const series_id = switch (fields[1]) { + .string => |s| s, + else => continue, + }; + const class_id = switch (fields[2]) { + .string => |s| s, + else => continue, + }; + const symbol = switch (fields[3]) { + .string => |s| s, + else => continue, + }; + + // CIKs are normalized to 10-digit zero-padded strings at + // every boundary. Wikidata's P5531 uses this convention, so + // downstream merge logic can join on the same key shape. + // EDGAR ticker-map JSON delivers them as bare integers, so + // we pad here. Cast to u64 first because signed `{d:0>10}` + // reserves a slot for the sign character and produces + // "0000+36405". + const cik_str = try std.fmt.allocPrint(allocator, "{d:0>10}", .{@as(u64, @intCast(cik_n))}); + errdefer allocator.free(cik_str); + const symbol_owned = try allocator.dupe(u8, symbol); + errdefer allocator.free(symbol_owned); + const series_owned = try allocator.dupe(u8, series_id); + errdefer allocator.free(series_owned); + const class_owned = try allocator.dupe(u8, class_id); + errdefer allocator.free(class_owned); + + const gop = try out.map.getOrPut(symbol_owned); + if (gop.found_existing) { + // Multiple class IDs share a ticker — take the first. + // A more sophisticated rule (prefer lowest-cost class) + // would need expense-ratio data this provider doesn't + // currently load. + allocator.free(symbol_owned); + allocator.free(cik_str); + allocator.free(series_owned); + allocator.free(class_owned); + continue; + } + gop.value_ptr.* = .{ + .cik = cik_str, + .series_id = series_owned, + .class_id = class_owned, + .title = null, + }; + } + return out; +} + +/// Parser for the stocks-and-UITs `company_tickers.json` shape, which +/// is keyed by integer-string indices rather than the array-of-arrays +/// shape used by `company_tickers_mf.json`. Each entry has +/// `cik_str`, `ticker`, `title`. +pub fn parseStockTickerMap(allocator: std.mem.Allocator, json_bytes: []const u8) !TickerMap { + var out: TickerMap = .{ + .map = .init(allocator), + .allocator = allocator, + }; + errdefer out.deinit(); + + const parsed = try std.json.parseFromSlice(std.json.Value, allocator, json_bytes, .{}); + defer parsed.deinit(); + + const root = switch (parsed.value) { + .object => |o| o, + else => return error.InvalidTickerMap, + }; + + var it = root.iterator(); + while (it.next()) |entry| { + const obj = switch (entry.value_ptr.*) { + .object => |o| o, + else => continue, + }; + const cik_n = switch (obj.get("cik_str") orelse continue) { + .integer => |n| n, + else => continue, + }; + const symbol = switch (obj.get("ticker") orelse continue) { + .string => |s| s, + else => continue, + }; + const title = if (obj.get("title")) |v| switch (v) { + .string => |s| s, + else => null, + } else null; + + // CIKs are normalized to 10-digit zero-padded strings at + // every boundary. Wikidata's P5531 uses this convention, so + // downstream merge logic can join on the same key shape. + // EDGAR ticker-map JSON delivers them as bare integers, so + // we pad here. Cast to u64 first because signed `{d:0>10}` + // reserves a slot for the sign character and produces + // "0000+36405". + const cik_str = try std.fmt.allocPrint(allocator, "{d:0>10}", .{@as(u64, @intCast(cik_n))}); + errdefer allocator.free(cik_str); + const symbol_owned = try allocator.dupe(u8, symbol); + errdefer allocator.free(symbol_owned); + const title_owned = if (title) |t| try allocator.dupe(u8, t) else null; + errdefer if (title_owned) |t| allocator.free(t); + + const gop = try out.map.getOrPut(symbol_owned); + if (gop.found_existing) { + allocator.free(symbol_owned); + allocator.free(cik_str); + if (title_owned) |t| allocator.free(t); + continue; + } + gop.value_ptr.* = .{ + .cik = cik_str, + .series_id = null, + .class_id = null, + .title = title_owned, + }; + } + return out; +} + +/// Lightweight summary of a CIK's `submissions/CIK*.json` feed. +/// Pulls out the four fields callers need (entity_name, entity_type, +/// sic_description, latest_nport_p_url) so they can branch without +/// re-parsing the full JSON. All owned strings allocated by the +/// caller's allocator; caller must free via `deinit`. +pub const SubmissionsSummary = struct { + entity_name: ?[]u8 = null, + entity_type: ?[]u8 = null, + sic_description: ?[]u8 = null, + /// URL to the most-recent NPORT-P primary_doc.xml, if any. + latest_nport_p_url: ?[]u8 = null, + + pub fn deinit(self: *SubmissionsSummary, allocator: std.mem.Allocator) void { + if (self.entity_name) |s| allocator.free(s); + if (self.entity_type) |s| allocator.free(s); + if (self.sic_description) |s| allocator.free(s); + if (self.latest_nport_p_url) |s| allocator.free(s); + } +}; + +fn parseSubmissionsFeed( + allocator: std.mem.Allocator, + json_bytes: []const u8, + cik: []const u8, +) !SubmissionsSummary { + var out: SubmissionsSummary = .{}; + errdefer out.deinit(allocator); + + const parsed = try std.json.parseFromSlice(std.json.Value, allocator, json_bytes, .{}); + defer parsed.deinit(); + + const root = switch (parsed.value) { + .object => |o| o, + else => return out, + }; + if (root.get("name")) |v| switch (v) { + .string => |s| out.entity_name = try allocator.dupe(u8, s), + else => {}, + }; + if (root.get("entityType")) |v| switch (v) { + .string => |s| out.entity_type = try allocator.dupe(u8, s), + else => {}, + }; + if (root.get("sicDescription")) |v| switch (v) { + .string => |s| if (s.len > 0) { + out.sic_description = try allocator.dupe(u8, s); + }, + else => {}, + }; + + out.latest_nport_p_url = try findNportPUrlInSubmissions(allocator, root, cik); + return out; +} + +/// Shares-outstanding from EDGAR XBRL companyconcept endpoint. +/// Sourced from the `dei:EntityCommonStockSharesOutstanding` concept, +/// which the SEC's Document and Entity Information taxonomy mandates +/// on the cover page of 10-K, 10-Q, 40-F, and similar forms. +/// +/// The dei concept is preferred over `us-gaap:CommonStockSharesOutstanding` +/// because it covers Canadian 40-F filers (e.g. Barrick Mining) that +/// don't file under us-gaap. EU 20-F filers (e.g. BP) are still NOT +/// covered — they use pure ifrs-full without dei tagging — so callers +/// must tolerate `null` returns. +/// +/// `value` is the share count from the most recent reporting period. +/// `period_end` is the `end` date that count was reported as-of, in +/// `YYYY-MM-DD` form. `form` is the SEC form name (`10-K`, `10-Q`, +/// `40-F`, etc.) that supplied the number, useful for staleness +/// reasoning ("a 10-Q is 3 months stale, a 40-F is 12 months stale"). +pub const SharesOutstanding = struct { + value: u64, + period_end: []u8, // owned + form: []u8, // owned + + pub fn deinit(self: *SharesOutstanding, allocator: std.mem.Allocator) void { + allocator.free(self.period_end); + allocator.free(self.form); + } +}; + +/// Per-symbol shares-outstanding record, ready for SRF emission. Joins +/// the bare `SharesOutstanding` fetch result (CIK-level) with caller- +/// supplied `symbol` and `as_of` so each output row carries the full +/// provenance needed by downstream merge logic. +/// +/// The `source` field has no default — provenance is always emitted +/// (per the project's source-pure invariant: every row in a shared +/// classification file must self-identify which source produced it). +pub const SharesRecord = struct { + symbol: []u8, // owned + shares_outstanding: u64, + period_end: []u8, // owned, YYYY-MM-DD + form: ?[]u8 = null, // owned (e.g. "10-Q", "40-F") + cik: []u8, // owned + as_of: []u8, // owned (date scraper ran) + source: []const u8, // no default + + pub fn deinit(self: *SharesRecord, allocator: std.mem.Allocator) void { + allocator.free(self.symbol); + allocator.free(self.period_end); + if (self.form) |f| allocator.free(f); + allocator.free(self.cik); + allocator.free(self.as_of); + } +}; + +/// SRF-emit shape for the `profile` variant of an ETF metrics record. +/// One per fund. Disjoint from the internal `EtfMetrics` struct, which +/// holds the whole fund's data (profile + N sectors + M holdings) in +/// nested arrays for parsing convenience. +pub const EtfProfileRecord = struct { + symbol: []u8, // owned + series_name: ?[]u8 = null, // owned + cik: []u8, // owned + series_id: ?[]u8 = null, // owned + net_assets: ?f64 = null, + period_end: ?[]u8 = null, // owned, YYYY-MM-DD + as_of: []u8, // owned + source: []const u8, // no default + + pub fn deinit(self: *EtfProfileRecord, allocator: std.mem.Allocator) void { + allocator.free(self.symbol); + if (self.series_name) |s| allocator.free(s); + allocator.free(self.cik); + if (self.series_id) |s| allocator.free(s); + if (self.period_end) |s| allocator.free(s); + allocator.free(self.as_of); + } +}; + +/// SRF-emit shape for the `sector` variant. One per (assetCat, +/// issuerCat) pair within a fund. The `code` field is the raw +/// NPORT-P abbreviation; `description` is the human-readable +/// translation per `sectorDescription`. +pub const EtfSectorRecord = struct { + symbol: []u8, // owned + code: []u8, // owned, e.g. "EC/CORP" + description: []u8, // owned, e.g. "Equity / Corporate" + pct_of_portfolio: f64, + as_of: []u8, // owned + source: []const u8, // no default + + pub fn deinit(self: *EtfSectorRecord, allocator: std.mem.Allocator) void { + allocator.free(self.symbol); + allocator.free(self.code); + allocator.free(self.description); + allocator.free(self.as_of); + } +}; + +/// SRF-emit shape for the `holding` variant. One per top-N holding +/// retained from NPORT-P. Carries the full identifier inventory so +/// downstream display can prefer ticker > cusip > lei without +/// refetching. +pub const EtfHoldingRecord = struct { + symbol: []u8, // owned; the FUND's symbol + name: []u8, // owned; holding's company / instrument name + ticker: ?[]u8 = null, // owned + cusip: ?[]u8 = null, // owned + lei: ?[]u8 = null, // owned + country: ?[]u8 = null, // owned, ISO-3166 alpha-2 + pct_of_portfolio: f64, + as_of: []u8, // owned + source: []const u8, // no default + + pub fn deinit(self: *EtfHoldingRecord, allocator: std.mem.Allocator) void { + allocator.free(self.symbol); + allocator.free(self.name); + if (self.ticker) |s| allocator.free(s); + if (self.cusip) |s| allocator.free(s); + if (self.lei) |s| allocator.free(s); + if (self.country) |s| allocator.free(s); + allocator.free(self.as_of); + } +}; + +/// Tagged union covering all three rows of `etf_metrics.srf`. SRF's +/// default `type` discriminator is what we want, so no `srf_tag_field` +/// override is declared. Builders (in `main.zig`) construct the slice +/// by appending one `.profile` then N `.sector` then M `.holding` +/// variants per fund. +pub const EtfMetricRecord = union(enum) { + profile: EtfProfileRecord, + sector: EtfSectorRecord, + holding: EtfHoldingRecord, + + pub fn deinit(self: *EtfMetricRecord, allocator: std.mem.Allocator) void { + switch (self.*) { + .profile => |*r| r.deinit(allocator), + .sector => |*r| r.deinit(allocator), + .holding => |*r| r.deinit(allocator), + } + } +}; + +/// Decompose one fund's internal `EtfMetrics` struct into the SRF- +/// emit-shaped union slice. Appends one `.profile` variant then N +/// `.sector` variants then M `.holding` variants to `out`. All +/// strings on the resulting union values are freshly duped so the +/// caller can deinit `metrics` independently. Caller owns the +/// appended union values and must deinit them. +pub fn appendEtfMetricRecords( + allocator: std.mem.Allocator, + out: *std.ArrayList(EtfMetricRecord), + metrics: EtfMetrics, +) !void { + try out.append(allocator, .{ .profile = .{ + .symbol = try allocator.dupe(u8, metrics.symbol), + .series_name = if (metrics.series_name) |s| try allocator.dupe(u8, s) else null, + .cik = try allocator.dupe(u8, metrics.cik), + .series_id = if (metrics.series_id) |s| try allocator.dupe(u8, s) else null, + .net_assets = metrics.net_assets, + .period_end = if (metrics.period_end) |s| try allocator.dupe(u8, s) else null, + .as_of = try allocator.dupe(u8, metrics.as_of), + .source = "edgar", + } }); + for (metrics.sectors) |s| { + try out.append(allocator, .{ .sector = .{ + .symbol = try allocator.dupe(u8, metrics.symbol), + .code = try allocator.dupe(u8, s.code), + .description = try allocator.dupe(u8, s.description), + .pct_of_portfolio = s.pct_of_portfolio, + .as_of = try allocator.dupe(u8, metrics.as_of), + .source = "edgar", + } }); + } + for (metrics.holdings) |h| { + try out.append(allocator, .{ .holding = .{ + .symbol = try allocator.dupe(u8, metrics.symbol), + .name = try allocator.dupe(u8, h.name), + .ticker = if (h.ticker) |t| try allocator.dupe(u8, t) else null, + .cusip = if (h.cusip) |c| try allocator.dupe(u8, c) else null, + .lei = if (h.lei) |l| try allocator.dupe(u8, l) else null, + .country = if (h.country) |c| try allocator.dupe(u8, c) else null, + .pct_of_portfolio = h.pct_of_portfolio, + .as_of = try allocator.dupe(u8, metrics.as_of), + .source = "edgar", + } }); + } +} + +fn parseSharesOutstanding( + allocator: std.mem.Allocator, + json_bytes: []const u8, +) !?SharesOutstanding { + const parsed = try std.json.parseFromSlice(std.json.Value, allocator, json_bytes, .{}); + defer parsed.deinit(); + + const root = switch (parsed.value) { + .object => |o| o, + else => return null, + }; + const units = switch (root.get("units") orelse return null) { + .object => |o| o, + else => return null, + }; + // The unit key is "shares". Defensive: take the first units + // entry whose array has at least one row. + var rows: []std.json.Value = &.{}; + var unit_it = units.iterator(); + while (unit_it.next()) |entry| { + switch (entry.value_ptr.*) { + .array => |a| if (a.items.len > 0) { + rows = a.items; + break; + }, + else => continue, + } + } + if (rows.len == 0) return null; + + // Pick the row with the latest `end` date. EDGAR usually returns + // them in chronological order but don't rely on that. + var best_idx: usize = 0; + var best_end: []const u8 = ""; + for (rows, 0..) |row, i| { + const obj = switch (row) { + .object => |o| o, + else => continue, + }; + const end = switch (obj.get("end") orelse continue) { + .string => |s| s, + else => continue, + }; + if (std.mem.order(u8, end, best_end) == .gt) { + best_end = end; + best_idx = i; + } + } + if (best_end.len == 0) return null; + + const obj = switch (rows[best_idx]) { + .object => |o| o, + else => return null, + }; + const val_node = obj.get("val") orelse return null; + const val: u64 = switch (val_node) { + .integer => |n| if (n < 0) return null else @intCast(n), + .float => |f| if (f < 0) return null else @intFromFloat(f), + else => return null, + }; + const form_str: []const u8 = switch (obj.get("form") orelse .null) { + .string => |s| s, + else => "", + }; + + return .{ + .value = val, + .period_end = try allocator.dupe(u8, best_end), + .form = try allocator.dupe(u8, form_str), + }; +} + +fn findNportPUrlInSubmissions( + allocator: std.mem.Allocator, + root: std.json.ObjectMap, + cik: []const u8, +) !?[]u8 { + const filings = switch (root.get("filings") orelse return null) { + .object => |o| o, + else => return null, + }; + const recent = switch (filings.get("recent") orelse return null) { + .object => |o| o, + else => return null, + }; + const forms = switch (recent.get("form") orelse return null) { + .array => |a| a.items, + else => return null, + }; + const accessions = switch (recent.get("accessionNumber") orelse return null) { + .array => |a| a.items, + else => return null, + }; + const dates = switch (recent.get("filingDate") orelse return null) { + .array => |a| a.items, + else => return null, + }; + + var best_idx: ?usize = null; + var best_date: []const u8 = ""; + for (forms, 0..) |form, i| { + const fname = switch (form) { + .string => |s| s, + else => continue, + }; + if (!std.mem.eql(u8, fname, "NPORT-P")) continue; + if (i >= dates.len) continue; + const fd = switch (dates[i]) { + .string => |s| s, + else => continue, + }; + if (std.mem.order(u8, fd, best_date) == .gt) { + best_date = fd; + best_idx = i; + } + } + const idx = best_idx orelse return null; + + const acc = switch (accessions[idx]) { + .string => |s| s, + else => return null, + }; + + var cik_no_zeros = cik; + while (cik_no_zeros.len > 1 and cik_no_zeros[0] == '0') cik_no_zeros = cik_no_zeros[1..]; + + var adsh_buf: std.ArrayList(u8) = .empty; + defer adsh_buf.deinit(allocator); + for (acc) |c| if (c != '-') try adsh_buf.append(allocator, c); + + return try std.fmt.allocPrint( + allocator, + "https://www.sec.gov/Archives/edgar/data/{s}/{s}/primary_doc.xml", + .{ cik_no_zeros, adsh_buf.items }, + ); +} + +/// Extract the most-recent filing URL from an EDGAR full-text search +/// response. Used by `findLatestNportP` (series-keyed search). Lifted +/// out so the same parser can be reused if we add more search calls. +fn parseLatestNportPFromSearch(allocator: std.mem.Allocator, json_bytes: []const u8) !?[]u8 { + const parsed = try std.json.parseFromSlice(std.json.Value, allocator, json_bytes, .{}); + defer parsed.deinit(); + + const root = switch (parsed.value) { + .object => |o| o, + else => return null, + }; + const hits_obj = switch (root.get("hits") orelse return null) { + .object => |o| o, + else => return null, + }; + const hits_arr = switch (hits_obj.get("hits") orelse return null) { + .array => |a| a.items, + else => return null, + }; + if (hits_arr.len == 0) return null; + + var best_idx: usize = 0; + var best_date: []const u8 = ""; + for (hits_arr, 0..) |hit, i| { + const hit_obj = switch (hit) { + .object => |o| o, + else => continue, + }; + const src = switch (hit_obj.get("_source") orelse continue) { + .object => |o| o, + else => continue, + }; + const fd = switch (src.get("file_date") orelse continue) { + .string => |s| s, + else => continue, + }; + if (std.mem.order(u8, fd, best_date) == .gt) { + best_date = fd; + best_idx = i; + } + } + + const best = switch (hits_arr[best_idx]) { + .object => |o| o, + else => return null, + }; + const src = switch (best.get("_source") orelse return null) { + .object => |o| o, + else => return null, + }; + const adsh = switch (src.get("adsh") orelse return null) { + .string => |s| s, + else => return null, + }; + const ciks_arr = switch (src.get("ciks") orelse return null) { + .array => |a| a.items, + else => return null, + }; + if (ciks_arr.len == 0) return null; + const cik_padded = switch (ciks_arr[0]) { + .string => |s| s, + else => return null, + }; + + var cik_no_zeros = cik_padded; + while (cik_no_zeros.len > 1 and cik_no_zeros[0] == '0') cik_no_zeros = cik_no_zeros[1..]; + + var adsh_buf = std.ArrayList(u8).empty; + defer adsh_buf.deinit(allocator); + for (adsh) |c| if (c != '-') try adsh_buf.append(allocator, c); + + return try std.fmt.allocPrint( + allocator, + "https://www.sec.gov/Archives/edgar/data/{s}/{s}/primary_doc.xml", + .{ cik_no_zeros, adsh_buf.items }, + ); +} + +/// Classify a CIK based on its submissions-feed metadata. Decides +/// whether the symbol is a registered fund (probe NPORT-P), a +/// trust/ETN-style instrument (profile-only), or a plain operating +/// company (skip). +/// +/// Decision rules — kept in one place because they're load-bearing +/// for what `EtfMetricsResult` variant `fetchEtfMetrics` returns. +/// Rules are based on observation across ~100 real symbols: +/// +/// 1. Has NPORT-P filing → fund_shaped. +/// The presence of a NPORT-P is the unambiguous signal that +/// the entity is a registered investment company. Catches all +/// ETFs and mutual funds regardless of entityType / SIC. +/// +/// 2. entityType == "other" AND SIC indicates +/// a securities issuer or commodity dealer → trust_shaped. +/// Catches ETN issuers (Credit Suisse AG → SLVO), commodity +/// brokers (some smaller commodity trusts), without a NPORT-P. +/// Does NOT catch foreign issuers like BP/Barrick (entityType +/// "other" but SIC is industry-specific, not securities-related). +/// +/// 3. entityType == "operating" AND SIC contains +/// "Commodity" → trust_shaped. +/// Catches commodity grantor trusts (GLD, SLV, IAU, GBTC). +/// `entityType` is "operating" for these despite their +/// trust-like nature — SEC classifies them as commodity- +/// contracts brokers because they hold physical commodities. +/// +/// 4. otherwise → operating. +/// Plain operating companies (AAPL, NFLX, BRK.B, BP, etc.). +/// No fund records emitted; Wikidata covers their classification. +/// +/// Note: REITs (e.g. Realty Income, O) are `operating` + SIC +/// "Real Estate Investment Trusts". They are operating companies +/// that distribute rental income, not registered investment +/// companies. They get bucketed under `operating` — Wikidata is +/// the right source for them. +fn classifyByEntityType(sub: *const SubmissionsSummary) enum { + fund_shaped, + trust_shaped, + operating, +} { + // Rule 1: NPORT-P presence is the strongest fund signal. + if (sub.latest_nport_p_url != null) return .fund_shaped; + + const et = sub.entity_type orelse return .operating; + const sic_opt = sub.sic_description; + + // Rule 2: securities issuers (ETN sponsor banks). + if (std.mem.eql(u8, et, "other")) { + if (sic_opt) |sic| { + const securities_hints = [_][]const u8{ + "Security Brokers", // "Security Brokers, Dealers..." + "Commodity Contracts", + "Investment Trust", // explicit, not "Real Estate Investment Trusts" + }; + for (securities_hints) |h| { + if (std.mem.indexOf(u8, sic, h) != null) return .trust_shaped; + } + } + return .operating; + } + + // Rule 3: commodity grantor trusts classified as "operating". + if (std.mem.eql(u8, et, "operating")) { + if (sic_opt) |sic| { + if (std.mem.indexOf(u8, sic, "Commodity") != null) { + return .trust_shaped; + } + } + } + return .operating; +} + +test "classifyByEntityType buckets real-world entities" { + const T = std.testing; + + // SPY: NPORT-P present → fund_shaped (regardless of other fields). + { + var s: SubmissionsSummary = .{}; + defer s.deinit(T.allocator); + s.entity_type = try T.allocator.dupe(u8, "other"); + s.latest_nport_p_url = try T.allocator.dupe(u8, "https://example/primary_doc.xml"); + try T.expectEqual(.fund_shaped, classifyByEntityType(&s)); + } + // SLVO/GLDI/USOI issuer (Credit Suisse AG): no NPORT-P, "other" + // entityType, SIC = "Security Brokers..." → trust_shaped. + { + var s: SubmissionsSummary = .{}; + defer s.deinit(T.allocator); + s.entity_type = try T.allocator.dupe(u8, "other"); + s.sic_description = try T.allocator.dupe(u8, "Security Brokers, Dealers & Flotation Companies"); + try T.expectEqual(.trust_shaped, classifyByEntityType(&s)); + } + // BP plc: foreign issuer, "other" entityType, SIC = industry. + // Should be `operating`, not trust_shaped. + { + var s: SubmissionsSummary = .{}; + defer s.deinit(T.allocator); + s.entity_type = try T.allocator.dupe(u8, "other"); + s.sic_description = try T.allocator.dupe(u8, "Petroleum Refining"); + try T.expectEqual(.operating, classifyByEntityType(&s)); + } + // Barrick: same shape as BP. + { + var s: SubmissionsSummary = .{}; + defer s.deinit(T.allocator); + s.entity_type = try T.allocator.dupe(u8, "other"); + s.sic_description = try T.allocator.dupe(u8, "Gold and Silver Ores"); + try T.expectEqual(.operating, classifyByEntityType(&s)); + } + // GLD: "operating" entityType but SIC is commodity-contracts. + { + var s: SubmissionsSummary = .{}; + defer s.deinit(T.allocator); + s.entity_type = try T.allocator.dupe(u8, "operating"); + s.sic_description = try T.allocator.dupe(u8, "Commodity Contracts Brokers & Dealers"); + try T.expectEqual(.trust_shaped, classifyByEntityType(&s)); + } + // AAPL. + { + var s: SubmissionsSummary = .{}; + defer s.deinit(T.allocator); + s.entity_type = try T.allocator.dupe(u8, "operating"); + s.sic_description = try T.allocator.dupe(u8, "Electronic Computers"); + try T.expectEqual(.operating, classifyByEntityType(&s)); + } + // NFLX. + { + var s: SubmissionsSummary = .{}; + defer s.deinit(T.allocator); + s.entity_type = try T.allocator.dupe(u8, "operating"); + s.sic_description = try T.allocator.dupe(u8, "Services-Video Tape Rental"); + try T.expectEqual(.operating, classifyByEntityType(&s)); + } + // Realty Income (O): REIT, operating company. + { + var s: SubmissionsSummary = .{}; + defer s.deinit(T.allocator); + s.entity_type = try T.allocator.dupe(u8, "operating"); + s.sic_description = try T.allocator.dupe(u8, "Real Estate Investment Trusts"); + try T.expectEqual(.operating, classifyByEntityType(&s)); + } +} + +/// Result kind for `fetchEtfMetrics`. The caller — see `main.zig` — +/// distinguishes a full holdings record from a profile-only record so +/// it can log the right thing and produce accurate coverage stats. +pub const EtfMetricsResult = union(enum) { + /// Full NPORT-P parse with holdings + sectors. + full: EtfMetrics, + /// Submissions-feed metadata only. Used for unit-investment trusts + /// that file 10-K instead of NPORT-P (commodity trusts like GLD, + /// some grantor trusts). + profile_only: EtfMetrics, + /// Symbol is in the stock-ticker map but is a plain operating + /// company (AAPL, MSFT, …). Not a fund. Caller should skip. + not_a_fund: void, + /// Symbol isn't in either ticker map. Caller should skip. + not_in_edgar: void, +}; + +/// Construct an EtfMetrics record from submissions-feed metadata +/// alone, with no holdings or sectors. Used for trust entities (e.g. +/// commodity trusts) that lack a NPORT-P filing but for which we +/// still want to surface name + CIK in `etf_metrics.srf`. +fn buildProfileOnlyMetrics( + io: std.Io, + allocator: std.mem.Allocator, + entry: *const TickerEntry, + sub: *const SubmissionsSummary, + symbol: []const u8, +) !EtfMetrics { + var as_of_buf: [10]u8 = undefined; + const today_date = fmt.todayDate(io); + const as_of = try std.fmt.bufPrint(&as_of_buf, "{f}", .{today_date}); + + // Prefer the submissions-feed name (canonical, "SPDR GOLD TRUST") + // over the company_tickers.json title (less authoritative). + const name_src: ?[]const u8 = sub.entity_name orelse entry.title; + const series_name: ?[]u8 = if (name_src) |n| try allocator.dupe(u8, n) else null; + errdefer if (series_name) |s| allocator.free(s); + + return .{ + .symbol = try allocator.dupe(u8, symbol), + .series_name = series_name, + .cik = try allocator.dupe(u8, entry.cik), + .series_id = null, + .net_assets = null, + .period_end = null, + .as_of = try allocator.dupe(u8, as_of), + .holdings = &.{}, + .sectors = &.{}, + }; +} + +/// Parse N-PORT-P bytes into an EtfMetrics struct. Heavy XML — we use +/// the vendored `xml.zig` DOM parser. +fn parseNportP( + io: std.Io, + allocator: std.mem.Allocator, + xml_bytes: []const u8, + symbol: []const u8, + entry: *const TickerEntry, + top_n_holdings: usize, +) !EtfMetrics { + var as_of_buf: [10]u8 = undefined; + const today_date = fmt.todayDate(io); + const as_of = try std.fmt.bufPrint(&as_of_buf, "{f}", .{today_date}); + + var doc = try xml.parse(allocator, xml_bytes); + defer doc.deinit(); + const root = doc.root; + + // Walk: edgarSubmission > formData > genInfo and fundInfo. + const form_data = (try root.findChildByTag("formData")) orelse return error.MissingFormData; + const gen_info = try form_data.findChildByTag("genInfo"); + const fund_info = try form_data.findChildByTag("fundInfo"); + const invst_or_secs = try form_data.findChildByTag("invstOrSecs"); + + var series_name: ?[]const u8 = null; + var period_end: ?[]const u8 = null; + if (gen_info) |gi| { + if (try gi.findChildByTag("seriesName")) |e| { + if (e.children.items.len > 0) { + if (e.children.items[0] == .CharData) { + const sn = e.children.items[0].CharData; + // Single-series trusts (SPY, IVV, …) write + // "N/A" here — drop it so we fall through to the + // ticker-map title below. + if (!std.mem.eql(u8, sn, "N/A") and sn.len > 0) { + series_name = try allocator.dupe(u8, sn); + } + } + } + } + } + // Fall back to the ticker-map title (e.g. "SPDR S&P 500 ETF Trust" + // for SPY) if NPORT-P didn't supply a useful series name. The + // title comes from `company_tickers.json` for stock-map entries. + if (series_name == null) { + if (entry.title) |t| { + series_name = try allocator.dupe(u8, t); + } + } + if (gen_info) |gi| { + if (try gi.findChildByTag("repPdEnd")) |e| { + if (e.children.items.len > 0) { + if (e.children.items[0] == .CharData) { + period_end = try allocator.dupe(u8, e.children.items[0].CharData); + } + } + } + } + + var net_assets: ?f64 = null; + if (fund_info) |fi| { + if (try fi.findChildByTag("netAssets")) |e| { + if (e.children.items.len > 0) { + if (e.children.items[0] == .CharData) { + net_assets = std.fmt.parseFloat(f64, e.children.items[0].CharData) catch null; + } + } + } + } + + // Holdings + sector breakdown. + var holdings_list: std.ArrayList(Holding) = .empty; + errdefer { + for (holdings_list.items) |h| { + allocator.free(h.name); + if (h.ticker) |t| allocator.free(t); + if (h.cusip) |c| allocator.free(c); + if (h.lei) |l| allocator.free(l); + if (h.country) |c| allocator.free(c); + } + holdings_list.deinit(allocator); + } + + // Sector aggregation: assetCat × issuerCat → cumulative weight + var sector_map: std.StringHashMap(f64) = .init(allocator); + defer { + var it = sector_map.iterator(); + while (it.next()) |entry2| allocator.free(entry2.key_ptr.*); + sector_map.deinit(); + } + + if (invst_or_secs) |secs| { + for (secs.children.items) |child| { + if (child != .Element) continue; + const sec = child.Element; + if (!std.mem.eql(u8, sec.tag, "invstOrSec")) continue; + + const name_text = elementText(sec, "name") orelse continue; + const pct_text = elementText(sec, "pctVal") orelse continue; + const pct = std.fmt.parseFloat(f64, pct_text) catch continue; + + try holdings_list.append(allocator, .{ + .name = try allocator.dupe(u8, name_text), + .ticker = if (elementAttrValue(sec, "identifiers", "ticker")) |t| try allocator.dupe(u8, t) else null, + .cusip = if (elementText(sec, "cusip")) |c| try allocator.dupe(u8, c) else null, + .lei = if (elementTextOptional(sec, "lei")) |l| try allocator.dupe(u8, l) else null, + .country = if (elementTextOptional(sec, "invCountry")) |c| try allocator.dupe(u8, c) else null, + .pct_of_portfolio = pct, + }); + + // Aggregate by (assetCat, issuerCat). + const asset_cat = elementText(sec, "assetCat") orelse "?"; + const issuer_cat = elementText(sec, "issuerCat") orelse "?"; + const key = try std.fmt.allocPrint(allocator, "{s}/{s}", .{ asset_cat, issuer_cat }); + const gop = try sector_map.getOrPut(key); + if (gop.found_existing) { + allocator.free(key); + gop.value_ptr.* += pct; + } else { + gop.value_ptr.* = pct; + } + } + } + + // Top N holdings by pct_of_portfolio. + const all_holdings = try holdings_list.toOwnedSlice(allocator); + std.mem.sort(Holding, all_holdings, {}, struct { + fn gt(_: void, a: Holding, b: Holding) bool { + return a.pct_of_portfolio > b.pct_of_portfolio; + } + }.gt); + + const keep = @min(all_holdings.len, top_n_holdings); + const top = try allocator.alloc(Holding, keep); + for (all_holdings[0..keep], 0..) |h, i| top[i] = h; + // Free the rest. + for (all_holdings[keep..]) |h| { + allocator.free(h.name); + if (h.ticker) |t| allocator.free(t); + if (h.cusip) |c| allocator.free(c); + if (h.lei) |l| allocator.free(l); + if (h.country) |c| allocator.free(c); + } + allocator.free(all_holdings); + + // Sector list. + var sectors_list: std.ArrayList(SectorWeight) = .empty; + errdefer sectors_list.deinit(allocator); + var s_it = sector_map.iterator(); + while (s_it.next()) |s_entry| { + const code = s_entry.key_ptr.*; + try sectors_list.append(allocator, .{ + .code = try allocator.dupe(u8, code), + .description = try allocator.dupe(u8, sectorDescription(code)), + .pct_of_portfolio = s_entry.value_ptr.*, + }); + } + const sectors = try sectors_list.toOwnedSlice(allocator); + std.mem.sort(SectorWeight, sectors, {}, struct { + fn gt(_: void, a: SectorWeight, b: SectorWeight) bool { + return a.pct_of_portfolio > b.pct_of_portfolio; + } + }.gt); + + return .{ + .symbol = try allocator.dupe(u8, symbol), + .series_name = series_name, + .cik = try allocator.dupe(u8, entry.cik), + .series_id = if (entry.series_id) |sid| try allocator.dupe(u8, sid) else null, + .net_assets = net_assets, + .period_end = period_end, + .as_of = try allocator.dupe(u8, as_of), + .holdings = top, + .sectors = sectors, + }; +} + +/// Translation table for NPORT-P `assetCat/issuerCat` codes. The +/// values are the SEC's own form-instruction abbreviations; the +/// descriptions are condensed-but-accurate human readings used to +/// populate `SectorWeight.description`. +/// +/// Coverage targets the codes observed across a representative +/// real-world portfolio (~32 distinct codes seen across stock / +/// bond / blended ETFs). Unrecognized codes round-trip raw (the +/// lookup falls back to the code itself) so unknowns surface for +/// table extension rather than silently corrupting downstream +/// classification. +/// +/// AssetCat values per SEC form instructions: +/// EC Equity (common) DE Derivative +/// EP Equity Preferred DFE Derivative — Foreign Exchange +/// DBT Debt DIR Direct Investment in Real Property +/// ABS-MBS Asset-Backed Mortgage DCR Direct Credit Risk +/// ABS-O Asset-Backed Other LON Loan +/// ABS-CBDO Asset-Backed CBO/CDO STIV Short-Term Investment Vehicle +/// RA Repurchase Agreement ? Other / Unknown +/// +/// IssuerCat values per SEC form instructions: +/// CORP Corporate MUN Municipal +/// UST US Treasury NUSS Non-US Sovereign +/// USGA US Government Agency RF Registered Fund +/// USGSE US Government-Sponsored Enterprise +/// PF Private Fund ? Other / Unknown +const sector_code_descriptions = [_]struct { + code: []const u8, + description: []const u8, +}{ + // Equity (common) + .{ .code = "EC/CORP", .description = "Equity / Corporate" }, + .{ .code = "EC/RF", .description = "Equity / Registered Fund" }, + .{ .code = "EC/NUSS", .description = "Equity / Non-US Sovereign" }, + .{ .code = "EC/?", .description = "Equity / Other" }, + + // Equity preferred + .{ .code = "EP/CORP", .description = "Equity Preferred / Corporate" }, + .{ .code = "EP/NUSS", .description = "Equity Preferred / Non-US Sovereign" }, + + // Debt + .{ .code = "DBT/UST", .description = "Debt / US Treasury" }, + .{ .code = "DBT/USGA", .description = "Debt / US Gov Agency" }, + .{ .code = "DBT/USGSE", .description = "Debt / US GSE" }, + .{ .code = "DBT/CORP", .description = "Debt / Corporate" }, + .{ .code = "DBT/MUN", .description = "Debt / Municipal" }, + .{ .code = "DBT/NUSS", .description = "Debt / Non-US Sovereign" }, + .{ .code = "DBT/?", .description = "Debt / Other" }, + + // Asset-backed + .{ .code = "ABS-MBS/USGSE", .description = "Asset-Backed / US GSE Mortgage" }, + .{ .code = "ABS-MBS/USGA", .description = "Asset-Backed / US Gov Agency Mortgage" }, + .{ .code = "ABS-MBS/CORP", .description = "Asset-Backed / Corporate Mortgage" }, + .{ .code = "ABS-O/CORP", .description = "Asset-Backed Other / Corporate" }, + .{ .code = "ABS-CBDO/CORP", .description = "Asset-Backed CBO/CDO / Corporate" }, + + // Repurchase agreements + .{ .code = "RA/CORP", .description = "Repurchase Agreement / Corporate" }, + .{ .code = "RA/?", .description = "Repurchase Agreement / Other" }, + + // Loans + .{ .code = "LON/CORP", .description = "Loan / Corporate" }, + + // Short-term investment vehicles + .{ .code = "STIV/CORP", .description = "Short-Term Investment Vehicle / Corporate" }, + .{ .code = "STIV/RF", .description = "Short-Term Investment Vehicle / Registered Fund" }, + .{ .code = "STIV/PF", .description = "Short-Term Investment Vehicle / Private Fund" }, + + // Derivatives + .{ .code = "DE/CORP", .description = "Derivative / Corporate" }, + .{ .code = "DE/?", .description = "Derivative / Other" }, + .{ .code = "DFE/CORP", .description = "Derivative-FX / Corporate" }, + .{ .code = "DFE/?", .description = "Derivative-FX / Other" }, + + // Direct investment / direct credit risk + .{ .code = "DIR/?", .description = "Direct Real Property / Other" }, + .{ .code = "DCR/?", .description = "Direct Credit Risk / Other" }, + + // Catch-all unknowns. We translate "?/X" to a more readable + // shape but preserve the structure (issuer is known even if + // asset class isn't). + .{ .code = "?/CORP", .description = "Other / Corporate" }, + .{ .code = "?/?", .description = "Other / Other" }, +}; + +/// Look up an NPORT-P sector code's human-readable description. For +/// unknown codes returns the code itself, so the caller can always +/// `dupe` the result without conditional handling. +pub fn sectorDescription(code: []const u8) []const u8 { + for (sector_code_descriptions) |entry| { + if (std.mem.eql(u8, entry.code, code)) return entry.description; + } + return code; +} + +fn elementText(parent: *xml.Element, tag: []const u8) ?[]const u8 { + const child = (parent.findChildByTag(tag) catch return null) orelse return null; + if (child.children.items.len == 0) return null; + if (child.children.items[0] != .CharData) return null; + return child.children.items[0].CharData; +} + +/// Read the `value` attribute of a child element identified by `tag`, +/// optionally nested inside `parent_tag` (use `null` for direct +/// children). Returns null when either path step fails. Used to pull +/// `` out of NPORT-P +/// holding records, where the ticker is encoded as an attribute on a +/// nested element rather than as text content. +fn elementAttrValue(parent: *xml.Element, parent_tag: ?[]const u8, tag: []const u8) ?[]const u8 { + const container: *xml.Element = if (parent_tag) |pt| + (parent.findChildByTag(pt) catch return null) orelse return null + else + parent; + const child = (container.findChildByTag(tag) catch return null) orelse return null; + return child.getAttribute("value"); +} + +/// Like `elementText` but treats NPORT-P's `"N/A"` sentinel and the +/// empty string as absent. NPORT-P uses literal `"N/A"` for missing +/// LEIs on issuers without one, and empty `` for +/// holdings whose country can't be determined; both should round-trip +/// as null in Zig. +fn elementTextOptional(parent: *xml.Element, tag: []const u8) ?[]const u8 { + const text = elementText(parent, tag) orelse return null; + if (text.len == 0) return null; + if (std.mem.eql(u8, text, "N/A")) return null; + return text; +} + +test "parseTickerMap parses fixture rows" { + const fixture = + \\{"fields":["cik","seriesId","classId","symbol"],"data":[ + \\ [36405,"S000002848","C000007808","VTI"], + \\ [1100663,"S000004362","C000012092","AGG"] + \\]} + ; + const allocator = std.testing.allocator; + var map = try parseTickerMap(allocator, fixture); + defer map.deinit(); + + const vti = map.map.get("VTI") orelse return error.TestFailed; + try std.testing.expectEqualStrings("0000036405", vti.cik); + try std.testing.expectEqualStrings("S000002848", vti.series_id orelse return error.TestFailed); + + const agg = map.map.get("AGG") orelse return error.TestFailed; + try std.testing.expectEqualStrings("0001100663", agg.cik); +} + +test "parseStockTickerMap parses fixture" { + const fixture = + \\{ + \\ "0":{"cik_str":78462,"ticker":"SPY","title":"SPDR S&P 500 ETF Trust"}, + \\ "1":{"cik_str":1222333,"ticker":"GLD","title":"SPDR GOLD TRUST"} + \\} + ; + const allocator = std.testing.allocator; + var map = try parseStockTickerMap(allocator, fixture); + defer map.deinit(); + + const spy = map.map.get("SPY") orelse return error.TestFailed; + try std.testing.expectEqualStrings("0000078462", spy.cik); + try std.testing.expect(spy.series_id == null); + try std.testing.expectEqualStrings("SPDR S&P 500 ETF Trust", spy.title orelse return error.TestFailed); + + const gld = map.map.get("GLD") orelse return error.TestFailed; + try std.testing.expectEqualStrings("0001222333", gld.cik); +} + +test "sectorDescription translates known codes and round-trips unknown" { + // Known codes get translated. + try std.testing.expectEqualStrings("Equity / Corporate", sectorDescription("EC/CORP")); + try std.testing.expectEqualStrings("Debt / US Treasury", sectorDescription("DBT/UST")); + try std.testing.expectEqualStrings("Asset-Backed / US GSE Mortgage", sectorDescription("ABS-MBS/USGSE")); + + // Codes added to cover real-world NPORT-P output. + try std.testing.expectEqualStrings("Debt / Municipal", sectorDescription("DBT/MUN")); + try std.testing.expectEqualStrings("Short-Term Investment Vehicle / Registered Fund", sectorDescription("STIV/RF")); + try std.testing.expectEqualStrings("Repurchase Agreement / Corporate", sectorDescription("RA/CORP")); + try std.testing.expectEqualStrings("Other / Other", sectorDescription("?/?")); + + // Unknown codes round-trip raw so future additions surface for + // table extension rather than getting silently mistranslated. + try std.testing.expectEqualStrings("MADE/UPCODE", sectorDescription("MADE/UPCODE")); + try std.testing.expectEqualStrings("", sectorDescription("")); +} + +test "parseNportP holdings: ticker/lei/country populated when present" { + const allocator = std.testing.allocator; + // Minimal NPORT-P fixture covering the holding-identifier shapes. + // Two holdings: first has all identifiers, second is bare-bones + // with the "N/A" LEI sentinel and an empty . + const xml_fixture = + \\ + \\ + \\ + \\ + \\ + \\ Argan Inc + \\ 529900E4KZWBV9KGBS83 + \\ 04010E109 + \\ + \\ + \\ + \\ + \\ 4.89 + \\ EC + \\ CORP + \\ US + \\ + \\ + \\ Mystery Bond + \\ N/A + \\ 000000000 + \\ 0.50 + \\ DBT + \\ CORP + \\ + \\ + \\ + \\ + \\ + ; + const entry = TickerEntry{ + .cik = "0000000000", + .series_id = null, + .class_id = null, + .title = null, + }; + var metrics = try parseNportP(std.testing.io, allocator, xml_fixture, "TEST", &entry, 10); + defer metrics.deinit(allocator); + + try std.testing.expectEqual(@as(usize, 2), metrics.holdings.len); + + // Holdings are sorted by pct descending — Argan first. + const argan = metrics.holdings[0]; + try std.testing.expectEqualStrings("Argan Inc", argan.name); + try std.testing.expectEqualStrings("AGX", argan.ticker orelse return error.TickerMissing); + try std.testing.expectEqualStrings("04010E109", argan.cusip orelse return error.CusipMissing); + try std.testing.expectEqualStrings("529900E4KZWBV9KGBS83", argan.lei orelse return error.LeiMissing); + try std.testing.expectEqualStrings("US", argan.country orelse return error.CountryMissing); + + // Mystery Bond: no , "N/A" lei, empty . + const mystery = metrics.holdings[1]; + try std.testing.expectEqualStrings("Mystery Bond", mystery.name); + try std.testing.expect(mystery.ticker == null); + try std.testing.expect(mystery.lei == null); + try std.testing.expect(mystery.country == null); + try std.testing.expectEqualStrings("000000000", mystery.cusip orelse return error.CusipMissing); +} + +test "appendEtfMetricRecords decomposes one fund into profile + sectors + holdings" { + const allocator = std.testing.allocator; + + // Build a minimal EtfMetrics by hand. Strings are owned; deinit + // matches what `parseNportP` would do. + const sectors = try allocator.alloc(SectorWeight, 2); + sectors[0] = .{ + .code = try allocator.dupe(u8, "EC/CORP"), + .description = try allocator.dupe(u8, "Equity / Corporate"), + .pct_of_portfolio = 98.5, + }; + sectors[1] = .{ + .code = try allocator.dupe(u8, "STIV/CORP"), + .description = try allocator.dupe(u8, "Short-Term Investment Vehicle / Corporate"), + .pct_of_portfolio = 1.5, + }; + const holdings = try allocator.alloc(Holding, 3); + holdings[0] = .{ + .name = try allocator.dupe(u8, "Apple Inc"), + .ticker = try allocator.dupe(u8, "AAPL"), + .cusip = try allocator.dupe(u8, "037833100"), + .lei = null, + .country = try allocator.dupe(u8, "US"), + .pct_of_portfolio = 7.0, + }; + holdings[1] = .{ + .name = try allocator.dupe(u8, "Microsoft Corp"), + .ticker = try allocator.dupe(u8, "MSFT"), + .cusip = try allocator.dupe(u8, "594918104"), + .lei = null, + .country = try allocator.dupe(u8, "US"), + .pct_of_portfolio = 6.0, + }; + holdings[2] = .{ + .name = try allocator.dupe(u8, "NVIDIA Corp"), + .ticker = try allocator.dupe(u8, "NVDA"), + .cusip = try allocator.dupe(u8, "67066G104"), + .lei = null, + .country = try allocator.dupe(u8, "US"), + .pct_of_portfolio = 5.0, + }; + var metrics = EtfMetrics{ + .symbol = try allocator.dupe(u8, "VTI"), + .series_name = try allocator.dupe(u8, "VANGUARD TOTAL STOCK MARKET INDEX FUND"), + .cik = try allocator.dupe(u8, "0000036405"), + .series_id = try allocator.dupe(u8, "S000002848"), + .net_assets = 2_000_000_000_000.0, + .period_end = try allocator.dupe(u8, "2025-12-31"), + .as_of = try allocator.dupe(u8, "2026-05-25"), + .holdings = holdings, + .sectors = sectors, + }; + defer metrics.deinit(allocator); + + var out: std.ArrayList(EtfMetricRecord) = .empty; + defer { + for (out.items) |*r| r.deinit(allocator); + out.deinit(allocator); + } + + try appendEtfMetricRecords(allocator, &out, metrics); + + // Expect 1 profile + 2 sectors + 3 holdings = 6 records. + try std.testing.expectEqual(@as(usize, 6), out.items.len); + + // First is profile. + try std.testing.expect(out.items[0] == .profile); + try std.testing.expectEqualStrings("VTI", out.items[0].profile.symbol); + try std.testing.expectEqualStrings("0000036405", out.items[0].profile.cik); + try std.testing.expectEqualStrings("S000002848", out.items[0].profile.series_id orelse return error.SeriesIdMissing); + + // Next two are sectors. + try std.testing.expect(out.items[1] == .sector); + try std.testing.expectEqualStrings("EC/CORP", out.items[1].sector.code); + try std.testing.expectEqualStrings("Equity / Corporate", out.items[1].sector.description); + try std.testing.expect(out.items[2] == .sector); + + // Last three are holdings. + try std.testing.expect(out.items[3] == .holding); + try std.testing.expect(out.items[4] == .holding); + try std.testing.expect(out.items[5] == .holding); + try std.testing.expectEqualStrings("AAPL", out.items[3].holding.ticker orelse return error.TickerMissing); + try std.testing.expectEqualStrings("VTI", out.items[3].holding.symbol); // fund symbol, not holding's +} diff --git a/src/providers/Wikidata.zig b/src/providers/Wikidata.zig new file mode 100644 index 0000000..e8eb3f0 --- /dev/null +++ b/src/providers/Wikidata.zig @@ -0,0 +1,618 @@ +//! Wikidata SPARQL classification provider. +//! +//! ## What this provider does +//! +//! Given a stock symbol, Wikidata can answer: +//! +//! * "What kind of entity is this?" — name, industry, sector, +//! country of incorporation, inception date, instance-of +//! classification (operating company / mutual fund / ETF / …). +//! * "Does this match the SEC's CIK?" — Wikidata's P5531 already +//! stores the 10-digit zero-padded CIK matching SEC's convention. +//! +//! ## Workflow +//! +//! `fetch(symbols)` runs ONE batched SPARQL query that returns +//! per-ticker rows. The query is keyed on the US-listing (NYSE / +//! Nasdaq / NYSE Arca / OTC Markets) of each ticker — without that +//! filter, common US tickers silently resolve to whichever +//! foreign-exchange company happens to share the symbol (`MRK` → +//! Merck KGaA on Frankfurt; `PG` → People's Garment on SET; etc.). +//! +//! The provider is stateless. Caching belongs to the data service, +//! which writes per-symbol `classification.srf` files after this +//! provider returns and reads them back on subsequent calls. +//! +//! ## Glossary +//! +//! SPARQL Query language for RDF-shaped data. Wikidata's +//! primary read API. +//! P-number Property identifier in Wikidata (P249 = ticker symbol, +//! P414 = stock exchange, P31 = instance of, ...). +//! Q-number Entity identifier in Wikidata (Q40244 = ETF as a +//! concept, Q13677 = NYSE the entity, Q312 = Apple Inc. +//! the entity). +//! wdt:Pxxx Truthy/direct property statement — the simple shape. +//! p:Pxxx Reified property statement — lets a statement carry +//! qualifiers (e.g. ticker symbol AS A QUALIFIER on the +//! stock-exchange statement, rather than as a direct +//! property of the company). +//! ps:Pxxx "Statement value" predicate — within a reified +//! statement, points to the statement's main value. +//! pq:Pxxx "Qualifier" predicate — within a reified statement, +//! points to a qualifier on that statement. +//! +//! Why the reified statement matters here: Wikidata stores tickers +//! as P249 qualifiers on a P414 (stock exchange) statement, NOT as +//! a direct `wdt:P249` property. Querying naively returns zero rows +//! for nearly every US-listed equity. + +const std = @import("std"); +const http = @import("../net/http.zig"); +const fmt = @import("../format.zig"); + +const sparql_endpoint = "https://query.wikidata.org/sparql"; + +/// Per-symbol classification record produced by parsing a Wikidata +/// SPARQL response. Fields are nullable when Wikidata has no value +/// for that property; the `source` field always emits per the +/// project's source-pure invariant. +pub const ClassificationRecord = struct { + symbol: []const u8, // owned + name: ?[]const u8 = null, // owned + sector: ?[]const u8 = null, // owned + industry: ?[]const u8 = null, // owned + /// ISO-3166 alpha-2 country code (e.g. "US", "GB", "DE"). + country: ?[]const u8 = null, // owned + asset_class: ?[]const u8 = null, // owned + is_etf: bool = false, + /// YYYY-MM-DD; trimmed from Wikidata's ISO-8601 date. + inception_date: ?[]const u8 = null, // owned + /// Wikidata's P5531 — the SEC CIK as a digit string. Wikidata + /// already zero-pads to 10 digits, matching the project-wide + /// CIK normalization convention. + cik: ?[]const u8 = null, // owned + /// YYYY-MM-DD when this provider ran, NOT when Wikidata last + /// updated the underlying entity. + as_of: []const u8, // owned + source: []const u8, // no default — provenance always emitted + + pub fn deinit(self: *ClassificationRecord, allocator: std.mem.Allocator) void { + allocator.free(self.symbol); + if (self.name) |s| allocator.free(s); + if (self.sector) |s| allocator.free(s); + if (self.industry) |s| allocator.free(s); + if (self.country) |s| allocator.free(s); + if (self.asset_class) |s| allocator.free(s); + if (self.inception_date) |s| allocator.free(s); + if (self.cik) |s| allocator.free(s); + allocator.free(self.as_of); + } +}; + +/// Geo-bucket constants used by the country → geo lookup. Kept as +/// named constants (rather than inline string literals in the map) +/// so callers can reference them without typo risk and the +/// taxonomy is tweakable in one place. +pub const geo = struct { + pub const us = "US"; + pub const developed = "International Developed"; + pub const emerging = "Emerging Markets"; + pub const unknown = "Unknown"; +}; + +/// Wikidata Q-IDs we test against `instance of` (P31) to classify +/// fund-shaped securities. Curated, not exhaustive. +const etf_q_ids = [_][]const u8{ + "Q40244", // exchange-traded fund + "Q4118901", // exchange-traded bond fund + "Q104638128", // ETF tracking specific index +}; +const mutual_fund_q_ids = [_][]const u8{ + "Q1752230", // mutual fund + "Q11644608", // open-end fund +}; + +/// US stock exchanges accepted by the SPARQL exchange filter. +/// Without this filter, ticker collisions across global exchanges +/// silently return the wrong company. +/// +/// Q-IDs: +/// Q13677 New York Stock Exchange (NYSE) +/// Q82059 Nasdaq +/// Q4527260 NYSE Arca +/// Q1666011 OTC Markets Group / Pink Sheets +const us_exchanges = [_][]const u8{ + "wd:Q13677", + "wd:Q82059", + "wd:Q4527260", + "wd:Q1666011", +}; + +/// Country-code-to-geo-bucket lookup. Wikidata returns ISO-3166 +/// alpha-2 codes via P17 → P297; we map them to the geo taxonomy +/// (`geo.us` / `geo.developed` / `geo.emerging` / `geo.unknown`). +/// +/// MSCI conventions used as the developed/emerging split. Taiwan +/// and South Korea are MSCI-emerging despite FTSE classifying them +/// developed. Israel is MSCI-developed (upgraded 2010). Canada is +/// folded into International Developed (some users prefer separate +/// Canada bucket; override in `metadata.srf` if so). +const country_to_geo = std.StaticStringMap([]const u8).initComptime(.{ + // United States + .{ "US", geo.us }, + // Alpha-3 fallback for entries that use the longer form. + .{ "USA", geo.us }, + + // International Developed — Europe ex-CIS + .{ "GB", geo.developed }, + .{ "DE", geo.developed }, + .{ "FR", geo.developed }, + .{ "NL", geo.developed }, + .{ "CH", geo.developed }, + .{ "SE", geo.developed }, + .{ "DK", geo.developed }, + .{ "NO", geo.developed }, + .{ "FI", geo.developed }, + .{ "IT", geo.developed }, + .{ "ES", geo.developed }, + .{ "BE", geo.developed }, + .{ "AT", geo.developed }, + .{ "IE", geo.developed }, + .{ "LU", geo.developed }, + .{ "PT", geo.developed }, + .{ "GR", geo.developed }, + .{ "IS", geo.developed }, + + // International Developed — Asia-Pacific + Israel + Canada + .{ "JP", geo.developed }, + .{ "AU", geo.developed }, + .{ "NZ", geo.developed }, + .{ "SG", geo.developed }, + .{ "HK", geo.developed }, + .{ "IL", geo.developed }, + .{ "CA", geo.developed }, + + // Emerging Markets (MSCI) + .{ "CN", geo.emerging }, + .{ "TW", geo.emerging }, + .{ "KR", geo.emerging }, + .{ "IN", geo.emerging }, + .{ "BR", geo.emerging }, + .{ "MX", geo.emerging }, + .{ "RU", geo.emerging }, + .{ "TR", geo.emerging }, + .{ "ZA", geo.emerging }, + .{ "TH", geo.emerging }, + .{ "MY", geo.emerging }, + .{ "ID", geo.emerging }, + .{ "PH", geo.emerging }, + .{ "VN", geo.emerging }, + .{ "AR", geo.emerging }, + .{ "CL", geo.emerging }, + .{ "CO", geo.emerging }, + .{ "PE", geo.emerging }, + .{ "EG", geo.emerging }, +}); + +/// Map an ISO-3166 alpha-2 country code to one of the geo buckets. +/// Null/empty input or an unknown code returns `geo.unknown` so the +/// user can override in `metadata.srf`. +pub fn geoFor(iso2: ?[]const u8) []const u8 { + const code = iso2 orelse return geo.unknown; + if (code.len == 0) return geo.unknown; + return country_to_geo.get(code) orelse geo.unknown; +} + +// ── Wikidata provider state (file-as-struct) ───────────────────── +// +// Callers do `const wikidata = @import("providers/Wikidata.zig");` +// followed by `var wd = wikidata.init(...);` and `wd.fetch(...)`. + +client: http.Client, +allocator: std.mem.Allocator, +io: std.Io, +/// Contact email for User-Agent / From headers, sourced from +/// `Config.user_email`. Required; callers must surface a clear +/// missing-config error before constructing this provider. +user_email: []const u8, + +const Wikidata = @This(); + +pub fn init( + io: std.Io, + allocator: std.mem.Allocator, + user_email: []const u8, +) Wikidata { + return .{ + .client = http.Client.init(io, allocator), + .allocator = allocator, + .io = io, + .user_email = user_email, + }; +} + +pub fn deinit(self: *Wikidata) void { + self.client.deinit(); +} + +/// Fetch and parse Wikidata classifications for `symbols`. +/// Runs a single batched SPARQL query and parses the response. +/// Caller owns the returned slice and each record. +pub fn fetch( + self: *Wikidata, + result_allocator: std.mem.Allocator, + symbols: []const []const u8, +) ![]ClassificationRecord { + if (symbols.len == 0) return &.{}; + + const query = try buildQuery(self.allocator, symbols); + defer self.allocator.free(query); + + const json = try self.postSparql(query); + defer self.allocator.free(json); + + return parse(self.io, result_allocator, json, symbols); +} + +/// POST a SPARQL query. Sets the User-Agent + From headers from +/// `user_email` for politeness; Wikidata explicitly recommends +/// descriptive User-Agent strings. +fn postSparql(self: *Wikidata, query: []const u8) ![]u8 { + var form_buf: std.Io.Writer.Allocating = .init(self.allocator); + defer form_buf.deinit(); + try form_buf.writer.writeAll("query="); + // `Component.formatEscaped` percent-encodes everything outside + // RFC 3986's unreserved set — exactly the contract for the + // `application/x-www-form-urlencoded` body we're building. + try (std.Uri.Component{ .raw = query }).formatEscaped(&form_buf.writer); + + var ua_buf: [256]u8 = undefined; + const ua = std.fmt.bufPrint(&ua_buf, "zfin/0.1 ({s})", .{self.user_email}) catch return error.UserEmailTooLong; + + const headers = [_]std.http.Header{ + .{ .name = "User-Agent", .value = ua }, + .{ .name = "Accept", .value = "application/sparql-results+json" }, + .{ .name = "Content-Type", .value = "application/x-www-form-urlencoded" }, + .{ .name = "From", .value = self.user_email }, + }; + + var resp = try self.client.request(.POST, sparql_endpoint, form_buf.written(), &headers); + defer resp.deinit(); + return self.allocator.dupe(u8, resp.body); +} + +/// Build the batched SPARQL query for a slice of ticker symbols. +/// Caller owns the returned bytes. Symbols interpolated via +/// `VALUES ?ticker { "AAPL" "MSFT" ... }`. +/// +/// Wikidata's ticker storage is non-obvious: tickers are stored as +/// `P249` qualifiers on a `P414` (stock exchange) statement. Naive +/// `?security wdt:P249 ?ticker` returns zero rows for nearly every +/// US-listed equity. The query reaches them via: +/// +/// ?security p:P414 ?stmt . +/// ?stmt ps:P414 ?exchange . +/// ?stmt pq:P249 ?ticker . +/// +/// `?exchange` is filtered to a small set of US exchanges to avoid +/// ticker collisions with foreign listings. +fn buildQuery(allocator: std.mem.Allocator, symbols: []const []const u8) ![]u8 { + var aw: std.Io.Writer.Allocating = .init(allocator); + defer aw.deinit(); + + try aw.writer.writeAll( + \\SELECT ?ticker ?security ?securityLabel ?industryLabel ?countryCode ?inception ?cik ?instance WHERE { + \\ VALUES ?ticker { + ); + for (symbols) |s| { + try aw.writer.print(" \"{s}\"", .{s}); + } + try aw.writer.writeAll(" }\n"); + try aw.writer.writeAll(" VALUES ?exchange {"); + for (us_exchanges) |x| { + try aw.writer.print(" {s}", .{x}); + } + try aw.writer.writeAll(" }\n"); + try aw.writer.writeAll( + \\ ?security p:P414 ?exchstmt . + \\ ?exchstmt ps:P414 ?exchange . + \\ ?exchstmt pq:P249 ?ticker . + \\ OPTIONAL { ?security wdt:P452 ?industry . } + \\ OPTIONAL { ?security wdt:P17 ?country . ?country wdt:P297 ?countryCode . } + \\ OPTIONAL { ?security wdt:P571 ?inception . } + \\ OPTIONAL { ?security wdt:P5531 ?cik . } + \\ OPTIONAL { ?security wdt:P31 ?instance . } + \\ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } + \\} + ); + return aw.toOwnedSlice(); +} + +/// Parse the SPARQL JSON response into `ClassificationRecord` values. +/// Multiple bindings for the same ticker (e.g. multiple `instance of` +/// values) get merged into one record — first-non-null wins. +fn parse( + io: std.Io, + allocator: std.mem.Allocator, + json_bytes: []const u8, + expected_symbols: []const []const u8, +) ![]ClassificationRecord { + const today = fmt.todayDate(io); + var as_of_buf: [10]u8 = undefined; + const as_of = try std.fmt.bufPrint(&as_of_buf, "{f}", .{today}); + + const parsed = std.json.parseFromSlice(std.json.Value, allocator, json_bytes, .{}) catch + return &.{}; + defer parsed.deinit(); + + const root = switch (parsed.value) { + .object => |o| o, + else => return &.{}, + }; + const results = switch (root.get("results") orelse return &.{}) { + .object => |o| o, + else => return &.{}, + }; + const bindings = switch (results.get("bindings") orelse return &.{}) { + .array => |a| a.items, + else => return &.{}, + }; + + // Map symbol → record; merge multiple bindings. + var by_symbol: std.StringHashMap(ClassificationRecord) = .init(allocator); + defer { + var it = by_symbol.valueIterator(); + while (it.next()) |r| r.deinit(allocator); + by_symbol.deinit(); + } + + for (bindings) |b| { + const obj = switch (b) { + .object => |o| o, + else => continue, + }; + const ticker = sparqlValue(obj, "ticker") orelse continue; + + // Verify ticker is one we asked for. Wikidata can return + // surprising matches (foreign exchanges); skip those. + var matched = false; + for (expected_symbols) |s| { + if (std.ascii.eqlIgnoreCase(s, ticker)) { + matched = true; + break; + } + } + if (!matched) continue; + + const existing_or_new = try by_symbol.getOrPut(ticker); + if (!existing_or_new.found_existing) { + existing_or_new.key_ptr.* = try allocator.dupe(u8, ticker); + existing_or_new.value_ptr.* = .{ + .symbol = try allocator.dupe(u8, ticker), + .as_of = try allocator.dupe(u8, as_of), + .source = "wikidata", + }; + } + const rec = existing_or_new.value_ptr; + + if (rec.name == null) { + if (sparqlValue(obj, "securityLabel")) |label| { + rec.name = try allocator.dupe(u8, label); + } + } + if (rec.industry == null) { + if (sparqlValue(obj, "industryLabel")) |ind| { + rec.industry = try allocator.dupe(u8, ind); + rec.sector = try allocator.dupe(u8, ind); + } + } + if (rec.country == null) { + if (sparqlValue(obj, "countryCode")) |c| { + rec.country = try allocator.dupe(u8, c); + } + } + if (rec.inception_date == null) { + if (sparqlValue(obj, "inception")) |d| { + if (d.len >= 10) { + rec.inception_date = try allocator.dupe(u8, d[0..10]); + } + } + } + if (rec.cik == null) { + if (sparqlValue(obj, "cik")) |c| { + rec.cik = try allocator.dupe(u8, c); + } + } + if (sparqlValue(obj, "instance")) |inst_iri| { + // The "instance" value is a Q-ID URI like + // "http://www.wikidata.org/entity/Q40244". Extract the + // Q-ID suffix and test against our known sets. + const last_slash = std.mem.lastIndexOfScalar(u8, inst_iri, '/'); + const q_id = if (last_slash) |i| inst_iri[i + 1 ..] else inst_iri; + for (etf_q_ids) |target| { + if (std.mem.eql(u8, q_id, target)) { + rec.is_etf = true; + if (rec.asset_class == null) { + rec.asset_class = try allocator.dupe(u8, "ETF (uncategorized)"); + } + break; + } + } + for (mutual_fund_q_ids) |target| { + if (std.mem.eql(u8, q_id, target)) { + rec.is_etf = true; + if (rec.asset_class == null) { + rec.asset_class = try allocator.dupe(u8, "Mutual Fund (uncategorized)"); + } + break; + } + } + } + } + + // Drain map into owned slice. Caller takes ownership; our defer + // above calls deinit on values, so clear the map before returning + // to avoid double-free. + var out = try allocator.alloc(ClassificationRecord, by_symbol.count()); + var idx: usize = 0; + var it = by_symbol.iterator(); + while (it.next()) |entry| { + out[idx] = entry.value_ptr.*; + idx += 1; + } + var key_it = by_symbol.keyIterator(); + while (key_it.next()) |k| allocator.free(k.*); + by_symbol.clearRetainingCapacity(); + return out; +} + +/// Pull the `.value` string out of a SPARQL JSON binding object's +/// named field. Returns null if absent or non-string. +fn sparqlValue(obj: std.json.ObjectMap, field: []const u8) ?[]const u8 { + const slot = obj.get(field) orelse return null; + const slot_obj = switch (slot) { + .object => |o| o, + else => return null, + }; + const val = slot_obj.get("value") orelse return null; + return switch (val) { + .string => |s| s, + else => null, + }; +} + +// ── Tests ──────────────────────────────────────────────────────── + +test "buildQuery includes all symbols and required SELECT vars" { + const allocator = std.testing.allocator; + const syms = [_][]const u8{ "AAPL", "VTI" }; + const q = try buildQuery(allocator, &syms); + defer allocator.free(q); + + try std.testing.expect(std.mem.indexOf(u8, q, "\"AAPL\"") != null); + try std.testing.expect(std.mem.indexOf(u8, q, "\"VTI\"") != null); + try std.testing.expect(std.mem.indexOf(u8, q, "p:P414") != null); + try std.testing.expect(std.mem.indexOf(u8, q, "pq:P249") != null); + try std.testing.expect(std.mem.indexOf(u8, q, "wdt:P452") != null); + try std.testing.expect(std.mem.indexOf(u8, q, "wdt:P17") != null); + // US-exchange filter must be present — without it, US tickers + // collide with foreign exchanges (MRK→Merck KGaA, PG→People's + // Garment, etc.). See `us_exchanges` doc-block. + try std.testing.expect(std.mem.indexOf(u8, q, "wd:Q13677") != null); // NYSE + try std.testing.expect(std.mem.indexOf(u8, q, "wd:Q82059") != null); // Nasdaq + try std.testing.expect(std.mem.indexOf(u8, q, "ps:P414 ?exchange") != null); +} + +test "parse: AAPL fixture round-trips name + industry + country" { + const fixture = + \\{ + \\ "head": {"vars": ["ticker", "security", "securityLabel", "industryLabel", "countryCode", "inception", "cik", "instance"]}, + \\ "results": { + \\ "bindings": [ + \\ { + \\ "ticker": {"type": "literal", "value": "AAPL"}, + \\ "security": {"type": "uri", "value": "http://www.wikidata.org/entity/Q312"}, + \\ "securityLabel": {"type": "literal", "value": "Apple Inc."}, + \\ "industryLabel": {"type": "literal", "value": "consumer electronics"}, + \\ "countryCode": {"type": "literal", "value": "US"}, + \\ "instance": {"type": "uri", "value": "http://www.wikidata.org/entity/Q4830453"} + \\ } + \\ ] + \\ } + \\} + ; + + const allocator = std.testing.allocator; + const expected = [_][]const u8{"AAPL"}; + const recs = try parse(std.testing.io, allocator, fixture, &expected); + defer { + for (recs) |*r| { + var m = r.*; + m.deinit(allocator); + } + allocator.free(recs); + } + + try std.testing.expectEqual(@as(usize, 1), recs.len); + try std.testing.expectEqualStrings("AAPL", recs[0].symbol); + try std.testing.expectEqualStrings("Apple Inc.", recs[0].name.?); + try std.testing.expectEqualStrings("consumer electronics", recs[0].industry.?); + try std.testing.expectEqualStrings("consumer electronics", recs[0].sector.?); + try std.testing.expectEqualStrings("US", recs[0].country.?); + try std.testing.expect(!recs[0].is_etf); +} + +test "parse: ETF fixture sets is_etf=true and asset_class" { + const fixture = + \\{ + \\ "head": {"vars": ["ticker", "security", "securityLabel", "instance"]}, + \\ "results": { + \\ "bindings": [ + \\ { + \\ "ticker": {"type": "literal", "value": "VTI"}, + \\ "security": {"type": "uri", "value": "http://www.wikidata.org/entity/Q1809462"}, + \\ "securityLabel": {"type": "literal", "value": "Vanguard Total Stock Market ETF"}, + \\ "instance": {"type": "uri", "value": "http://www.wikidata.org/entity/Q40244"} + \\ } + \\ ] + \\ } + \\} + ; + + const allocator = std.testing.allocator; + const expected = [_][]const u8{"VTI"}; + const recs = try parse(std.testing.io, allocator, fixture, &expected); + defer { + for (recs) |*r| { + var m = r.*; + m.deinit(allocator); + } + allocator.free(recs); + } + + try std.testing.expectEqual(@as(usize, 1), recs.len); + try std.testing.expect(recs[0].is_etf); + try std.testing.expectEqualStrings("ETF (uncategorized)", recs[0].asset_class.?); +} + +test "parse: bindings for symbols not requested are dropped" { + const fixture = + \\{ + \\ "head": {"vars": ["ticker", "security", "securityLabel"]}, + \\ "results": { + \\ "bindings": [ + \\ {"ticker": {"type": "literal", "value": "WRONG"}, + \\ "security": {"type": "uri", "value": "http://example/Q1"}, + \\ "securityLabel": {"type": "literal", "value": "Wrong Co"}} + \\ ] + \\ } + \\} + ; + + const allocator = std.testing.allocator; + const expected = [_][]const u8{"AAPL"}; + const recs = try parse(std.testing.io, allocator, fixture, &expected); + defer allocator.free(recs); + + try std.testing.expectEqual(@as(usize, 0), recs.len); +} + +test "geoFor maps known ISO-3166 codes to bucket" { + try std.testing.expectEqualStrings(geo.us, geoFor("US")); + try std.testing.expectEqualStrings(geo.us, geoFor("USA")); + try std.testing.expectEqualStrings(geo.developed, geoFor("GB")); + try std.testing.expectEqualStrings(geo.developed, geoFor("DE")); + try std.testing.expectEqualStrings(geo.developed, geoFor("CA")); + try std.testing.expectEqualStrings(geo.developed, geoFor("IL")); + try std.testing.expectEqualStrings(geo.emerging, geoFor("CN")); + try std.testing.expectEqualStrings(geo.emerging, geoFor("TW")); + try std.testing.expectEqualStrings(geo.emerging, geoFor("KR")); +} + +test "geoFor returns Unknown for null/empty/unmapped" { + try std.testing.expectEqualStrings(geo.unknown, geoFor(null)); + try std.testing.expectEqualStrings(geo.unknown, geoFor("")); + try std.testing.expectEqualStrings(geo.unknown, geoFor("ZZ")); // unassigned ISO-2 + try std.testing.expectEqualStrings(geo.unknown, geoFor("XX")); +} diff --git a/src/providers/xml.zig b/src/providers/xml.zig new file mode 100644 index 0000000..a2eca82 --- /dev/null +++ b/src/providers/xml.zig @@ -0,0 +1,713 @@ +// VENDORED - see README.md. +// File sourced from: +// https://github.com/Snektron/vulkan-zig/blob/797ae8af88e84753af9640266de61a985b76b580/generator/xml.zig +// via ~/shared/aws-zig/src/xml.zig +const std = @import("std"); +const mem = std.mem; +const testing = std.testing; +const Allocator = mem.Allocator; +const ArenaAllocator = std.heap.ArenaAllocator; +const ArrayList = std.ArrayList; + +pub const Attribute = struct { + name: []const u8, + value: []const u8, +}; + +pub const Content = union(enum) { + CharData: []const u8, + Comment: []const u8, + Element: *Element, +}; + +pub const Element = struct { + pub const AttributeList = ArrayList(*Attribute); + pub const ContentList = ArrayList(Content); + + tag: []const u8, + attributes: AttributeList, + children: ContentList, + next_sibling: ?*Element = null, + allocator: std.mem.Allocator, + + fn init(tag: []const u8, alloc: Allocator) Element { + return .{ + .tag = tag, + .attributes = .empty, + .children = .empty, + .allocator = alloc, + }; + } + + pub fn getAttribute(self: *Element, attrib_name: []const u8) ?[]const u8 { + for (self.attributes.items) |child| { + if (mem.eql(u8, child.name, attrib_name)) { + return child.value; + } + } + + return null; + } + + pub fn getCharData(self: *Element, child_tag: []const u8) ?[]const u8 { + const child = (self.findChildByTag(child_tag) catch return null) orelse return null; + if (child.children.items.len != 1) { + return null; + } + + return switch (child.children.items[0]) { + .CharData => |char_data| char_data, + else => null, + }; + } + + pub fn iterator(self: *Element) ChildIterator { + return .{ + .items = self.children.items, + .i = 0, + }; + } + + pub fn elements(self: *Element) ChildElementIterator { + return .{ + .inner = self.iterator(), + }; + } + + pub fn findChildByTag(self: *Element, tag: []const u8) !?*Element { + var it = self.findChildrenByTag(tag); + return try it.next(); + } + + pub fn findChildrenByTag(self: *Element, tag: []const u8) FindChildrenByTagIterator { + return .{ + .inner = self.elements(), + .tag = tag, + }; + } + + pub const ChildIterator = struct { + items: []Content, + i: usize, + + pub fn next(self: *ChildIterator) ?*Content { + if (self.i < self.items.len) { + self.i += 1; + return &self.items[self.i - 1]; + } + + return null; + } + }; + + pub const ChildElementIterator = struct { + inner: ChildIterator, + + pub fn next(self: *ChildElementIterator) ?*Element { + while (self.inner.next()) |child| { + if (child.* != .Element) { + continue; + } + + return child.*.Element; + } + + return null; + } + }; + + fn strictEqual(a: []const u8, b: []const u8, _: PredicateOptions) !bool { + return mem.eql(u8, a, b); + } + pub const FindChildrenByTagIterator = struct { + inner: ChildElementIterator, + tag: []const u8, + predicate: *const fn (a: []const u8, b: []const u8, options: PredicateOptions) anyerror!bool = strictEqual, + predicate_options: PredicateOptions = .{}, + + pub fn next(self: *FindChildrenByTagIterator) !?*Element { + while (self.inner.next()) |child| { + if (!try self.predicate(child.tag, self.tag, self.predicate_options)) { + continue; + } + + return child; + } + + return null; + } + }; +}; + +pub const PredicateOptions = struct { + allocator: ?std.mem.Allocator = null, +}; +pub const XmlDecl = struct { + version: []const u8, + encoding: ?[]const u8, + standalone: ?bool, +}; + +pub const Document = struct { + arena: ArenaAllocator, + xml_decl: ?*XmlDecl, + root: *Element, + + pub fn deinit(self: Document) void { + var arena = self.arena; // Copy to stack so self can be taken by value. + arena.deinit(); + } +}; + +const ParseContext = struct { + source: []const u8, + offset: usize, + line: usize, + column: usize, + + fn init(source: []const u8) ParseContext { + return .{ + .source = source, + .offset = 0, + .line = 0, + .column = 0, + }; + } + + fn peek(self: *ParseContext) ?u8 { + return if (self.offset < self.source.len) self.source[self.offset] else null; + } + + fn consume(self: *ParseContext) !u8 { + if (self.offset < self.source.len) { + return self.consumeNoEof(); + } + + return error.UnexpectedEof; + } + + fn consumeNoEof(self: *ParseContext) u8 { + std.debug.assert(self.offset < self.source.len); + const c = self.source[self.offset]; + self.offset += 1; + + if (c == '\n') { + self.line += 1; + self.column = 0; + } else { + self.column += 1; + } + + return c; + } + + fn eat(self: *ParseContext, char: u8) bool { + self.expect(char) catch return false; + return true; + } + + fn expect(self: *ParseContext, expected: u8) !void { + if (self.peek()) |actual| { + if (expected != actual) { + return error.UnexpectedCharacter; + } + + _ = self.consumeNoEof(); + return; + } + + return error.UnexpectedEof; + } + + fn eatStr(self: *ParseContext, text: []const u8) bool { + self.expectStr(text) catch return false; + return true; + } + + fn expectStr(self: *ParseContext, text: []const u8) !void { + if (self.source.len < self.offset + text.len) { + return error.UnexpectedEof; + } else if (std.mem.startsWith(u8, self.source[self.offset..], text)) { + var i: usize = 0; + while (i < text.len) : (i += 1) { + _ = self.consumeNoEof(); + } + + return; + } + + return error.UnexpectedCharacter; + } + + fn eatWs(self: *ParseContext) bool { + var ws = false; + + while (self.peek()) |ch| { + switch (ch) { + ' ', '\t', '\n', '\r' => { + ws = true; + _ = self.consumeNoEof(); + }, + else => break, + } + } + + return ws; + } + + fn expectWs(self: *ParseContext) !void { + if (!self.eatWs()) return error.UnexpectedCharacter; + } + + fn currentLine(self: ParseContext) []const u8 { + var begin: usize = 0; + if (mem.lastIndexOfScalar(u8, self.source[0..self.offset], '\n')) |prev_nl| { + begin = prev_nl + 1; + } + + const end = mem.indexOfScalarPos(u8, self.source, self.offset, '\n') orelse self.source.len; + return self.source[begin..end]; + } +}; + +test "ParseContext" { + { + var ctx = ParseContext.init("I like pythons"); + try testing.expectEqual(@as(?u8, 'I'), ctx.peek()); + try testing.expectEqual(@as(u8, 'I'), ctx.consumeNoEof()); + try testing.expectEqual(@as(?u8, ' '), ctx.peek()); + try testing.expectEqual(@as(u8, ' '), try ctx.consume()); + + try testing.expect(ctx.eat('l')); + try testing.expectEqual(@as(?u8, 'i'), ctx.peek()); + try testing.expectEqual(false, ctx.eat('a')); + try testing.expectEqual(@as(?u8, 'i'), ctx.peek()); + + try ctx.expect('i'); + try testing.expectEqual(@as(?u8, 'k'), ctx.peek()); + try testing.expectError(error.UnexpectedCharacter, ctx.expect('a')); + try testing.expectEqual(@as(?u8, 'k'), ctx.peek()); + + try testing.expect(ctx.eatStr("ke")); + try testing.expectEqual(@as(?u8, ' '), ctx.peek()); + + try testing.expect(ctx.eatWs()); + try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); + try testing.expectEqual(false, ctx.eatWs()); + try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); + + try testing.expectEqual(false, ctx.eatStr("aaaaaaaaa")); + try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); + + try testing.expectError(error.UnexpectedEof, ctx.expectStr("aaaaaaaaa")); + try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); + try testing.expectError(error.UnexpectedCharacter, ctx.expectStr("pytn")); + try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); + try ctx.expectStr("python"); + try testing.expectEqual(@as(?u8, 's'), ctx.peek()); + } + + { + var ctx = ParseContext.init(""); + try testing.expectEqual(ctx.peek(), null); + try testing.expectError(error.UnexpectedEof, ctx.consume()); + try testing.expectEqual(ctx.eat('p'), false); + try testing.expectError(error.UnexpectedEof, ctx.expect('p')); + } +} + +pub const ParseError = error{ + IllegalCharacter, + UnexpectedEof, + UnexpectedCharacter, + UnclosedValue, + UnclosedComment, + InvalidName, + InvalidEntity, + InvalidStandaloneValue, + NonMatchingClosingTag, + InvalidDocument, + OutOfMemory, +}; + +pub fn parse(backing_allocator: Allocator, source: []const u8) !Document { + var ctx = ParseContext.init(source); + return try parseDocument(&ctx, backing_allocator); +} + +fn parseDocument(ctx: *ParseContext, backing_allocator: Allocator) !Document { + var doc = Document{ + .arena = ArenaAllocator.init(backing_allocator), + .xml_decl = null, + // SAFETY: assigned below by `try parseDocumentRoot(&doc, ctx)` + // before `doc` is returned to the caller. If the parse fails, + // we propagate the error and the caller sees an error, not + // a half-initialized doc. + .root = undefined, + }; + + errdefer doc.deinit(); + + const allocator = doc.arena.allocator(); + + try trySkipComments(ctx, allocator); + + doc.xml_decl = try tryParseProlog(ctx, allocator); + _ = ctx.eatWs(); + try trySkipComments(ctx, allocator); + + doc.root = (try tryParseElement(ctx, allocator, null)) orelse return error.InvalidDocument; + _ = ctx.eatWs(); + try trySkipComments(ctx, allocator); + + if (ctx.peek() != null) return error.InvalidDocument; + + return doc; +} + +fn parseAttrValue(ctx: *ParseContext, alloc: Allocator) ![]const u8 { + const quote = try ctx.consume(); + if (quote != '"' and quote != '\'') return error.UnexpectedCharacter; + + const begin = ctx.offset; + + while (true) { + const c = ctx.consume() catch return error.UnclosedValue; + if (c == quote) break; + } + + const end = ctx.offset - 1; + + return try dupeAndUnescape(alloc, ctx.source[begin..end]); +} + +fn parseEqAttrValue(ctx: *ParseContext, alloc: Allocator) ![]const u8 { + _ = ctx.eatWs(); + try ctx.expect('='); + _ = ctx.eatWs(); + + return try parseAttrValue(ctx, alloc); +} + +fn parseNameNoDupe(ctx: *ParseContext) ![]const u8 { + // XML's spec on names is very long, so to make this easier + // we just take any character that is not special and not whitespace + const begin = ctx.offset; + + while (ctx.peek()) |ch| { + switch (ch) { + ' ', '\t', '\n', '\r' => break, + '&', '"', '\'', '<', '>', '?', '=', '/' => break, + else => _ = ctx.consumeNoEof(), + } + } + + const end = ctx.offset; + if (begin == end) return error.InvalidName; + + return ctx.source[begin..end]; +} + +fn tryParseCharData(ctx: *ParseContext, alloc: Allocator) !?[]const u8 { + const begin = ctx.offset; + + while (ctx.peek()) |ch| { + switch (ch) { + '<' => break, + else => _ = ctx.consumeNoEof(), + } + } + + const end = ctx.offset; + if (begin == end) return null; + + return try dupeAndUnescape(alloc, ctx.source[begin..end]); +} + +fn parseContent(ctx: *ParseContext, alloc: Allocator, parent: ?*Element) ParseError!Content { + if (try tryParseCharData(ctx, alloc)) |cd| { + return Content{ .CharData = cd }; + } else if (try tryParseComment(ctx, alloc)) |comment| { + return Content{ .Comment = comment }; + } else if (try tryParseElement(ctx, alloc, parent)) |elem| { + return Content{ .Element = elem }; + } else { + return error.UnexpectedCharacter; + } +} + +fn tryParseAttr(ctx: *ParseContext, alloc: Allocator) !?*Attribute { + const name = parseNameNoDupe(ctx) catch return null; + _ = ctx.eatWs(); + try ctx.expect('='); + _ = ctx.eatWs(); + const value = try parseAttrValue(ctx, alloc); + + const attr = try alloc.create(Attribute); + attr.name = try alloc.dupe(u8, name); + attr.value = value; + return attr; +} + +fn tryParseElement(ctx: *ParseContext, alloc: Allocator, parent: ?*Element) !?*Element { + const start = ctx.offset; + if (!ctx.eat('<')) return null; + const tag = parseNameNoDupe(ctx) catch { + ctx.offset = start; + return null; + }; + + const element = try alloc.create(Element); + element.* = Element.init(try alloc.dupe(u8, tag), alloc); + + while (ctx.eatWs()) { + const attr = (try tryParseAttr(ctx, alloc)) orelse break; + try element.attributes.append(element.allocator, attr); + } + + if (ctx.eatStr("/>")) { + return element; + } + + try ctx.expect('>'); + + while (true) { + if (ctx.peek() == null) { + return error.UnexpectedEof; + } else if (ctx.eatStr("'); + + if (parent) |p| { + var last_element: ?*Element = null; + + for (0..p.children.items.len) |i| { + const child = p.children.items[p.children.items.len - i - 1]; + if (child == .Element) { + last_element = child.Element; + break; + } + } + + if (last_element) |lc| { + lc.next_sibling = element; + } + } + + return element; +} + +test "tryParseElement" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const alloc = arena.allocator(); + + { + var ctx = ParseContext.init("<= a='b'/>"); + try testing.expectEqual(@as(?*Element, null), try tryParseElement(&ctx, alloc, null)); + try testing.expectEqual(@as(?u8, '<'), ctx.peek()); + } + + { + var ctx = ParseContext.init(""); + const elem = try tryParseElement(&ctx, alloc, null); + try testing.expectEqualSlices(u8, elem.?.tag, "python"); + + const size_attr = elem.?.attributes.items[0]; + try testing.expectEqualSlices(u8, size_attr.name, "size"); + try testing.expectEqualSlices(u8, size_attr.value, "15"); + + const color_attr = elem.?.attributes.items[1]; + try testing.expectEqualSlices(u8, color_attr.name, "color"); + try testing.expectEqualSlices(u8, color_attr.value, "green"); + } + + { + var ctx = ParseContext.init("test"); + const elem = try tryParseElement(&ctx, alloc, null); + try testing.expectEqualSlices(u8, elem.?.tag, "python"); + try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "test"); + } + + { + var ctx = ParseContext.init("bdf"); + const elem = try tryParseElement(&ctx, alloc, null); + try testing.expectEqualSlices(u8, elem.?.tag, "a"); + try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "b"); + try testing.expectEqualSlices(u8, elem.?.children.items[1].Element.tag, "c"); + try testing.expectEqualSlices(u8, elem.?.children.items[2].CharData, "d"); + try testing.expectEqualSlices(u8, elem.?.children.items[3].Element.tag, "e"); + try testing.expectEqualSlices(u8, elem.?.children.items[4].CharData, "f"); + try testing.expectEqualSlices(u8, elem.?.children.items[5].Comment, "g"); + } +} + +fn tryParseProlog(ctx: *ParseContext, alloc: Allocator) !?*XmlDecl { + const start = ctx.offset; + if (!ctx.eatStr(""); + return decl; +} + +test "tryParseProlog" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const alloc = arena.allocator(); + + { + var ctx = ParseContext.init(""); + try testing.expectEqual(@as(?*XmlDecl, null), try tryParseProlog(&ctx, alloc)); + try testing.expectEqual(@as(?u8, '<'), ctx.peek()); + } + + { + var ctx = ParseContext.init(""); + const decl = try tryParseProlog(&ctx, alloc); + try testing.expectEqualSlices(u8, "aa", decl.?.version); + try testing.expectEqual(@as(?[]const u8, null), decl.?.encoding); + try testing.expectEqual(@as(?bool, null), decl.?.standalone); + } + + { + var ctx = ParseContext.init(""); + const decl = try tryParseProlog(&ctx, alloc); + try testing.expectEqualSlices(u8, "aa", decl.?.version); + try testing.expectEqualSlices(u8, "bbb", decl.?.encoding.?); + try testing.expectEqual(@as(?bool, true), decl.?.standalone.?); + } +} + +fn trySkipComments(ctx: *ParseContext, alloc: Allocator) !void { + while (try tryParseComment(ctx, alloc)) |_| { + _ = ctx.eatWs(); + } +} + +fn tryParseComment(ctx: *ParseContext, alloc: Allocator) !?[]const u8 { + if (!ctx.eatStr("")) { + _ = ctx.consume() catch return error.UnclosedComment; + } + + const end = ctx.offset - "-->".len; + return try alloc.dupe(u8, ctx.source[begin..end]); +} + +fn unescapeEntity(text: []const u8) !u8 { + const EntitySubstition = struct { text: []const u8, replacement: u8 }; + + const entities = [_]EntitySubstition{ + .{ .text = "<", .replacement = '<' }, + .{ .text = ">", .replacement = '>' }, + .{ .text = "&", .replacement = '&' }, + .{ .text = "'", .replacement = '\'' }, + .{ .text = """, .replacement = '"' }, + }; + + for (entities) |entity| { + if (std.mem.eql(u8, text, entity.text)) return entity.replacement; + } + + return error.InvalidEntity; +} + +fn dupeAndUnescape(alloc: Allocator, text: []const u8) ![]const u8 { + const str = try alloc.alloc(u8, text.len); + + var j: usize = 0; + var i: usize = 0; + while (i < text.len) : (j += 1) { + if (text[i] == '&') { + const entity_end = 1 + (mem.indexOfScalarPos(u8, text, i, ';') orelse return error.InvalidEntity); + str[j] = try unescapeEntity(text[i..entity_end]); + i = entity_end; + } else { + str[j] = text[i]; + i += 1; + } + } + + // This error is not strictly true, but we need to match one of the items + // from the error set provided by the other stdlib calls at the calling site + if (!alloc.resize(str, j)) { + defer alloc.free(str); + return alloc.dupe(u8, str[0..j]) catch return error.OutOfMemory; + } + return str[0..j]; +} + +test "dupeAndUnescape" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const alloc = arena.allocator(); + + const duped = try dupeAndUnescape(testing.allocator, "test"); + defer testing.allocator.free(duped); + try testing.expectEqualSlices(u8, "test", duped); + const duped2 = try dupeAndUnescape(testing.allocator, "a<b&c>d"e'f<"); + defer testing.allocator.free(duped2); + try testing.expectEqualSlices(u8, "ad\"e'f<", duped2); + try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&")); + try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&&")); + try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&test;")); + try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&boa")); +} + +test "Top level comments" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + const alloc = arena.allocator(); + + const doc = try parse(alloc, ""); + try testing.expectEqualSlices(u8, "python", doc.root.tag); +} diff --git a/src/service.zig b/src/service.zig index 98db5e2..ef7beba 100644 --- a/src/service.zig +++ b/src/service.zig @@ -1798,12 +1798,16 @@ pub const DataService = struct { .splits => "/splits", .etf_profile => return false, // not served .meta => return false, - // New variants wired into the endpoint mapping by - // Milestone 1 chunk 3 (DataService methods). For now - // they're not yet served; clients fall through to live - // provider fetch via getClassification / getEntityFacts / - // getEtfMetrics, which don't exist yet. + // Endpoint mapping for these will be wired when the + // corresponding `getClassification` / `getEntityFacts` / + // `getEtfMetrics` service methods land. Until then, + // server sync is a no-op for them. .classification, .etf_metrics, .entity_facts => return false, + // Provider-internal cache files (ticker-map indexes) + // are not served — clients fetch them directly from + // the SEC. The DataService caches the JSON via + // `Store` after fetching; the server has no role. + .tickers_funds, .tickers_companies => return false, }; const full_url = std.fmt.allocPrint(self.allocator, "{s}/{s}{s}", .{ server_url, symbol, endpoint }) catch return false;