add Edgar/Wikidata providers that will replace AlphaVantage

2026-05-27 10:14:06 -07:00 · 2026-05-27 10:14:06 -07:00 · cc2087fd07
commit cc2087fd07
parent dfd64bf511
9 changed files with 3223 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -922,6 +922,20 @@ zig build run -- <args> # build and run

 The compiled binary is at `zig-out/bin/zfin`.

+## Vendored code
+
+A small amount of third-party source is vendored directly into the
+tree (rather than added as a Zig package dependency) where the
+upstream is small, stable, and not packaged for `build.zig.zon`:
+
+|          File           |                                                                                  Source                                                                                                 |                                         Purpose                                           |
+|-------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
+| `src/providers/xml.zig` | [Snektron/vulkan-zig](https://github.com/Snektron/vulkan-zig/blob/797ae8af88e84753af9640266de61a985b76b580/generator/xml.zig), via [aws-zig](https://github.com/elerch/aws-sdk-for-zig) | XML DOM parser used by the EDGAR provider for NPORT-P primary documents. |
+
+Each vendored file carries a `// VENDORED - see README.md` header
+identifying its upstream source. When updating, copy the new
+upstream verbatim and re-add the header.
+
 ## License

 MIT
--- a/build.zig.zon
+++ b/build.zig.zon
@ -13,8 +13,8 @@
            .hash = "z2d-0.11.0-j5P_HtLzDwBGyQt49DrT0v4BuVqI_SRs6CXsuj7eBVhR",
        },
        .srf = .{
-            .url = "git+https://git.lerch.org/lobo/srf.git?ref=master#512eab0db082f1679af4de77b1f1713409766fcf",
-            .hash = "srf-0.0.0-qZj57-7CAQBdAFgdiSB2bE5Socq8QNId8PFzynVQbSUN",
+            .url = "git+https://git.lerch.org/lobo/srf#12b755660e96ed65c645975110214fcc9c66ca4d",
+            .hash = "srf-0.0.0-qZj5743KAQAykeIHzFJdRDwgAA-Yy1RLaj0Lw4W5Rphx",
        },
    },
    .paths = .{
--- a/src/Config.zig
+++ b/src/Config.zig
@ -38,6 +38,11 @@ fmp_key: ?[]const u8 = null,
 alphavantage_key: ?[]const u8 = null,
 tiingo_key: ?[]const u8 = null,
 openfigi_key: ?[]const u8 = null,
+/// User contact email used as the User-Agent / From header for
+/// open-data providers that require politeness identification
+/// (Wikidata SPARQL, EDGAR). No API-key authentication semantics —
+/// just identifies the operator. Sourced from `ZFIN_USER_EMAIL`.
+user_email: ?[]const u8 = null,
 /// URL of a zfin-server instance for lazy cache sync (e.g. "https://zfin.lerch.org")
 server_url: ?[]const u8 = null,
 cache_dir: []const u8,
@ -92,6 +97,7 @@ pub fn fromEnv(io: std.Io, allocator: std.mem.Allocator, environ_map: *const std
    self.alphavantage_key = self.resolve("ALPHAVANTAGE_API_KEY");
    self.tiingo_key = self.resolve("TIINGO_API_KEY");
    self.openfigi_key = self.resolve("OPENFIGI_API_KEY");
+    self.user_email = self.resolve("ZFIN_USER_EMAIL");
    self.server_url = self.resolve("ZFIN_SERVER");

    const env_cache = self.resolve("ZFIN_CACHE_DIR");
--- a/src/cache/store.zig
+++ b/src/cache/store.zig
@ -63,6 +63,15 @@ pub const Ttl = struct {
    /// Refreshes on quarterly filing cadence; 30-day TTL gives a
    /// fortnightly margin around each fiscal-quarter boundary.
    pub const entity_facts: i64 = 30 * s_per_day;
+
+    /// EDGAR ticker-map indexes (`company_tickers.json` and the MF
+    /// equivalent). SEC updates these daily upstream, but the
+    /// ticker→CIK mapping is extremely stable (changes are rare
+    /// rename events). 30-day TTL with jitter keeps the load
+    /// reasonable while still picking up new listings within a
+    /// month.
+    pub const tickers_funds: i64 = 30 * s_per_day;
+    pub const tickers_companies: i64 = 30 * s_per_day;
 };

 /// Cache TTL specification with optional per-key expiration jitter.
@ -175,6 +184,15 @@ pub const DataType = enum {
    /// symbol-keyed, so a single dual-class issuer (BRK.A / BRK.B)
    /// has one shared facts file.
    entity_facts,
+    /// EDGAR's `company_tickers_mf.json` index, cached at
+    /// `<cache_dir>/_edgar/tickers_funds.srf`. Single-record file
+    /// (one MutualFundTickerMapBlob) under a synthetic `_edgar` key.
+    /// Updated daily upstream; refreshes monthly with jitter.
+    tickers_funds,
+    /// EDGAR's `company_tickers.json` index, cached at
+    /// `<cache_dir>/_edgar/tickers_companies.srf`. Same shape as
+    /// `tickers_funds`.
+    tickers_companies,

    pub fn fileName(self: DataType) []const u8 {
        return switch (self) {
@ -189,6 +207,8 @@ pub const DataType = enum {
            .classification => "classification.srf",
            .etf_metrics => "etf_metrics.srf",
            .entity_facts => "entity_facts.srf",
+            .tickers_funds => "tickers_funds.srf",
+            .tickers_companies => "tickers_companies.srf",
        };
    }

@ -202,6 +222,8 @@ pub const DataType = enum {
            .classification => Ttl.classification,
            .etf_metrics => Ttl.etf_metrics,
            .entity_facts => Ttl.entity_facts,
+            .tickers_funds => Ttl.tickers_funds,
+            .tickers_companies => Ttl.tickers_companies,
            .candles_daily, .candles_meta, .meta => 0,
        };
    }
@ -2331,6 +2353,9 @@ test "TTL constants are reasonable" {
    try std.testing.expectEqual(@as(i64, 90 * std.time.s_per_day), Ttl.classification);
    try std.testing.expectEqual(@as(i64, 90 * std.time.s_per_day), Ttl.etf_metrics);
    try std.testing.expectEqual(@as(i64, 30 * std.time.s_per_day), Ttl.entity_facts);
+    // EDGAR ticker-map indexes refresh monthly with jitter.
+    try std.testing.expectEqual(@as(i64, 30 * std.time.s_per_day), Ttl.tickers_funds);
+    try std.testing.expectEqual(@as(i64, 30 * std.time.s_per_day), Ttl.tickers_companies);
 }

 test "DataType.ttl returns correct values" {
@ -2342,6 +2367,8 @@ test "DataType.ttl returns correct values" {
    try std.testing.expectEqual(Ttl.classification, DataType.classification.ttl());
    try std.testing.expectEqual(Ttl.etf_metrics, DataType.etf_metrics.ttl());
    try std.testing.expectEqual(Ttl.entity_facts, DataType.entity_facts.ttl());
+    try std.testing.expectEqual(Ttl.tickers_funds, DataType.tickers_funds.ttl());
+    try std.testing.expectEqual(Ttl.tickers_companies, DataType.tickers_companies.ttl());

    // These types have no TTL (0 = managed elsewhere)
    try std.testing.expectEqual(@as(i64, 0), DataType.candles_daily.ttl());
@ -2361,6 +2388,8 @@ test "DataType.fileName returns correct file names" {
    try std.testing.expectEqualStrings("classification.srf", DataType.classification.fileName());
    try std.testing.expectEqualStrings("etf_metrics.srf", DataType.etf_metrics.fileName());
    try std.testing.expectEqualStrings("entity_facts.srf", DataType.entity_facts.fileName());
+    try std.testing.expectEqualStrings("tickers_funds.srf", DataType.tickers_funds.fileName());
+    try std.testing.expectEqualStrings("tickers_companies.srf", DataType.tickers_companies.fileName());
 }

 test "negative_cache_content format" {
--- a/src/main.zig
+++ b/src/main.zig
@ -721,4 +721,10 @@ test "looksLikeUnquotedGlob: empty arg returns false" {

 test {
    std.testing.refAllDecls(@This());
+    // Wikidata and EDGAR providers aren't yet imported via
+    // `service.zig`; pull them in here for test discovery in the
+    // meantime. Drop these once the providers are wired through
+    // the data service.
+    _ = @import("providers/Wikidata.zig");
+    _ = @import("providers/Edgar.zig");
 }
--- a/src/providers/Edgar.zig
+++ b/src/providers/Edgar.zig
--- a/src/providers/Wikidata.zig
+++ b/src/providers/Wikidata.zig
@ -0,0 +1,618 @@
+//! Wikidata SPARQL classification provider.
+//!
+//! ## What this provider does
+//!
+//! Given a stock symbol, Wikidata can answer:
+//!
+//!   * "What kind of entity is this?" — name, industry, sector,
+//!     country of incorporation, inception date, instance-of
+//!     classification (operating company / mutual fund / ETF / …).
+//!   * "Does this match the SEC's CIK?" — Wikidata's P5531 already
+//!     stores the 10-digit zero-padded CIK matching SEC's convention.
+//!
+//! ## Workflow
+//!
+//! `fetch(symbols)` runs ONE batched SPARQL query that returns
+//! per-ticker rows. The query is keyed on the US-listing (NYSE /
+//! Nasdaq / NYSE Arca / OTC Markets) of each ticker — without that
+//! filter, common US tickers silently resolve to whichever
+//! foreign-exchange company happens to share the symbol (`MRK` →
+//! Merck KGaA on Frankfurt; `PG` → People's Garment on SET; etc.).
+//!
+//! The provider is stateless. Caching belongs to the data service,
+//! which writes per-symbol `classification.srf` files after this
+//! provider returns and reads them back on subsequent calls.
+//!
+//! ## Glossary
+//!
+//!   SPARQL    Query language for RDF-shaped data. Wikidata's
+//!             primary read API.
+//!   P-number  Property identifier in Wikidata (P249 = ticker symbol,
+//!             P414 = stock exchange, P31 = instance of, ...).
+//!   Q-number  Entity identifier in Wikidata (Q40244 = ETF as a
+//!             concept, Q13677 = NYSE the entity, Q312 = Apple Inc.
+//!             the entity).
+//!   wdt:Pxxx  Truthy/direct property statement — the simple shape.
+//!   p:Pxxx    Reified property statement — lets a statement carry
+//!             qualifiers (e.g. ticker symbol AS A QUALIFIER on the
+//!             stock-exchange statement, rather than as a direct
+//!             property of the company).
+//!   ps:Pxxx   "Statement value" predicate — within a reified
+//!             statement, points to the statement's main value.
+//!   pq:Pxxx   "Qualifier" predicate — within a reified statement,
+//!             points to a qualifier on that statement.
+//!
+//! Why the reified statement matters here: Wikidata stores tickers
+//! as P249 qualifiers on a P414 (stock exchange) statement, NOT as
+//! a direct `wdt:P249` property. Querying naively returns zero rows
+//! for nearly every US-listed equity.
+
+const std = @import("std");
+const http = @import("../net/http.zig");
+const fmt = @import("../format.zig");
+
+const sparql_endpoint = "https://query.wikidata.org/sparql";
+
+/// Per-symbol classification record produced by parsing a Wikidata
+/// SPARQL response. Fields are nullable when Wikidata has no value
+/// for that property; the `source` field always emits per the
+/// project's source-pure invariant.
+pub const ClassificationRecord = struct {
+    symbol: []const u8, // owned
+    name: ?[]const u8 = null, // owned
+    sector: ?[]const u8 = null, // owned
+    industry: ?[]const u8 = null, // owned
+    /// ISO-3166 alpha-2 country code (e.g. "US", "GB", "DE").
+    country: ?[]const u8 = null, // owned
+    asset_class: ?[]const u8 = null, // owned
+    is_etf: bool = false,
+    /// YYYY-MM-DD; trimmed from Wikidata's ISO-8601 date.
+    inception_date: ?[]const u8 = null, // owned
+    /// Wikidata's P5531 — the SEC CIK as a digit string. Wikidata
+    /// already zero-pads to 10 digits, matching the project-wide
+    /// CIK normalization convention.
+    cik: ?[]const u8 = null, // owned
+    /// YYYY-MM-DD when this provider ran, NOT when Wikidata last
+    /// updated the underlying entity.
+    as_of: []const u8, // owned
+    source: []const u8, // no default — provenance always emitted
+
+    pub fn deinit(self: *ClassificationRecord, allocator: std.mem.Allocator) void {
+        allocator.free(self.symbol);
+        if (self.name) |s| allocator.free(s);
+        if (self.sector) |s| allocator.free(s);
+        if (self.industry) |s| allocator.free(s);
+        if (self.country) |s| allocator.free(s);
+        if (self.asset_class) |s| allocator.free(s);
+        if (self.inception_date) |s| allocator.free(s);
+        if (self.cik) |s| allocator.free(s);
+        allocator.free(self.as_of);
+    }
+};
+
+/// Geo-bucket constants used by the country → geo lookup. Kept as
+/// named constants (rather than inline string literals in the map)
+/// so callers can reference them without typo risk and the
+/// taxonomy is tweakable in one place.
+pub const geo = struct {
+    pub const us = "US";
+    pub const developed = "International Developed";
+    pub const emerging = "Emerging Markets";
+    pub const unknown = "Unknown";
+};
+
+/// Wikidata Q-IDs we test against `instance of` (P31) to classify
+/// fund-shaped securities. Curated, not exhaustive.
+const etf_q_ids = [_][]const u8{
+    "Q40244", // exchange-traded fund
+    "Q4118901", // exchange-traded bond fund
+    "Q104638128", // ETF tracking specific index
+};
+const mutual_fund_q_ids = [_][]const u8{
+    "Q1752230", // mutual fund
+    "Q11644608", // open-end fund
+};
+
+/// US stock exchanges accepted by the SPARQL exchange filter.
+/// Without this filter, ticker collisions across global exchanges
+/// silently return the wrong company.
+///
+/// Q-IDs:
+///   Q13677     New York Stock Exchange (NYSE)
+///   Q82059     Nasdaq
+///   Q4527260   NYSE Arca
+///   Q1666011   OTC Markets Group / Pink Sheets
+const us_exchanges = [_][]const u8{
+    "wd:Q13677",
+    "wd:Q82059",
+    "wd:Q4527260",
+    "wd:Q1666011",
+};
+
+/// Country-code-to-geo-bucket lookup. Wikidata returns ISO-3166
+/// alpha-2 codes via P17 → P297; we map them to the geo taxonomy
+/// (`geo.us` / `geo.developed` / `geo.emerging` / `geo.unknown`).
+///
+/// MSCI conventions used as the developed/emerging split. Taiwan
+/// and South Korea are MSCI-emerging despite FTSE classifying them
+/// developed. Israel is MSCI-developed (upgraded 2010). Canada is
+/// folded into International Developed (some users prefer separate
+/// Canada bucket; override in `metadata.srf` if so).
+const country_to_geo = std.StaticStringMap([]const u8).initComptime(.{
+    // United States
+    .{ "US", geo.us },
+    // Alpha-3 fallback for entries that use the longer form.
+    .{ "USA", geo.us },
+
+    // International Developed — Europe ex-CIS
+    .{ "GB", geo.developed },
+    .{ "DE", geo.developed },
+    .{ "FR", geo.developed },
+    .{ "NL", geo.developed },
+    .{ "CH", geo.developed },
+    .{ "SE", geo.developed },
+    .{ "DK", geo.developed },
+    .{ "NO", geo.developed },
+    .{ "FI", geo.developed },
+    .{ "IT", geo.developed },
+    .{ "ES", geo.developed },
+    .{ "BE", geo.developed },
+    .{ "AT", geo.developed },
+    .{ "IE", geo.developed },
+    .{ "LU", geo.developed },
+    .{ "PT", geo.developed },
+    .{ "GR", geo.developed },
+    .{ "IS", geo.developed },
+
+    // International Developed — Asia-Pacific + Israel + Canada
+    .{ "JP", geo.developed },
+    .{ "AU", geo.developed },
+    .{ "NZ", geo.developed },
+    .{ "SG", geo.developed },
+    .{ "HK", geo.developed },
+    .{ "IL", geo.developed },
+    .{ "CA", geo.developed },
+
+    // Emerging Markets (MSCI)
+    .{ "CN", geo.emerging },
+    .{ "TW", geo.emerging },
+    .{ "KR", geo.emerging },
+    .{ "IN", geo.emerging },
+    .{ "BR", geo.emerging },
+    .{ "MX", geo.emerging },
+    .{ "RU", geo.emerging },
+    .{ "TR", geo.emerging },
+    .{ "ZA", geo.emerging },
+    .{ "TH", geo.emerging },
+    .{ "MY", geo.emerging },
+    .{ "ID", geo.emerging },
+    .{ "PH", geo.emerging },
+    .{ "VN", geo.emerging },
+    .{ "AR", geo.emerging },
+    .{ "CL", geo.emerging },
+    .{ "CO", geo.emerging },
+    .{ "PE", geo.emerging },
+    .{ "EG", geo.emerging },
+});
+
+/// Map an ISO-3166 alpha-2 country code to one of the geo buckets.
+/// Null/empty input or an unknown code returns `geo.unknown` so the
+/// user can override in `metadata.srf`.
+pub fn geoFor(iso2: ?[]const u8) []const u8 {
+    const code = iso2 orelse return geo.unknown;
+    if (code.len == 0) return geo.unknown;
+    return country_to_geo.get(code) orelse geo.unknown;
+}
+
+// ── Wikidata provider state (file-as-struct) ─────────────────────
+//
+// Callers do `const wikidata = @import("providers/Wikidata.zig");`
+// followed by `var wd = wikidata.init(...);` and `wd.fetch(...)`.
+
+client: http.Client,
+allocator: std.mem.Allocator,
+io: std.Io,
+/// Contact email for User-Agent / From headers, sourced from
+/// `Config.user_email`. Required; callers must surface a clear
+/// missing-config error before constructing this provider.
+user_email: []const u8,
+
+const Wikidata = @This();
+
+pub fn init(
+    io: std.Io,
+    allocator: std.mem.Allocator,
+    user_email: []const u8,
+) Wikidata {
+    return .{
+        .client = http.Client.init(io, allocator),
+        .allocator = allocator,
+        .io = io,
+        .user_email = user_email,
+    };
+}
+
+pub fn deinit(self: *Wikidata) void {
+    self.client.deinit();
+}
+
+/// Fetch and parse Wikidata classifications for `symbols`.
+/// Runs a single batched SPARQL query and parses the response.
+/// Caller owns the returned slice and each record.
+pub fn fetch(
+    self: *Wikidata,
+    result_allocator: std.mem.Allocator,
+    symbols: []const []const u8,
+) ![]ClassificationRecord {
+    if (symbols.len == 0) return &.{};
+
+    const query = try buildQuery(self.allocator, symbols);
+    defer self.allocator.free(query);
+
+    const json = try self.postSparql(query);
+    defer self.allocator.free(json);
+
+    return parse(self.io, result_allocator, json, symbols);
+}
+
+/// POST a SPARQL query. Sets the User-Agent + From headers from
+/// `user_email` for politeness; Wikidata explicitly recommends
+/// descriptive User-Agent strings.
+fn postSparql(self: *Wikidata, query: []const u8) ![]u8 {
+    var form_buf: std.Io.Writer.Allocating = .init(self.allocator);
+    defer form_buf.deinit();
+    try form_buf.writer.writeAll("query=");
+    // `Component.formatEscaped` percent-encodes everything outside
+    // RFC 3986's unreserved set — exactly the contract for the
+    // `application/x-www-form-urlencoded` body we're building.
+    try (std.Uri.Component{ .raw = query }).formatEscaped(&form_buf.writer);
+
+    var ua_buf: [256]u8 = undefined;
+    const ua = std.fmt.bufPrint(&ua_buf, "zfin/0.1 ({s})", .{self.user_email}) catch return error.UserEmailTooLong;
+
+    const headers = [_]std.http.Header{
+        .{ .name = "User-Agent", .value = ua },
+        .{ .name = "Accept", .value = "application/sparql-results+json" },
+        .{ .name = "Content-Type", .value = "application/x-www-form-urlencoded" },
+        .{ .name = "From", .value = self.user_email },
+    };
+
+    var resp = try self.client.request(.POST, sparql_endpoint, form_buf.written(), &headers);
+    defer resp.deinit();
+    return self.allocator.dupe(u8, resp.body);
+}
+
+/// Build the batched SPARQL query for a slice of ticker symbols.
+/// Caller owns the returned bytes. Symbols interpolated via
+/// `VALUES ?ticker { "AAPL" "MSFT" ... }`.
+///
+/// Wikidata's ticker storage is non-obvious: tickers are stored as
+/// `P249` qualifiers on a `P414` (stock exchange) statement. Naive
+/// `?security wdt:P249 ?ticker` returns zero rows for nearly every
+/// US-listed equity. The query reaches them via:
+///
+///   ?security p:P414 ?stmt .
+///   ?stmt ps:P414 ?exchange .
+///   ?stmt pq:P249 ?ticker .
+///
+/// `?exchange` is filtered to a small set of US exchanges to avoid
+/// ticker collisions with foreign listings.
+fn buildQuery(allocator: std.mem.Allocator, symbols: []const []const u8) ![]u8 {
+    var aw: std.Io.Writer.Allocating = .init(allocator);
+    defer aw.deinit();
+
+    try aw.writer.writeAll(
+        \\SELECT ?ticker ?security ?securityLabel ?industryLabel ?countryCode ?inception ?cik ?instance WHERE {
+        \\  VALUES ?ticker {
+    );
+    for (symbols) |s| {
+        try aw.writer.print(" \"{s}\"", .{s});
+    }
+    try aw.writer.writeAll(" }\n");
+    try aw.writer.writeAll("  VALUES ?exchange {");
+    for (us_exchanges) |x| {
+        try aw.writer.print(" {s}", .{x});
+    }
+    try aw.writer.writeAll(" }\n");
+    try aw.writer.writeAll(
+        \\  ?security p:P414 ?exchstmt .
+        \\  ?exchstmt ps:P414 ?exchange .
+        \\  ?exchstmt pq:P249 ?ticker .
+        \\  OPTIONAL { ?security wdt:P452 ?industry . }
+        \\  OPTIONAL { ?security wdt:P17 ?country . ?country wdt:P297 ?countryCode . }
+        \\  OPTIONAL { ?security wdt:P571 ?inception . }
+        \\  OPTIONAL { ?security wdt:P5531 ?cik . }
+        \\  OPTIONAL { ?security wdt:P31 ?instance . }
+        \\  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
+        \\}
+    );
+    return aw.toOwnedSlice();
+}
+
+/// Parse the SPARQL JSON response into `ClassificationRecord` values.
+/// Multiple bindings for the same ticker (e.g. multiple `instance of`
+/// values) get merged into one record — first-non-null wins.
+fn parse(
+    io: std.Io,
+    allocator: std.mem.Allocator,
+    json_bytes: []const u8,
+    expected_symbols: []const []const u8,
+) ![]ClassificationRecord {
+    const today = fmt.todayDate(io);
+    var as_of_buf: [10]u8 = undefined;
+    const as_of = try std.fmt.bufPrint(&as_of_buf, "{f}", .{today});
+
+    const parsed = std.json.parseFromSlice(std.json.Value, allocator, json_bytes, .{}) catch
+        return &.{};
+    defer parsed.deinit();
+
+    const root = switch (parsed.value) {
+        .object => |o| o,
+        else => return &.{},
+    };
+    const results = switch (root.get("results") orelse return &.{}) {
+        .object => |o| o,
+        else => return &.{},
+    };
+    const bindings = switch (results.get("bindings") orelse return &.{}) {
+        .array => |a| a.items,
+        else => return &.{},
+    };
+
+    // Map symbol → record; merge multiple bindings.
+    var by_symbol: std.StringHashMap(ClassificationRecord) = .init(allocator);
+    defer {
+        var it = by_symbol.valueIterator();
+        while (it.next()) |r| r.deinit(allocator);
+        by_symbol.deinit();
+    }
+
+    for (bindings) |b| {
+        const obj = switch (b) {
+            .object => |o| o,
+            else => continue,
+        };
+        const ticker = sparqlValue(obj, "ticker") orelse continue;
+
+        // Verify ticker is one we asked for. Wikidata can return
+        // surprising matches (foreign exchanges); skip those.
+        var matched = false;
+        for (expected_symbols) |s| {
+            if (std.ascii.eqlIgnoreCase(s, ticker)) {
+                matched = true;
+                break;
+            }
+        }
+        if (!matched) continue;
+
+        const existing_or_new = try by_symbol.getOrPut(ticker);
+        if (!existing_or_new.found_existing) {
+            existing_or_new.key_ptr.* = try allocator.dupe(u8, ticker);
+            existing_or_new.value_ptr.* = .{
+                .symbol = try allocator.dupe(u8, ticker),
+                .as_of = try allocator.dupe(u8, as_of),
+                .source = "wikidata",
+            };
+        }
+        const rec = existing_or_new.value_ptr;
+
+        if (rec.name == null) {
+            if (sparqlValue(obj, "securityLabel")) |label| {
+                rec.name = try allocator.dupe(u8, label);
+            }
+        }
+        if (rec.industry == null) {
+            if (sparqlValue(obj, "industryLabel")) |ind| {
+                rec.industry = try allocator.dupe(u8, ind);
+                rec.sector = try allocator.dupe(u8, ind);
+            }
+        }
+        if (rec.country == null) {
+            if (sparqlValue(obj, "countryCode")) |c| {
+                rec.country = try allocator.dupe(u8, c);
+            }
+        }
+        if (rec.inception_date == null) {
+            if (sparqlValue(obj, "inception")) |d| {
+                if (d.len >= 10) {
+                    rec.inception_date = try allocator.dupe(u8, d[0..10]);
+                }
+            }
+        }
+        if (rec.cik == null) {
+            if (sparqlValue(obj, "cik")) |c| {
+                rec.cik = try allocator.dupe(u8, c);
+            }
+        }
+        if (sparqlValue(obj, "instance")) |inst_iri| {
+            // The "instance" value is a Q-ID URI like
+            // "http://www.wikidata.org/entity/Q40244". Extract the
+            // Q-ID suffix and test against our known sets.
+            const last_slash = std.mem.lastIndexOfScalar(u8, inst_iri, '/');
+            const q_id = if (last_slash) |i| inst_iri[i + 1 ..] else inst_iri;
+            for (etf_q_ids) |target| {
+                if (std.mem.eql(u8, q_id, target)) {
+                    rec.is_etf = true;
+                    if (rec.asset_class == null) {
+                        rec.asset_class = try allocator.dupe(u8, "ETF (uncategorized)");
+                    }
+                    break;
+                }
+            }
+            for (mutual_fund_q_ids) |target| {
+                if (std.mem.eql(u8, q_id, target)) {
+                    rec.is_etf = true;
+                    if (rec.asset_class == null) {
+                        rec.asset_class = try allocator.dupe(u8, "Mutual Fund (uncategorized)");
+                    }
+                    break;
+                }
+            }
+        }
+    }
+
+    // Drain map into owned slice. Caller takes ownership; our defer
+    // above calls deinit on values, so clear the map before returning
+    // to avoid double-free.
+    var out = try allocator.alloc(ClassificationRecord, by_symbol.count());
+    var idx: usize = 0;
+    var it = by_symbol.iterator();
+    while (it.next()) |entry| {
+        out[idx] = entry.value_ptr.*;
+        idx += 1;
+    }
+    var key_it = by_symbol.keyIterator();
+    while (key_it.next()) |k| allocator.free(k.*);
+    by_symbol.clearRetainingCapacity();
+    return out;
+}
+
+/// Pull the `.value` string out of a SPARQL JSON binding object's
+/// named field. Returns null if absent or non-string.
+fn sparqlValue(obj: std.json.ObjectMap, field: []const u8) ?[]const u8 {
+    const slot = obj.get(field) orelse return null;
+    const slot_obj = switch (slot) {
+        .object => |o| o,
+        else => return null,
+    };
+    const val = slot_obj.get("value") orelse return null;
+    return switch (val) {
+        .string => |s| s,
+        else => null,
+    };
+}
+
+// ── Tests ────────────────────────────────────────────────────────
+
+test "buildQuery includes all symbols and required SELECT vars" {
+    const allocator = std.testing.allocator;
+    const syms = [_][]const u8{ "AAPL", "VTI" };
+    const q = try buildQuery(allocator, &syms);
+    defer allocator.free(q);
+
+    try std.testing.expect(std.mem.indexOf(u8, q, "\"AAPL\"") != null);
+    try std.testing.expect(std.mem.indexOf(u8, q, "\"VTI\"") != null);
+    try std.testing.expect(std.mem.indexOf(u8, q, "p:P414") != null);
+    try std.testing.expect(std.mem.indexOf(u8, q, "pq:P249") != null);
+    try std.testing.expect(std.mem.indexOf(u8, q, "wdt:P452") != null);
+    try std.testing.expect(std.mem.indexOf(u8, q, "wdt:P17") != null);
+    // US-exchange filter must be present — without it, US tickers
+    // collide with foreign exchanges (MRK→Merck KGaA, PG→People's
+    // Garment, etc.). See `us_exchanges` doc-block.
+    try std.testing.expect(std.mem.indexOf(u8, q, "wd:Q13677") != null); // NYSE
+    try std.testing.expect(std.mem.indexOf(u8, q, "wd:Q82059") != null); // Nasdaq
+    try std.testing.expect(std.mem.indexOf(u8, q, "ps:P414 ?exchange") != null);
+}
+
+test "parse: AAPL fixture round-trips name + industry + country" {
+    const fixture =
+        \\{
+        \\  "head": {"vars": ["ticker", "security", "securityLabel", "industryLabel", "countryCode", "inception", "cik", "instance"]},
+        \\  "results": {
+        \\    "bindings": [
+        \\      {
+        \\        "ticker": {"type": "literal", "value": "AAPL"},
+        \\        "security": {"type": "uri", "value": "http://www.wikidata.org/entity/Q312"},
+        \\        "securityLabel": {"type": "literal", "value": "Apple Inc."},
+        \\        "industryLabel": {"type": "literal", "value": "consumer electronics"},
+        \\        "countryCode": {"type": "literal", "value": "US"},
+        \\        "instance": {"type": "uri", "value": "http://www.wikidata.org/entity/Q4830453"}
+        \\      }
+        \\    ]
+        \\  }
+        \\}
+    ;
+
+    const allocator = std.testing.allocator;
+    const expected = [_][]const u8{"AAPL"};
+    const recs = try parse(std.testing.io, allocator, fixture, &expected);
+    defer {
+        for (recs) |*r| {
+            var m = r.*;
+            m.deinit(allocator);
+        }
+        allocator.free(recs);
+    }
+
+    try std.testing.expectEqual(@as(usize, 1), recs.len);
+    try std.testing.expectEqualStrings("AAPL", recs[0].symbol);
+    try std.testing.expectEqualStrings("Apple Inc.", recs[0].name.?);
+    try std.testing.expectEqualStrings("consumer electronics", recs[0].industry.?);
+    try std.testing.expectEqualStrings("consumer electronics", recs[0].sector.?);
+    try std.testing.expectEqualStrings("US", recs[0].country.?);
+    try std.testing.expect(!recs[0].is_etf);
+}
+
+test "parse: ETF fixture sets is_etf=true and asset_class" {
+    const fixture =
+        \\{
+        \\  "head": {"vars": ["ticker", "security", "securityLabel", "instance"]},
+        \\  "results": {
+        \\    "bindings": [
+        \\      {
+        \\        "ticker": {"type": "literal", "value": "VTI"},
+        \\        "security": {"type": "uri", "value": "http://www.wikidata.org/entity/Q1809462"},
+        \\        "securityLabel": {"type": "literal", "value": "Vanguard Total Stock Market ETF"},
+        \\        "instance": {"type": "uri", "value": "http://www.wikidata.org/entity/Q40244"}
+        \\      }
+        \\    ]
+        \\  }
+        \\}
+    ;
+
+    const allocator = std.testing.allocator;
+    const expected = [_][]const u8{"VTI"};
+    const recs = try parse(std.testing.io, allocator, fixture, &expected);
+    defer {
+        for (recs) |*r| {
+            var m = r.*;
+            m.deinit(allocator);
+        }
+        allocator.free(recs);
+    }
+
+    try std.testing.expectEqual(@as(usize, 1), recs.len);
+    try std.testing.expect(recs[0].is_etf);
+    try std.testing.expectEqualStrings("ETF (uncategorized)", recs[0].asset_class.?);
+}
+
+test "parse: bindings for symbols not requested are dropped" {
+    const fixture =
+        \\{
+        \\  "head": {"vars": ["ticker", "security", "securityLabel"]},
+        \\  "results": {
+        \\    "bindings": [
+        \\      {"ticker": {"type": "literal", "value": "WRONG"},
+        \\       "security": {"type": "uri", "value": "http://example/Q1"},
+        \\       "securityLabel": {"type": "literal", "value": "Wrong Co"}}
+        \\    ]
+        \\  }
+        \\}
+    ;
+
+    const allocator = std.testing.allocator;
+    const expected = [_][]const u8{"AAPL"};
+    const recs = try parse(std.testing.io, allocator, fixture, &expected);
+    defer allocator.free(recs);
+
+    try std.testing.expectEqual(@as(usize, 0), recs.len);
+}
+
+test "geoFor maps known ISO-3166 codes to bucket" {
+    try std.testing.expectEqualStrings(geo.us, geoFor("US"));
+    try std.testing.expectEqualStrings(geo.us, geoFor("USA"));
+    try std.testing.expectEqualStrings(geo.developed, geoFor("GB"));
+    try std.testing.expectEqualStrings(geo.developed, geoFor("DE"));
+    try std.testing.expectEqualStrings(geo.developed, geoFor("CA"));
+    try std.testing.expectEqualStrings(geo.developed, geoFor("IL"));
+    try std.testing.expectEqualStrings(geo.emerging, geoFor("CN"));
+    try std.testing.expectEqualStrings(geo.emerging, geoFor("TW"));
+    try std.testing.expectEqualStrings(geo.emerging, geoFor("KR"));
+}
+
+test "geoFor returns Unknown for null/empty/unmapped" {
+    try std.testing.expectEqualStrings(geo.unknown, geoFor(null));
+    try std.testing.expectEqualStrings(geo.unknown, geoFor(""));
+    try std.testing.expectEqualStrings(geo.unknown, geoFor("ZZ")); // unassigned ISO-2
+    try std.testing.expectEqualStrings(geo.unknown, geoFor("XX"));
+}
--- a/src/providers/xml.zig
+++ b/src/providers/xml.zig
@ -0,0 +1,713 @@
+// VENDORED - see README.md.
+// File sourced from:
+// https://github.com/Snektron/vulkan-zig/blob/797ae8af88e84753af9640266de61a985b76b580/generator/xml.zig
+// via ~/shared/aws-zig/src/xml.zig
+const std = @import("std");
+const mem = std.mem;
+const testing = std.testing;
+const Allocator = mem.Allocator;
+const ArenaAllocator = std.heap.ArenaAllocator;
+const ArrayList = std.ArrayList;
+
+pub const Attribute = struct {
+    name: []const u8,
+    value: []const u8,
+};
+
+pub const Content = union(enum) {
+    CharData: []const u8,
+    Comment: []const u8,
+    Element: *Element,
+};
+
+pub const Element = struct {
+    pub const AttributeList = ArrayList(*Attribute);
+    pub const ContentList = ArrayList(Content);
+
+    tag: []const u8,
+    attributes: AttributeList,
+    children: ContentList,
+    next_sibling: ?*Element = null,
+    allocator: std.mem.Allocator,
+
+    fn init(tag: []const u8, alloc: Allocator) Element {
+        return .{
+            .tag = tag,
+            .attributes = .empty,
+            .children = .empty,
+            .allocator = alloc,
+        };
+    }
+
+    pub fn getAttribute(self: *Element, attrib_name: []const u8) ?[]const u8 {
+        for (self.attributes.items) |child| {
+            if (mem.eql(u8, child.name, attrib_name)) {
+                return child.value;
+            }
+        }
+
+        return null;
+    }
+
+    pub fn getCharData(self: *Element, child_tag: []const u8) ?[]const u8 {
+        const child = (self.findChildByTag(child_tag) catch return null) orelse return null;
+        if (child.children.items.len != 1) {
+            return null;
+        }
+
+        return switch (child.children.items[0]) {
+            .CharData => |char_data| char_data,
+            else => null,
+        };
+    }
+
+    pub fn iterator(self: *Element) ChildIterator {
+        return .{
+            .items = self.children.items,
+            .i = 0,
+        };
+    }
+
+    pub fn elements(self: *Element) ChildElementIterator {
+        return .{
+            .inner = self.iterator(),
+        };
+    }
+
+    pub fn findChildByTag(self: *Element, tag: []const u8) !?*Element {
+        var it = self.findChildrenByTag(tag);
+        return try it.next();
+    }
+
+    pub fn findChildrenByTag(self: *Element, tag: []const u8) FindChildrenByTagIterator {
+        return .{
+            .inner = self.elements(),
+            .tag = tag,
+        };
+    }
+
+    pub const ChildIterator = struct {
+        items: []Content,
+        i: usize,
+
+        pub fn next(self: *ChildIterator) ?*Content {
+            if (self.i < self.items.len) {
+                self.i += 1;
+                return &self.items[self.i - 1];
+            }
+
+            return null;
+        }
+    };
+
+    pub const ChildElementIterator = struct {
+        inner: ChildIterator,
+
+        pub fn next(self: *ChildElementIterator) ?*Element {
+            while (self.inner.next()) |child| {
+                if (child.* != .Element) {
+                    continue;
+                }
+
+                return child.*.Element;
+            }
+
+            return null;
+        }
+    };
+
+    fn strictEqual(a: []const u8, b: []const u8, _: PredicateOptions) !bool {
+        return mem.eql(u8, a, b);
+    }
+    pub const FindChildrenByTagIterator = struct {
+        inner: ChildElementIterator,
+        tag: []const u8,
+        predicate: *const fn (a: []const u8, b: []const u8, options: PredicateOptions) anyerror!bool = strictEqual,
+        predicate_options: PredicateOptions = .{},
+
+        pub fn next(self: *FindChildrenByTagIterator) !?*Element {
+            while (self.inner.next()) |child| {
+                if (!try self.predicate(child.tag, self.tag, self.predicate_options)) {
+                    continue;
+                }
+
+                return child;
+            }
+
+            return null;
+        }
+    };
+};
+
+pub const PredicateOptions = struct {
+    allocator: ?std.mem.Allocator = null,
+};
+pub const XmlDecl = struct {
+    version: []const u8,
+    encoding: ?[]const u8,
+    standalone: ?bool,
+};
+
+pub const Document = struct {
+    arena: ArenaAllocator,
+    xml_decl: ?*XmlDecl,
+    root: *Element,
+
+    pub fn deinit(self: Document) void {
+        var arena = self.arena; // Copy to stack so self can be taken by value.
+        arena.deinit();
+    }
+};
+
+const ParseContext = struct {
+    source: []const u8,
+    offset: usize,
+    line: usize,
+    column: usize,
+
+    fn init(source: []const u8) ParseContext {
+        return .{
+            .source = source,
+            .offset = 0,
+            .line = 0,
+            .column = 0,
+        };
+    }
+
+    fn peek(self: *ParseContext) ?u8 {
+        return if (self.offset < self.source.len) self.source[self.offset] else null;
+    }
+
+    fn consume(self: *ParseContext) !u8 {
+        if (self.offset < self.source.len) {
+            return self.consumeNoEof();
+        }
+
+        return error.UnexpectedEof;
+    }
+
+    fn consumeNoEof(self: *ParseContext) u8 {
+        std.debug.assert(self.offset < self.source.len);
+        const c = self.source[self.offset];
+        self.offset += 1;
+
+        if (c == '\n') {
+            self.line += 1;
+            self.column = 0;
+        } else {
+            self.column += 1;
+        }
+
+        return c;
+    }
+
+    fn eat(self: *ParseContext, char: u8) bool {
+        self.expect(char) catch return false;
+        return true;
+    }
+
+    fn expect(self: *ParseContext, expected: u8) !void {
+        if (self.peek()) |actual| {
+            if (expected != actual) {
+                return error.UnexpectedCharacter;
+            }
+
+            _ = self.consumeNoEof();
+            return;
+        }
+
+        return error.UnexpectedEof;
+    }
+
+    fn eatStr(self: *ParseContext, text: []const u8) bool {
+        self.expectStr(text) catch return false;
+        return true;
+    }
+
+    fn expectStr(self: *ParseContext, text: []const u8) !void {
+        if (self.source.len < self.offset + text.len) {
+            return error.UnexpectedEof;
+        } else if (std.mem.startsWith(u8, self.source[self.offset..], text)) {
+            var i: usize = 0;
+            while (i < text.len) : (i += 1) {
+                _ = self.consumeNoEof();
+            }
+
+            return;
+        }
+
+        return error.UnexpectedCharacter;
+    }
+
+    fn eatWs(self: *ParseContext) bool {
+        var ws = false;
+
+        while (self.peek()) |ch| {
+            switch (ch) {
+                ' ', '\t', '\n', '\r' => {
+                    ws = true;
+                    _ = self.consumeNoEof();
+                },
+                else => break,
+            }
+        }
+
+        return ws;
+    }
+
+    fn expectWs(self: *ParseContext) !void {
+        if (!self.eatWs()) return error.UnexpectedCharacter;
+    }
+
+    fn currentLine(self: ParseContext) []const u8 {
+        var begin: usize = 0;
+        if (mem.lastIndexOfScalar(u8, self.source[0..self.offset], '\n')) |prev_nl| {
+            begin = prev_nl + 1;
+        }
+
+        const end = mem.indexOfScalarPos(u8, self.source, self.offset, '\n') orelse self.source.len;
+        return self.source[begin..end];
+    }
+};
+
+test "ParseContext" {
+    {
+        var ctx = ParseContext.init("I like pythons");
+        try testing.expectEqual(@as(?u8, 'I'), ctx.peek());
+        try testing.expectEqual(@as(u8, 'I'), ctx.consumeNoEof());
+        try testing.expectEqual(@as(?u8, ' '), ctx.peek());
+        try testing.expectEqual(@as(u8, ' '), try ctx.consume());
+
+        try testing.expect(ctx.eat('l'));
+        try testing.expectEqual(@as(?u8, 'i'), ctx.peek());
+        try testing.expectEqual(false, ctx.eat('a'));
+        try testing.expectEqual(@as(?u8, 'i'), ctx.peek());
+
+        try ctx.expect('i');
+        try testing.expectEqual(@as(?u8, 'k'), ctx.peek());
+        try testing.expectError(error.UnexpectedCharacter, ctx.expect('a'));
+        try testing.expectEqual(@as(?u8, 'k'), ctx.peek());
+
+        try testing.expect(ctx.eatStr("ke"));
+        try testing.expectEqual(@as(?u8, ' '), ctx.peek());
+
+        try testing.expect(ctx.eatWs());
+        try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
+        try testing.expectEqual(false, ctx.eatWs());
+        try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
+
+        try testing.expectEqual(false, ctx.eatStr("aaaaaaaaa"));
+        try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
+
+        try testing.expectError(error.UnexpectedEof, ctx.expectStr("aaaaaaaaa"));
+        try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
+        try testing.expectError(error.UnexpectedCharacter, ctx.expectStr("pytn"));
+        try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
+        try ctx.expectStr("python");
+        try testing.expectEqual(@as(?u8, 's'), ctx.peek());
+    }
+
+    {
+        var ctx = ParseContext.init("");
+        try testing.expectEqual(ctx.peek(), null);
+        try testing.expectError(error.UnexpectedEof, ctx.consume());
+        try testing.expectEqual(ctx.eat('p'), false);
+        try testing.expectError(error.UnexpectedEof, ctx.expect('p'));
+    }
+}
+
+pub const ParseError = error{
+    IllegalCharacter,
+    UnexpectedEof,
+    UnexpectedCharacter,
+    UnclosedValue,
+    UnclosedComment,
+    InvalidName,
+    InvalidEntity,
+    InvalidStandaloneValue,
+    NonMatchingClosingTag,
+    InvalidDocument,
+    OutOfMemory,
+};
+
+pub fn parse(backing_allocator: Allocator, source: []const u8) !Document {
+    var ctx = ParseContext.init(source);
+    return try parseDocument(&ctx, backing_allocator);
+}
+
+fn parseDocument(ctx: *ParseContext, backing_allocator: Allocator) !Document {
+    var doc = Document{
+        .arena = ArenaAllocator.init(backing_allocator),
+        .xml_decl = null,
+        // SAFETY: assigned below by `try parseDocumentRoot(&doc, ctx)`
+        // before `doc` is returned to the caller. If the parse fails,
+        // we propagate the error and the caller sees an error, not
+        // a half-initialized doc.
+        .root = undefined,
+    };
+
+    errdefer doc.deinit();
+
+    const allocator = doc.arena.allocator();
+
+    try trySkipComments(ctx, allocator);
+
+    doc.xml_decl = try tryParseProlog(ctx, allocator);
+    _ = ctx.eatWs();
+    try trySkipComments(ctx, allocator);
+
+    doc.root = (try tryParseElement(ctx, allocator, null)) orelse return error.InvalidDocument;
+    _ = ctx.eatWs();
+    try trySkipComments(ctx, allocator);
+
+    if (ctx.peek() != null) return error.InvalidDocument;
+
+    return doc;
+}
+
+fn parseAttrValue(ctx: *ParseContext, alloc: Allocator) ![]const u8 {
+    const quote = try ctx.consume();
+    if (quote != '"' and quote != '\'') return error.UnexpectedCharacter;
+
+    const begin = ctx.offset;
+
+    while (true) {
+        const c = ctx.consume() catch return error.UnclosedValue;
+        if (c == quote) break;
+    }
+
+    const end = ctx.offset - 1;
+
+    return try dupeAndUnescape(alloc, ctx.source[begin..end]);
+}
+
+fn parseEqAttrValue(ctx: *ParseContext, alloc: Allocator) ![]const u8 {
+    _ = ctx.eatWs();
+    try ctx.expect('=');
+    _ = ctx.eatWs();
+
+    return try parseAttrValue(ctx, alloc);
+}
+
+fn parseNameNoDupe(ctx: *ParseContext) ![]const u8 {
+    // XML's spec on names is very long, so to make this easier
+    // we just take any character that is not special and not whitespace
+    const begin = ctx.offset;
+
+    while (ctx.peek()) |ch| {
+        switch (ch) {
+            ' ', '\t', '\n', '\r' => break,
+            '&', '"', '\'', '<', '>', '?', '=', '/' => break,
+            else => _ = ctx.consumeNoEof(),
+        }
+    }
+
+    const end = ctx.offset;
+    if (begin == end) return error.InvalidName;
+
+    return ctx.source[begin..end];
+}
+
+fn tryParseCharData(ctx: *ParseContext, alloc: Allocator) !?[]const u8 {
+    const begin = ctx.offset;
+
+    while (ctx.peek()) |ch| {
+        switch (ch) {
+            '<' => break,
+            else => _ = ctx.consumeNoEof(),
+        }
+    }
+
+    const end = ctx.offset;
+    if (begin == end) return null;
+
+    return try dupeAndUnescape(alloc, ctx.source[begin..end]);
+}
+
+fn parseContent(ctx: *ParseContext, alloc: Allocator, parent: ?*Element) ParseError!Content {
+    if (try tryParseCharData(ctx, alloc)) |cd| {
+        return Content{ .CharData = cd };
+    } else if (try tryParseComment(ctx, alloc)) |comment| {
+        return Content{ .Comment = comment };
+    } else if (try tryParseElement(ctx, alloc, parent)) |elem| {
+        return Content{ .Element = elem };
+    } else {
+        return error.UnexpectedCharacter;
+    }
+}
+
+fn tryParseAttr(ctx: *ParseContext, alloc: Allocator) !?*Attribute {
+    const name = parseNameNoDupe(ctx) catch return null;
+    _ = ctx.eatWs();
+    try ctx.expect('=');
+    _ = ctx.eatWs();
+    const value = try parseAttrValue(ctx, alloc);
+
+    const attr = try alloc.create(Attribute);
+    attr.name = try alloc.dupe(u8, name);
+    attr.value = value;
+    return attr;
+}
+
+fn tryParseElement(ctx: *ParseContext, alloc: Allocator, parent: ?*Element) !?*Element {
+    const start = ctx.offset;
+    if (!ctx.eat('<')) return null;
+    const tag = parseNameNoDupe(ctx) catch {
+        ctx.offset = start;
+        return null;
+    };
+
+    const element = try alloc.create(Element);
+    element.* = Element.init(try alloc.dupe(u8, tag), alloc);
+
+    while (ctx.eatWs()) {
+        const attr = (try tryParseAttr(ctx, alloc)) orelse break;
+        try element.attributes.append(element.allocator, attr);
+    }
+
+    if (ctx.eatStr("/>")) {
+        return element;
+    }
+
+    try ctx.expect('>');
+
+    while (true) {
+        if (ctx.peek() == null) {
+            return error.UnexpectedEof;
+        } else if (ctx.eatStr("</")) {
+            break;
+        }
+
+        const content = try parseContent(ctx, alloc, element);
+        try element.children.append(element.allocator, content);
+    }
+
+    const closing_tag = try parseNameNoDupe(ctx);
+    if (!std.mem.eql(u8, tag, closing_tag)) {
+        return error.NonMatchingClosingTag;
+    }
+
+    _ = ctx.eatWs();
+    try ctx.expect('>');
+
+    if (parent) |p| {
+        var last_element: ?*Element = null;
+
+        for (0..p.children.items.len) |i| {
+            const child = p.children.items[p.children.items.len - i - 1];
+            if (child == .Element) {
+                last_element = child.Element;
+                break;
+            }
+        }
+
+        if (last_element) |lc| {
+            lc.next_sibling = element;
+        }
+    }
+
+    return element;
+}
+
+test "tryParseElement" {
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
+
+    {
+        var ctx = ParseContext.init("<= a='b'/>");
+        try testing.expectEqual(@as(?*Element, null), try tryParseElement(&ctx, alloc, null));
+        try testing.expectEqual(@as(?u8, '<'), ctx.peek());
+    }
+
+    {
+        var ctx = ParseContext.init("<python size='15' color = \"green\"/>");
+        const elem = try tryParseElement(&ctx, alloc, null);
+        try testing.expectEqualSlices(u8, elem.?.tag, "python");
+
+        const size_attr = elem.?.attributes.items[0];
+        try testing.expectEqualSlices(u8, size_attr.name, "size");
+        try testing.expectEqualSlices(u8, size_attr.value, "15");
+
+        const color_attr = elem.?.attributes.items[1];
+        try testing.expectEqualSlices(u8, color_attr.name, "color");
+        try testing.expectEqualSlices(u8, color_attr.value, "green");
+    }
+
+    {
+        var ctx = ParseContext.init("<python>test</python>");
+        const elem = try tryParseElement(&ctx, alloc, null);
+        try testing.expectEqualSlices(u8, elem.?.tag, "python");
+        try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "test");
+    }
+
+    {
+        var ctx = ParseContext.init("<a>b<c/>d<e/>f<!--g--></a>");
+        const elem = try tryParseElement(&ctx, alloc, null);
+        try testing.expectEqualSlices(u8, elem.?.tag, "a");
+        try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "b");
+        try testing.expectEqualSlices(u8, elem.?.children.items[1].Element.tag, "c");
+        try testing.expectEqualSlices(u8, elem.?.children.items[2].CharData, "d");
+        try testing.expectEqualSlices(u8, elem.?.children.items[3].Element.tag, "e");
+        try testing.expectEqualSlices(u8, elem.?.children.items[4].CharData, "f");
+        try testing.expectEqualSlices(u8, elem.?.children.items[5].Comment, "g");
+    }
+}
+
+fn tryParseProlog(ctx: *ParseContext, alloc: Allocator) !?*XmlDecl {
+    const start = ctx.offset;
+    if (!ctx.eatStr("<?") or !mem.eql(u8, try parseNameNoDupe(ctx), "xml")) {
+        ctx.offset = start;
+        return null;
+    }
+
+    const decl = try alloc.create(XmlDecl);
+    decl.encoding = null;
+    decl.standalone = null;
+
+    // Version info is mandatory
+    try ctx.expectWs();
+    try ctx.expectStr("version");
+    decl.version = try parseEqAttrValue(ctx, alloc);
+
+    if (ctx.eatWs()) {
+        // Optional encoding and standalone info
+        var require_ws = false;
+
+        if (ctx.eatStr("encoding")) {
+            decl.encoding = try parseEqAttrValue(ctx, alloc);
+            require_ws = true;
+        }
+
+        if (require_ws == ctx.eatWs() and ctx.eatStr("standalone")) {
+            const standalone = try parseEqAttrValue(ctx, alloc);
+            if (std.mem.eql(u8, standalone, "yes")) {
+                decl.standalone = true;
+            } else if (std.mem.eql(u8, standalone, "no")) {
+                decl.standalone = false;
+            } else {
+                return error.InvalidStandaloneValue;
+            }
+        }
+
+        _ = ctx.eatWs();
+    }
+
+    try ctx.expectStr("?>");
+    return decl;
+}
+
+test "tryParseProlog" {
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
+
+    {
+        var ctx = ParseContext.init("<?xmla version='aa'?>");
+        try testing.expectEqual(@as(?*XmlDecl, null), try tryParseProlog(&ctx, alloc));
+        try testing.expectEqual(@as(?u8, '<'), ctx.peek());
+    }
+
+    {
+        var ctx = ParseContext.init("<?xml version='aa'?>");
+        const decl = try tryParseProlog(&ctx, alloc);
+        try testing.expectEqualSlices(u8, "aa", decl.?.version);
+        try testing.expectEqual(@as(?[]const u8, null), decl.?.encoding);
+        try testing.expectEqual(@as(?bool, null), decl.?.standalone);
+    }
+
+    {
+        var ctx = ParseContext.init("<?xml version=\"aa\" encoding = 'bbb' standalone   \t =   'yes'?>");
+        const decl = try tryParseProlog(&ctx, alloc);
+        try testing.expectEqualSlices(u8, "aa", decl.?.version);
+        try testing.expectEqualSlices(u8, "bbb", decl.?.encoding.?);
+        try testing.expectEqual(@as(?bool, true), decl.?.standalone.?);
+    }
+}
+
+fn trySkipComments(ctx: *ParseContext, alloc: Allocator) !void {
+    while (try tryParseComment(ctx, alloc)) |_| {
+        _ = ctx.eatWs();
+    }
+}
+
+fn tryParseComment(ctx: *ParseContext, alloc: Allocator) !?[]const u8 {
+    if (!ctx.eatStr("<!--")) return null;
+
+    const begin = ctx.offset;
+    while (!ctx.eatStr("-->")) {
+        _ = ctx.consume() catch return error.UnclosedComment;
+    }
+
+    const end = ctx.offset - "-->".len;
+    return try alloc.dupe(u8, ctx.source[begin..end]);
+}
+
+fn unescapeEntity(text: []const u8) !u8 {
+    const EntitySubstition = struct { text: []const u8, replacement: u8 };
+
+    const entities = [_]EntitySubstition{
+        .{ .text = "&lt;", .replacement = '<' },
+        .{ .text = "&gt;", .replacement = '>' },
+        .{ .text = "&amp;", .replacement = '&' },
+        .{ .text = "&apos;", .replacement = '\'' },
+        .{ .text = "&quot;", .replacement = '"' },
+    };
+
+    for (entities) |entity| {
+        if (std.mem.eql(u8, text, entity.text)) return entity.replacement;
+    }
+
+    return error.InvalidEntity;
+}
+
+fn dupeAndUnescape(alloc: Allocator, text: []const u8) ![]const u8 {
+    const str = try alloc.alloc(u8, text.len);
+
+    var j: usize = 0;
+    var i: usize = 0;
+    while (i < text.len) : (j += 1) {
+        if (text[i] == '&') {
+            const entity_end = 1 + (mem.indexOfScalarPos(u8, text, i, ';') orelse return error.InvalidEntity);
+            str[j] = try unescapeEntity(text[i..entity_end]);
+            i = entity_end;
+        } else {
+            str[j] = text[i];
+            i += 1;
+        }
+    }
+
+    // This error is not strictly true, but we need to match one of the items
+    // from the error set provided by the other stdlib calls at the calling site
+    if (!alloc.resize(str, j)) {
+        defer alloc.free(str);
+        return alloc.dupe(u8, str[0..j]) catch return error.OutOfMemory;
+    }
+    return str[0..j];
+}
+
+test "dupeAndUnescape" {
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
+
+    const duped = try dupeAndUnescape(testing.allocator, "test");
+    defer testing.allocator.free(duped);
+    try testing.expectEqualSlices(u8, "test", duped);
+    const duped2 = try dupeAndUnescape(testing.allocator, "a&lt;b&amp;c&gt;d&quot;e&apos;f&lt;");
+    defer testing.allocator.free(duped2);
+    try testing.expectEqualSlices(u8, "a<b&c>d\"e'f<", duped2);
+    try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&"));
+    try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&&"));
+    try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&test;"));
+    try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&boa"));
+}
+
+test "Top level comments" {
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
+
+    const doc = try parse(alloc, "<?xml version='aa'?><!--comment--><python color='green'/><!--another comment-->");
+    try testing.expectEqualSlices(u8, "python", doc.root.tag);
+}
--- a/src/service.zig
+++ b/src/service.zig
@ -1798,12 +1798,16 @@ pub const DataService = struct {
            .splits => "/splits",
            .etf_profile => return false, // not served
            .meta => return false,
-            // New variants wired into the endpoint mapping by
-            // Milestone 1 chunk 3 (DataService methods). For now
-            // they're not yet served; clients fall through to live
-            // provider fetch via getClassification / getEntityFacts /
-            // getEtfMetrics, which don't exist yet.
+            // Endpoint mapping for these will be wired when the
+            // corresponding `getClassification` / `getEntityFacts` /
+            // `getEtfMetrics` service methods land. Until then,
+            // server sync is a no-op for them.
            .classification, .etf_metrics, .entity_facts => return false,
+            // Provider-internal cache files (ticker-map indexes)
+            // are not served — clients fetch them directly from
+            // the SEC. The DataService caches the JSON via
+            // `Store` after fetching; the server has no role.
+            .tickers_funds, .tickers_companies => return false,
        };

        const full_url = std.fmt.allocPrint(self.allocator, "{s}/{s}{s}", .{ server_url, symbol, endpoint }) catch return false;