//! Wikidata SPARQL classification provider. //! //! ## What this provider does //! //! Given a stock symbol, Wikidata can answer: //! //! * "What kind of entity is this?" — name, industry, sector, //! country of incorporation, inception date, instance-of //! classification (operating company / mutual fund / ETF / …). //! * "Does this match the SEC's CIK?" — Wikidata's P5531 already //! stores the 10-digit zero-padded CIK matching SEC's convention. //! //! ## Workflow //! //! `fetch(symbols)` runs ONE batched SPARQL query that returns //! per-ticker rows. The query is keyed on the US-listing (NYSE / //! Nasdaq / NYSE Arca / OTC Markets) of each ticker — without that //! filter, common US tickers silently resolve to whichever //! foreign-exchange company happens to share the symbol (`MRK` → //! Merck KGaA on Frankfurt; `PG` → People's Garment on SET; etc.). //! //! The provider is stateless. Caching belongs to the data service, //! which writes per-symbol `classification.srf` files after this //! provider returns and reads them back on subsequent calls. //! //! ## Glossary //! //! SPARQL Query language for RDF-shaped data. Wikidata's //! primary read API. //! P-number Property identifier in Wikidata (P249 = ticker symbol, //! P414 = stock exchange, P31 = instance of, ...). //! Q-number Entity identifier in Wikidata (Q40244 = ETF as a //! concept, Q13677 = NYSE the entity, Q312 = Apple Inc. //! the entity). //! wdt:Pxxx Truthy/direct property statement — the simple shape. //! p:Pxxx Reified property statement — lets a statement carry //! qualifiers (e.g. ticker symbol AS A QUALIFIER on the //! stock-exchange statement, rather than as a direct //! property of the company). //! ps:Pxxx "Statement value" predicate — within a reified //! statement, points to the statement's main value. //! pq:Pxxx "Qualifier" predicate — within a reified statement, //! points to a qualifier on that statement. //! //! Why the reified statement matters here: Wikidata stores tickers //! as P249 qualifiers on a P414 (stock exchange) statement, NOT as //! a direct `wdt:P249` property. Querying naively returns zero rows //! for nearly every US-listed equity. const std = @import("std"); const http = @import("../net/http.zig"); const fmt = @import("../format.zig"); const sparql_endpoint = "https://query.wikidata.org/sparql"; /// Per-symbol classification record produced by parsing a Wikidata /// SPARQL response. Fields are nullable when Wikidata has no value /// for that property; the `source` field always emits per the /// project's source-pure invariant. pub const ClassificationRecord = struct { symbol: []const u8, // owned name: ?[]const u8 = null, // owned sector: ?[]const u8 = null, // owned industry: ?[]const u8 = null, // owned /// ISO-3166 alpha-2 country code (e.g. "US", "GB", "DE"). country: ?[]const u8 = null, // owned asset_class: ?[]const u8 = null, // owned is_etf: bool = false, /// YYYY-MM-DD; trimmed from Wikidata's ISO-8601 date. inception_date: ?[]const u8 = null, // owned /// Wikidata's P5531 — the SEC CIK as a digit string. Wikidata /// already zero-pads to 10 digits, matching the project-wide /// CIK normalization convention. cik: ?[]const u8 = null, // owned /// YYYY-MM-DD when this provider ran, NOT when Wikidata last /// updated the underlying entity. as_of: []const u8, // owned source: []const u8, // no default — provenance always emitted pub fn deinit(self: *ClassificationRecord, allocator: std.mem.Allocator) void { allocator.free(self.symbol); if (self.name) |s| allocator.free(s); if (self.sector) |s| allocator.free(s); if (self.industry) |s| allocator.free(s); if (self.country) |s| allocator.free(s); if (self.asset_class) |s| allocator.free(s); if (self.inception_date) |s| allocator.free(s); if (self.cik) |s| allocator.free(s); allocator.free(self.as_of); } }; /// Geo-bucket constants used by the country → geo lookup. Kept as /// named constants (rather than inline string literals in the map) /// so callers can reference them without typo risk and the /// taxonomy is tweakable in one place. pub const geo = struct { pub const us = "US"; pub const developed = "International Developed"; pub const emerging = "Emerging Markets"; pub const unknown = "Unknown"; }; /// Wikidata Q-IDs we test against `instance of` (P31) to classify /// fund-shaped securities. Curated, not exhaustive. const etf_q_ids = [_][]const u8{ "Q40244", // exchange-traded fund "Q4118901", // exchange-traded bond fund "Q104638128", // ETF tracking specific index }; const mutual_fund_q_ids = [_][]const u8{ "Q1752230", // mutual fund "Q11644608", // open-end fund }; /// US stock exchanges accepted by the SPARQL exchange filter. /// Without this filter, ticker collisions across global exchanges /// silently return the wrong company. /// /// Q-IDs: /// Q13677 New York Stock Exchange (NYSE) /// Q82059 Nasdaq /// Q4527260 NYSE Arca /// Q1666011 OTC Markets Group / Pink Sheets const us_exchanges = [_][]const u8{ "wd:Q13677", "wd:Q82059", "wd:Q4527260", "wd:Q1666011", }; /// Country-code-to-geo-bucket lookup. Wikidata returns ISO-3166 /// alpha-2 codes via P17 → P297; we map them to the geo taxonomy /// (`geo.us` / `geo.developed` / `geo.emerging` / `geo.unknown`). /// /// MSCI conventions used as the developed/emerging split. Taiwan /// and South Korea are MSCI-emerging despite FTSE classifying them /// developed. Israel is MSCI-developed (upgraded 2010). Canada is /// folded into International Developed (some users prefer separate /// Canada bucket; override in `metadata.srf` if so). const country_to_geo = std.StaticStringMap([]const u8).initComptime(.{ // United States .{ "US", geo.us }, // Alpha-3 fallback for entries that use the longer form. .{ "USA", geo.us }, // International Developed — Europe ex-CIS .{ "GB", geo.developed }, .{ "DE", geo.developed }, .{ "FR", geo.developed }, .{ "NL", geo.developed }, .{ "CH", geo.developed }, .{ "SE", geo.developed }, .{ "DK", geo.developed }, .{ "NO", geo.developed }, .{ "FI", geo.developed }, .{ "IT", geo.developed }, .{ "ES", geo.developed }, .{ "BE", geo.developed }, .{ "AT", geo.developed }, .{ "IE", geo.developed }, .{ "LU", geo.developed }, .{ "PT", geo.developed }, .{ "GR", geo.developed }, .{ "IS", geo.developed }, // International Developed — Asia-Pacific + Israel + Canada .{ "JP", geo.developed }, .{ "AU", geo.developed }, .{ "NZ", geo.developed }, .{ "SG", geo.developed }, .{ "HK", geo.developed }, .{ "IL", geo.developed }, .{ "CA", geo.developed }, // Emerging Markets (MSCI) .{ "CN", geo.emerging }, .{ "TW", geo.emerging }, .{ "KR", geo.emerging }, .{ "IN", geo.emerging }, .{ "BR", geo.emerging }, .{ "MX", geo.emerging }, .{ "RU", geo.emerging }, .{ "TR", geo.emerging }, .{ "ZA", geo.emerging }, .{ "TH", geo.emerging }, .{ "MY", geo.emerging }, .{ "ID", geo.emerging }, .{ "PH", geo.emerging }, .{ "VN", geo.emerging }, .{ "AR", geo.emerging }, .{ "CL", geo.emerging }, .{ "CO", geo.emerging }, .{ "PE", geo.emerging }, .{ "EG", geo.emerging }, }); /// Map an ISO-3166 alpha-2 country code to one of the geo buckets. /// Null/empty input or an unknown code returns `geo.unknown` so the /// user can override in `metadata.srf`. pub fn geoFor(iso2: ?[]const u8) []const u8 { const code = iso2 orelse return geo.unknown; if (code.len == 0) return geo.unknown; return country_to_geo.get(code) orelse geo.unknown; } // ── Wikidata provider state (file-as-struct) ───────────────────── // // Callers do `const wikidata = @import("providers/Wikidata.zig");` // followed by `var wd = wikidata.init(...);` and `wd.fetch(...)`. client: http.Client, allocator: std.mem.Allocator, io: std.Io, /// Contact email for User-Agent / From headers, sourced from /// `Config.user_email`. Required; callers must surface a clear /// missing-config error before constructing this provider. user_email: []const u8, const Wikidata = @This(); pub fn init( io: std.Io, allocator: std.mem.Allocator, user_email: []const u8, ) Wikidata { return .{ .client = http.Client.init(io, allocator), .allocator = allocator, .io = io, .user_email = user_email, }; } pub fn deinit(self: *Wikidata) void { self.client.deinit(); } /// Fetch and parse Wikidata classifications for `symbols`. /// Runs a single batched SPARQL query and parses the response. /// Caller owns the returned slice and each record. pub fn fetch( self: *Wikidata, result_allocator: std.mem.Allocator, symbols: []const []const u8, ) ![]ClassificationRecord { if (symbols.len == 0) return &.{}; const query = try buildQuery(self.allocator, symbols); defer self.allocator.free(query); const json = try self.postSparql(query); defer self.allocator.free(json); return parse(self.io, result_allocator, json, symbols); } /// POST a SPARQL query. Sets the User-Agent + From headers from /// `user_email` for politeness; Wikidata explicitly recommends /// descriptive User-Agent strings. fn postSparql(self: *Wikidata, query: []const u8) ![]u8 { var form_buf: std.Io.Writer.Allocating = .init(self.allocator); defer form_buf.deinit(); try form_buf.writer.writeAll("query="); // `Component.formatEscaped` percent-encodes everything outside // RFC 3986's unreserved set — exactly the contract for the // `application/x-www-form-urlencoded` body we're building. try (std.Uri.Component{ .raw = query }).formatEscaped(&form_buf.writer); var ua_buf: [256]u8 = undefined; const ua = std.fmt.bufPrint(&ua_buf, "zfin/0.1 ({s})", .{self.user_email}) catch return error.UserEmailTooLong; const headers = [_]std.http.Header{ .{ .name = "User-Agent", .value = ua }, .{ .name = "Accept", .value = "application/sparql-results+json" }, .{ .name = "Content-Type", .value = "application/x-www-form-urlencoded" }, .{ .name = "From", .value = self.user_email }, }; var resp = try self.client.request(.POST, sparql_endpoint, form_buf.written(), &headers); defer resp.deinit(); return self.allocator.dupe(u8, resp.body); } /// Build the batched SPARQL query for a slice of ticker symbols. /// Caller owns the returned bytes. Symbols interpolated via /// `VALUES ?ticker { "AAPL" "MSFT" ... }`. /// /// Wikidata's ticker storage is non-obvious: tickers are stored as /// `P249` qualifiers on a `P414` (stock exchange) statement. Naive /// `?security wdt:P249 ?ticker` returns zero rows for nearly every /// US-listed equity. The query reaches them via: /// /// ?security p:P414 ?stmt . /// ?stmt ps:P414 ?exchange . /// ?stmt pq:P249 ?ticker . /// /// `?exchange` is filtered to a small set of US exchanges to avoid /// ticker collisions with foreign listings. fn buildQuery(allocator: std.mem.Allocator, symbols: []const []const u8) ![]u8 { var aw: std.Io.Writer.Allocating = .init(allocator); defer aw.deinit(); try aw.writer.writeAll( \\SELECT ?ticker ?security ?securityLabel ?industryLabel ?countryCode ?inception ?cik ?instance WHERE { \\ VALUES ?ticker { ); for (symbols) |s| { try aw.writer.print(" \"{s}\"", .{s}); } try aw.writer.writeAll(" }\n"); try aw.writer.writeAll(" VALUES ?exchange {"); for (us_exchanges) |x| { try aw.writer.print(" {s}", .{x}); } try aw.writer.writeAll(" }\n"); try aw.writer.writeAll( \\ ?security p:P414 ?exchstmt . \\ ?exchstmt ps:P414 ?exchange . \\ ?exchstmt pq:P249 ?ticker . \\ OPTIONAL { ?security wdt:P452 ?industry . } \\ OPTIONAL { ?security wdt:P17 ?country . ?country wdt:P297 ?countryCode . } \\ OPTIONAL { ?security wdt:P571 ?inception . } \\ OPTIONAL { ?security wdt:P5531 ?cik . } \\ OPTIONAL { ?security wdt:P31 ?instance . } \\ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } \\} ); return aw.toOwnedSlice(); } /// Parse the SPARQL JSON response into `ClassificationRecord` values. /// Multiple bindings for the same ticker (e.g. multiple `instance of` /// values) get merged into one record — first-non-null wins. fn parse( io: std.Io, allocator: std.mem.Allocator, json_bytes: []const u8, expected_symbols: []const []const u8, ) ![]ClassificationRecord { const today = fmt.todayDate(io); var as_of_buf: [10]u8 = undefined; const as_of = try std.fmt.bufPrint(&as_of_buf, "{f}", .{today}); const parsed = std.json.parseFromSlice(std.json.Value, allocator, json_bytes, .{}) catch return &.{}; defer parsed.deinit(); const root = switch (parsed.value) { .object => |o| o, else => return &.{}, }; const results = switch (root.get("results") orelse return &.{}) { .object => |o| o, else => return &.{}, }; const bindings = switch (results.get("bindings") orelse return &.{}) { .array => |a| a.items, else => return &.{}, }; // Map symbol → record; merge multiple bindings. var by_symbol: std.StringHashMap(ClassificationRecord) = .init(allocator); defer { var it = by_symbol.valueIterator(); while (it.next()) |r| r.deinit(allocator); by_symbol.deinit(); } for (bindings) |b| { const obj = switch (b) { .object => |o| o, else => continue, }; const ticker = sparqlValue(obj, "ticker") orelse continue; // Verify ticker is one we asked for. Wikidata can return // surprising matches (foreign exchanges); skip those. var matched = false; for (expected_symbols) |s| { if (std.ascii.eqlIgnoreCase(s, ticker)) { matched = true; break; } } if (!matched) continue; const existing_or_new = try by_symbol.getOrPut(ticker); if (!existing_or_new.found_existing) { existing_or_new.key_ptr.* = try allocator.dupe(u8, ticker); existing_or_new.value_ptr.* = .{ .symbol = try allocator.dupe(u8, ticker), .as_of = try allocator.dupe(u8, as_of), .source = "wikidata", }; } const rec = existing_or_new.value_ptr; if (rec.name == null) { if (sparqlValue(obj, "securityLabel")) |label| { rec.name = try allocator.dupe(u8, label); } } if (rec.industry == null) { if (sparqlValue(obj, "industryLabel")) |ind| { rec.industry = try allocator.dupe(u8, ind); rec.sector = try allocator.dupe(u8, ind); } } if (rec.country == null) { if (sparqlValue(obj, "countryCode")) |c| { rec.country = try allocator.dupe(u8, c); } } if (rec.inception_date == null) { if (sparqlValue(obj, "inception")) |d| { if (d.len >= 10) { rec.inception_date = try allocator.dupe(u8, d[0..10]); } } } if (rec.cik == null) { if (sparqlValue(obj, "cik")) |c| { rec.cik = try allocator.dupe(u8, c); } } if (sparqlValue(obj, "instance")) |inst_iri| { // The "instance" value is a Q-ID URI like // "http://www.wikidata.org/entity/Q40244". Extract the // Q-ID suffix and test against our known sets. const last_slash = std.mem.lastIndexOfScalar(u8, inst_iri, '/'); const q_id = if (last_slash) |i| inst_iri[i + 1 ..] else inst_iri; for (etf_q_ids) |target| { if (std.mem.eql(u8, q_id, target)) { rec.is_etf = true; if (rec.asset_class == null) { rec.asset_class = try allocator.dupe(u8, "ETF (uncategorized)"); } break; } } for (mutual_fund_q_ids) |target| { if (std.mem.eql(u8, q_id, target)) { rec.is_etf = true; if (rec.asset_class == null) { rec.asset_class = try allocator.dupe(u8, "Mutual Fund (uncategorized)"); } break; } } } } // Drain map into owned slice. Caller takes ownership; our defer // above calls deinit on values, so clear the map before returning // to avoid double-free. var out = try allocator.alloc(ClassificationRecord, by_symbol.count()); var idx: usize = 0; var it = by_symbol.iterator(); while (it.next()) |entry| { out[idx] = entry.value_ptr.*; idx += 1; } var key_it = by_symbol.keyIterator(); while (key_it.next()) |k| allocator.free(k.*); by_symbol.clearRetainingCapacity(); return out; } /// Pull the `.value` string out of a SPARQL JSON binding object's /// named field. Returns null if absent or non-string. fn sparqlValue(obj: std.json.ObjectMap, field: []const u8) ?[]const u8 { const slot = obj.get(field) orelse return null; const slot_obj = switch (slot) { .object => |o| o, else => return null, }; const val = slot_obj.get("value") orelse return null; return switch (val) { .string => |s| s, else => null, }; } // ── Tests ──────────────────────────────────────────────────────── test "buildQuery includes all symbols and required SELECT vars" { const allocator = std.testing.allocator; const syms = [_][]const u8{ "AAPL", "VTI" }; const q = try buildQuery(allocator, &syms); defer allocator.free(q); try std.testing.expect(std.mem.indexOf(u8, q, "\"AAPL\"") != null); try std.testing.expect(std.mem.indexOf(u8, q, "\"VTI\"") != null); try std.testing.expect(std.mem.indexOf(u8, q, "p:P414") != null); try std.testing.expect(std.mem.indexOf(u8, q, "pq:P249") != null); try std.testing.expect(std.mem.indexOf(u8, q, "wdt:P452") != null); try std.testing.expect(std.mem.indexOf(u8, q, "wdt:P17") != null); // US-exchange filter must be present — without it, US tickers // collide with foreign exchanges (MRK→Merck KGaA, PG→People's // Garment, etc.). See `us_exchanges` doc-block. try std.testing.expect(std.mem.indexOf(u8, q, "wd:Q13677") != null); // NYSE try std.testing.expect(std.mem.indexOf(u8, q, "wd:Q82059") != null); // Nasdaq try std.testing.expect(std.mem.indexOf(u8, q, "ps:P414 ?exchange") != null); } test "parse: AAPL fixture round-trips name + industry + country" { const fixture = \\{ \\ "head": {"vars": ["ticker", "security", "securityLabel", "industryLabel", "countryCode", "inception", "cik", "instance"]}, \\ "results": { \\ "bindings": [ \\ { \\ "ticker": {"type": "literal", "value": "AAPL"}, \\ "security": {"type": "uri", "value": "http://www.wikidata.org/entity/Q312"}, \\ "securityLabel": {"type": "literal", "value": "Apple Inc."}, \\ "industryLabel": {"type": "literal", "value": "consumer electronics"}, \\ "countryCode": {"type": "literal", "value": "US"}, \\ "instance": {"type": "uri", "value": "http://www.wikidata.org/entity/Q4830453"} \\ } \\ ] \\ } \\} ; const allocator = std.testing.allocator; const expected = [_][]const u8{"AAPL"}; const recs = try parse(std.testing.io, allocator, fixture, &expected); defer { for (recs) |*r| { var m = r.*; m.deinit(allocator); } allocator.free(recs); } try std.testing.expectEqual(@as(usize, 1), recs.len); try std.testing.expectEqualStrings("AAPL", recs[0].symbol); try std.testing.expectEqualStrings("Apple Inc.", recs[0].name.?); try std.testing.expectEqualStrings("consumer electronics", recs[0].industry.?); try std.testing.expectEqualStrings("consumer electronics", recs[0].sector.?); try std.testing.expectEqualStrings("US", recs[0].country.?); try std.testing.expect(!recs[0].is_etf); } test "parse: ETF fixture sets is_etf=true and asset_class" { const fixture = \\{ \\ "head": {"vars": ["ticker", "security", "securityLabel", "instance"]}, \\ "results": { \\ "bindings": [ \\ { \\ "ticker": {"type": "literal", "value": "VTI"}, \\ "security": {"type": "uri", "value": "http://www.wikidata.org/entity/Q1809462"}, \\ "securityLabel": {"type": "literal", "value": "Vanguard Total Stock Market ETF"}, \\ "instance": {"type": "uri", "value": "http://www.wikidata.org/entity/Q40244"} \\ } \\ ] \\ } \\} ; const allocator = std.testing.allocator; const expected = [_][]const u8{"VTI"}; const recs = try parse(std.testing.io, allocator, fixture, &expected); defer { for (recs) |*r| { var m = r.*; m.deinit(allocator); } allocator.free(recs); } try std.testing.expectEqual(@as(usize, 1), recs.len); try std.testing.expect(recs[0].is_etf); try std.testing.expectEqualStrings("ETF (uncategorized)", recs[0].asset_class.?); } test "parse: bindings for symbols not requested are dropped" { const fixture = \\{ \\ "head": {"vars": ["ticker", "security", "securityLabel"]}, \\ "results": { \\ "bindings": [ \\ {"ticker": {"type": "literal", "value": "WRONG"}, \\ "security": {"type": "uri", "value": "http://example/Q1"}, \\ "securityLabel": {"type": "literal", "value": "Wrong Co"}} \\ ] \\ } \\} ; const allocator = std.testing.allocator; const expected = [_][]const u8{"AAPL"}; const recs = try parse(std.testing.io, allocator, fixture, &expected); defer allocator.free(recs); try std.testing.expectEqual(@as(usize, 0), recs.len); } test "geoFor maps known ISO-3166 codes to bucket" { try std.testing.expectEqualStrings(geo.us, geoFor("US")); try std.testing.expectEqualStrings(geo.us, geoFor("USA")); try std.testing.expectEqualStrings(geo.developed, geoFor("GB")); try std.testing.expectEqualStrings(geo.developed, geoFor("DE")); try std.testing.expectEqualStrings(geo.developed, geoFor("CA")); try std.testing.expectEqualStrings(geo.developed, geoFor("IL")); try std.testing.expectEqualStrings(geo.emerging, geoFor("CN")); try std.testing.expectEqualStrings(geo.emerging, geoFor("TW")); try std.testing.expectEqualStrings(geo.emerging, geoFor("KR")); } test "geoFor returns Unknown for null/empty/unmapped" { try std.testing.expectEqualStrings(geo.unknown, geoFor(null)); try std.testing.expectEqualStrings(geo.unknown, geoFor("")); try std.testing.expectEqualStrings(geo.unknown, geoFor("ZZ")); // unassigned ISO-2 try std.testing.expectEqualStrings(geo.unknown, geoFor("XX")); }