add Edgar/Wikidata providers that will replace AlphaVantage
This commit is contained in:
parent
dfd64bf511
commit
cc2087fd07
9 changed files with 3223 additions and 7 deletions
14
README.md
14
README.md
|
|
@ -922,6 +922,20 @@ zig build run -- <args> # build and run
|
|||
|
||||
The compiled binary is at `zig-out/bin/zfin`.
|
||||
|
||||
## Vendored code
|
||||
|
||||
A small amount of third-party source is vendored directly into the
|
||||
tree (rather than added as a Zig package dependency) where the
|
||||
upstream is small, stable, and not packaged for `build.zig.zon`:
|
||||
|
||||
| File | Source | Purpose |
|
||||
|-------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
|
||||
| `src/providers/xml.zig` | [Snektron/vulkan-zig](https://github.com/Snektron/vulkan-zig/blob/797ae8af88e84753af9640266de61a985b76b580/generator/xml.zig), via [aws-zig](https://github.com/elerch/aws-sdk-for-zig) | XML DOM parser used by the EDGAR provider for NPORT-P primary documents. |
|
||||
|
||||
Each vendored file carries a `// VENDORED - see README.md` header
|
||||
identifying its upstream source. When updating, copy the new
|
||||
upstream verbatim and re-add the header.
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
||||
|
|
|
|||
|
|
@ -13,8 +13,8 @@
|
|||
.hash = "z2d-0.11.0-j5P_HtLzDwBGyQt49DrT0v4BuVqI_SRs6CXsuj7eBVhR",
|
||||
},
|
||||
.srf = .{
|
||||
.url = "git+https://git.lerch.org/lobo/srf.git?ref=master#512eab0db082f1679af4de77b1f1713409766fcf",
|
||||
.hash = "srf-0.0.0-qZj57-7CAQBdAFgdiSB2bE5Socq8QNId8PFzynVQbSUN",
|
||||
.url = "git+https://git.lerch.org/lobo/srf#12b755660e96ed65c645975110214fcc9c66ca4d",
|
||||
.hash = "srf-0.0.0-qZj5743KAQAykeIHzFJdRDwgAA-Yy1RLaj0Lw4W5Rphx",
|
||||
},
|
||||
},
|
||||
.paths = .{
|
||||
|
|
|
|||
|
|
@ -38,6 +38,11 @@ fmp_key: ?[]const u8 = null,
|
|||
alphavantage_key: ?[]const u8 = null,
|
||||
tiingo_key: ?[]const u8 = null,
|
||||
openfigi_key: ?[]const u8 = null,
|
||||
/// User contact email used as the User-Agent / From header for
|
||||
/// open-data providers that require politeness identification
|
||||
/// (Wikidata SPARQL, EDGAR). No API-key authentication semantics —
|
||||
/// just identifies the operator. Sourced from `ZFIN_USER_EMAIL`.
|
||||
user_email: ?[]const u8 = null,
|
||||
/// URL of a zfin-server instance for lazy cache sync (e.g. "https://zfin.lerch.org")
|
||||
server_url: ?[]const u8 = null,
|
||||
cache_dir: []const u8,
|
||||
|
|
@ -92,6 +97,7 @@ pub fn fromEnv(io: std.Io, allocator: std.mem.Allocator, environ_map: *const std
|
|||
self.alphavantage_key = self.resolve("ALPHAVANTAGE_API_KEY");
|
||||
self.tiingo_key = self.resolve("TIINGO_API_KEY");
|
||||
self.openfigi_key = self.resolve("OPENFIGI_API_KEY");
|
||||
self.user_email = self.resolve("ZFIN_USER_EMAIL");
|
||||
self.server_url = self.resolve("ZFIN_SERVER");
|
||||
|
||||
const env_cache = self.resolve("ZFIN_CACHE_DIR");
|
||||
|
|
|
|||
29
src/cache/store.zig
vendored
29
src/cache/store.zig
vendored
|
|
@ -63,6 +63,15 @@ pub const Ttl = struct {
|
|||
/// Refreshes on quarterly filing cadence; 30-day TTL gives a
|
||||
/// fortnightly margin around each fiscal-quarter boundary.
|
||||
pub const entity_facts: i64 = 30 * s_per_day;
|
||||
|
||||
/// EDGAR ticker-map indexes (`company_tickers.json` and the MF
|
||||
/// equivalent). SEC updates these daily upstream, but the
|
||||
/// ticker→CIK mapping is extremely stable (changes are rare
|
||||
/// rename events). 30-day TTL with jitter keeps the load
|
||||
/// reasonable while still picking up new listings within a
|
||||
/// month.
|
||||
pub const tickers_funds: i64 = 30 * s_per_day;
|
||||
pub const tickers_companies: i64 = 30 * s_per_day;
|
||||
};
|
||||
|
||||
/// Cache TTL specification with optional per-key expiration jitter.
|
||||
|
|
@ -175,6 +184,15 @@ pub const DataType = enum {
|
|||
/// symbol-keyed, so a single dual-class issuer (BRK.A / BRK.B)
|
||||
/// has one shared facts file.
|
||||
entity_facts,
|
||||
/// EDGAR's `company_tickers_mf.json` index, cached at
|
||||
/// `<cache_dir>/_edgar/tickers_funds.srf`. Single-record file
|
||||
/// (one MutualFundTickerMapBlob) under a synthetic `_edgar` key.
|
||||
/// Updated daily upstream; refreshes monthly with jitter.
|
||||
tickers_funds,
|
||||
/// EDGAR's `company_tickers.json` index, cached at
|
||||
/// `<cache_dir>/_edgar/tickers_companies.srf`. Same shape as
|
||||
/// `tickers_funds`.
|
||||
tickers_companies,
|
||||
|
||||
pub fn fileName(self: DataType) []const u8 {
|
||||
return switch (self) {
|
||||
|
|
@ -189,6 +207,8 @@ pub const DataType = enum {
|
|||
.classification => "classification.srf",
|
||||
.etf_metrics => "etf_metrics.srf",
|
||||
.entity_facts => "entity_facts.srf",
|
||||
.tickers_funds => "tickers_funds.srf",
|
||||
.tickers_companies => "tickers_companies.srf",
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -202,6 +222,8 @@ pub const DataType = enum {
|
|||
.classification => Ttl.classification,
|
||||
.etf_metrics => Ttl.etf_metrics,
|
||||
.entity_facts => Ttl.entity_facts,
|
||||
.tickers_funds => Ttl.tickers_funds,
|
||||
.tickers_companies => Ttl.tickers_companies,
|
||||
.candles_daily, .candles_meta, .meta => 0,
|
||||
};
|
||||
}
|
||||
|
|
@ -2331,6 +2353,9 @@ test "TTL constants are reasonable" {
|
|||
try std.testing.expectEqual(@as(i64, 90 * std.time.s_per_day), Ttl.classification);
|
||||
try std.testing.expectEqual(@as(i64, 90 * std.time.s_per_day), Ttl.etf_metrics);
|
||||
try std.testing.expectEqual(@as(i64, 30 * std.time.s_per_day), Ttl.entity_facts);
|
||||
// EDGAR ticker-map indexes refresh monthly with jitter.
|
||||
try std.testing.expectEqual(@as(i64, 30 * std.time.s_per_day), Ttl.tickers_funds);
|
||||
try std.testing.expectEqual(@as(i64, 30 * std.time.s_per_day), Ttl.tickers_companies);
|
||||
}
|
||||
|
||||
test "DataType.ttl returns correct values" {
|
||||
|
|
@ -2342,6 +2367,8 @@ test "DataType.ttl returns correct values" {
|
|||
try std.testing.expectEqual(Ttl.classification, DataType.classification.ttl());
|
||||
try std.testing.expectEqual(Ttl.etf_metrics, DataType.etf_metrics.ttl());
|
||||
try std.testing.expectEqual(Ttl.entity_facts, DataType.entity_facts.ttl());
|
||||
try std.testing.expectEqual(Ttl.tickers_funds, DataType.tickers_funds.ttl());
|
||||
try std.testing.expectEqual(Ttl.tickers_companies, DataType.tickers_companies.ttl());
|
||||
|
||||
// These types have no TTL (0 = managed elsewhere)
|
||||
try std.testing.expectEqual(@as(i64, 0), DataType.candles_daily.ttl());
|
||||
|
|
@ -2361,6 +2388,8 @@ test "DataType.fileName returns correct file names" {
|
|||
try std.testing.expectEqualStrings("classification.srf", DataType.classification.fileName());
|
||||
try std.testing.expectEqualStrings("etf_metrics.srf", DataType.etf_metrics.fileName());
|
||||
try std.testing.expectEqualStrings("entity_facts.srf", DataType.entity_facts.fileName());
|
||||
try std.testing.expectEqualStrings("tickers_funds.srf", DataType.tickers_funds.fileName());
|
||||
try std.testing.expectEqualStrings("tickers_companies.srf", DataType.tickers_companies.fileName());
|
||||
}
|
||||
|
||||
test "negative_cache_content format" {
|
||||
|
|
|
|||
|
|
@ -721,4 +721,10 @@ test "looksLikeUnquotedGlob: empty arg returns false" {
|
|||
|
||||
test {
|
||||
std.testing.refAllDecls(@This());
|
||||
// Wikidata and EDGAR providers aren't yet imported via
|
||||
// `service.zig`; pull them in here for test discovery in the
|
||||
// meantime. Drop these once the providers are wired through
|
||||
// the data service.
|
||||
_ = @import("providers/Wikidata.zig");
|
||||
_ = @import("providers/Edgar.zig");
|
||||
}
|
||||
|
|
|
|||
1826
src/providers/Edgar.zig
Normal file
1826
src/providers/Edgar.zig
Normal file
File diff suppressed because it is too large
Load diff
618
src/providers/Wikidata.zig
Normal file
618
src/providers/Wikidata.zig
Normal file
|
|
@ -0,0 +1,618 @@
|
|||
//! Wikidata SPARQL classification provider.
|
||||
//!
|
||||
//! ## What this provider does
|
||||
//!
|
||||
//! Given a stock symbol, Wikidata can answer:
|
||||
//!
|
||||
//! * "What kind of entity is this?" — name, industry, sector,
|
||||
//! country of incorporation, inception date, instance-of
|
||||
//! classification (operating company / mutual fund / ETF / …).
|
||||
//! * "Does this match the SEC's CIK?" — Wikidata's P5531 already
|
||||
//! stores the 10-digit zero-padded CIK matching SEC's convention.
|
||||
//!
|
||||
//! ## Workflow
|
||||
//!
|
||||
//! `fetch(symbols)` runs ONE batched SPARQL query that returns
|
||||
//! per-ticker rows. The query is keyed on the US-listing (NYSE /
|
||||
//! Nasdaq / NYSE Arca / OTC Markets) of each ticker — without that
|
||||
//! filter, common US tickers silently resolve to whichever
|
||||
//! foreign-exchange company happens to share the symbol (`MRK` →
|
||||
//! Merck KGaA on Frankfurt; `PG` → People's Garment on SET; etc.).
|
||||
//!
|
||||
//! The provider is stateless. Caching belongs to the data service,
|
||||
//! which writes per-symbol `classification.srf` files after this
|
||||
//! provider returns and reads them back on subsequent calls.
|
||||
//!
|
||||
//! ## Glossary
|
||||
//!
|
||||
//! SPARQL Query language for RDF-shaped data. Wikidata's
|
||||
//! primary read API.
|
||||
//! P-number Property identifier in Wikidata (P249 = ticker symbol,
|
||||
//! P414 = stock exchange, P31 = instance of, ...).
|
||||
//! Q-number Entity identifier in Wikidata (Q40244 = ETF as a
|
||||
//! concept, Q13677 = NYSE the entity, Q312 = Apple Inc.
|
||||
//! the entity).
|
||||
//! wdt:Pxxx Truthy/direct property statement — the simple shape.
|
||||
//! p:Pxxx Reified property statement — lets a statement carry
|
||||
//! qualifiers (e.g. ticker symbol AS A QUALIFIER on the
|
||||
//! stock-exchange statement, rather than as a direct
|
||||
//! property of the company).
|
||||
//! ps:Pxxx "Statement value" predicate — within a reified
|
||||
//! statement, points to the statement's main value.
|
||||
//! pq:Pxxx "Qualifier" predicate — within a reified statement,
|
||||
//! points to a qualifier on that statement.
|
||||
//!
|
||||
//! Why the reified statement matters here: Wikidata stores tickers
|
||||
//! as P249 qualifiers on a P414 (stock exchange) statement, NOT as
|
||||
//! a direct `wdt:P249` property. Querying naively returns zero rows
|
||||
//! for nearly every US-listed equity.
|
||||
|
||||
const std = @import("std");
|
||||
const http = @import("../net/http.zig");
|
||||
const fmt = @import("../format.zig");
|
||||
|
||||
const sparql_endpoint = "https://query.wikidata.org/sparql";
|
||||
|
||||
/// Per-symbol classification record produced by parsing a Wikidata
|
||||
/// SPARQL response. Fields are nullable when Wikidata has no value
|
||||
/// for that property; the `source` field always emits per the
|
||||
/// project's source-pure invariant.
|
||||
pub const ClassificationRecord = struct {
|
||||
symbol: []const u8, // owned
|
||||
name: ?[]const u8 = null, // owned
|
||||
sector: ?[]const u8 = null, // owned
|
||||
industry: ?[]const u8 = null, // owned
|
||||
/// ISO-3166 alpha-2 country code (e.g. "US", "GB", "DE").
|
||||
country: ?[]const u8 = null, // owned
|
||||
asset_class: ?[]const u8 = null, // owned
|
||||
is_etf: bool = false,
|
||||
/// YYYY-MM-DD; trimmed from Wikidata's ISO-8601 date.
|
||||
inception_date: ?[]const u8 = null, // owned
|
||||
/// Wikidata's P5531 — the SEC CIK as a digit string. Wikidata
|
||||
/// already zero-pads to 10 digits, matching the project-wide
|
||||
/// CIK normalization convention.
|
||||
cik: ?[]const u8 = null, // owned
|
||||
/// YYYY-MM-DD when this provider ran, NOT when Wikidata last
|
||||
/// updated the underlying entity.
|
||||
as_of: []const u8, // owned
|
||||
source: []const u8, // no default — provenance always emitted
|
||||
|
||||
pub fn deinit(self: *ClassificationRecord, allocator: std.mem.Allocator) void {
|
||||
allocator.free(self.symbol);
|
||||
if (self.name) |s| allocator.free(s);
|
||||
if (self.sector) |s| allocator.free(s);
|
||||
if (self.industry) |s| allocator.free(s);
|
||||
if (self.country) |s| allocator.free(s);
|
||||
if (self.asset_class) |s| allocator.free(s);
|
||||
if (self.inception_date) |s| allocator.free(s);
|
||||
if (self.cik) |s| allocator.free(s);
|
||||
allocator.free(self.as_of);
|
||||
}
|
||||
};
|
||||
|
||||
/// Geo-bucket constants used by the country → geo lookup. Kept as
|
||||
/// named constants (rather than inline string literals in the map)
|
||||
/// so callers can reference them without typo risk and the
|
||||
/// taxonomy is tweakable in one place.
|
||||
pub const geo = struct {
|
||||
pub const us = "US";
|
||||
pub const developed = "International Developed";
|
||||
pub const emerging = "Emerging Markets";
|
||||
pub const unknown = "Unknown";
|
||||
};
|
||||
|
||||
/// Wikidata Q-IDs we test against `instance of` (P31) to classify
|
||||
/// fund-shaped securities. Curated, not exhaustive.
|
||||
const etf_q_ids = [_][]const u8{
|
||||
"Q40244", // exchange-traded fund
|
||||
"Q4118901", // exchange-traded bond fund
|
||||
"Q104638128", // ETF tracking specific index
|
||||
};
|
||||
const mutual_fund_q_ids = [_][]const u8{
|
||||
"Q1752230", // mutual fund
|
||||
"Q11644608", // open-end fund
|
||||
};
|
||||
|
||||
/// US stock exchanges accepted by the SPARQL exchange filter.
|
||||
/// Without this filter, ticker collisions across global exchanges
|
||||
/// silently return the wrong company.
|
||||
///
|
||||
/// Q-IDs:
|
||||
/// Q13677 New York Stock Exchange (NYSE)
|
||||
/// Q82059 Nasdaq
|
||||
/// Q4527260 NYSE Arca
|
||||
/// Q1666011 OTC Markets Group / Pink Sheets
|
||||
const us_exchanges = [_][]const u8{
|
||||
"wd:Q13677",
|
||||
"wd:Q82059",
|
||||
"wd:Q4527260",
|
||||
"wd:Q1666011",
|
||||
};
|
||||
|
||||
/// Country-code-to-geo-bucket lookup. Wikidata returns ISO-3166
|
||||
/// alpha-2 codes via P17 → P297; we map them to the geo taxonomy
|
||||
/// (`geo.us` / `geo.developed` / `geo.emerging` / `geo.unknown`).
|
||||
///
|
||||
/// MSCI conventions used as the developed/emerging split. Taiwan
|
||||
/// and South Korea are MSCI-emerging despite FTSE classifying them
|
||||
/// developed. Israel is MSCI-developed (upgraded 2010). Canada is
|
||||
/// folded into International Developed (some users prefer separate
|
||||
/// Canada bucket; override in `metadata.srf` if so).
|
||||
const country_to_geo = std.StaticStringMap([]const u8).initComptime(.{
|
||||
// United States
|
||||
.{ "US", geo.us },
|
||||
// Alpha-3 fallback for entries that use the longer form.
|
||||
.{ "USA", geo.us },
|
||||
|
||||
// International Developed — Europe ex-CIS
|
||||
.{ "GB", geo.developed },
|
||||
.{ "DE", geo.developed },
|
||||
.{ "FR", geo.developed },
|
||||
.{ "NL", geo.developed },
|
||||
.{ "CH", geo.developed },
|
||||
.{ "SE", geo.developed },
|
||||
.{ "DK", geo.developed },
|
||||
.{ "NO", geo.developed },
|
||||
.{ "FI", geo.developed },
|
||||
.{ "IT", geo.developed },
|
||||
.{ "ES", geo.developed },
|
||||
.{ "BE", geo.developed },
|
||||
.{ "AT", geo.developed },
|
||||
.{ "IE", geo.developed },
|
||||
.{ "LU", geo.developed },
|
||||
.{ "PT", geo.developed },
|
||||
.{ "GR", geo.developed },
|
||||
.{ "IS", geo.developed },
|
||||
|
||||
// International Developed — Asia-Pacific + Israel + Canada
|
||||
.{ "JP", geo.developed },
|
||||
.{ "AU", geo.developed },
|
||||
.{ "NZ", geo.developed },
|
||||
.{ "SG", geo.developed },
|
||||
.{ "HK", geo.developed },
|
||||
.{ "IL", geo.developed },
|
||||
.{ "CA", geo.developed },
|
||||
|
||||
// Emerging Markets (MSCI)
|
||||
.{ "CN", geo.emerging },
|
||||
.{ "TW", geo.emerging },
|
||||
.{ "KR", geo.emerging },
|
||||
.{ "IN", geo.emerging },
|
||||
.{ "BR", geo.emerging },
|
||||
.{ "MX", geo.emerging },
|
||||
.{ "RU", geo.emerging },
|
||||
.{ "TR", geo.emerging },
|
||||
.{ "ZA", geo.emerging },
|
||||
.{ "TH", geo.emerging },
|
||||
.{ "MY", geo.emerging },
|
||||
.{ "ID", geo.emerging },
|
||||
.{ "PH", geo.emerging },
|
||||
.{ "VN", geo.emerging },
|
||||
.{ "AR", geo.emerging },
|
||||
.{ "CL", geo.emerging },
|
||||
.{ "CO", geo.emerging },
|
||||
.{ "PE", geo.emerging },
|
||||
.{ "EG", geo.emerging },
|
||||
});
|
||||
|
||||
/// Map an ISO-3166 alpha-2 country code to one of the geo buckets.
|
||||
/// Null/empty input or an unknown code returns `geo.unknown` so the
|
||||
/// user can override in `metadata.srf`.
|
||||
pub fn geoFor(iso2: ?[]const u8) []const u8 {
|
||||
const code = iso2 orelse return geo.unknown;
|
||||
if (code.len == 0) return geo.unknown;
|
||||
return country_to_geo.get(code) orelse geo.unknown;
|
||||
}
|
||||
|
||||
// ── Wikidata provider state (file-as-struct) ─────────────────────
|
||||
//
|
||||
// Callers do `const wikidata = @import("providers/Wikidata.zig");`
|
||||
// followed by `var wd = wikidata.init(...);` and `wd.fetch(...)`.
|
||||
|
||||
client: http.Client,
|
||||
allocator: std.mem.Allocator,
|
||||
io: std.Io,
|
||||
/// Contact email for User-Agent / From headers, sourced from
|
||||
/// `Config.user_email`. Required; callers must surface a clear
|
||||
/// missing-config error before constructing this provider.
|
||||
user_email: []const u8,
|
||||
|
||||
const Wikidata = @This();
|
||||
|
||||
pub fn init(
|
||||
io: std.Io,
|
||||
allocator: std.mem.Allocator,
|
||||
user_email: []const u8,
|
||||
) Wikidata {
|
||||
return .{
|
||||
.client = http.Client.init(io, allocator),
|
||||
.allocator = allocator,
|
||||
.io = io,
|
||||
.user_email = user_email,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn deinit(self: *Wikidata) void {
|
||||
self.client.deinit();
|
||||
}
|
||||
|
||||
/// Fetch and parse Wikidata classifications for `symbols`.
|
||||
/// Runs a single batched SPARQL query and parses the response.
|
||||
/// Caller owns the returned slice and each record.
|
||||
pub fn fetch(
|
||||
self: *Wikidata,
|
||||
result_allocator: std.mem.Allocator,
|
||||
symbols: []const []const u8,
|
||||
) ![]ClassificationRecord {
|
||||
if (symbols.len == 0) return &.{};
|
||||
|
||||
const query = try buildQuery(self.allocator, symbols);
|
||||
defer self.allocator.free(query);
|
||||
|
||||
const json = try self.postSparql(query);
|
||||
defer self.allocator.free(json);
|
||||
|
||||
return parse(self.io, result_allocator, json, symbols);
|
||||
}
|
||||
|
||||
/// POST a SPARQL query. Sets the User-Agent + From headers from
|
||||
/// `user_email` for politeness; Wikidata explicitly recommends
|
||||
/// descriptive User-Agent strings.
|
||||
fn postSparql(self: *Wikidata, query: []const u8) ![]u8 {
|
||||
var form_buf: std.Io.Writer.Allocating = .init(self.allocator);
|
||||
defer form_buf.deinit();
|
||||
try form_buf.writer.writeAll("query=");
|
||||
// `Component.formatEscaped` percent-encodes everything outside
|
||||
// RFC 3986's unreserved set — exactly the contract for the
|
||||
// `application/x-www-form-urlencoded` body we're building.
|
||||
try (std.Uri.Component{ .raw = query }).formatEscaped(&form_buf.writer);
|
||||
|
||||
var ua_buf: [256]u8 = undefined;
|
||||
const ua = std.fmt.bufPrint(&ua_buf, "zfin/0.1 ({s})", .{self.user_email}) catch return error.UserEmailTooLong;
|
||||
|
||||
const headers = [_]std.http.Header{
|
||||
.{ .name = "User-Agent", .value = ua },
|
||||
.{ .name = "Accept", .value = "application/sparql-results+json" },
|
||||
.{ .name = "Content-Type", .value = "application/x-www-form-urlencoded" },
|
||||
.{ .name = "From", .value = self.user_email },
|
||||
};
|
||||
|
||||
var resp = try self.client.request(.POST, sparql_endpoint, form_buf.written(), &headers);
|
||||
defer resp.deinit();
|
||||
return self.allocator.dupe(u8, resp.body);
|
||||
}
|
||||
|
||||
/// Build the batched SPARQL query for a slice of ticker symbols.
|
||||
/// Caller owns the returned bytes. Symbols interpolated via
|
||||
/// `VALUES ?ticker { "AAPL" "MSFT" ... }`.
|
||||
///
|
||||
/// Wikidata's ticker storage is non-obvious: tickers are stored as
|
||||
/// `P249` qualifiers on a `P414` (stock exchange) statement. Naive
|
||||
/// `?security wdt:P249 ?ticker` returns zero rows for nearly every
|
||||
/// US-listed equity. The query reaches them via:
|
||||
///
|
||||
/// ?security p:P414 ?stmt .
|
||||
/// ?stmt ps:P414 ?exchange .
|
||||
/// ?stmt pq:P249 ?ticker .
|
||||
///
|
||||
/// `?exchange` is filtered to a small set of US exchanges to avoid
|
||||
/// ticker collisions with foreign listings.
|
||||
fn buildQuery(allocator: std.mem.Allocator, symbols: []const []const u8) ![]u8 {
|
||||
var aw: std.Io.Writer.Allocating = .init(allocator);
|
||||
defer aw.deinit();
|
||||
|
||||
try aw.writer.writeAll(
|
||||
\\SELECT ?ticker ?security ?securityLabel ?industryLabel ?countryCode ?inception ?cik ?instance WHERE {
|
||||
\\ VALUES ?ticker {
|
||||
);
|
||||
for (symbols) |s| {
|
||||
try aw.writer.print(" \"{s}\"", .{s});
|
||||
}
|
||||
try aw.writer.writeAll(" }\n");
|
||||
try aw.writer.writeAll(" VALUES ?exchange {");
|
||||
for (us_exchanges) |x| {
|
||||
try aw.writer.print(" {s}", .{x});
|
||||
}
|
||||
try aw.writer.writeAll(" }\n");
|
||||
try aw.writer.writeAll(
|
||||
\\ ?security p:P414 ?exchstmt .
|
||||
\\ ?exchstmt ps:P414 ?exchange .
|
||||
\\ ?exchstmt pq:P249 ?ticker .
|
||||
\\ OPTIONAL { ?security wdt:P452 ?industry . }
|
||||
\\ OPTIONAL { ?security wdt:P17 ?country . ?country wdt:P297 ?countryCode . }
|
||||
\\ OPTIONAL { ?security wdt:P571 ?inception . }
|
||||
\\ OPTIONAL { ?security wdt:P5531 ?cik . }
|
||||
\\ OPTIONAL { ?security wdt:P31 ?instance . }
|
||||
\\ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
||||
\\}
|
||||
);
|
||||
return aw.toOwnedSlice();
|
||||
}
|
||||
|
||||
/// Parse the SPARQL JSON response into `ClassificationRecord` values.
|
||||
/// Multiple bindings for the same ticker (e.g. multiple `instance of`
|
||||
/// values) get merged into one record — first-non-null wins.
|
||||
fn parse(
|
||||
io: std.Io,
|
||||
allocator: std.mem.Allocator,
|
||||
json_bytes: []const u8,
|
||||
expected_symbols: []const []const u8,
|
||||
) ![]ClassificationRecord {
|
||||
const today = fmt.todayDate(io);
|
||||
var as_of_buf: [10]u8 = undefined;
|
||||
const as_of = try std.fmt.bufPrint(&as_of_buf, "{f}", .{today});
|
||||
|
||||
const parsed = std.json.parseFromSlice(std.json.Value, allocator, json_bytes, .{}) catch
|
||||
return &.{};
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = switch (parsed.value) {
|
||||
.object => |o| o,
|
||||
else => return &.{},
|
||||
};
|
||||
const results = switch (root.get("results") orelse return &.{}) {
|
||||
.object => |o| o,
|
||||
else => return &.{},
|
||||
};
|
||||
const bindings = switch (results.get("bindings") orelse return &.{}) {
|
||||
.array => |a| a.items,
|
||||
else => return &.{},
|
||||
};
|
||||
|
||||
// Map symbol → record; merge multiple bindings.
|
||||
var by_symbol: std.StringHashMap(ClassificationRecord) = .init(allocator);
|
||||
defer {
|
||||
var it = by_symbol.valueIterator();
|
||||
while (it.next()) |r| r.deinit(allocator);
|
||||
by_symbol.deinit();
|
||||
}
|
||||
|
||||
for (bindings) |b| {
|
||||
const obj = switch (b) {
|
||||
.object => |o| o,
|
||||
else => continue,
|
||||
};
|
||||
const ticker = sparqlValue(obj, "ticker") orelse continue;
|
||||
|
||||
// Verify ticker is one we asked for. Wikidata can return
|
||||
// surprising matches (foreign exchanges); skip those.
|
||||
var matched = false;
|
||||
for (expected_symbols) |s| {
|
||||
if (std.ascii.eqlIgnoreCase(s, ticker)) {
|
||||
matched = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!matched) continue;
|
||||
|
||||
const existing_or_new = try by_symbol.getOrPut(ticker);
|
||||
if (!existing_or_new.found_existing) {
|
||||
existing_or_new.key_ptr.* = try allocator.dupe(u8, ticker);
|
||||
existing_or_new.value_ptr.* = .{
|
||||
.symbol = try allocator.dupe(u8, ticker),
|
||||
.as_of = try allocator.dupe(u8, as_of),
|
||||
.source = "wikidata",
|
||||
};
|
||||
}
|
||||
const rec = existing_or_new.value_ptr;
|
||||
|
||||
if (rec.name == null) {
|
||||
if (sparqlValue(obj, "securityLabel")) |label| {
|
||||
rec.name = try allocator.dupe(u8, label);
|
||||
}
|
||||
}
|
||||
if (rec.industry == null) {
|
||||
if (sparqlValue(obj, "industryLabel")) |ind| {
|
||||
rec.industry = try allocator.dupe(u8, ind);
|
||||
rec.sector = try allocator.dupe(u8, ind);
|
||||
}
|
||||
}
|
||||
if (rec.country == null) {
|
||||
if (sparqlValue(obj, "countryCode")) |c| {
|
||||
rec.country = try allocator.dupe(u8, c);
|
||||
}
|
||||
}
|
||||
if (rec.inception_date == null) {
|
||||
if (sparqlValue(obj, "inception")) |d| {
|
||||
if (d.len >= 10) {
|
||||
rec.inception_date = try allocator.dupe(u8, d[0..10]);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (rec.cik == null) {
|
||||
if (sparqlValue(obj, "cik")) |c| {
|
||||
rec.cik = try allocator.dupe(u8, c);
|
||||
}
|
||||
}
|
||||
if (sparqlValue(obj, "instance")) |inst_iri| {
|
||||
// The "instance" value is a Q-ID URI like
|
||||
// "http://www.wikidata.org/entity/Q40244". Extract the
|
||||
// Q-ID suffix and test against our known sets.
|
||||
const last_slash = std.mem.lastIndexOfScalar(u8, inst_iri, '/');
|
||||
const q_id = if (last_slash) |i| inst_iri[i + 1 ..] else inst_iri;
|
||||
for (etf_q_ids) |target| {
|
||||
if (std.mem.eql(u8, q_id, target)) {
|
||||
rec.is_etf = true;
|
||||
if (rec.asset_class == null) {
|
||||
rec.asset_class = try allocator.dupe(u8, "ETF (uncategorized)");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (mutual_fund_q_ids) |target| {
|
||||
if (std.mem.eql(u8, q_id, target)) {
|
||||
rec.is_etf = true;
|
||||
if (rec.asset_class == null) {
|
||||
rec.asset_class = try allocator.dupe(u8, "Mutual Fund (uncategorized)");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Drain map into owned slice. Caller takes ownership; our defer
|
||||
// above calls deinit on values, so clear the map before returning
|
||||
// to avoid double-free.
|
||||
var out = try allocator.alloc(ClassificationRecord, by_symbol.count());
|
||||
var idx: usize = 0;
|
||||
var it = by_symbol.iterator();
|
||||
while (it.next()) |entry| {
|
||||
out[idx] = entry.value_ptr.*;
|
||||
idx += 1;
|
||||
}
|
||||
var key_it = by_symbol.keyIterator();
|
||||
while (key_it.next()) |k| allocator.free(k.*);
|
||||
by_symbol.clearRetainingCapacity();
|
||||
return out;
|
||||
}
|
||||
|
||||
/// Pull the `.value` string out of a SPARQL JSON binding object's
|
||||
/// named field. Returns null if absent or non-string.
|
||||
fn sparqlValue(obj: std.json.ObjectMap, field: []const u8) ?[]const u8 {
|
||||
const slot = obj.get(field) orelse return null;
|
||||
const slot_obj = switch (slot) {
|
||||
.object => |o| o,
|
||||
else => return null,
|
||||
};
|
||||
const val = slot_obj.get("value") orelse return null;
|
||||
return switch (val) {
|
||||
.string => |s| s,
|
||||
else => null,
|
||||
};
|
||||
}
|
||||
|
||||
// ── Tests ────────────────────────────────────────────────────────
|
||||
|
||||
test "buildQuery includes all symbols and required SELECT vars" {
|
||||
const allocator = std.testing.allocator;
|
||||
const syms = [_][]const u8{ "AAPL", "VTI" };
|
||||
const q = try buildQuery(allocator, &syms);
|
||||
defer allocator.free(q);
|
||||
|
||||
try std.testing.expect(std.mem.indexOf(u8, q, "\"AAPL\"") != null);
|
||||
try std.testing.expect(std.mem.indexOf(u8, q, "\"VTI\"") != null);
|
||||
try std.testing.expect(std.mem.indexOf(u8, q, "p:P414") != null);
|
||||
try std.testing.expect(std.mem.indexOf(u8, q, "pq:P249") != null);
|
||||
try std.testing.expect(std.mem.indexOf(u8, q, "wdt:P452") != null);
|
||||
try std.testing.expect(std.mem.indexOf(u8, q, "wdt:P17") != null);
|
||||
// US-exchange filter must be present — without it, US tickers
|
||||
// collide with foreign exchanges (MRK→Merck KGaA, PG→People's
|
||||
// Garment, etc.). See `us_exchanges` doc-block.
|
||||
try std.testing.expect(std.mem.indexOf(u8, q, "wd:Q13677") != null); // NYSE
|
||||
try std.testing.expect(std.mem.indexOf(u8, q, "wd:Q82059") != null); // Nasdaq
|
||||
try std.testing.expect(std.mem.indexOf(u8, q, "ps:P414 ?exchange") != null);
|
||||
}
|
||||
|
||||
test "parse: AAPL fixture round-trips name + industry + country" {
|
||||
const fixture =
|
||||
\\{
|
||||
\\ "head": {"vars": ["ticker", "security", "securityLabel", "industryLabel", "countryCode", "inception", "cik", "instance"]},
|
||||
\\ "results": {
|
||||
\\ "bindings": [
|
||||
\\ {
|
||||
\\ "ticker": {"type": "literal", "value": "AAPL"},
|
||||
\\ "security": {"type": "uri", "value": "http://www.wikidata.org/entity/Q312"},
|
||||
\\ "securityLabel": {"type": "literal", "value": "Apple Inc."},
|
||||
\\ "industryLabel": {"type": "literal", "value": "consumer electronics"},
|
||||
\\ "countryCode": {"type": "literal", "value": "US"},
|
||||
\\ "instance": {"type": "uri", "value": "http://www.wikidata.org/entity/Q4830453"}
|
||||
\\ }
|
||||
\\ ]
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const allocator = std.testing.allocator;
|
||||
const expected = [_][]const u8{"AAPL"};
|
||||
const recs = try parse(std.testing.io, allocator, fixture, &expected);
|
||||
defer {
|
||||
for (recs) |*r| {
|
||||
var m = r.*;
|
||||
m.deinit(allocator);
|
||||
}
|
||||
allocator.free(recs);
|
||||
}
|
||||
|
||||
try std.testing.expectEqual(@as(usize, 1), recs.len);
|
||||
try std.testing.expectEqualStrings("AAPL", recs[0].symbol);
|
||||
try std.testing.expectEqualStrings("Apple Inc.", recs[0].name.?);
|
||||
try std.testing.expectEqualStrings("consumer electronics", recs[0].industry.?);
|
||||
try std.testing.expectEqualStrings("consumer electronics", recs[0].sector.?);
|
||||
try std.testing.expectEqualStrings("US", recs[0].country.?);
|
||||
try std.testing.expect(!recs[0].is_etf);
|
||||
}
|
||||
|
||||
test "parse: ETF fixture sets is_etf=true and asset_class" {
|
||||
const fixture =
|
||||
\\{
|
||||
\\ "head": {"vars": ["ticker", "security", "securityLabel", "instance"]},
|
||||
\\ "results": {
|
||||
\\ "bindings": [
|
||||
\\ {
|
||||
\\ "ticker": {"type": "literal", "value": "VTI"},
|
||||
\\ "security": {"type": "uri", "value": "http://www.wikidata.org/entity/Q1809462"},
|
||||
\\ "securityLabel": {"type": "literal", "value": "Vanguard Total Stock Market ETF"},
|
||||
\\ "instance": {"type": "uri", "value": "http://www.wikidata.org/entity/Q40244"}
|
||||
\\ }
|
||||
\\ ]
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const allocator = std.testing.allocator;
|
||||
const expected = [_][]const u8{"VTI"};
|
||||
const recs = try parse(std.testing.io, allocator, fixture, &expected);
|
||||
defer {
|
||||
for (recs) |*r| {
|
||||
var m = r.*;
|
||||
m.deinit(allocator);
|
||||
}
|
||||
allocator.free(recs);
|
||||
}
|
||||
|
||||
try std.testing.expectEqual(@as(usize, 1), recs.len);
|
||||
try std.testing.expect(recs[0].is_etf);
|
||||
try std.testing.expectEqualStrings("ETF (uncategorized)", recs[0].asset_class.?);
|
||||
}
|
||||
|
||||
test "parse: bindings for symbols not requested are dropped" {
|
||||
const fixture =
|
||||
\\{
|
||||
\\ "head": {"vars": ["ticker", "security", "securityLabel"]},
|
||||
\\ "results": {
|
||||
\\ "bindings": [
|
||||
\\ {"ticker": {"type": "literal", "value": "WRONG"},
|
||||
\\ "security": {"type": "uri", "value": "http://example/Q1"},
|
||||
\\ "securityLabel": {"type": "literal", "value": "Wrong Co"}}
|
||||
\\ ]
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const allocator = std.testing.allocator;
|
||||
const expected = [_][]const u8{"AAPL"};
|
||||
const recs = try parse(std.testing.io, allocator, fixture, &expected);
|
||||
defer allocator.free(recs);
|
||||
|
||||
try std.testing.expectEqual(@as(usize, 0), recs.len);
|
||||
}
|
||||
|
||||
test "geoFor maps known ISO-3166 codes to bucket" {
|
||||
try std.testing.expectEqualStrings(geo.us, geoFor("US"));
|
||||
try std.testing.expectEqualStrings(geo.us, geoFor("USA"));
|
||||
try std.testing.expectEqualStrings(geo.developed, geoFor("GB"));
|
||||
try std.testing.expectEqualStrings(geo.developed, geoFor("DE"));
|
||||
try std.testing.expectEqualStrings(geo.developed, geoFor("CA"));
|
||||
try std.testing.expectEqualStrings(geo.developed, geoFor("IL"));
|
||||
try std.testing.expectEqualStrings(geo.emerging, geoFor("CN"));
|
||||
try std.testing.expectEqualStrings(geo.emerging, geoFor("TW"));
|
||||
try std.testing.expectEqualStrings(geo.emerging, geoFor("KR"));
|
||||
}
|
||||
|
||||
test "geoFor returns Unknown for null/empty/unmapped" {
|
||||
try std.testing.expectEqualStrings(geo.unknown, geoFor(null));
|
||||
try std.testing.expectEqualStrings(geo.unknown, geoFor(""));
|
||||
try std.testing.expectEqualStrings(geo.unknown, geoFor("ZZ")); // unassigned ISO-2
|
||||
try std.testing.expectEqualStrings(geo.unknown, geoFor("XX"));
|
||||
}
|
||||
713
src/providers/xml.zig
Normal file
713
src/providers/xml.zig
Normal file
|
|
@ -0,0 +1,713 @@
|
|||
// VENDORED - see README.md.
|
||||
// File sourced from:
|
||||
// https://github.com/Snektron/vulkan-zig/blob/797ae8af88e84753af9640266de61a985b76b580/generator/xml.zig
|
||||
// via ~/shared/aws-zig/src/xml.zig
|
||||
const std = @import("std");
|
||||
const mem = std.mem;
|
||||
const testing = std.testing;
|
||||
const Allocator = mem.Allocator;
|
||||
const ArenaAllocator = std.heap.ArenaAllocator;
|
||||
const ArrayList = std.ArrayList;
|
||||
|
||||
pub const Attribute = struct {
|
||||
name: []const u8,
|
||||
value: []const u8,
|
||||
};
|
||||
|
||||
pub const Content = union(enum) {
|
||||
CharData: []const u8,
|
||||
Comment: []const u8,
|
||||
Element: *Element,
|
||||
};
|
||||
|
||||
pub const Element = struct {
|
||||
pub const AttributeList = ArrayList(*Attribute);
|
||||
pub const ContentList = ArrayList(Content);
|
||||
|
||||
tag: []const u8,
|
||||
attributes: AttributeList,
|
||||
children: ContentList,
|
||||
next_sibling: ?*Element = null,
|
||||
allocator: std.mem.Allocator,
|
||||
|
||||
fn init(tag: []const u8, alloc: Allocator) Element {
|
||||
return .{
|
||||
.tag = tag,
|
||||
.attributes = .empty,
|
||||
.children = .empty,
|
||||
.allocator = alloc,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn getAttribute(self: *Element, attrib_name: []const u8) ?[]const u8 {
|
||||
for (self.attributes.items) |child| {
|
||||
if (mem.eql(u8, child.name, attrib_name)) {
|
||||
return child.value;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
pub fn getCharData(self: *Element, child_tag: []const u8) ?[]const u8 {
|
||||
const child = (self.findChildByTag(child_tag) catch return null) orelse return null;
|
||||
if (child.children.items.len != 1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return switch (child.children.items[0]) {
|
||||
.CharData => |char_data| char_data,
|
||||
else => null,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn iterator(self: *Element) ChildIterator {
|
||||
return .{
|
||||
.items = self.children.items,
|
||||
.i = 0,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn elements(self: *Element) ChildElementIterator {
|
||||
return .{
|
||||
.inner = self.iterator(),
|
||||
};
|
||||
}
|
||||
|
||||
pub fn findChildByTag(self: *Element, tag: []const u8) !?*Element {
|
||||
var it = self.findChildrenByTag(tag);
|
||||
return try it.next();
|
||||
}
|
||||
|
||||
pub fn findChildrenByTag(self: *Element, tag: []const u8) FindChildrenByTagIterator {
|
||||
return .{
|
||||
.inner = self.elements(),
|
||||
.tag = tag,
|
||||
};
|
||||
}
|
||||
|
||||
pub const ChildIterator = struct {
|
||||
items: []Content,
|
||||
i: usize,
|
||||
|
||||
pub fn next(self: *ChildIterator) ?*Content {
|
||||
if (self.i < self.items.len) {
|
||||
self.i += 1;
|
||||
return &self.items[self.i - 1];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
pub const ChildElementIterator = struct {
|
||||
inner: ChildIterator,
|
||||
|
||||
pub fn next(self: *ChildElementIterator) ?*Element {
|
||||
while (self.inner.next()) |child| {
|
||||
if (child.* != .Element) {
|
||||
continue;
|
||||
}
|
||||
|
||||
return child.*.Element;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
fn strictEqual(a: []const u8, b: []const u8, _: PredicateOptions) !bool {
|
||||
return mem.eql(u8, a, b);
|
||||
}
|
||||
pub const FindChildrenByTagIterator = struct {
|
||||
inner: ChildElementIterator,
|
||||
tag: []const u8,
|
||||
predicate: *const fn (a: []const u8, b: []const u8, options: PredicateOptions) anyerror!bool = strictEqual,
|
||||
predicate_options: PredicateOptions = .{},
|
||||
|
||||
pub fn next(self: *FindChildrenByTagIterator) !?*Element {
|
||||
while (self.inner.next()) |child| {
|
||||
if (!try self.predicate(child.tag, self.tag, self.predicate_options)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
return child;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
pub const PredicateOptions = struct {
|
||||
allocator: ?std.mem.Allocator = null,
|
||||
};
|
||||
pub const XmlDecl = struct {
|
||||
version: []const u8,
|
||||
encoding: ?[]const u8,
|
||||
standalone: ?bool,
|
||||
};
|
||||
|
||||
pub const Document = struct {
|
||||
arena: ArenaAllocator,
|
||||
xml_decl: ?*XmlDecl,
|
||||
root: *Element,
|
||||
|
||||
pub fn deinit(self: Document) void {
|
||||
var arena = self.arena; // Copy to stack so self can be taken by value.
|
||||
arena.deinit();
|
||||
}
|
||||
};
|
||||
|
||||
const ParseContext = struct {
|
||||
source: []const u8,
|
||||
offset: usize,
|
||||
line: usize,
|
||||
column: usize,
|
||||
|
||||
fn init(source: []const u8) ParseContext {
|
||||
return .{
|
||||
.source = source,
|
||||
.offset = 0,
|
||||
.line = 0,
|
||||
.column = 0,
|
||||
};
|
||||
}
|
||||
|
||||
fn peek(self: *ParseContext) ?u8 {
|
||||
return if (self.offset < self.source.len) self.source[self.offset] else null;
|
||||
}
|
||||
|
||||
fn consume(self: *ParseContext) !u8 {
|
||||
if (self.offset < self.source.len) {
|
||||
return self.consumeNoEof();
|
||||
}
|
||||
|
||||
return error.UnexpectedEof;
|
||||
}
|
||||
|
||||
fn consumeNoEof(self: *ParseContext) u8 {
|
||||
std.debug.assert(self.offset < self.source.len);
|
||||
const c = self.source[self.offset];
|
||||
self.offset += 1;
|
||||
|
||||
if (c == '\n') {
|
||||
self.line += 1;
|
||||
self.column = 0;
|
||||
} else {
|
||||
self.column += 1;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
fn eat(self: *ParseContext, char: u8) bool {
|
||||
self.expect(char) catch return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
fn expect(self: *ParseContext, expected: u8) !void {
|
||||
if (self.peek()) |actual| {
|
||||
if (expected != actual) {
|
||||
return error.UnexpectedCharacter;
|
||||
}
|
||||
|
||||
_ = self.consumeNoEof();
|
||||
return;
|
||||
}
|
||||
|
||||
return error.UnexpectedEof;
|
||||
}
|
||||
|
||||
fn eatStr(self: *ParseContext, text: []const u8) bool {
|
||||
self.expectStr(text) catch return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
fn expectStr(self: *ParseContext, text: []const u8) !void {
|
||||
if (self.source.len < self.offset + text.len) {
|
||||
return error.UnexpectedEof;
|
||||
} else if (std.mem.startsWith(u8, self.source[self.offset..], text)) {
|
||||
var i: usize = 0;
|
||||
while (i < text.len) : (i += 1) {
|
||||
_ = self.consumeNoEof();
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
return error.UnexpectedCharacter;
|
||||
}
|
||||
|
||||
fn eatWs(self: *ParseContext) bool {
|
||||
var ws = false;
|
||||
|
||||
while (self.peek()) |ch| {
|
||||
switch (ch) {
|
||||
' ', '\t', '\n', '\r' => {
|
||||
ws = true;
|
||||
_ = self.consumeNoEof();
|
||||
},
|
||||
else => break,
|
||||
}
|
||||
}
|
||||
|
||||
return ws;
|
||||
}
|
||||
|
||||
fn expectWs(self: *ParseContext) !void {
|
||||
if (!self.eatWs()) return error.UnexpectedCharacter;
|
||||
}
|
||||
|
||||
fn currentLine(self: ParseContext) []const u8 {
|
||||
var begin: usize = 0;
|
||||
if (mem.lastIndexOfScalar(u8, self.source[0..self.offset], '\n')) |prev_nl| {
|
||||
begin = prev_nl + 1;
|
||||
}
|
||||
|
||||
const end = mem.indexOfScalarPos(u8, self.source, self.offset, '\n') orelse self.source.len;
|
||||
return self.source[begin..end];
|
||||
}
|
||||
};
|
||||
|
||||
test "ParseContext" {
|
||||
{
|
||||
var ctx = ParseContext.init("I like pythons");
|
||||
try testing.expectEqual(@as(?u8, 'I'), ctx.peek());
|
||||
try testing.expectEqual(@as(u8, 'I'), ctx.consumeNoEof());
|
||||
try testing.expectEqual(@as(?u8, ' '), ctx.peek());
|
||||
try testing.expectEqual(@as(u8, ' '), try ctx.consume());
|
||||
|
||||
try testing.expect(ctx.eat('l'));
|
||||
try testing.expectEqual(@as(?u8, 'i'), ctx.peek());
|
||||
try testing.expectEqual(false, ctx.eat('a'));
|
||||
try testing.expectEqual(@as(?u8, 'i'), ctx.peek());
|
||||
|
||||
try ctx.expect('i');
|
||||
try testing.expectEqual(@as(?u8, 'k'), ctx.peek());
|
||||
try testing.expectError(error.UnexpectedCharacter, ctx.expect('a'));
|
||||
try testing.expectEqual(@as(?u8, 'k'), ctx.peek());
|
||||
|
||||
try testing.expect(ctx.eatStr("ke"));
|
||||
try testing.expectEqual(@as(?u8, ' '), ctx.peek());
|
||||
|
||||
try testing.expect(ctx.eatWs());
|
||||
try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
|
||||
try testing.expectEqual(false, ctx.eatWs());
|
||||
try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
|
||||
|
||||
try testing.expectEqual(false, ctx.eatStr("aaaaaaaaa"));
|
||||
try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
|
||||
|
||||
try testing.expectError(error.UnexpectedEof, ctx.expectStr("aaaaaaaaa"));
|
||||
try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
|
||||
try testing.expectError(error.UnexpectedCharacter, ctx.expectStr("pytn"));
|
||||
try testing.expectEqual(@as(?u8, 'p'), ctx.peek());
|
||||
try ctx.expectStr("python");
|
||||
try testing.expectEqual(@as(?u8, 's'), ctx.peek());
|
||||
}
|
||||
|
||||
{
|
||||
var ctx = ParseContext.init("");
|
||||
try testing.expectEqual(ctx.peek(), null);
|
||||
try testing.expectError(error.UnexpectedEof, ctx.consume());
|
||||
try testing.expectEqual(ctx.eat('p'), false);
|
||||
try testing.expectError(error.UnexpectedEof, ctx.expect('p'));
|
||||
}
|
||||
}
|
||||
|
||||
pub const ParseError = error{
|
||||
IllegalCharacter,
|
||||
UnexpectedEof,
|
||||
UnexpectedCharacter,
|
||||
UnclosedValue,
|
||||
UnclosedComment,
|
||||
InvalidName,
|
||||
InvalidEntity,
|
||||
InvalidStandaloneValue,
|
||||
NonMatchingClosingTag,
|
||||
InvalidDocument,
|
||||
OutOfMemory,
|
||||
};
|
||||
|
||||
pub fn parse(backing_allocator: Allocator, source: []const u8) !Document {
|
||||
var ctx = ParseContext.init(source);
|
||||
return try parseDocument(&ctx, backing_allocator);
|
||||
}
|
||||
|
||||
fn parseDocument(ctx: *ParseContext, backing_allocator: Allocator) !Document {
|
||||
var doc = Document{
|
||||
.arena = ArenaAllocator.init(backing_allocator),
|
||||
.xml_decl = null,
|
||||
// SAFETY: assigned below by `try parseDocumentRoot(&doc, ctx)`
|
||||
// before `doc` is returned to the caller. If the parse fails,
|
||||
// we propagate the error and the caller sees an error, not
|
||||
// a half-initialized doc.
|
||||
.root = undefined,
|
||||
};
|
||||
|
||||
errdefer doc.deinit();
|
||||
|
||||
const allocator = doc.arena.allocator();
|
||||
|
||||
try trySkipComments(ctx, allocator);
|
||||
|
||||
doc.xml_decl = try tryParseProlog(ctx, allocator);
|
||||
_ = ctx.eatWs();
|
||||
try trySkipComments(ctx, allocator);
|
||||
|
||||
doc.root = (try tryParseElement(ctx, allocator, null)) orelse return error.InvalidDocument;
|
||||
_ = ctx.eatWs();
|
||||
try trySkipComments(ctx, allocator);
|
||||
|
||||
if (ctx.peek() != null) return error.InvalidDocument;
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
fn parseAttrValue(ctx: *ParseContext, alloc: Allocator) ![]const u8 {
|
||||
const quote = try ctx.consume();
|
||||
if (quote != '"' and quote != '\'') return error.UnexpectedCharacter;
|
||||
|
||||
const begin = ctx.offset;
|
||||
|
||||
while (true) {
|
||||
const c = ctx.consume() catch return error.UnclosedValue;
|
||||
if (c == quote) break;
|
||||
}
|
||||
|
||||
const end = ctx.offset - 1;
|
||||
|
||||
return try dupeAndUnescape(alloc, ctx.source[begin..end]);
|
||||
}
|
||||
|
||||
fn parseEqAttrValue(ctx: *ParseContext, alloc: Allocator) ![]const u8 {
|
||||
_ = ctx.eatWs();
|
||||
try ctx.expect('=');
|
||||
_ = ctx.eatWs();
|
||||
|
||||
return try parseAttrValue(ctx, alloc);
|
||||
}
|
||||
|
||||
fn parseNameNoDupe(ctx: *ParseContext) ![]const u8 {
|
||||
// XML's spec on names is very long, so to make this easier
|
||||
// we just take any character that is not special and not whitespace
|
||||
const begin = ctx.offset;
|
||||
|
||||
while (ctx.peek()) |ch| {
|
||||
switch (ch) {
|
||||
' ', '\t', '\n', '\r' => break,
|
||||
'&', '"', '\'', '<', '>', '?', '=', '/' => break,
|
||||
else => _ = ctx.consumeNoEof(),
|
||||
}
|
||||
}
|
||||
|
||||
const end = ctx.offset;
|
||||
if (begin == end) return error.InvalidName;
|
||||
|
||||
return ctx.source[begin..end];
|
||||
}
|
||||
|
||||
fn tryParseCharData(ctx: *ParseContext, alloc: Allocator) !?[]const u8 {
|
||||
const begin = ctx.offset;
|
||||
|
||||
while (ctx.peek()) |ch| {
|
||||
switch (ch) {
|
||||
'<' => break,
|
||||
else => _ = ctx.consumeNoEof(),
|
||||
}
|
||||
}
|
||||
|
||||
const end = ctx.offset;
|
||||
if (begin == end) return null;
|
||||
|
||||
return try dupeAndUnescape(alloc, ctx.source[begin..end]);
|
||||
}
|
||||
|
||||
fn parseContent(ctx: *ParseContext, alloc: Allocator, parent: ?*Element) ParseError!Content {
|
||||
if (try tryParseCharData(ctx, alloc)) |cd| {
|
||||
return Content{ .CharData = cd };
|
||||
} else if (try tryParseComment(ctx, alloc)) |comment| {
|
||||
return Content{ .Comment = comment };
|
||||
} else if (try tryParseElement(ctx, alloc, parent)) |elem| {
|
||||
return Content{ .Element = elem };
|
||||
} else {
|
||||
return error.UnexpectedCharacter;
|
||||
}
|
||||
}
|
||||
|
||||
fn tryParseAttr(ctx: *ParseContext, alloc: Allocator) !?*Attribute {
|
||||
const name = parseNameNoDupe(ctx) catch return null;
|
||||
_ = ctx.eatWs();
|
||||
try ctx.expect('=');
|
||||
_ = ctx.eatWs();
|
||||
const value = try parseAttrValue(ctx, alloc);
|
||||
|
||||
const attr = try alloc.create(Attribute);
|
||||
attr.name = try alloc.dupe(u8, name);
|
||||
attr.value = value;
|
||||
return attr;
|
||||
}
|
||||
|
||||
fn tryParseElement(ctx: *ParseContext, alloc: Allocator, parent: ?*Element) !?*Element {
|
||||
const start = ctx.offset;
|
||||
if (!ctx.eat('<')) return null;
|
||||
const tag = parseNameNoDupe(ctx) catch {
|
||||
ctx.offset = start;
|
||||
return null;
|
||||
};
|
||||
|
||||
const element = try alloc.create(Element);
|
||||
element.* = Element.init(try alloc.dupe(u8, tag), alloc);
|
||||
|
||||
while (ctx.eatWs()) {
|
||||
const attr = (try tryParseAttr(ctx, alloc)) orelse break;
|
||||
try element.attributes.append(element.allocator, attr);
|
||||
}
|
||||
|
||||
if (ctx.eatStr("/>")) {
|
||||
return element;
|
||||
}
|
||||
|
||||
try ctx.expect('>');
|
||||
|
||||
while (true) {
|
||||
if (ctx.peek() == null) {
|
||||
return error.UnexpectedEof;
|
||||
} else if (ctx.eatStr("</")) {
|
||||
break;
|
||||
}
|
||||
|
||||
const content = try parseContent(ctx, alloc, element);
|
||||
try element.children.append(element.allocator, content);
|
||||
}
|
||||
|
||||
const closing_tag = try parseNameNoDupe(ctx);
|
||||
if (!std.mem.eql(u8, tag, closing_tag)) {
|
||||
return error.NonMatchingClosingTag;
|
||||
}
|
||||
|
||||
_ = ctx.eatWs();
|
||||
try ctx.expect('>');
|
||||
|
||||
if (parent) |p| {
|
||||
var last_element: ?*Element = null;
|
||||
|
||||
for (0..p.children.items.len) |i| {
|
||||
const child = p.children.items[p.children.items.len - i - 1];
|
||||
if (child == .Element) {
|
||||
last_element = child.Element;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (last_element) |lc| {
|
||||
lc.next_sibling = element;
|
||||
}
|
||||
}
|
||||
|
||||
return element;
|
||||
}
|
||||
|
||||
test "tryParseElement" {
|
||||
var arena = std.heap.ArenaAllocator.init(testing.allocator);
|
||||
defer arena.deinit();
|
||||
const alloc = arena.allocator();
|
||||
|
||||
{
|
||||
var ctx = ParseContext.init("<= a='b'/>");
|
||||
try testing.expectEqual(@as(?*Element, null), try tryParseElement(&ctx, alloc, null));
|
||||
try testing.expectEqual(@as(?u8, '<'), ctx.peek());
|
||||
}
|
||||
|
||||
{
|
||||
var ctx = ParseContext.init("<python size='15' color = \"green\"/>");
|
||||
const elem = try tryParseElement(&ctx, alloc, null);
|
||||
try testing.expectEqualSlices(u8, elem.?.tag, "python");
|
||||
|
||||
const size_attr = elem.?.attributes.items[0];
|
||||
try testing.expectEqualSlices(u8, size_attr.name, "size");
|
||||
try testing.expectEqualSlices(u8, size_attr.value, "15");
|
||||
|
||||
const color_attr = elem.?.attributes.items[1];
|
||||
try testing.expectEqualSlices(u8, color_attr.name, "color");
|
||||
try testing.expectEqualSlices(u8, color_attr.value, "green");
|
||||
}
|
||||
|
||||
{
|
||||
var ctx = ParseContext.init("<python>test</python>");
|
||||
const elem = try tryParseElement(&ctx, alloc, null);
|
||||
try testing.expectEqualSlices(u8, elem.?.tag, "python");
|
||||
try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "test");
|
||||
}
|
||||
|
||||
{
|
||||
var ctx = ParseContext.init("<a>b<c/>d<e/>f<!--g--></a>");
|
||||
const elem = try tryParseElement(&ctx, alloc, null);
|
||||
try testing.expectEqualSlices(u8, elem.?.tag, "a");
|
||||
try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "b");
|
||||
try testing.expectEqualSlices(u8, elem.?.children.items[1].Element.tag, "c");
|
||||
try testing.expectEqualSlices(u8, elem.?.children.items[2].CharData, "d");
|
||||
try testing.expectEqualSlices(u8, elem.?.children.items[3].Element.tag, "e");
|
||||
try testing.expectEqualSlices(u8, elem.?.children.items[4].CharData, "f");
|
||||
try testing.expectEqualSlices(u8, elem.?.children.items[5].Comment, "g");
|
||||
}
|
||||
}
|
||||
|
||||
fn tryParseProlog(ctx: *ParseContext, alloc: Allocator) !?*XmlDecl {
|
||||
const start = ctx.offset;
|
||||
if (!ctx.eatStr("<?") or !mem.eql(u8, try parseNameNoDupe(ctx), "xml")) {
|
||||
ctx.offset = start;
|
||||
return null;
|
||||
}
|
||||
|
||||
const decl = try alloc.create(XmlDecl);
|
||||
decl.encoding = null;
|
||||
decl.standalone = null;
|
||||
|
||||
// Version info is mandatory
|
||||
try ctx.expectWs();
|
||||
try ctx.expectStr("version");
|
||||
decl.version = try parseEqAttrValue(ctx, alloc);
|
||||
|
||||
if (ctx.eatWs()) {
|
||||
// Optional encoding and standalone info
|
||||
var require_ws = false;
|
||||
|
||||
if (ctx.eatStr("encoding")) {
|
||||
decl.encoding = try parseEqAttrValue(ctx, alloc);
|
||||
require_ws = true;
|
||||
}
|
||||
|
||||
if (require_ws == ctx.eatWs() and ctx.eatStr("standalone")) {
|
||||
const standalone = try parseEqAttrValue(ctx, alloc);
|
||||
if (std.mem.eql(u8, standalone, "yes")) {
|
||||
decl.standalone = true;
|
||||
} else if (std.mem.eql(u8, standalone, "no")) {
|
||||
decl.standalone = false;
|
||||
} else {
|
||||
return error.InvalidStandaloneValue;
|
||||
}
|
||||
}
|
||||
|
||||
_ = ctx.eatWs();
|
||||
}
|
||||
|
||||
try ctx.expectStr("?>");
|
||||
return decl;
|
||||
}
|
||||
|
||||
test "tryParseProlog" {
|
||||
var arena = std.heap.ArenaAllocator.init(testing.allocator);
|
||||
defer arena.deinit();
|
||||
const alloc = arena.allocator();
|
||||
|
||||
{
|
||||
var ctx = ParseContext.init("<?xmla version='aa'?>");
|
||||
try testing.expectEqual(@as(?*XmlDecl, null), try tryParseProlog(&ctx, alloc));
|
||||
try testing.expectEqual(@as(?u8, '<'), ctx.peek());
|
||||
}
|
||||
|
||||
{
|
||||
var ctx = ParseContext.init("<?xml version='aa'?>");
|
||||
const decl = try tryParseProlog(&ctx, alloc);
|
||||
try testing.expectEqualSlices(u8, "aa", decl.?.version);
|
||||
try testing.expectEqual(@as(?[]const u8, null), decl.?.encoding);
|
||||
try testing.expectEqual(@as(?bool, null), decl.?.standalone);
|
||||
}
|
||||
|
||||
{
|
||||
var ctx = ParseContext.init("<?xml version=\"aa\" encoding = 'bbb' standalone \t = 'yes'?>");
|
||||
const decl = try tryParseProlog(&ctx, alloc);
|
||||
try testing.expectEqualSlices(u8, "aa", decl.?.version);
|
||||
try testing.expectEqualSlices(u8, "bbb", decl.?.encoding.?);
|
||||
try testing.expectEqual(@as(?bool, true), decl.?.standalone.?);
|
||||
}
|
||||
}
|
||||
|
||||
fn trySkipComments(ctx: *ParseContext, alloc: Allocator) !void {
|
||||
while (try tryParseComment(ctx, alloc)) |_| {
|
||||
_ = ctx.eatWs();
|
||||
}
|
||||
}
|
||||
|
||||
fn tryParseComment(ctx: *ParseContext, alloc: Allocator) !?[]const u8 {
|
||||
if (!ctx.eatStr("<!--")) return null;
|
||||
|
||||
const begin = ctx.offset;
|
||||
while (!ctx.eatStr("-->")) {
|
||||
_ = ctx.consume() catch return error.UnclosedComment;
|
||||
}
|
||||
|
||||
const end = ctx.offset - "-->".len;
|
||||
return try alloc.dupe(u8, ctx.source[begin..end]);
|
||||
}
|
||||
|
||||
fn unescapeEntity(text: []const u8) !u8 {
|
||||
const EntitySubstition = struct { text: []const u8, replacement: u8 };
|
||||
|
||||
const entities = [_]EntitySubstition{
|
||||
.{ .text = "<", .replacement = '<' },
|
||||
.{ .text = ">", .replacement = '>' },
|
||||
.{ .text = "&", .replacement = '&' },
|
||||
.{ .text = "'", .replacement = '\'' },
|
||||
.{ .text = """, .replacement = '"' },
|
||||
};
|
||||
|
||||
for (entities) |entity| {
|
||||
if (std.mem.eql(u8, text, entity.text)) return entity.replacement;
|
||||
}
|
||||
|
||||
return error.InvalidEntity;
|
||||
}
|
||||
|
||||
fn dupeAndUnescape(alloc: Allocator, text: []const u8) ![]const u8 {
|
||||
const str = try alloc.alloc(u8, text.len);
|
||||
|
||||
var j: usize = 0;
|
||||
var i: usize = 0;
|
||||
while (i < text.len) : (j += 1) {
|
||||
if (text[i] == '&') {
|
||||
const entity_end = 1 + (mem.indexOfScalarPos(u8, text, i, ';') orelse return error.InvalidEntity);
|
||||
str[j] = try unescapeEntity(text[i..entity_end]);
|
||||
i = entity_end;
|
||||
} else {
|
||||
str[j] = text[i];
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// This error is not strictly true, but we need to match one of the items
|
||||
// from the error set provided by the other stdlib calls at the calling site
|
||||
if (!alloc.resize(str, j)) {
|
||||
defer alloc.free(str);
|
||||
return alloc.dupe(u8, str[0..j]) catch return error.OutOfMemory;
|
||||
}
|
||||
return str[0..j];
|
||||
}
|
||||
|
||||
test "dupeAndUnescape" {
|
||||
var arena = std.heap.ArenaAllocator.init(testing.allocator);
|
||||
defer arena.deinit();
|
||||
const alloc = arena.allocator();
|
||||
|
||||
const duped = try dupeAndUnescape(testing.allocator, "test");
|
||||
defer testing.allocator.free(duped);
|
||||
try testing.expectEqualSlices(u8, "test", duped);
|
||||
const duped2 = try dupeAndUnescape(testing.allocator, "a<b&c>d"e'f<");
|
||||
defer testing.allocator.free(duped2);
|
||||
try testing.expectEqualSlices(u8, "a<b&c>d\"e'f<", duped2);
|
||||
try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&"));
|
||||
try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&&"));
|
||||
try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&test;"));
|
||||
try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&boa"));
|
||||
}
|
||||
|
||||
test "Top level comments" {
|
||||
var arena = std.heap.ArenaAllocator.init(testing.allocator);
|
||||
defer arena.deinit();
|
||||
const alloc = arena.allocator();
|
||||
|
||||
const doc = try parse(alloc, "<?xml version='aa'?><!--comment--><python color='green'/><!--another comment-->");
|
||||
try testing.expectEqualSlices(u8, "python", doc.root.tag);
|
||||
}
|
||||
|
|
@ -1798,12 +1798,16 @@ pub const DataService = struct {
|
|||
.splits => "/splits",
|
||||
.etf_profile => return false, // not served
|
||||
.meta => return false,
|
||||
// New variants wired into the endpoint mapping by
|
||||
// Milestone 1 chunk 3 (DataService methods). For now
|
||||
// they're not yet served; clients fall through to live
|
||||
// provider fetch via getClassification / getEntityFacts /
|
||||
// getEtfMetrics, which don't exist yet.
|
||||
// Endpoint mapping for these will be wired when the
|
||||
// corresponding `getClassification` / `getEntityFacts` /
|
||||
// `getEtfMetrics` service methods land. Until then,
|
||||
// server sync is a no-op for them.
|
||||
.classification, .etf_metrics, .entity_facts => return false,
|
||||
// Provider-internal cache files (ticker-map indexes)
|
||||
// are not served — clients fetch them directly from
|
||||
// the SEC. The DataService caches the JSON via
|
||||
// `Store` after fetching; the server has no role.
|
||||
.tickers_funds, .tickers_companies => return false,
|
||||
};
|
||||
|
||||
const full_url = std.fmt.allocPrint(self.allocator, "{s}/{s}{s}", .{ server_url, symbol, endpoint }) catch return false;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue