From e17e76905f415fadf5391718a8f977ab972f3095 Mon Sep 17 00:00:00 2001 From: Emil Lerch Date: Wed, 20 May 2026 11:06:48 -0700 Subject: [PATCH] better reporting on http related issues --- src/net/http.zig | 194 +++++++++++++++++++++++++++++++++++++++-------- src/service.zig | 30 +++++++- 2 files changed, 191 insertions(+), 33 deletions(-) diff --git a/src/net/http.zig b/src/net/http.zig index 97c89c4..886d248 100644 --- a/src/net/http.zig +++ b/src/net/http.zig @@ -1,19 +1,55 @@ const std = @import("std"); -pub const HttpError = error{ - RequestFailed, - RateLimited, - Unauthorized, - NotFound, - /// HTTP 402 Payment Required — used by FMP to mark symbols (mainly ETFs, - /// mutual funds, CUSIPs, and some dual-class shares) that aren't covered - /// by the caller's current plan. Providers should translate this into - /// "no data" rather than a hard failure. - PaymentRequired, - ServerError, - InvalidResponse, - OutOfMemory, -}; +const log = std.log.scoped(.http); + +/// Public error set for the HTTP client wrapper. Merges the stdlib +/// transport error sets directly so that callers see meaningful Zig +/// error names (`NoAddressReturned`, `ConnectionRefused`, +/// `EndOfStream`, `TlsInitializationFailed`, ...) at the boundary +/// instead of a single opaque `RequestFailed`. +/// +/// Why this shape, not `anyerror` and not a hand-rolled flat set: +/// - Callers get exhaustive-switch checking from the compiler when +/// they need to discriminate between, say, DNS failures and +/// connection failures. +/// - The error names propagate through `@errorName(err)` to log +/// lines without per-stage translation tables. An operator +/// reading "DnsLookupFailed" or "ConnectFailed" learns less +/// than one reading "NoAddressReturned" — the stdlib variant +/// is the truth, ours would be a lossy summary. +/// - The set is the same shape as stdlib's own +/// `http.Client.ConnectError = ConnectTcpError || RequestError` +/// pattern. Zig dedupes shared variants automatically. +/// +/// HTTP-status classifications (`RateLimited`, `NotFound`, etc.) live +/// alongside the transport variants because callers consume both via +/// the same `try` sites and the same error switches. +pub const HttpError = std.Uri.ParseError || + std.http.Client.RequestError || + std.http.Client.Request.ReceiveHeadError || + std.Io.Writer.Error || + std.Io.Reader.Error || + std.mem.Allocator.Error || + error{ + /// Last-resort transport failure. Used only when we cannot + /// recover the underlying stdlib error (e.g., across the + /// retry boundary in `request()` after multiple distinct + /// failures collapsed into a single retry exhaustion). + /// Adding new uses of this variant is a smell — prefer + /// preserving the real error. + RequestFailed, + // ── HTTP status classifications (no stdlib equivalent) ── + RateLimited, + Unauthorized, + NotFound, + /// HTTP 402 Payment Required — used by FMP to mark symbols (mainly ETFs, + /// mutual funds, CUSIPs, and some dual-class shares) that aren't covered + /// by the caller's current plan. Providers should translate this into + /// "no data" rather than a hard failure. + PaymentRequired, + ServerError, + InvalidResponse, + }; pub const Response = struct { status: std.http.Status, @@ -127,15 +163,26 @@ pub const Client = struct { } pub fn request(self: *Client, method: std.http.Method, url: []const u8, body: ?[]const u8, extra_headers: []const std.http.Header) HttpError!Response { + // Preserves the last underlying error across retry attempts. + // Without this, the retry loop used to collapse every attempt's + // distinct failure into a single `HttpError.RequestFailed`, + // which is exactly the kind of opaque error that hides root + // causes. A DNS-truncation bug surfaced for hours as bare + // "RequestFailed" because the per-attempt error was discarded + // by `catch {}`. Now the caller's `@errorName(err)` reports + // the real cause (e.g., `NoAddressReturned`). var attempt: u8 = 0; + var last_err: HttpError = HttpError.RequestFailed; while (true) : (attempt += 1) { - const response = self.doRequest(method, url, body, extra_headers) catch { - if (attempt >= self.max_retries) return HttpError.RequestFailed; + const response = self.doRequest(method, url, body, extra_headers) catch |err| { + last_err = err; + if (attempt >= self.max_retries) return last_err; self.backoffSleep(attempt); continue; }; return classifyResponse(response) catch |err| { if (err == HttpError.ServerError and attempt < self.max_retries) { + last_err = err; self.backoffSleep(attempt); continue; } @@ -150,28 +197,97 @@ pub const Client = struct { } fn doRequest(self: *Client, method: std.http.Method, url: []const u8, body: ?[]const u8, extra_headers: []const std.http.Header) HttpError!Response { - const uri = std.Uri.parse(url) catch return HttpError.RequestFailed; + // Per-stage timing for diagnosing where transport-level stalls + // occur. Zig 0.16's `std.http.Client` exposes no per-request + // receive timeout (only `connectTcpOptions.timeout`), so a + // request can hang forever in `receiveHead` or `streamRemaining` + // with no upstream visibility. The per-stage logging lets the + // operator pinpoint exactly which network operation wedged + // once they Ctrl-C, even though no `catch` will fire on a + // truly infinite stall. + // + // Each stage that may fail logs the underlying stdlib error + // name verbatim before propagating it through `HttpError`. + // The error returned to the caller IS the underlying stdlib + // error (`NoAddressReturned`, `ConnectionRefused`, + // `EndOfStream`, `TlsInitializationFailed`, ...) — `HttpError` + // is a merged superset of the relevant stdlib error sets, so + // `try` here propagates the original error verbatim. Earlier + // versions of this function caught every error and rethrew as + // a single `HttpError.RequestFailed`, which collapsed six + // distinct failure modes into one and made operator + // diagnosis impossible (this cost hours of debugging at + // least once when an intermittent DNS-truncation bug + // surfaced as bare "RequestFailed"). + // + // wall-clock required: per-stage transport timing. Uses + // `.awake` (monotonic) so a system clock jump mid-request + // doesn't produce nonsense elapsed values. + const t_start = std.Io.Timestamp.now(self.io, .awake).nanoseconds; + var t_stage = t_start; + const stageElapsedMs = struct { + fn f(prev: *i96, io: std.Io) i64 { + const now = std.Io.Timestamp.now(io, .awake).nanoseconds; + const delta_ns = now - prev.*; + prev.* = now; + return @intCast(@divTrunc(delta_ns, std.time.ns_per_ms)); + } + }.f; + + const uri = std.Uri.parse(url) catch |err| { + log.warn("http {s}: stage=uri_parse err={s} url={s}", .{ @tagName(method), @errorName(err), url }); + return err; + }; + const ms_uri_parse = stageElapsedMs(&t_stage, self.io); var req = self.http_client.request(method, uri, .{ .redirect_behavior = @enumFromInt(3), .extra_headers = extra_headers, - }) catch return HttpError.RequestFailed; + }) catch |err| { + // The connect stage covers DNS lookup, TCP connect, and + // TLS handshake. Logging at warn level (rather than debug) + // because DNS / connectivity failures are exactly what + // operators need to see immediately. + log.warn("http {s}: stage=connect err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url }); + return err; + }; defer req.deinit(); + const ms_connect = stageElapsedMs(&t_stage, self.io); if (body) |payload| { var send_buf: [4096]u8 = undefined; req.transfer_encoding = .{ .content_length = payload.len }; - var bw = req.sendBodyUnflushed(&send_buf) catch return HttpError.RequestFailed; - bw.writer.writeAll(payload) catch return HttpError.RequestFailed; - bw.end() catch return HttpError.RequestFailed; - req.connection.?.flush() catch return HttpError.RequestFailed; + var bw = req.sendBodyUnflushed(&send_buf) catch |err| { + log.warn("http {s}: stage=send_body_open err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url }); + return err; + }; + bw.writer.writeAll(payload) catch |err| { + log.warn("http {s}: stage=send_body_write err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url }); + return err; + }; + bw.end() catch |err| { + log.warn("http {s}: stage=send_body_end err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url }); + return err; + }; + req.connection.?.flush() catch |err| { + log.warn("http {s}: stage=send_body_flush err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url }); + return err; + }; } else { - req.sendBodiless() catch return HttpError.RequestFailed; + req.sendBodiless() catch |err| { + log.warn("http {s}: stage=send_bodiless err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url }); + return err; + }; } + const ms_send = stageElapsedMs(&t_stage, self.io); // Matches the default redirect capacity in std.http.Client.fetch. var redirect_buffer: [8 * 1024]u8 = undefined; - var response = req.receiveHead(&redirect_buffer) catch return HttpError.RequestFailed; + var response = req.receiveHead(&redirect_buffer) catch |err| { + log.warn("http {s}: stage=receive_head err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url }); + return err; + }; + const ms_receive_head = stageElapsedMs(&t_stage, self.io); // Capture the ETag (if any) from the response head BEFORE // draining the body. `Response.reader()` invalidates @@ -181,8 +297,7 @@ pub const Client = struct { var it = response.head.iterateHeaders(); while (it.next()) |h| { if (std.ascii.eqlIgnoreCase(h.name, "etag")) { - const dup = self.allocator.dupe(u8, h.value) catch - return HttpError.OutOfMemory; + const dup = self.allocator.dupe(u8, h.value) catch |err| return err; break :blk dup; } } @@ -205,13 +320,30 @@ pub const Client = struct { var decompress: std.http.Decompress = undefined; var decompress_buffer: [64 * 1024]u8 = undefined; const reader = response.readerDecompressing(&transfer_buffer, &decompress, &decompress_buffer); - _ = reader.streamRemaining(&aw.writer) catch { - return HttpError.RequestFailed; + _ = reader.streamRemaining(&aw.writer) catch |err| { + log.warn("http {s}: stage=stream_body err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url }); + return err; }; + const ms_body = stageElapsedMs(&t_stage, self.io); - const resp_body = aw.toOwnedSlice() catch { - return HttpError.OutOfMemory; - }; + const resp_body = aw.toOwnedSlice() catch |err| return err; + + const total_ms = @divTrunc(std.Io.Timestamp.now(self.io, .awake).nanoseconds - t_start, std.time.ns_per_ms); + log.debug( + "http {s}: ok status={d} bytes={d} total_ms={d} (uri_parse={d} connect={d} send={d} receive_head={d} body={d}) url={s}", + .{ + @tagName(method), + @intFromEnum(response.head.status), + resp_body.len, + total_ms, + ms_uri_parse, + ms_connect, + ms_send, + ms_receive_head, + ms_body, + url, + }, + ); return .{ .status = response.head.status, diff --git a/src/service.zig b/src/service.zig index 64c886f..ad1fc81 100644 --- a/src/service.zig +++ b/src/service.zig @@ -1489,13 +1489,35 @@ pub const DataService = struct { /// One attempt at syncing a file from the server. Archives a torn /// body when detected but does NOT retry — the caller decides that. fn tryOneSync(self: *DataService, symbol: []const u8, data_type: cache.DataType, full_url: []const u8) SyncAttempt { - log.debug("{s}: syncing {s} from server", .{ symbol, @tagName(data_type) }); + // Per-attempt start/finish trace. The "started" line emits + // before any blocking call; the "finished" line emits on every + // exit path. If a sync wedges in `client.get`, you'll see the + // started line with no matching finished line — the missing + // finished entries identify which symbols are stuck. Pair this + // with the per-stage `http: stage=...` lines from `net/http.zig` + // to pinpoint which transport stage stalled. + // + // wall-clock required: per-attempt elapsed for diagnosing + // partial-success/stall patterns under parallel fan-out. + // `.awake` (monotonic) avoids spurious negatives on clock skew. + const t_start = std.Io.Timestamp.now(self.io, .awake).nanoseconds; + log.debug("{s}: tryOneSync started ({s})", .{ symbol, @tagName(data_type) }); var client = http.Client.init(self.io, self.allocator); defer client.deinit(); var response = client.get(full_url) catch |err| { - log.debug("{s}: server sync failed for {s}: {s}", .{ symbol, @tagName(data_type), @errorName(err) }); + const elapsed_ms = @divTrunc(std.Io.Timestamp.now(self.io, .awake).nanoseconds - t_start, std.time.ns_per_ms); + // Operator-visible: surfaces meaningful failures + // (`NoAddressReturned`, `ConnectionRefused`, + // `TlsInitializationFailed`, etc.) instead of swallowing + // them. Network-shaped errors are exactly what the user + // needs to see when sync stops working — keeping this at + // debug level meant a DNS-truncation bug was visible only + // to anyone running with debug logging on, which cost + // hours of diagnosis time. + log.warn("{s}: server sync failed for {s}: {s} (elapsed_ms={d})", .{ symbol, @tagName(data_type), @errorName(err), elapsed_ms }); + log.debug("{s}: tryOneSync finished ({s}) result=net_err elapsed_ms={d}", .{ symbol, @tagName(data_type), elapsed_ms }); return .net_err; }; defer response.deinit(); @@ -1535,6 +1557,7 @@ pub const DataService = struct { "{s}: {s} server response failed integrity check ({d} bytes, expected sha256={s}, actual={s}) — archived under _torn/, not writing to cache", .{ symbol, @tagName(data_type), response.body.len, m.expected_hex, m.actual_hex }, ); + log.debug("{s}: tryOneSync finished ({s}) result=torn elapsed_ms={d}", .{ symbol, @tagName(data_type), @divTrunc(std.Io.Timestamp.now(self.io, .awake).nanoseconds - t_start, std.time.ns_per_ms) }); return .torn; }, .ok, .not_applicable => {}, @@ -1578,6 +1601,7 @@ pub const DataService = struct { "{s}: rejecting torn {s} server response ({d} bytes) — archived under _torn/, not writing to cache", .{ symbol, @tagName(data_type), response.body.len }, ); + log.debug("{s}: tryOneSync finished ({s}) result=torn elapsed_ms={d}", .{ symbol, @tagName(data_type), @divTrunc(std.Io.Timestamp.now(self.io, .awake).nanoseconds - t_start, std.time.ns_per_ms) }); return .torn; } @@ -1585,9 +1609,11 @@ pub const DataService = struct { var s = self.store(); s.writeRaw(symbol, data_type, response.body) catch |err| { log.debug("{s}: failed to write synced {s} to cache: {s}", .{ symbol, @tagName(data_type), @errorName(err) }); + log.debug("{s}: tryOneSync finished ({s}) result=net_err elapsed_ms={d}", .{ symbol, @tagName(data_type), @divTrunc(std.Io.Timestamp.now(self.io, .awake).nanoseconds - t_start, std.time.ns_per_ms) }); return .net_err; }; log.debug("{s}: synced {s} from server ({d} bytes)", .{ symbol, @tagName(data_type), response.body.len }); + log.debug("{s}: tryOneSync finished ({s}) result=ok elapsed_ms={d}", .{ symbol, @tagName(data_type), @divTrunc(std.Io.Timestamp.now(self.io, .awake).nanoseconds - t_start, std.time.ns_per_ms) }); return .ok; }