better reporting on http related issues

This commit is contained in:
Emil Lerch 2026-05-20 11:06:48 -07:00
parent d3b1c04a3a
commit e17e76905f
Signed by: lobo
GPG key ID: A7B62D657EF764F8
2 changed files with 191 additions and 33 deletions

View file

@ -1,19 +1,55 @@
const std = @import("std");
pub const HttpError = error{
RequestFailed,
RateLimited,
Unauthorized,
NotFound,
/// HTTP 402 Payment Required used by FMP to mark symbols (mainly ETFs,
/// mutual funds, CUSIPs, and some dual-class shares) that aren't covered
/// by the caller's current plan. Providers should translate this into
/// "no data" rather than a hard failure.
PaymentRequired,
ServerError,
InvalidResponse,
OutOfMemory,
};
const log = std.log.scoped(.http);
/// Public error set for the HTTP client wrapper. Merges the stdlib
/// transport error sets directly so that callers see meaningful Zig
/// error names (`NoAddressReturned`, `ConnectionRefused`,
/// `EndOfStream`, `TlsInitializationFailed`, ...) at the boundary
/// instead of a single opaque `RequestFailed`.
///
/// Why this shape, not `anyerror` and not a hand-rolled flat set:
/// - Callers get exhaustive-switch checking from the compiler when
/// they need to discriminate between, say, DNS failures and
/// connection failures.
/// - The error names propagate through `@errorName(err)` to log
/// lines without per-stage translation tables. An operator
/// reading "DnsLookupFailed" or "ConnectFailed" learns less
/// than one reading "NoAddressReturned" the stdlib variant
/// is the truth, ours would be a lossy summary.
/// - The set is the same shape as stdlib's own
/// `http.Client.ConnectError = ConnectTcpError || RequestError`
/// pattern. Zig dedupes shared variants automatically.
///
/// HTTP-status classifications (`RateLimited`, `NotFound`, etc.) live
/// alongside the transport variants because callers consume both via
/// the same `try` sites and the same error switches.
pub const HttpError = std.Uri.ParseError ||
std.http.Client.RequestError ||
std.http.Client.Request.ReceiveHeadError ||
std.Io.Writer.Error ||
std.Io.Reader.Error ||
std.mem.Allocator.Error ||
error{
/// Last-resort transport failure. Used only when we cannot
/// recover the underlying stdlib error (e.g., across the
/// retry boundary in `request()` after multiple distinct
/// failures collapsed into a single retry exhaustion).
/// Adding new uses of this variant is a smell prefer
/// preserving the real error.
RequestFailed,
// HTTP status classifications (no stdlib equivalent)
RateLimited,
Unauthorized,
NotFound,
/// HTTP 402 Payment Required used by FMP to mark symbols (mainly ETFs,
/// mutual funds, CUSIPs, and some dual-class shares) that aren't covered
/// by the caller's current plan. Providers should translate this into
/// "no data" rather than a hard failure.
PaymentRequired,
ServerError,
InvalidResponse,
};
pub const Response = struct {
status: std.http.Status,
@ -127,15 +163,26 @@ pub const Client = struct {
}
pub fn request(self: *Client, method: std.http.Method, url: []const u8, body: ?[]const u8, extra_headers: []const std.http.Header) HttpError!Response {
// Preserves the last underlying error across retry attempts.
// Without this, the retry loop used to collapse every attempt's
// distinct failure into a single `HttpError.RequestFailed`,
// which is exactly the kind of opaque error that hides root
// causes. A DNS-truncation bug surfaced for hours as bare
// "RequestFailed" because the per-attempt error was discarded
// by `catch {}`. Now the caller's `@errorName(err)` reports
// the real cause (e.g., `NoAddressReturned`).
var attempt: u8 = 0;
var last_err: HttpError = HttpError.RequestFailed;
while (true) : (attempt += 1) {
const response = self.doRequest(method, url, body, extra_headers) catch {
if (attempt >= self.max_retries) return HttpError.RequestFailed;
const response = self.doRequest(method, url, body, extra_headers) catch |err| {
last_err = err;
if (attempt >= self.max_retries) return last_err;
self.backoffSleep(attempt);
continue;
};
return classifyResponse(response) catch |err| {
if (err == HttpError.ServerError and attempt < self.max_retries) {
last_err = err;
self.backoffSleep(attempt);
continue;
}
@ -150,28 +197,97 @@ pub const Client = struct {
}
fn doRequest(self: *Client, method: std.http.Method, url: []const u8, body: ?[]const u8, extra_headers: []const std.http.Header) HttpError!Response {
const uri = std.Uri.parse(url) catch return HttpError.RequestFailed;
// Per-stage timing for diagnosing where transport-level stalls
// occur. Zig 0.16's `std.http.Client` exposes no per-request
// receive timeout (only `connectTcpOptions.timeout`), so a
// request can hang forever in `receiveHead` or `streamRemaining`
// with no upstream visibility. The per-stage logging lets the
// operator pinpoint exactly which network operation wedged
// once they Ctrl-C, even though no `catch` will fire on a
// truly infinite stall.
//
// Each stage that may fail logs the underlying stdlib error
// name verbatim before propagating it through `HttpError`.
// The error returned to the caller IS the underlying stdlib
// error (`NoAddressReturned`, `ConnectionRefused`,
// `EndOfStream`, `TlsInitializationFailed`, ...) `HttpError`
// is a merged superset of the relevant stdlib error sets, so
// `try` here propagates the original error verbatim. Earlier
// versions of this function caught every error and rethrew as
// a single `HttpError.RequestFailed`, which collapsed six
// distinct failure modes into one and made operator
// diagnosis impossible (this cost hours of debugging at
// least once when an intermittent DNS-truncation bug
// surfaced as bare "RequestFailed").
//
// wall-clock required: per-stage transport timing. Uses
// `.awake` (monotonic) so a system clock jump mid-request
// doesn't produce nonsense elapsed values.
const t_start = std.Io.Timestamp.now(self.io, .awake).nanoseconds;
var t_stage = t_start;
const stageElapsedMs = struct {
fn f(prev: *i96, io: std.Io) i64 {
const now = std.Io.Timestamp.now(io, .awake).nanoseconds;
const delta_ns = now - prev.*;
prev.* = now;
return @intCast(@divTrunc(delta_ns, std.time.ns_per_ms));
}
}.f;
const uri = std.Uri.parse(url) catch |err| {
log.warn("http {s}: stage=uri_parse err={s} url={s}", .{ @tagName(method), @errorName(err), url });
return err;
};
const ms_uri_parse = stageElapsedMs(&t_stage, self.io);
var req = self.http_client.request(method, uri, .{
.redirect_behavior = @enumFromInt(3),
.extra_headers = extra_headers,
}) catch return HttpError.RequestFailed;
}) catch |err| {
// The connect stage covers DNS lookup, TCP connect, and
// TLS handshake. Logging at warn level (rather than debug)
// because DNS / connectivity failures are exactly what
// operators need to see immediately.
log.warn("http {s}: stage=connect err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url });
return err;
};
defer req.deinit();
const ms_connect = stageElapsedMs(&t_stage, self.io);
if (body) |payload| {
var send_buf: [4096]u8 = undefined;
req.transfer_encoding = .{ .content_length = payload.len };
var bw = req.sendBodyUnflushed(&send_buf) catch return HttpError.RequestFailed;
bw.writer.writeAll(payload) catch return HttpError.RequestFailed;
bw.end() catch return HttpError.RequestFailed;
req.connection.?.flush() catch return HttpError.RequestFailed;
var bw = req.sendBodyUnflushed(&send_buf) catch |err| {
log.warn("http {s}: stage=send_body_open err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url });
return err;
};
bw.writer.writeAll(payload) catch |err| {
log.warn("http {s}: stage=send_body_write err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url });
return err;
};
bw.end() catch |err| {
log.warn("http {s}: stage=send_body_end err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url });
return err;
};
req.connection.?.flush() catch |err| {
log.warn("http {s}: stage=send_body_flush err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url });
return err;
};
} else {
req.sendBodiless() catch return HttpError.RequestFailed;
req.sendBodiless() catch |err| {
log.warn("http {s}: stage=send_bodiless err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url });
return err;
};
}
const ms_send = stageElapsedMs(&t_stage, self.io);
// Matches the default redirect capacity in std.http.Client.fetch.
var redirect_buffer: [8 * 1024]u8 = undefined;
var response = req.receiveHead(&redirect_buffer) catch return HttpError.RequestFailed;
var response = req.receiveHead(&redirect_buffer) catch |err| {
log.warn("http {s}: stage=receive_head err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url });
return err;
};
const ms_receive_head = stageElapsedMs(&t_stage, self.io);
// Capture the ETag (if any) from the response head BEFORE
// draining the body. `Response.reader()` invalidates
@ -181,8 +297,7 @@ pub const Client = struct {
var it = response.head.iterateHeaders();
while (it.next()) |h| {
if (std.ascii.eqlIgnoreCase(h.name, "etag")) {
const dup = self.allocator.dupe(u8, h.value) catch
return HttpError.OutOfMemory;
const dup = self.allocator.dupe(u8, h.value) catch |err| return err;
break :blk dup;
}
}
@ -205,13 +320,30 @@ pub const Client = struct {
var decompress: std.http.Decompress = undefined;
var decompress_buffer: [64 * 1024]u8 = undefined;
const reader = response.readerDecompressing(&transfer_buffer, &decompress, &decompress_buffer);
_ = reader.streamRemaining(&aw.writer) catch {
return HttpError.RequestFailed;
_ = reader.streamRemaining(&aw.writer) catch |err| {
log.warn("http {s}: stage=stream_body err={s} elapsed_ms={d} url={s}", .{ @tagName(method), @errorName(err), stageElapsedMs(&t_stage, self.io), url });
return err;
};
const ms_body = stageElapsedMs(&t_stage, self.io);
const resp_body = aw.toOwnedSlice() catch {
return HttpError.OutOfMemory;
};
const resp_body = aw.toOwnedSlice() catch |err| return err;
const total_ms = @divTrunc(std.Io.Timestamp.now(self.io, .awake).nanoseconds - t_start, std.time.ns_per_ms);
log.debug(
"http {s}: ok status={d} bytes={d} total_ms={d} (uri_parse={d} connect={d} send={d} receive_head={d} body={d}) url={s}",
.{
@tagName(method),
@intFromEnum(response.head.status),
resp_body.len,
total_ms,
ms_uri_parse,
ms_connect,
ms_send,
ms_receive_head,
ms_body,
url,
},
);
return .{
.status = response.head.status,

View file

@ -1489,13 +1489,35 @@ pub const DataService = struct {
/// One attempt at syncing a file from the server. Archives a torn
/// body when detected but does NOT retry the caller decides that.
fn tryOneSync(self: *DataService, symbol: []const u8, data_type: cache.DataType, full_url: []const u8) SyncAttempt {
log.debug("{s}: syncing {s} from server", .{ symbol, @tagName(data_type) });
// Per-attempt start/finish trace. The "started" line emits
// before any blocking call; the "finished" line emits on every
// exit path. If a sync wedges in `client.get`, you'll see the
// started line with no matching finished line the missing
// finished entries identify which symbols are stuck. Pair this
// with the per-stage `http: stage=...` lines from `net/http.zig`
// to pinpoint which transport stage stalled.
//
// wall-clock required: per-attempt elapsed for diagnosing
// partial-success/stall patterns under parallel fan-out.
// `.awake` (monotonic) avoids spurious negatives on clock skew.
const t_start = std.Io.Timestamp.now(self.io, .awake).nanoseconds;
log.debug("{s}: tryOneSync started ({s})", .{ symbol, @tagName(data_type) });
var client = http.Client.init(self.io, self.allocator);
defer client.deinit();
var response = client.get(full_url) catch |err| {
log.debug("{s}: server sync failed for {s}: {s}", .{ symbol, @tagName(data_type), @errorName(err) });
const elapsed_ms = @divTrunc(std.Io.Timestamp.now(self.io, .awake).nanoseconds - t_start, std.time.ns_per_ms);
// Operator-visible: surfaces meaningful failures
// (`NoAddressReturned`, `ConnectionRefused`,
// `TlsInitializationFailed`, etc.) instead of swallowing
// them. Network-shaped errors are exactly what the user
// needs to see when sync stops working keeping this at
// debug level meant a DNS-truncation bug was visible only
// to anyone running with debug logging on, which cost
// hours of diagnosis time.
log.warn("{s}: server sync failed for {s}: {s} (elapsed_ms={d})", .{ symbol, @tagName(data_type), @errorName(err), elapsed_ms });
log.debug("{s}: tryOneSync finished ({s}) result=net_err elapsed_ms={d}", .{ symbol, @tagName(data_type), elapsed_ms });
return .net_err;
};
defer response.deinit();
@ -1535,6 +1557,7 @@ pub const DataService = struct {
"{s}: {s} server response failed integrity check ({d} bytes, expected sha256={s}, actual={s}) — archived under _torn/, not writing to cache",
.{ symbol, @tagName(data_type), response.body.len, m.expected_hex, m.actual_hex },
);
log.debug("{s}: tryOneSync finished ({s}) result=torn elapsed_ms={d}", .{ symbol, @tagName(data_type), @divTrunc(std.Io.Timestamp.now(self.io, .awake).nanoseconds - t_start, std.time.ns_per_ms) });
return .torn;
},
.ok, .not_applicable => {},
@ -1578,6 +1601,7 @@ pub const DataService = struct {
"{s}: rejecting torn {s} server response ({d} bytes) — archived under _torn/, not writing to cache",
.{ symbol, @tagName(data_type), response.body.len },
);
log.debug("{s}: tryOneSync finished ({s}) result=torn elapsed_ms={d}", .{ symbol, @tagName(data_type), @divTrunc(std.Io.Timestamp.now(self.io, .awake).nanoseconds - t_start, std.time.ns_per_ms) });
return .torn;
}
@ -1585,9 +1609,11 @@ pub const DataService = struct {
var s = self.store();
s.writeRaw(symbol, data_type, response.body) catch |err| {
log.debug("{s}: failed to write synced {s} to cache: {s}", .{ symbol, @tagName(data_type), @errorName(err) });
log.debug("{s}: tryOneSync finished ({s}) result=net_err elapsed_ms={d}", .{ symbol, @tagName(data_type), @divTrunc(std.Io.Timestamp.now(self.io, .awake).nanoseconds - t_start, std.time.ns_per_ms) });
return .net_err;
};
log.debug("{s}: synced {s} from server ({d} bytes)", .{ symbol, @tagName(data_type), response.body.len });
log.debug("{s}: tryOneSync finished ({s}) result=ok elapsed_ms={d}", .{ symbol, @tagName(data_type), @divTrunc(std.Io.Timestamp.now(self.io, .awake).nanoseconds - t_start, std.time.ns_per_ms) });
return .ok;
}