diff --git a/build.zig b/build.zig index 72d5034..e6543e5 100644 --- a/build.zig +++ b/build.zig @@ -153,4 +153,155 @@ pub fn build(b: *std.Build) void { // // Lastly, the Zig build system is relatively simple and self-contained, // and reading its source code will allow you to master it. + + // Benchmark step + const benchmark_step = b.step("benchmark", "Run benchmarks with hyperfine"); + const benchmark_optimize = if (optimize == .Debug) .ReleaseSafe else optimize; + const benchmark_record_count = 100_000; + const include_jsonl = b.option(bool, "benchmark-jsonl", "Include JSONL in benchmarks (slow)") orelse false; + + // Check for hyperfine + const check_hyperfine = b.addSystemCommand(&.{ "sh", "-c", "command -v hyperfine >/dev/null 2>&1 || (echo 'Error: hyperfine not found. Install it with: cargo install hyperfine' >&2 && exit 1)" }); + benchmark_step.dependOn(&check_hyperfine.step); + + // Build test data generator + const gen_exe = b.addExecutable(.{ + .name = "generate_test_data", + .root_module = b.createModule(.{ + .root_source_file = b.path("src/generate_test_data.zig"), + .target = target, + .optimize = benchmark_optimize, + }), + }); + const install_gen = b.addInstallArtifact(gen_exe, .{}); + check_hyperfine.step.dependOn(&install_gen.step); + + // Rebuild main executable with benchmark optimization + const benchmark_exe = b.addExecutable(.{ + .name = "srf", + .root_module = b.createModule(.{ + .root_source_file = b.path("src/main.zig"), + .target = target, + .optimize = benchmark_optimize, + .imports = &.{ + .{ .name = "srf", .module = mod }, + }, + }), + }); + const install_benchmark_exe = b.addInstallArtifact(benchmark_exe, .{}); + check_hyperfine.step.dependOn(&install_benchmark_exe.step); + + const run_benchmark = BenchmarkStep.create(b, .{ + .gen_exe = gen_exe, + .srf_exe = benchmark_exe, + .record_count = benchmark_record_count, + .include_jsonl = include_jsonl, + }); + run_benchmark.step.dependOn(&check_hyperfine.step); + benchmark_step.dependOn(&run_benchmark.step); } + +const BenchmarkStep = struct { + step: std.Build.Step, + gen_exe: *std.Build.Step.Compile, + srf_exe: *std.Build.Step.Compile, + record_count: usize, + include_jsonl: bool, + + pub fn create(owner: *std.Build, options: struct { + gen_exe: *std.Build.Step.Compile, + srf_exe: *std.Build.Step.Compile, + record_count: usize, + include_jsonl: bool, + }) *BenchmarkStep { + const self = owner.allocator.create(BenchmarkStep) catch @panic("OOM"); + self.* = .{ + .step = std.Build.Step.init(.{ + .id = .custom, + .name = "run benchmark", + .owner = owner, + .makeFn = make, + }), + .gen_exe = options.gen_exe, + .srf_exe = options.srf_exe, + .record_count = options.record_count, + .include_jsonl = options.include_jsonl, + }; + return self; + } + + fn make(step: *std.Build.Step, _: std.Build.Step.MakeOptions) !void { + const b = step.owner; + const self: *BenchmarkStep = @fieldParentPtr("step", step); + + const gen_path = b.getInstallPath(.bin, self.gen_exe.name); + const exe_path = b.getInstallPath(.bin, self.srf_exe.name); + const count_str = b.fmt("{d}", .{self.record_count}); + + const formats = [_]struct { name: []const u8, ext: []const u8 }{ + .{ .name = "srf-compact", .ext = "srf" }, + .{ .name = "srf-long", .ext = "srf" }, + .{ .name = "jsonl", .ext = "jsonl" }, + .{ .name = "json", .ext = "json" }, + }; + + var test_files: [4][]const u8 = undefined; + for (formats, 0..) |fmt, i| { + // Create hash from format name and record count + var hasher = std.hash.Wyhash.init(0); + hasher.update(fmt.name); + hasher.update(count_str); + const hash = hasher.final(); + + const hash_str = b.fmt("{x}", .{hash}); + const cache_dir = b.cache_root.join(b.allocator, &.{ "o", hash_str }) catch @panic("OOM"); + std.fs.cwd().makePath(cache_dir) catch {}; + + const filename = b.fmt("test-{s}.{s}", .{ fmt.name, fmt.ext }); + const filepath = b.pathJoin(&.{ cache_dir, filename }); + test_files[i] = filepath; + + // Check if file exists + if (std.fs.cwd().access(filepath, .{})) { + continue; // File exists, skip generation + } else |_| {} + + // Generate file + var child = std.process.Child.init(&.{ gen_path, fmt.name, count_str }, b.allocator); + child.stdout_behavior = .Pipe; + try child.spawn(); + + const output = try child.stdout.?.readToEndAlloc(b.allocator, 100 * 1024 * 1024); + defer b.allocator.free(output); + + const term = try child.wait(); + if (term != .Exited or term.Exited != 0) return error.GenerationFailed; + + try std.fs.cwd().writeFile(.{ .sub_path = filepath, .data = output }); + } + + // Run hyperfine + var argv: std.ArrayList([]const u8) = .empty; + defer argv.deinit(b.allocator); + + try argv.appendSlice(b.allocator, &.{ "hyperfine", "-w", "2" }); + try argv.append(b.allocator, b.fmt("{s} srf <{s}", .{ exe_path, test_files[0] })); + try argv.append(b.allocator, b.fmt("{s} srf <{s}", .{ exe_path, test_files[1] })); + try argv.append(b.allocator, b.fmt("{s} json <{s}", .{ exe_path, test_files[3] })); + if (self.include_jsonl) { + try argv.append(b.allocator, b.fmt("{s} jsonl <{s}", .{ exe_path, test_files[2] })); + } + + var child = std.process.Child.init(argv.items, b.allocator); + + // We need to lock stderror so hyperfine can output progress in place + std.debug.lockStdErr(); + defer std.debug.unlockStdErr(); + + try child.spawn(); + const term = try child.wait(); + + if (term != .Exited or term.Exited != 0) + return error.BenchmarkFailed; + } +}; diff --git a/src/generate_test_data.zig b/src/generate_test_data.zig new file mode 100644 index 0000000..19af81e --- /dev/null +++ b/src/generate_test_data.zig @@ -0,0 +1,61 @@ +const std = @import("std"); + +const record_count = 100_000; + +pub fn main() !void { + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + const allocator = gpa.allocator(); + + const args = try std.process.argsAlloc(allocator); + defer std.process.argsFree(allocator, args); + + if (args.len < 2) { + std.debug.print("Usage: {s} [record_count]\n", .{args[0]}); + std.process.exit(1); + } + + const format = args[1]; + const count = if (args.len >= 3) + try std.fmt.parseInt(usize, args[2], 10) + else + record_count; + + var stdout_buffer: [1024]u8 = undefined; + var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer); + const stdout = &stdout_writer.interface; + + if (std.mem.eql(u8, format, "srf-compact")) { + try stdout.writeAll("#!srfv1\n"); + for (0..count) |i| { + try stdout.print("id:num:{d},name::User {d},email::user{d}@example.com,active:bool:true,score:num:{d}.5,bio:49:A \"complex\" string with\nnewlines and \\backslashes,status::active\n", .{ i, i, i, i }); + } + } else if (std.mem.eql(u8, format, "srf-long")) { + try stdout.writeAll("#!srfv1\n#!long\n"); + for (0..count) |i| { + try stdout.print("id:num:{d}\n", .{i}); + try stdout.print("name::User {d}\n", .{i}); + try stdout.print("email::user{d}@example.com\n", .{i}); + try stdout.writeAll("active:bool:true\n"); + try stdout.print("score:num:{d}.5\n", .{i}); + try stdout.writeAll("bio:49:A \"complex\" string with\nnewlines and \\backslashes\n"); + try stdout.writeAll("status::active\n\n"); + } + } else if (std.mem.eql(u8, format, "jsonl")) { + for (0..count) |i| { + try stdout.print("{{\"id\":{d},\"name\":\"User {d}\",\"email\":\"user{d}@example.com\",\"active\":true,\"score\":{d}.5,\"bio\":\"A \\\"complex\\\" string with\\nnewlines and \\\\backslashes\",\"status\":\"active\"}}\n", .{ i, i, i, i }); + } + } else if (std.mem.eql(u8, format, "json")) { + try stdout.writeAll("[\n"); + for (0..count) |i| { + if (i > 0) try stdout.writeAll(",\n"); + try stdout.print("{{\"id\":{d},\"name\":\"User {d}\",\"email\":\"user{d}@example.com\",\"active\":true,\"score\":{d}.5,\"bio\":\"A \\\"complex\\\" string with\\nnewlines and \\\\backslashes\",\"status\":\"active\"}}", .{ i, i, i, i }); + } + try stdout.writeAll("\n]\n"); + } else { + std.debug.print("Unknown format: {s}\n", .{format}); + std.process.exit(1); + } + + try stdout.flush(); +} diff --git a/src/main.zig b/src/main.zig index 13ab026..2bc582d 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,3 +1,124 @@ const std = @import("std"); +const srf = @import("srf.zig"); -pub fn main() !void {} +const CountingAllocator = struct { + child_allocator: std.mem.Allocator, + alloc_count: usize = 0, + free_count: usize = 0, + bytes_allocated: usize = 0, + + fn alloc(ctx: *anyopaque, len: usize, ptr_align: std.mem.Alignment, ret_addr: usize) ?[*]u8 { + const self: *CountingAllocator = @ptrCast(@alignCast(ctx)); + self.alloc_count += 1; + self.bytes_allocated += len; + if (self.alloc_count <= 25) { + std.debug.print("Alloc #{}: {} bytes\n", .{ self.alloc_count, len }); + } + return self.child_allocator.rawAlloc(len, ptr_align, ret_addr); + } + + fn resize(ctx: *anyopaque, buf: []u8, buf_align: std.mem.Alignment, new_len: usize, ret_addr: usize) bool { + const self: *CountingAllocator = @ptrCast(@alignCast(ctx)); + return self.child_allocator.rawResize(buf, buf_align, new_len, ret_addr); + } + + fn free(ctx: *anyopaque, buf: []u8, buf_align: std.mem.Alignment, ret_addr: usize) void { + const self: *CountingAllocator = @ptrCast(@alignCast(ctx)); + self.free_count += 1; + return self.child_allocator.rawFree(buf, buf_align, ret_addr); + } + + fn remap(ctx: *anyopaque, buf: []u8, buf_align: std.mem.Alignment, new_len: usize, ret_addr: usize) ?[*]u8 { + const self: *CountingAllocator = @ptrCast(@alignCast(ctx)); + return self.child_allocator.rawRemap(buf, buf_align, new_len, ret_addr); + } + + fn allocator(self: *CountingAllocator) std.mem.Allocator { + return .{ + .ptr = self, + .vtable = &.{ + .alloc = alloc, + .resize = resize, + .free = free, + .remap = remap, + }, + }; + } +}; + +pub fn main() !void { + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + const base_allocator = gpa.allocator(); + + const args = try std.process.argsAlloc(base_allocator); + defer std.process.argsFree(base_allocator, args); + + if (args.len < 2) { + std.debug.print("Usage: {s} \n", .{args[0]}); + std.process.exit(1); + } + + const format = args[1]; + + const debug_allocs = std.process.hasEnvVarConstant("DEBUG_ALLOCATIONS"); + + var counting = CountingAllocator{ .child_allocator = base_allocator }; + const allocator = if (debug_allocs) counting.allocator() else base_allocator; + + var stdin_buffer: [1024]u8 = undefined; + var stdin_reader = std.fs.File.stdin().reader(&stdin_buffer); + const stdin = &stdin_reader.interface; + + // Load all data into memory first for fair comparison + var data: std.ArrayList(u8) = .empty; + defer data.deinit(base_allocator); + try stdin.appendRemaining(base_allocator, &data, @enumFromInt(100 * 1024 * 1024)); + + if (std.mem.eql(u8, format, "srf")) { + // TODO: Remove this code. SRF should be using an Arena allocator instead + const buffer = try base_allocator.alloc(u8, 200 * 1024 * 1024); + defer base_allocator.free(buffer); + var fba = std.heap.FixedBufferAllocator.init(buffer); + const srf_allocator = fba.allocator(); + // remove ^^ + + var reader = std.Io.Reader.fixed(data.items); + const records = try srf.parse(&reader, srf_allocator, .{}); + defer { + for (records.items) |r| r.deinit(srf_allocator); + srf_allocator.free(records.items); + } + } else if (std.mem.eql(u8, format, "jsonl")) { + var lines = std.mem.splitScalar(u8, data.items, '\n'); + while (lines.next()) |line| { + if (line.len == 0) continue; + const parsed = try std.json.parseFromSlice(std.json.Value, allocator, line, .{}); + defer parsed.deinit(); + } + } else if (std.mem.eql(u8, format, "json")) { + const parsed = try std.json.parseFromSlice(std.json.Value, allocator, data.items, .{}); + defer parsed.deinit(); + var count: usize = 0; + for (parsed.value.array.items) |item| { + _ = item.object.get("id"); + _ = item.object.get("name"); + _ = item.object.get("email"); + _ = item.object.get("active"); + _ = item.object.get("score"); + _ = item.object.get("bio"); + _ = item.object.get("status"); + count += 1; + } + std.mem.doNotOptimizeAway(&count); + } else { + std.debug.print("Unknown format: {s}\n", .{format}); + std.process.exit(1); + } + + if (debug_allocs) { + std.debug.print("Allocations: {}\n", .{counting.alloc_count}); + std.debug.print("Frees: {}\n", .{counting.free_count}); + std.debug.print("Bytes allocated: {}\n", .{counting.bytes_allocated}); + } +}