From 2f36f82363faae90feacc0b70843d9b9d917271e Mon Sep 17 00:00:00 2001 From: Emil Lerch Date: Fri, 25 Aug 2023 14:43:40 -0700 Subject: [PATCH] factor out hashing so we can use it in codegen --- Package.zig | 195 ++--------------------------------------- codegen/src/Hasher.zig | 186 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+), 188 deletions(-) create mode 100644 codegen/src/Hasher.zig diff --git a/Package.zig b/Package.zig index 02a8b9d..f051a89 100644 --- a/Package.zig +++ b/Package.zig @@ -1,56 +1,11 @@ const builtin = @import("builtin"); const std = @import("std"); const testing = std.testing; +const Hasher = @import("codegen/src/Hasher.zig"); /// This is 128 bits - Even with 2^54 cache entries, the probably of a collision would be under 10^-6 const bin_digest_len = 16; const hex_digest_len = bin_digest_len * 2; -const hex_multihash_len = 2 * multihash_len; -const MultiHashHexDigest = [hex_multihash_len]u8; -const hex_charset = "0123456789abcdef"; -const Hash = std.crypto.hash.sha2.Sha256; -const multihash_len = 1 + 1 + Hash.digest_length; -const MultihashFunction = enum(u16) { - identity = 0x00, - sha1 = 0x11, - @"sha2-256" = 0x12, - @"sha2-512" = 0x13, - @"sha3-512" = 0x14, - @"sha3-384" = 0x15, - @"sha3-256" = 0x16, - @"sha3-224" = 0x17, - @"sha2-384" = 0x20, - @"sha2-256-trunc254-padded" = 0x1012, - @"sha2-224" = 0x1013, - @"sha2-512-224" = 0x1014, - @"sha2-512-256" = 0x1015, - @"blake2b-256" = 0xb220, - _, -}; -const HashedFile = struct { - fs_path: []const u8, - normalized_path: []const u8, - hash: [Hash.digest_length]u8, - failure: Error!void, - - const Error = std.fs.File.OpenError || std.fs.File.ReadError || std.fs.File.StatError; - - fn lessThan(context: void, lhs: *const HashedFile, rhs: *const HashedFile) bool { - _ = context; - return std.mem.lessThan(u8, lhs.normalized_path, rhs.normalized_path); - } -}; - -const multihash_function: MultihashFunction = switch (Hash) { - std.crypto.hash.sha2.Sha256 => .@"sha2-256", - else => @compileError("unreachable"), -}; -comptime { - // We avoid unnecessary uleb128 code in hexDigest by asserting here the - // values are small enough to be contained in the one-byte encoding. - std.debug.assert(@intFromEnum(multihash_function) < 127); - std.debug.assert(Hash.digest_length < 127); -} const Package = @This(); @@ -112,7 +67,7 @@ pub fn fetchAndUnpack( // Check if the expected_hash is already present in the global package // cache, and thereby avoid both fetching and unpacking. if (dep.hash) |h| cached: { - const hex_digest = h[0..hex_multihash_len]; + const hex_digest = h[0..Hasher.hex_multihash_len]; const pkg_dir_sub_path = "p" ++ s ++ hex_digest; const build_root = try global_cache_directory.join(gpa, &.{pkg_dir_sub_path}); @@ -131,7 +86,7 @@ pub fn fetchAndUnpack( ptr.* = .{ .root_src_directory = .{ - .path = build_root, + .path = build_root, // TODO: This leaks memory somehow (should be cleaned in deinit() .handle = pkg_dir, }, .root_src_directory_owned = true, @@ -149,7 +104,7 @@ pub fn fetchAndUnpack( const uri = try std.Uri.parse(dep.url); const rand_int = std.crypto.random.int(u64); - const tmp_dir_sub_path = "tmp" ++ s ++ hex64(rand_int); + const tmp_dir_sub_path = "tmp" ++ s ++ Hasher.hex64(rand_int); const actual_hash = a: { var tmp_directory: std.Build.Cache.Directory = d: { @@ -247,13 +202,13 @@ pub fn fetchAndUnpack( // Of course, if the ignore rules above omit the file from the package, then everything // is fine and no error should be raised. - break :a try computePackageHash(thread_pool, .{ .dir = tmp_directory.handle }); + break :a try Hasher.computeDirectoryHash(thread_pool, .{ .dir = tmp_directory.handle }); }; - const pkg_dir_sub_path = "p" ++ s ++ hexDigest(actual_hash); + const pkg_dir_sub_path = "p" ++ s ++ Hasher.hexDigest(actual_hash); try renameTmpIntoCache(global_cache_directory.handle, tmp_dir_sub_path, pkg_dir_sub_path); - const actual_hex = hexDigest(actual_hash); + const actual_hex = Hasher.hexDigest(actual_hash); if (dep.hash) |h| { if (!std.mem.eql(u8, h, &actual_hex)) { std.log.err("hash mismatch: expected: {s}, found: {s}", .{ @@ -272,16 +227,6 @@ pub fn fetchAndUnpack( const mod = try createWithDir(gpa, global_cache_directory, pkg_dir_sub_path); return mod; } -fn hex64(x: u64) [16]u8 { - var result: [16]u8 = undefined; - var i: usize = 0; - while (i < 8) : (i += 1) { - const byte = @as(u8, @truncate(x >> @as(u6, @intCast(8 * i)))); - result[i * 2 + 0] = hex_charset[byte >> 4]; - result[i * 2 + 1] = hex_charset[byte & 15]; - } - return result; -} fn ProgressReader(comptime ReaderType: type) type { return struct { child_reader: ReaderType, @@ -340,81 +285,6 @@ fn isTarAttachment(content_disposition: []const u8) bool { } return std.ascii.endsWithIgnoreCase(content_disposition[value_start..value_end], ".tar.gz"); } -fn computePackageHash( - thread_pool: *std.Thread.Pool, - pkg_dir: std.fs.IterableDir, -) ![Hash.digest_length]u8 { - const gpa = thread_pool.allocator; - - // We'll use an arena allocator for the path name strings since they all - // need to be in memory for sorting. - var arena_instance = std.heap.ArenaAllocator.init(gpa); - defer arena_instance.deinit(); - const arena = arena_instance.allocator(); - - // Collect all files, recursively, then sort. - var all_files = std.ArrayList(*HashedFile).init(gpa); - defer all_files.deinit(); - - var walker = try pkg_dir.walk(gpa); - defer walker.deinit(); - - { - // The final hash will be a hash of each file hashed independently. This - // allows hashing in parallel. - var wait_group: std.Thread.WaitGroup = .{}; - defer wait_group.wait(); - - while (try walker.next()) |entry| { - switch (entry.kind) { - .directory => continue, - .file => {}, - else => return error.IllegalFileTypeInPackage, - } - const hashed_file = try arena.create(HashedFile); - const fs_path = try arena.dupe(u8, entry.path); - hashed_file.* = .{ - .fs_path = fs_path, - .normalized_path = try normalizePath(arena, fs_path), - .hash = undefined, // to be populated by the worker - .failure = undefined, // to be populated by the worker - }; - wait_group.start(); - try thread_pool.spawn(workerHashFile, .{ pkg_dir.dir, hashed_file, &wait_group }); - - try all_files.append(hashed_file); - } - } - - std.mem.sort(*HashedFile, all_files.items, {}, HashedFile.lessThan); - - var hasher = Hash.init(.{}); - var any_failures = false; - for (all_files.items) |hashed_file| { - hashed_file.failure catch |err| { - any_failures = true; - std.log.err("unable to hash '{s}': {s}", .{ hashed_file.fs_path, @errorName(err) }); - }; - hasher.update(&hashed_file.hash); - } - if (any_failures) return error.PackageHashUnavailable; - return hasher.finalResult(); -} -fn hexDigest(digest: [Hash.digest_length]u8) [multihash_len * 2]u8 { - var result: [multihash_len * 2]u8 = undefined; - - result[0] = hex_charset[@intFromEnum(multihash_function) >> 4]; - result[1] = hex_charset[@intFromEnum(multihash_function) & 15]; - - result[2] = hex_charset[Hash.digest_length >> 4]; - result[3] = hex_charset[Hash.digest_length & 15]; - - for (digest, 0..) |byte, i| { - result[4 + i * 2] = hex_charset[byte >> 4]; - result[5 + i * 2] = hex_charset[byte & 15]; - } - return result; -} fn renameTmpIntoCache( cache_dir: std.fs.Dir, tmp_dir_sub_path: []const u8, @@ -475,57 +345,6 @@ fn createWithDir( } return ptr; } -/// Make a file system path identical independently of operating system path inconsistencies. -/// This converts backslashes into forward slashes. -fn normalizePath(arena: std.mem.Allocator, fs_path: []const u8) ![]const u8 { - const canonical_sep = '/'; - - if (std.fs.path.sep == canonical_sep) - return fs_path; - - const normalized = try arena.dupe(u8, fs_path); - for (normalized) |*byte| { - switch (byte.*) { - std.fs.path.sep => byte.* = canonical_sep, - else => continue, - } - } - return normalized; -} - -fn workerHashFile(dir: std.fs.Dir, hashed_file: *HashedFile, wg: *std.Thread.WaitGroup) void { - defer wg.finish(); - hashed_file.failure = hashFileFallible(dir, hashed_file); -} - -fn hashFileFallible(dir: std.fs.Dir, hashed_file: *HashedFile) HashedFile.Error!void { - var buf: [8000]u8 = undefined; - var file = try dir.openFile(hashed_file.fs_path, .{}); - defer file.close(); - var hasher = Hash.init(.{}); - hasher.update(hashed_file.normalized_path); - hasher.update(&.{ 0, @intFromBool(try isExecutable(file)) }); - while (true) { - const bytes_read = try file.read(&buf); - if (bytes_read == 0) break; - hasher.update(buf[0..bytes_read]); - } - hasher.final(&hashed_file.hash); -} - -fn isExecutable(file: std.fs.File) !bool { - if (builtin.os.tag == .windows) { - // TODO check the ACL on Windows. - // Until this is implemented, this could be a false negative on - // Windows, which is why we do not yet set executable_bit_only above - // when unpacking the tarball. - return false; - } else { - const stat = try file.stat(); - return (stat.mode & std.os.S.IXUSR) != 0; - } -} - // Create/Write a file, close it, then grab its stat.mtime timestamp. fn testGetCurrentFileTimestamp(dir: std.fs.Dir) !i128 { const test_out_file = "test-filetimestamp.tmp"; diff --git a/codegen/src/Hasher.zig b/codegen/src/Hasher.zig new file mode 100644 index 0000000..dcfd4ae --- /dev/null +++ b/codegen/src/Hasher.zig @@ -0,0 +1,186 @@ +const builtin = @import("builtin"); +const std = @import("std"); +const Hash = std.crypto.hash.sha2.Sha256; +const HashedFile = struct { + fs_path: []const u8, + normalized_path: []const u8, + hash: [Hash.digest_length]u8, + failure: Error!void, + + const Error = std.fs.File.OpenError || std.fs.File.ReadError || std.fs.File.StatError; + + fn lessThan(context: void, lhs: *const HashedFile, rhs: *const HashedFile) bool { + _ = context; + return std.mem.lessThan(u8, lhs.normalized_path, rhs.normalized_path); + } +}; + +const multihash_len = 1 + 1 + Hash.digest_length; +pub const hex_multihash_len = 2 * multihash_len; + +const MultiHashHexDigest = [hex_multihash_len]u8; +const MultihashFunction = enum(u16) { + identity = 0x00, + sha1 = 0x11, + @"sha2-256" = 0x12, + @"sha2-512" = 0x13, + @"sha3-512" = 0x14, + @"sha3-384" = 0x15, + @"sha3-256" = 0x16, + @"sha3-224" = 0x17, + @"sha2-384" = 0x20, + @"sha2-256-trunc254-padded" = 0x1012, + @"sha2-224" = 0x1013, + @"sha2-512-224" = 0x1014, + @"sha2-512-256" = 0x1015, + @"blake2b-256" = 0xb220, + _, +}; + +const multihash_function: MultihashFunction = switch (Hash) { + std.crypto.hash.sha2.Sha256 => .@"sha2-256", + else => @compileError("unreachable"), +}; +comptime { + // We avoid unnecessary uleb128 code in hexDigest by asserting here the + // values are small enough to be contained in the one-byte encoding. + std.debug.assert(@intFromEnum(multihash_function) < 127); + std.debug.assert(Hash.digest_length < 127); +} +const hex_charset = "0123456789abcdef"; + +pub fn hexDigest(digest: [Hash.digest_length]u8) [multihash_len * 2]u8 { + var result: [multihash_len * 2]u8 = undefined; + + result[0] = hex_charset[@intFromEnum(multihash_function) >> 4]; + result[1] = hex_charset[@intFromEnum(multihash_function) & 15]; + + result[2] = hex_charset[Hash.digest_length >> 4]; + result[3] = hex_charset[Hash.digest_length & 15]; + + for (digest, 0..) |byte, i| { + result[4 + i * 2] = hex_charset[byte >> 4]; + result[5 + i * 2] = hex_charset[byte & 15]; + } + return result; +} +pub fn hex64(x: u64) [16]u8 { + var result: [16]u8 = undefined; + var i: usize = 0; + while (i < 8) : (i += 1) { + const byte = @as(u8, @truncate(x >> @as(u6, @intCast(8 * i)))); + result[i * 2 + 0] = hex_charset[byte >> 4]; + result[i * 2 + 1] = hex_charset[byte & 15]; + } + return result; +} +pub fn computeDirectoryHash( + thread_pool: *std.Thread.Pool, + dir: std.fs.IterableDir, +) ![Hash.digest_length]u8 { + const gpa = thread_pool.allocator; + + // We'll use an arena allocator for the path name strings since they all + // need to be in memory for sorting. + var arena_instance = std.heap.ArenaAllocator.init(gpa); + defer arena_instance.deinit(); + const arena = arena_instance.allocator(); + + // Collect all files, recursively, then sort. + var all_files = std.ArrayList(*HashedFile).init(gpa); + defer all_files.deinit(); + + var walker = try dir.walk(gpa); + defer walker.deinit(); + + { + // The final hash will be a hash of each file hashed independently. This + // allows hashing in parallel. + var wait_group: std.Thread.WaitGroup = .{}; + defer wait_group.wait(); + + while (try walker.next()) |entry| { + switch (entry.kind) { + .directory => continue, + .file => {}, + else => return error.IllegalFileTypeInPackage, + } + const hashed_file = try arena.create(HashedFile); + const fs_path = try arena.dupe(u8, entry.path); + hashed_file.* = .{ + .fs_path = fs_path, + .normalized_path = try normalizePath(arena, fs_path), + .hash = undefined, // to be populated by the worker + .failure = undefined, // to be populated by the worker + }; + wait_group.start(); + try thread_pool.spawn(workerHashFile, .{ dir.dir, hashed_file, &wait_group }); + + try all_files.append(hashed_file); + } + } + + std.mem.sort(*HashedFile, all_files.items, {}, HashedFile.lessThan); + + var hasher = Hash.init(.{}); + var any_failures = false; + for (all_files.items) |hashed_file| { + hashed_file.failure catch |err| { + any_failures = true; + std.log.err("unable to hash '{s}': {s}", .{ hashed_file.fs_path, @errorName(err) }); + }; + hasher.update(&hashed_file.hash); + } + if (any_failures) return error.DirectoryHashUnavailable; + return hasher.finalResult(); +} +fn workerHashFile(dir: std.fs.Dir, hashed_file: *HashedFile, wg: *std.Thread.WaitGroup) void { + defer wg.finish(); + hashed_file.failure = hashFileFallible(dir, hashed_file); +} + +fn hashFileFallible(dir: std.fs.Dir, hashed_file: *HashedFile) HashedFile.Error!void { + var buf: [8000]u8 = undefined; + var file = try dir.openFile(hashed_file.fs_path, .{}); + defer file.close(); + var hasher = Hash.init(.{}); + hasher.update(hashed_file.normalized_path); + hasher.update(&.{ 0, @intFromBool(try isExecutable(file)) }); + while (true) { + const bytes_read = try file.read(&buf); + if (bytes_read == 0) break; + hasher.update(buf[0..bytes_read]); + } + hasher.final(&hashed_file.hash); +} + +/// Make a file system path identical independently of operating system path inconsistencies. +/// This converts backslashes into forward slashes. +fn normalizePath(arena: std.mem.Allocator, fs_path: []const u8) ![]const u8 { + const canonical_sep = '/'; + + if (std.fs.path.sep == canonical_sep) + return fs_path; + + const normalized = try arena.dupe(u8, fs_path); + for (normalized) |*byte| { + switch (byte.*) { + std.fs.path.sep => byte.* = canonical_sep, + else => continue, + } + } + return normalized; +} + +fn isExecutable(file: std.fs.File) !bool { + if (builtin.os.tag == .windows) { + // TODO check the ACL on Windows. + // Until this is implemented, this could be a false negative on + // Windows, which is why we do not yet set executable_bit_only above + // when unpacking the tarball. + return false; + } else { + const stat = try file.stat(); + return (stat.mode & std.os.S.IXUSR) != 0; + } +}