factor out hashing so we can use it in codegen

This commit is contained in:
Emil Lerch 2023-08-25 14:43:40 -07:00
parent db4037111f
commit 2f36f82363
Signed by: lobo
GPG Key ID: A7B62D657EF764F8
2 changed files with 193 additions and 188 deletions

View File

@ -1,56 +1,11 @@
const builtin = @import("builtin");
const std = @import("std");
const testing = std.testing;
const Hasher = @import("codegen/src/Hasher.zig");
/// This is 128 bits - Even with 2^54 cache entries, the probably of a collision would be under 10^-6
const bin_digest_len = 16;
const hex_digest_len = bin_digest_len * 2;
const hex_multihash_len = 2 * multihash_len;
const MultiHashHexDigest = [hex_multihash_len]u8;
const hex_charset = "0123456789abcdef";
const Hash = std.crypto.hash.sha2.Sha256;
const multihash_len = 1 + 1 + Hash.digest_length;
const MultihashFunction = enum(u16) {
identity = 0x00,
sha1 = 0x11,
@"sha2-256" = 0x12,
@"sha2-512" = 0x13,
@"sha3-512" = 0x14,
@"sha3-384" = 0x15,
@"sha3-256" = 0x16,
@"sha3-224" = 0x17,
@"sha2-384" = 0x20,
@"sha2-256-trunc254-padded" = 0x1012,
@"sha2-224" = 0x1013,
@"sha2-512-224" = 0x1014,
@"sha2-512-256" = 0x1015,
@"blake2b-256" = 0xb220,
_,
};
const HashedFile = struct {
fs_path: []const u8,
normalized_path: []const u8,
hash: [Hash.digest_length]u8,
failure: Error!void,
const Error = std.fs.File.OpenError || std.fs.File.ReadError || std.fs.File.StatError;
fn lessThan(context: void, lhs: *const HashedFile, rhs: *const HashedFile) bool {
_ = context;
return std.mem.lessThan(u8, lhs.normalized_path, rhs.normalized_path);
}
};
const multihash_function: MultihashFunction = switch (Hash) {
std.crypto.hash.sha2.Sha256 => .@"sha2-256",
else => @compileError("unreachable"),
};
comptime {
// We avoid unnecessary uleb128 code in hexDigest by asserting here the
// values are small enough to be contained in the one-byte encoding.
std.debug.assert(@intFromEnum(multihash_function) < 127);
std.debug.assert(Hash.digest_length < 127);
}
const Package = @This();
@ -112,7 +67,7 @@ pub fn fetchAndUnpack(
// Check if the expected_hash is already present in the global package
// cache, and thereby avoid both fetching and unpacking.
if (dep.hash) |h| cached: {
const hex_digest = h[0..hex_multihash_len];
const hex_digest = h[0..Hasher.hex_multihash_len];
const pkg_dir_sub_path = "p" ++ s ++ hex_digest;
const build_root = try global_cache_directory.join(gpa, &.{pkg_dir_sub_path});
@ -131,7 +86,7 @@ pub fn fetchAndUnpack(
ptr.* = .{
.root_src_directory = .{
.path = build_root,
.path = build_root, // TODO: This leaks memory somehow (should be cleaned in deinit()
.handle = pkg_dir,
},
.root_src_directory_owned = true,
@ -149,7 +104,7 @@ pub fn fetchAndUnpack(
const uri = try std.Uri.parse(dep.url);
const rand_int = std.crypto.random.int(u64);
const tmp_dir_sub_path = "tmp" ++ s ++ hex64(rand_int);
const tmp_dir_sub_path = "tmp" ++ s ++ Hasher.hex64(rand_int);
const actual_hash = a: {
var tmp_directory: std.Build.Cache.Directory = d: {
@ -247,13 +202,13 @@ pub fn fetchAndUnpack(
// Of course, if the ignore rules above omit the file from the package, then everything
// is fine and no error should be raised.
break :a try computePackageHash(thread_pool, .{ .dir = tmp_directory.handle });
break :a try Hasher.computeDirectoryHash(thread_pool, .{ .dir = tmp_directory.handle });
};
const pkg_dir_sub_path = "p" ++ s ++ hexDigest(actual_hash);
const pkg_dir_sub_path = "p" ++ s ++ Hasher.hexDigest(actual_hash);
try renameTmpIntoCache(global_cache_directory.handle, tmp_dir_sub_path, pkg_dir_sub_path);
const actual_hex = hexDigest(actual_hash);
const actual_hex = Hasher.hexDigest(actual_hash);
if (dep.hash) |h| {
if (!std.mem.eql(u8, h, &actual_hex)) {
std.log.err("hash mismatch: expected: {s}, found: {s}", .{
@ -272,16 +227,6 @@ pub fn fetchAndUnpack(
const mod = try createWithDir(gpa, global_cache_directory, pkg_dir_sub_path);
return mod;
}
fn hex64(x: u64) [16]u8 {
var result: [16]u8 = undefined;
var i: usize = 0;
while (i < 8) : (i += 1) {
const byte = @as(u8, @truncate(x >> @as(u6, @intCast(8 * i))));
result[i * 2 + 0] = hex_charset[byte >> 4];
result[i * 2 + 1] = hex_charset[byte & 15];
}
return result;
}
fn ProgressReader(comptime ReaderType: type) type {
return struct {
child_reader: ReaderType,
@ -340,81 +285,6 @@ fn isTarAttachment(content_disposition: []const u8) bool {
}
return std.ascii.endsWithIgnoreCase(content_disposition[value_start..value_end], ".tar.gz");
}
fn computePackageHash(
thread_pool: *std.Thread.Pool,
pkg_dir: std.fs.IterableDir,
) ![Hash.digest_length]u8 {
const gpa = thread_pool.allocator;
// We'll use an arena allocator for the path name strings since they all
// need to be in memory for sorting.
var arena_instance = std.heap.ArenaAllocator.init(gpa);
defer arena_instance.deinit();
const arena = arena_instance.allocator();
// Collect all files, recursively, then sort.
var all_files = std.ArrayList(*HashedFile).init(gpa);
defer all_files.deinit();
var walker = try pkg_dir.walk(gpa);
defer walker.deinit();
{
// The final hash will be a hash of each file hashed independently. This
// allows hashing in parallel.
var wait_group: std.Thread.WaitGroup = .{};
defer wait_group.wait();
while (try walker.next()) |entry| {
switch (entry.kind) {
.directory => continue,
.file => {},
else => return error.IllegalFileTypeInPackage,
}
const hashed_file = try arena.create(HashedFile);
const fs_path = try arena.dupe(u8, entry.path);
hashed_file.* = .{
.fs_path = fs_path,
.normalized_path = try normalizePath(arena, fs_path),
.hash = undefined, // to be populated by the worker
.failure = undefined, // to be populated by the worker
};
wait_group.start();
try thread_pool.spawn(workerHashFile, .{ pkg_dir.dir, hashed_file, &wait_group });
try all_files.append(hashed_file);
}
}
std.mem.sort(*HashedFile, all_files.items, {}, HashedFile.lessThan);
var hasher = Hash.init(.{});
var any_failures = false;
for (all_files.items) |hashed_file| {
hashed_file.failure catch |err| {
any_failures = true;
std.log.err("unable to hash '{s}': {s}", .{ hashed_file.fs_path, @errorName(err) });
};
hasher.update(&hashed_file.hash);
}
if (any_failures) return error.PackageHashUnavailable;
return hasher.finalResult();
}
fn hexDigest(digest: [Hash.digest_length]u8) [multihash_len * 2]u8 {
var result: [multihash_len * 2]u8 = undefined;
result[0] = hex_charset[@intFromEnum(multihash_function) >> 4];
result[1] = hex_charset[@intFromEnum(multihash_function) & 15];
result[2] = hex_charset[Hash.digest_length >> 4];
result[3] = hex_charset[Hash.digest_length & 15];
for (digest, 0..) |byte, i| {
result[4 + i * 2] = hex_charset[byte >> 4];
result[5 + i * 2] = hex_charset[byte & 15];
}
return result;
}
fn renameTmpIntoCache(
cache_dir: std.fs.Dir,
tmp_dir_sub_path: []const u8,
@ -475,57 +345,6 @@ fn createWithDir(
}
return ptr;
}
/// Make a file system path identical independently of operating system path inconsistencies.
/// This converts backslashes into forward slashes.
fn normalizePath(arena: std.mem.Allocator, fs_path: []const u8) ![]const u8 {
const canonical_sep = '/';
if (std.fs.path.sep == canonical_sep)
return fs_path;
const normalized = try arena.dupe(u8, fs_path);
for (normalized) |*byte| {
switch (byte.*) {
std.fs.path.sep => byte.* = canonical_sep,
else => continue,
}
}
return normalized;
}
fn workerHashFile(dir: std.fs.Dir, hashed_file: *HashedFile, wg: *std.Thread.WaitGroup) void {
defer wg.finish();
hashed_file.failure = hashFileFallible(dir, hashed_file);
}
fn hashFileFallible(dir: std.fs.Dir, hashed_file: *HashedFile) HashedFile.Error!void {
var buf: [8000]u8 = undefined;
var file = try dir.openFile(hashed_file.fs_path, .{});
defer file.close();
var hasher = Hash.init(.{});
hasher.update(hashed_file.normalized_path);
hasher.update(&.{ 0, @intFromBool(try isExecutable(file)) });
while (true) {
const bytes_read = try file.read(&buf);
if (bytes_read == 0) break;
hasher.update(buf[0..bytes_read]);
}
hasher.final(&hashed_file.hash);
}
fn isExecutable(file: std.fs.File) !bool {
if (builtin.os.tag == .windows) {
// TODO check the ACL on Windows.
// Until this is implemented, this could be a false negative on
// Windows, which is why we do not yet set executable_bit_only above
// when unpacking the tarball.
return false;
} else {
const stat = try file.stat();
return (stat.mode & std.os.S.IXUSR) != 0;
}
}
// Create/Write a file, close it, then grab its stat.mtime timestamp.
fn testGetCurrentFileTimestamp(dir: std.fs.Dir) !i128 {
const test_out_file = "test-filetimestamp.tmp";

186
codegen/src/Hasher.zig Normal file
View File

@ -0,0 +1,186 @@
const builtin = @import("builtin");
const std = @import("std");
const Hash = std.crypto.hash.sha2.Sha256;
const HashedFile = struct {
fs_path: []const u8,
normalized_path: []const u8,
hash: [Hash.digest_length]u8,
failure: Error!void,
const Error = std.fs.File.OpenError || std.fs.File.ReadError || std.fs.File.StatError;
fn lessThan(context: void, lhs: *const HashedFile, rhs: *const HashedFile) bool {
_ = context;
return std.mem.lessThan(u8, lhs.normalized_path, rhs.normalized_path);
}
};
const multihash_len = 1 + 1 + Hash.digest_length;
pub const hex_multihash_len = 2 * multihash_len;
const MultiHashHexDigest = [hex_multihash_len]u8;
const MultihashFunction = enum(u16) {
identity = 0x00,
sha1 = 0x11,
@"sha2-256" = 0x12,
@"sha2-512" = 0x13,
@"sha3-512" = 0x14,
@"sha3-384" = 0x15,
@"sha3-256" = 0x16,
@"sha3-224" = 0x17,
@"sha2-384" = 0x20,
@"sha2-256-trunc254-padded" = 0x1012,
@"sha2-224" = 0x1013,
@"sha2-512-224" = 0x1014,
@"sha2-512-256" = 0x1015,
@"blake2b-256" = 0xb220,
_,
};
const multihash_function: MultihashFunction = switch (Hash) {
std.crypto.hash.sha2.Sha256 => .@"sha2-256",
else => @compileError("unreachable"),
};
comptime {
// We avoid unnecessary uleb128 code in hexDigest by asserting here the
// values are small enough to be contained in the one-byte encoding.
std.debug.assert(@intFromEnum(multihash_function) < 127);
std.debug.assert(Hash.digest_length < 127);
}
const hex_charset = "0123456789abcdef";
pub fn hexDigest(digest: [Hash.digest_length]u8) [multihash_len * 2]u8 {
var result: [multihash_len * 2]u8 = undefined;
result[0] = hex_charset[@intFromEnum(multihash_function) >> 4];
result[1] = hex_charset[@intFromEnum(multihash_function) & 15];
result[2] = hex_charset[Hash.digest_length >> 4];
result[3] = hex_charset[Hash.digest_length & 15];
for (digest, 0..) |byte, i| {
result[4 + i * 2] = hex_charset[byte >> 4];
result[5 + i * 2] = hex_charset[byte & 15];
}
return result;
}
pub fn hex64(x: u64) [16]u8 {
var result: [16]u8 = undefined;
var i: usize = 0;
while (i < 8) : (i += 1) {
const byte = @as(u8, @truncate(x >> @as(u6, @intCast(8 * i))));
result[i * 2 + 0] = hex_charset[byte >> 4];
result[i * 2 + 1] = hex_charset[byte & 15];
}
return result;
}
pub fn computeDirectoryHash(
thread_pool: *std.Thread.Pool,
dir: std.fs.IterableDir,
) ![Hash.digest_length]u8 {
const gpa = thread_pool.allocator;
// We'll use an arena allocator for the path name strings since they all
// need to be in memory for sorting.
var arena_instance = std.heap.ArenaAllocator.init(gpa);
defer arena_instance.deinit();
const arena = arena_instance.allocator();
// Collect all files, recursively, then sort.
var all_files = std.ArrayList(*HashedFile).init(gpa);
defer all_files.deinit();
var walker = try dir.walk(gpa);
defer walker.deinit();
{
// The final hash will be a hash of each file hashed independently. This
// allows hashing in parallel.
var wait_group: std.Thread.WaitGroup = .{};
defer wait_group.wait();
while (try walker.next()) |entry| {
switch (entry.kind) {
.directory => continue,
.file => {},
else => return error.IllegalFileTypeInPackage,
}
const hashed_file = try arena.create(HashedFile);
const fs_path = try arena.dupe(u8, entry.path);
hashed_file.* = .{
.fs_path = fs_path,
.normalized_path = try normalizePath(arena, fs_path),
.hash = undefined, // to be populated by the worker
.failure = undefined, // to be populated by the worker
};
wait_group.start();
try thread_pool.spawn(workerHashFile, .{ dir.dir, hashed_file, &wait_group });
try all_files.append(hashed_file);
}
}
std.mem.sort(*HashedFile, all_files.items, {}, HashedFile.lessThan);
var hasher = Hash.init(.{});
var any_failures = false;
for (all_files.items) |hashed_file| {
hashed_file.failure catch |err| {
any_failures = true;
std.log.err("unable to hash '{s}': {s}", .{ hashed_file.fs_path, @errorName(err) });
};
hasher.update(&hashed_file.hash);
}
if (any_failures) return error.DirectoryHashUnavailable;
return hasher.finalResult();
}
fn workerHashFile(dir: std.fs.Dir, hashed_file: *HashedFile, wg: *std.Thread.WaitGroup) void {
defer wg.finish();
hashed_file.failure = hashFileFallible(dir, hashed_file);
}
fn hashFileFallible(dir: std.fs.Dir, hashed_file: *HashedFile) HashedFile.Error!void {
var buf: [8000]u8 = undefined;
var file = try dir.openFile(hashed_file.fs_path, .{});
defer file.close();
var hasher = Hash.init(.{});
hasher.update(hashed_file.normalized_path);
hasher.update(&.{ 0, @intFromBool(try isExecutable(file)) });
while (true) {
const bytes_read = try file.read(&buf);
if (bytes_read == 0) break;
hasher.update(buf[0..bytes_read]);
}
hasher.final(&hashed_file.hash);
}
/// Make a file system path identical independently of operating system path inconsistencies.
/// This converts backslashes into forward slashes.
fn normalizePath(arena: std.mem.Allocator, fs_path: []const u8) ![]const u8 {
const canonical_sep = '/';
if (std.fs.path.sep == canonical_sep)
return fs_path;
const normalized = try arena.dupe(u8, fs_path);
for (normalized) |*byte| {
switch (byte.*) {
std.fs.path.sep => byte.* = canonical_sep,
else => continue,
}
}
return normalized;
}
fn isExecutable(file: std.fs.File) !bool {
if (builtin.os.tag == .windows) {
// TODO check the ACL on Windows.
// Until this is implemented, this could be a false negative on
// Windows, which is why we do not yet set executable_bit_only above
// when unpacking the tarball.
return false;
} else {
const stat = try file.stat();
return (stat.mode & std.os.S.IXUSR) != 0;
}
}