pos/build.zig

const std = @import("std");

// Although this function looks imperative, it does not perform the build
// directly and instead it mutates the build graph (`b`) that will be then
// executed by an external runner. The functions in `std.Build` implement a DSL
// for defining build steps and express dependencies between them, allowing the
// build runner to parallelize the build automatically (and the cache system to
// know when a step doesn't need to be re-run).
pub fn build(b: *std.Build) !void {
    // Standard target options allow the person running `zig build` to choose
    // what target to build for. Here we do not override the defaults, which
    // means any target is allowed, and the default is native. Other options
    // for restricting supported target set are available.
    const target = b.standardTargetOptions(.{});
    // Standard optimization options allow the person running `zig build` to select
    // between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. Here we do not
    // set a preferred release mode, allowing the user to decide how to optimize.
    const optimize = b.standardOptimizeOption(.{});

    const long_tests = b.option(bool, "long-tests", "Run long-running tests") orelse false;
    // It's also possible to define more custom flags to toggle optional features
    // of this build script using `b.option()`. All defined flags (including
    // target and optimize options) will be listed when running `zig build --help`
    // in this directory.

    // We need to use curl for this as the domain doesn't work with zig TLS
    const download_link_step = DownloadStep(
        "link-4.1b",
        "https://www.link.cs.cmu.edu/link/ftp-site/link-grammar/link-4.1b/unix/link-4.1b.tar.gz",
    ).create(b);

    const upstream = download_link_step.dependency(b, .{});

    // mod tests fail because the library has unconditional output to stdout,
    // which messes with the test runner. So...we need to patch utilities.c with search:
    // printf("   Opening
    //
    // replace:
    // if ( verbosity > 0 ) printf("   Opening
    //

    const Substitutions = struct { orig: []const u8, new: []const u8 };
    const patterns: [4]Substitutions = .{
        .{
            .orig = "\tprintf(\"   Opening %s\\n\", filename); ",
            .new = "\tif ( verbosity > 0 ) printf(\"   Opening %s\\n\", filename); ",
        },
        .{
            .orig = "\tprintf(\"   Opening %s\\n\", filename);",
            .new = "\tif ( verbosity > 0 ) printf(\"   Opening %s\\n\", filename);",
        },
        .{
            .orig = "\t    printf(\"   Opening %s\\n\", completename); ",
            .new = "\t    if ( verbosity > 0 ) printf(\"   Opening %s\\n\", completename); ",
        },
        .{
            .orig = "    printf(\"   Opening %s\\n\", completename); ",
            .new = "    if ( verbosity > 0 ) printf(\"   Opening %s\\n\", completename); ",
        },
    };

    // pat5 is same as pat3
    //const pat5 = \t    printf("   Opening %s\n", completename); ";

    // Create sed-lite run step to patch utilities.c
    const sed_lite = b.dependency("sed_lite", .{});
    const sed_lite_exe = sed_lite.artifact("sed-lite");

    const patch_cmd = b.addRunArtifact(sed_lite_exe);
    patch_cmd.addArg("-sL");
    for (patterns) |s| {
        patch_cmd.addArg(s.orig);
        patch_cmd.addArg(s.new);
    }
    const util_src = try std.fs.path.join(b.allocator, &.{
        "src",
        "utilities.c",
    });
    patch_cmd.addFileArg(upstream.path(util_src));
    patch_cmd.step.dependOn(&download_link_step.step);

    const lib = b.addLibrary(.{
        .name = "link",
        .linkage = .static,
        .root_module = b.createModule(.{
            .target = target,
            .optimize = optimize,
            .link_libc = true,
        }),
    });

    lib.step.dependOn(&patch_cmd.step);
    lib.addIncludePath(upstream.path("include"));

    lib.addCSourceFiles(.{
        .root = upstream.path("src"),
        .files = &.{
            "analyze-linkage.c",
            "and.c",
            "api.c",
            "build-disjuncts.c",
            "command-line.c",
            "constituents.c",
            "count.c",
            "error.c",
            "extract-links.c",
            "fast-match.c",
            "idiom.c",
            "linkset.c",
            "massage.c",
            "post-process.c",
            "pp_knowledge.c",
            "pp_lexer.c",
            "pp_linkset.c",
            "preparation.c",
            "print-util.c",
            "print.c",
            "prune.c",
            "read-dict.c",
            "resources.c",
            "string-set.c",
            "tokenize.c",
            "utilities.c",
            "word-file.c",
        },
        .flags = &.{
            "-O2",
            "-fwrapv",
        },
    });

    // This creates a module, which represents a collection of source files alongside
    // some compilation options, such as optimization mode and linked system libraries.
    // Zig modules are the preferred way of making Zig code available to consumers.
    // addModule defines a module that we intend to make available for importing
    // to our consumers. We must give it a name because a Zig package can expose
    // multiple modules and consumers will need to be able to specify which
    // module they want to access.
    const mod = b.addModule("pos", .{
        // The root source file is the "entry point" of this module. Users of
        // this module will only be able to access public declarations contained
        // in this file, which means that if you have declarations that you
        // intend to expose to consumers that were defined in other files part
        // of this module, you will have to make sure to re-export them from
        // the root file.
        .root_source_file = b.path("src/root.zig"),
        // Later on we'll use this module as the root module of a test executable
        // which requires us to specify a target.
        .target = target,
    });
    const options = b.addOptions();
    options.addOption(bool, "long_tests", long_tests);

    const git_describe = b.run(&.{ "git", "describe", "--always", "--dirty" });
    options.addOption([]const u8, "version", git_describe);

    const options_module = options.createModule();
    mod.addImport("build_options", options_module);
    mod.linkLibrary(lib);
    mod.addIncludePath(upstream.path("include"));

    // Here we define an executable. An executable needs to have a root module
    // which needs to expose a `main` function. While we could add a main function
    // to the module defined above, it's sometimes preferable to split business
    // business logic and the CLI into two separate modules.
    //
    // If your goal is to create a Zig library for others to use, consider if
    // it might benefit from also exposing a CLI tool. A parser library for a
    // data serialization format could also bundle a CLI syntax checker, for example.
    //
    // If instead your goal is to create an executable, consider if users might
    // be interested in also being able to embed the core functionality of your
    // program in their own executable in order to avoid the overhead involved in
    // subprocessing your CLI tool.
    //
    // If neither case applies to you, feel free to delete the declaration you
    // don't need and to put everything under a single module.
    const exe = b.addExecutable(.{
        .name = "pos",
        .root_module = b.createModule(.{
            // b.createModule defines a new module just like b.addModule but,
            // unlike b.addModule, it does not expose the module to consumers of
            // this package, which is why in this case we don't have to give it a name.
            .root_source_file = b.path("src/main.zig"),
            // Target and optimization levels must be explicitly wired in when
            // defining an executable or library (in the root module), and you
            // can also hardcode a specific target for an executable or library
            // definition if desireable (e.g. firmware for embedded devices).
            .target = target,
            .optimize = optimize,
            // List of modules available for import in source files part of the
            // root module.
            .imports = &.{
                // Here "pos" is the name you will use in your source code to
                // import this module (e.g. `@import("pos")`). The name is
                // repeated because you are allowed to rename your imports, which
                // can be extremely useful in case of collisions (which can happen
                // importing modules from different packages).
                .{ .name = "pos", .module = mod },
                .{ .name = "build_options", .module = options_module },
            },
        }),
    });

    // Copy data files to install directory
    const install_data = b.addInstallDirectory(.{
        .source_dir = upstream.path("data"),
        .install_dir = .{ .custom = "share" },
        .install_subdir = "link",
    });
    install_data.step.dependOn(&download_link_step.step);

    // This declares intent for the executable to be installed into the
    // install prefix when running `zig build` (i.e. when executing the default
    // step). By default the install prefix is `zig-out/` but can be overridden
    // by passing `--prefix` or `-p`.
    b.installArtifact(exe);
    b.getInstallStep().dependOn(&install_data.step);

    // This creates a top level step. Top level steps have a name and can be
    // invoked by name when running `zig build` (e.g. `zig build run`).
    // This will evaluate the `run` step rather than the default step.
    // For a top level step to actually do something, it must depend on other
    // steps (e.g. a Run step, as we will see in a moment).
    const run_step = b.step("run", "Run the app");

    // This creates a RunArtifact step in the build graph. A RunArtifact step
    // invokes an executable compiled by Zig. Steps will only be executed by the
    // runner if invoked directly by the user (in the case of top level steps)
    // or if another step depends on it, so it's up to you to define when and
    // how this Run step will be executed. In our case we want to run it when
    // the user runs `zig build run`, so we create a dependency link.
    const run_cmd = b.addRunArtifact(exe);
    run_step.dependOn(&run_cmd.step);

    // By making the run step depend on the default step, it will be run from the
    // installation directory rather than directly from within the cache directory.
    run_cmd.step.dependOn(b.getInstallStep());

    // This allows the user to pass arguments to the application in the build
    // command itself, like this: `zig build run -- arg1 arg2 etc`
    if (b.args) |args| {
        run_cmd.addArgs(args);
    }

    // Creates an executable that will run `test` blocks from the provided module.
    // Here `mod` needs to define a target, which is why earlier we made sure to
    // set the releative field.
    const mod_tests = b.addTest(.{
        .root_module = mod,
    });

    // A run step that will run the test executable.
    const run_mod_tests = b.addRunArtifact(mod_tests);
    run_mod_tests.setCwd(.{ .cwd_relative = b.getInstallPath(.bin, "") });
    run_mod_tests.step.dependOn(&install_data.step);
    // Creates an executable that will run `test` blocks from the executable's
    // root module. Note that test executables only test one module at a time,
    // hence why we have to create two separate ones.
    const exe_tests = b.addTest(.{
        .root_module = exe.root_module,
    });

    // A run step that will run the second test executable.
    const run_exe_tests = b.addRunArtifact(exe_tests);
    run_exe_tests.setCwd(.{ .cwd_relative = b.getInstallPath(.bin, "") });
    run_exe_tests.step.dependOn(&install_data.step);

    // A top level step for running all tests. dependOn can be called multiple
    // times and since the two run steps do not depend on one another, this will
    // make the two of them run in parallel.
    const test_step = b.step("test", "Run tests");

    test_step.dependOn(&run_mod_tests.step);
    test_step.dependOn(&run_exe_tests.step);

    // Just like flags, top level steps are also listed in the `--help` menu.
    //
    // The Zig build system is entirely implemented in userland, which means
    // that it cannot hook into private compiler APIs. All compilation work
    // orchestrated by the build system will result in other Zig compiler
    // subcommands being invoked with the right flags defined. You can observe
    // these invocations when one fails (or you pass a flag to increase
    // verbosity) to validate assumptions and diagnose problems.
    //
    // Lastly, the Zig build system is relatively simple and self-contained,
    // and reading its source code will allow you to master it.
}

fn DownloadStep(comptime name: []const u8, comptime link: []const u8) type {
    return struct {
        step: std.Build.Step,
        builder: *std.Build,

        const download_link = link;
        const download_uri = std.Uri.parse(link) catch @compileError("download link is not a valid Uri");
        const file_type: enum {
            targz,
            zip,
        } = if (std.mem.endsWith(u8, link, ".tar.gz")) .targz else if (std.mem.endsWith(u8, link, "zip")) .zip else @compileError("can only download tar.gz or zip files");

        const Self = @This();

        const Dependency = struct {
            build_root: []const u8,
            build: *std.Build,
            download: *Self,

            pub fn path(self: Dependency, sub_path: []const u8) std.Build.LazyPath {
                const cache_path = (self.download.getOutputPath() catch @panic("OOM")).path;
                const full_path = std.fs.path.join(self.build.allocator, &.{ cache_path, sub_path }) catch @panic("OOM");
                return .{
                    .src_path = .{
                        .owner = self.build,
                        .sub_path = full_path,
                    },
                };
            }
        };

        fn fileName(uri: std.Uri) []const u8 {
            const path = switch (uri.path) {
                .raw => |r| r,
                .percent_encoded => |p| p,
            };
            var it = std.mem.splitBackwardsScalar(u8, path, std.fs.path.sep);
            return it.first();
        }

        fn fileNameNoExtension() []const u8 {
            const file_name = fileName(download_uri);
            return switch (file_type) {
                .targz => file_name[0..std.mem.lastIndexOf(u8, file_name, ".tar.gz").?],
                .zip => file_name[0..std.mem.lastIndexOf(u8, file_name, ".zip").?],
            };
        }

        pub fn create(builder: *std.Build) *Self {
            const self = builder.allocator.create(Self) catch @panic("OOM");
            self.* = .{
                .step = std.Build.Step.init(.{
                    .id = .custom,
                    .name = "download " ++ name,
                    .owner = builder,
                    .makeFn = make,
                }),
                .builder = builder,
            };
            return self;
        }

        const Algo = std.crypto.hash.sha2.Sha256;
        pub fn getOutputPath(self: *Self) !struct { path: []const u8, hash: [Algo.digest_length]u8 } {
            var hasher = Algo.init(.{});
            hasher.update(download_link);
            var cache_hash: [Algo.digest_length]u8 = undefined;
            hasher.final(&cache_hash);

            const cache_dir = try std.fs.path.join(self.builder.allocator, &[_][]const u8{ self.builder.cache_root.path.?, "o", try std.fmt.allocPrint(self.builder.allocator, "{s}", .{std.fmt.bytesToHex(cache_hash, .lower)}), fileNameNoExtension() });

            return .{
                .path = cache_dir,
                .hash = cache_hash,
            };
        }

        fn make(step: *std.Build.Step, options: std.Build.Step.MakeOptions) anyerror!void {
            _ = options;
            const self: *Self = @fieldParentPtr("step", step);

            const model_dir = fileNameNoExtension();

            // Create a cache hash based on the URL
            var hasher = Algo.init(.{});
            hasher.update(link);
            var cache_hash: [Algo.digest_length]u8 = undefined;
            hasher.final(&cache_hash);

            const cache_dir = try std.fs.path.join(self.builder.allocator, &[_][]const u8{ self.builder.cache_root.path.?, "o", try std.fmt.allocPrint(self.builder.allocator, "{s}", .{std.fmt.bytesToHex(cache_hash, .lower)}) });

            const cached_model_dir = try std.fs.path.join(
                self.builder.allocator,
                &[_][]const u8{ cache_dir, model_dir },
            );
            defer self.builder.allocator.free(cached_model_dir);

            // Check if already cached
            if (std.fs.cwd().access(cached_model_dir, .{})) |_| {
                step.result_cached = true;
                return;
            } else |_| {}

            // Not cached, need to download
            std.fs.cwd().makePath(cache_dir) catch @panic("Could not create cache directory");

            const archive = try std.fs.path.join(
                self.builder.allocator,
                &[_][]const u8{ cache_dir, fileName(download_uri) },
            );
            defer self.builder.allocator.free(archive);

            // Download
            const download_result = std.process.Child.run(.{
                .allocator = self.builder.allocator,
                .argv = &.{ "curl", "-s", "-o", archive, link },
            }) catch return error.DownloadFailed;
            if (download_result.term.Exited != 0) return error.DownloadFailed;

            switch (file_type) {
                .zip => {
                    // Extract to cache using stdlib
                    var zip_file = std.fs.cwd().openFile(archive, .{}) catch return error.UnzipFailed;
                    defer zip_file.close();

                    var cache_dir_handle = std.fs.cwd().openDir(cache_dir, .{}) catch return error.UnzipFailed;
                    defer cache_dir_handle.close();

                    var zip_file_buffer: [4096]u8 = undefined;
                    var zip_file_reader = zip_file.reader(&zip_file_buffer);

                    std.zip.extract(cache_dir_handle, &zip_file_reader, .{}) catch return error.UnzipFailed;

                    step.result_cached = false;
                },
                .targz => {
                    var archive_file = std.fs.cwd().openFile(archive, .{}) catch return error.ExtractFailed;
                    defer archive_file.close();

                    var buf: [4096]u8 = undefined;
                    var file_reader = archive_file.reader(&buf);
                    const reader = &file_reader.interface;

                    var cache_dir_handle = std.fs.cwd().openDir(cache_dir, .{}) catch return error.ExtractFailed;
                    defer cache_dir_handle.close();

                    var gz_buf: [std.compress.flate.max_window_len]u8 = undefined;
                    var decompress = std.compress.flate.Decompress.init(reader, .gzip, &gz_buf);
                    std.tar.pipeToFileSystem(
                        cache_dir_handle,
                        &decompress.reader,
                        .{ .mode_mode = .ignore },
                    ) catch return error.ExtractFailed;
                },
            }
        }

        pub fn dependency(
            self: *Self,
            b: *std.Build,
            args: anytype,
        ) *Dependency {
            _ = args;
            const output = self.getOutputPath() catch @panic("cannot get output path");
            const dep = b.allocator.create(Dependency) catch @panic("OOM");

            dep.* = .{
                .download = self,
                .build = b,
                .build_root = output.path,
            };
            return dep;
        }
        const UserValue = union(enum) {
            flag: void,
            scalar: []const u8,
            list: std.array_list.Managed([]const u8),
            map: std.StringHashMap(*const UserValue),
            lazy_path: std.Build.LazyPath,
            lazy_path_list: std.array_list.Managed(std.Build.LazyPath),
        };
    };
}