From 236682a75620603b7eec087019a0c4ba66e31621 Mon Sep 17 00:00:00 2001
From: Emil Lerch <emil@lerch.org>
Date: Tue, 9 Sep 2025 19:02:07 -0700
Subject: [PATCH] working sample without nix needed

---
 .mise.toml    |  6 ----
 README.md     | 14 +++++-----
 build.zig     | 40 +++++++++++++++++----------
 build.zig.zon | 12 +++-----
 src/main.zig  | 76 +++++++++++++++++++++++++++++----------------------
 5 files changed, 81 insertions(+), 67 deletions(-)

diff --git a/.mise.toml b/.mise.toml
index 4ef9eaf..a42cce9 100644
--- a/.mise.toml
+++ b/.mise.toml
@@ -3,9 +3,3 @@ zig = "0.15.1"
 zls = "0.15.0"
 pre-commit = "latest"
 "ubi:DonIsaac/zlint" = "latest"
-
-[hooks]
-enter = 'echo use "nix develop" if you want to build'
-
-[settings]
-experimental = true
diff --git a/README.md b/README.md
index beb2633..fe604b5 100644
--- a/README.md
+++ b/README.md
@@ -2,11 +2,10 @@
 
 This project implements a minimal real-time speech-to-text application using Vosk and Zig.
 
-## Setup
 
 ### Prerequisites
 - Zig 0.15.1 (configured via mise)
-- Nix development environment with C compilation tools, ALSA, and audio libraries
+- Nix development environment configured for ALSA, and audio libraries
 
 ### Vosk Model Download
 The application uses the Vosk small English model for speech recognition:
@@ -17,10 +16,8 @@ The application uses the Vosk small English model for speech recognition:
 
 ### Installation Steps
 1. Enter nix development environment: `nix develop`
-2. Download Vosk model: `wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip`
-3. Extract model: `unzip vosk-model-small-en-us-0.15.zip`
-4. Build application: `zig build`
-5. Run: `./zig-out/bin/stt`
+2. Build application: `zig build`
+3. Run: `zig build run`
 
 ## Usage
 The application will:
@@ -33,4 +30,7 @@ The application will:
 ## Dependencies
 - Vosk C API library
 - ALSA for audio capture
-- Standard C libraries for audio processing
+
+## Notes
+
+Vosk tends to recognize "light" as lake or like
diff --git a/build.zig b/build.zig
index 961a197..be661d2 100644
--- a/build.zig
+++ b/build.zig
@@ -4,16 +4,9 @@ pub fn build(b: *std.Build) void {
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
 
-    const vosk_dep = b.dependency("vosk", .{});
+    const vosk_dep = b.dependency("vosk_linux_x86_64", .{});
+    const alsa_dep = b.dependency("alsa", .{});
 
-    const zlib_dep = b.dependency("zlib", .{
-        .target = target,
-        .optimize = .ReleaseFast,
-    });
-    const sdl_dep = b.dependency("SDL", .{
-        .target = target,
-        .optimize = .ReleaseFast,
-    });
     // We need to use curl for this as the domain doesn't work with zig TLS
     const model_step = ModelDownloadStep.create(b);
 
@@ -38,10 +31,13 @@ pub fn build(b: *std.Build) void {
     exe.linkLibC();
     exe.addIncludePath(vosk_dep.path(""));
     exe.addLibraryPath(vosk_dep.path(""));
-    exe.linkLibrary(zlib_dep.artifact("z"));
-    exe.linkLibrary(sdl_dep.artifact("SDL2"));
-    exe.linkSystemLibrary("vosk");
-    exe.linkSystemLibrary("asound");
+    exe.linkSystemLibrary("vosk"); // comes from dependency, which is binary blob from gh releases
+
+    const alsa_lib = alsa_dep.artifact("asound");
+    exe.linkLibrary(alsa_lib); // use our built alsa-lib
+
+    // Use the installed headers from the alsa dependency
+    exe.addIncludePath(alsa_dep.path("zig-out/include"));
 
     b.installArtifact(exe);
 
@@ -53,6 +49,22 @@ pub fn build(b: *std.Build) void {
     if (b.args) |args| {
         run_cmd.addArgs(args);
     }
+    // Creates a step for unit testing. This only builds the test executable
+    // but does not run it.
+    const exe_unit_tests = b.addTest(.{
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("src/main.zig"),
+            .target = target,
+            .optimize = optimize,
+        }),
+    });
+    const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
+
+    // Similar to creating the run step earlier, this exposes a `test` step to
+    // the `zig build --help` menu, providing a way for the user to request
+    // running the unit tests.
+    const test_step = b.step("test", "Run unit tests");
+    test_step.dependOn(&run_exe_unit_tests.step);
 }
 
 const ModelDownloadStep = struct {
@@ -108,7 +120,7 @@ const ModelDownloadStep = struct {
         } else |_| {}
 
         // Not cached, need to download
-        std.fs.cwd().makePath(cache_dir) catch {};
+        std.fs.cwd().makePath(cache_dir) catch @panic("Could not create cache directory");
 
         const model_zip = std.fmt.allocPrint(self.builder.allocator, "{s}/model.zip", .{cache_dir}) catch @panic("OOM");
         defer self.builder.allocator.free(model_zip);
diff --git a/build.zig.zon b/build.zig.zon
index f4653f6..55a03a0 100644
--- a/build.zig.zon
+++ b/build.zig.zon
@@ -4,17 +4,13 @@
     .fingerprint = 0xc855826ba95b9540,
     .minimum_zig_version = "0.15.1",
     .dependencies = .{
-        .vosk = .{
+        .vosk_linux_x86_64 = .{
             .url = "https://github.com/alphacep/vosk-api/releases/download/v0.3.45/vosk-linux-x86_64-0.3.45.zip",
             .hash = "N-V-__8AAF22jAFTSU4AVxFCNWtotf7OD8gM33Y_ScIrCeu7",
         },
-        .zlib = .{
-            .url = "git+https://github.com/allyourcodebase/zlib#61e7df7e996ec5a5f13a653db3c419adb340d6ef",
-            .hash = "zlib-1.3.1-ZZQ7lbYMAAB1hTSOKSXAKAgHsfDcyWNH_37ojw5WSpgR",
-        },
-        .SDL = .{
-            .url = "git+https://github.com/allyourcodebase/SDL#d29847ebcb6da34dec466a06163431982500a092",
-            .hash = "SDL-2.32.6-JToi38aTEgECY2mV9iUG7dNouCbarfJ1mkzjjm53gC80",
+        .alsa = .{
+            .url = "git+https://git.lerch.org/lobo/aycb-alsa-lib/#41600d4ef511629e7c77cec3e08f7e8ca9021723",
+            .hash = "alsa-1.2.14-jevBFmKjAACg9m2jKphsZtsrjpWRjgM6A5Wrt7K85WBl",
         },
     },
     .paths = .{
diff --git a/src/main.zig b/src/main.zig
index 99a4f59..deee6b0 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -4,13 +4,16 @@ const c = @cImport({
     @cInclude("alsa/asoundlib.h");
 });
 
-const SAMPLE_RATE = 16000;
-const BUFFER_SIZE = 4000;
+const VOSK_SAMPLE_RATE = 16000;
+const BUFFER_SIZE = 256;
 
 pub fn main() !void {
     var gpa = std.heap.GeneralPurposeAllocator(.{}){};
     defer _ = gpa.deinit();
 
+    // Set ALSA config path to our local alsa.conf
+    _ = c.setenv("ALSA_CONFIG_PATH", "alsa.conf", 1);
+
     // Initialize Vosk
     c.vosk_set_log_level(-1);
     const model = c.vosk_model_new("zig-out/bin/vosk-model-small-en-us-0.15");
@@ -20,18 +23,18 @@ pub fn main() !void {
     }
     defer c.vosk_model_free(model);
 
-    const rec = c.vosk_recognizer_new(model, SAMPLE_RATE);
+    const rec = c.vosk_recognizer_new(model, VOSK_SAMPLE_RATE);
     if (rec == null) {
         std.debug.print("Failed to create recognizer\n", .{});
         return;
     }
     defer c.vosk_recognizer_free(rec);
 
-    // Try to open default capture device
+    // Try to open hardware capture device directly
     var handle: ?*c.snd_pcm_t = null;
-    var err = c.snd_pcm_open(&handle, "default", c.SND_PCM_STREAM_CAPTURE, c.SND_PCM_NONBLOCK);
+    var err = c.snd_pcm_open(&handle, "hw:3,0", c.SND_PCM_STREAM_CAPTURE, c.SND_PCM_NONBLOCK);
     if (err < 0) {
-        std.debug.print("Cannot open default audio device: {s}\n", .{c.snd_strerror(err)});
+        std.debug.print("Cannot open audio device: {s}\n", .{c.snd_strerror(err)});
         std.debug.print("Make sure no other applications are using the microphone\n", .{});
         return;
     }
@@ -44,48 +47,57 @@ pub fn main() !void {
         return;
     }
 
-    // Configure audio - try simple parameters first
-    err = c.snd_pcm_set_params(handle, c.SND_PCM_FORMAT_S16_LE, c.SND_PCM_ACCESS_RW_INTERLEAVED, 1, SAMPLE_RATE, 1, 100000);
+    // Configure audio parameters
+    err = c.snd_pcm_set_params(handle, c.SND_PCM_FORMAT_S16_LE, c.SND_PCM_ACCESS_RW_INTERLEAVED, 2, VOSK_SAMPLE_RATE, 1, 100000);
     if (err < 0) {
         std.debug.print("Cannot configure audio: {s}\n", .{c.snd_strerror(err)});
         return;
     }
 
+    // Prepare the PCM device
+    err = c.snd_pcm_prepare(handle);
+    if (err < 0) {
+        std.debug.print("Cannot prepare audio: {s}\n", .{c.snd_strerror(err)});
+        return;
+    }
+
+    // Start the PCM stream
+    err = c.snd_pcm_start(handle);
+    if (err < 0) {
+        std.debug.print("Cannot start audio: {s}\n", .{c.snd_strerror(err)});
+        return;
+    }
+
     std.debug.print("Audio configured successfully\n", .{});
     std.debug.print("Listening... (Ctrl+C to exit)\n", .{});
 
-    var buffer: [BUFFER_SIZE]i16 = undefined;
-    var frame_count: u32 = 0;
+    var buffer: [BUFFER_SIZE * 2]i16 = undefined; // stereo
+    var accumulator: [VOSK_SAMPLE_RATE]i16 = undefined; // 1 second buffer
+    var acc_pos: usize = 0;
 
     while (true) {
-        const frames = c.snd_pcm_readi(handle, &buffer, BUFFER_SIZE / 2);
-        if (frames < 0) {
-            std.debug.print("Audio read error: {s}\n", .{c.snd_strerror(@intCast(frames))});
-            err = c.snd_pcm_recover(handle, @intCast(frames), 0);
-            if (err < 0) {
-                std.debug.print("Cannot recover from error: {s}\n", .{c.snd_strerror(err)});
-                break;
-            }
+        const frames_read = c.snd_pcm_readi(handle, &buffer, BUFFER_SIZE);
+        if (frames_read < 0) {
+            _ = c.snd_pcm_recover(handle, @intCast(frames_read), 1);
             continue;
         }
 
-        frame_count += 1;
-        if (frame_count % 50 == 0) {
-            // Show we're getting audio data
-            var max_sample: u16 = 0;
-            for (0..@intCast(frames)) |i| {
-                const abs_sample = @abs(buffer[i]);
-                if (abs_sample > max_sample) {
-                    max_sample = abs_sample;
-                }
+        // Convert stereo to mono and accumulate
+        for (0..@intCast(frames_read)) |i| {
+            if (acc_pos < accumulator.len) {
+                accumulator[acc_pos] = buffer[i * 2]; // left channel
+                acc_pos += 1;
             }
-            std.debug.print("Audio: {} frames, max level: {}\n", .{ frames, max_sample });
         }
 
-        const result = c.vosk_recognizer_accept_waveform(rec, @ptrCast(&buffer), @intCast(frames * 2));
-        if (result != 0) {
-            const text = c.vosk_recognizer_result(rec);
-            std.debug.print("RECOGNIZED: {s}\n", .{text});
+        // Process when we have enough data (0.1 seconds)
+        if (acc_pos >= VOSK_SAMPLE_RATE / 10) {
+            const result = c.vosk_recognizer_accept_waveform(rec, @ptrCast(&accumulator), @intCast(acc_pos * 2));
+            if (result != 0) {
+                const text = c.vosk_recognizer_result(rec);
+                std.debug.print("{s}\n", .{text});
+            }
+            acc_pos = 0;
         }
     }
 }