From 236682a75620603b7eec087019a0c4ba66e31621 Mon Sep 17 00:00:00 2001 From: Emil Lerch Date: Tue, 9 Sep 2025 19:02:07 -0700 Subject: [PATCH] working sample without nix needed --- .mise.toml | 6 ---- README.md | 14 +++++----- build.zig | 40 +++++++++++++++++---------- build.zig.zon | 12 +++----- src/main.zig | 76 +++++++++++++++++++++++++++++---------------------- 5 files changed, 81 insertions(+), 67 deletions(-) diff --git a/.mise.toml b/.mise.toml index 4ef9eaf..a42cce9 100644 --- a/.mise.toml +++ b/.mise.toml @@ -3,9 +3,3 @@ zig = "0.15.1" zls = "0.15.0" pre-commit = "latest" "ubi:DonIsaac/zlint" = "latest" - -[hooks] -enter = 'echo use "nix develop" if you want to build' - -[settings] -experimental = true diff --git a/README.md b/README.md index beb2633..fe604b5 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,10 @@ This project implements a minimal real-time speech-to-text application using Vosk and Zig. -## Setup ### Prerequisites - Zig 0.15.1 (configured via mise) -- Nix development environment with C compilation tools, ALSA, and audio libraries +- Nix development environment configured for ALSA, and audio libraries ### Vosk Model Download The application uses the Vosk small English model for speech recognition: @@ -17,10 +16,8 @@ The application uses the Vosk small English model for speech recognition: ### Installation Steps 1. Enter nix development environment: `nix develop` -2. Download Vosk model: `wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip` -3. Extract model: `unzip vosk-model-small-en-us-0.15.zip` -4. Build application: `zig build` -5. Run: `./zig-out/bin/stt` +2. Build application: `zig build` +3. Run: `zig build run` ## Usage The application will: @@ -33,4 +30,7 @@ The application will: ## Dependencies - Vosk C API library - ALSA for audio capture -- Standard C libraries for audio processing + +## Notes + +Vosk tends to recognize "light" as lake or like diff --git a/build.zig b/build.zig index 961a197..be661d2 100644 --- a/build.zig +++ b/build.zig @@ -4,16 +4,9 @@ pub fn build(b: *std.Build) void { const target = b.standardTargetOptions(.{}); const optimize = b.standardOptimizeOption(.{}); - const vosk_dep = b.dependency("vosk", .{}); + const vosk_dep = b.dependency("vosk_linux_x86_64", .{}); + const alsa_dep = b.dependency("alsa", .{}); - const zlib_dep = b.dependency("zlib", .{ - .target = target, - .optimize = .ReleaseFast, - }); - const sdl_dep = b.dependency("SDL", .{ - .target = target, - .optimize = .ReleaseFast, - }); // We need to use curl for this as the domain doesn't work with zig TLS const model_step = ModelDownloadStep.create(b); @@ -38,10 +31,13 @@ pub fn build(b: *std.Build) void { exe.linkLibC(); exe.addIncludePath(vosk_dep.path("")); exe.addLibraryPath(vosk_dep.path("")); - exe.linkLibrary(zlib_dep.artifact("z")); - exe.linkLibrary(sdl_dep.artifact("SDL2")); - exe.linkSystemLibrary("vosk"); - exe.linkSystemLibrary("asound"); + exe.linkSystemLibrary("vosk"); // comes from dependency, which is binary blob from gh releases + + const alsa_lib = alsa_dep.artifact("asound"); + exe.linkLibrary(alsa_lib); // use our built alsa-lib + + // Use the installed headers from the alsa dependency + exe.addIncludePath(alsa_dep.path("zig-out/include")); b.installArtifact(exe); @@ -53,6 +49,22 @@ pub fn build(b: *std.Build) void { if (b.args) |args| { run_cmd.addArgs(args); } + // Creates a step for unit testing. This only builds the test executable + // but does not run it. + const exe_unit_tests = b.addTest(.{ + .root_module = b.createModule(.{ + .root_source_file = b.path("src/main.zig"), + .target = target, + .optimize = optimize, + }), + }); + const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); + + // Similar to creating the run step earlier, this exposes a `test` step to + // the `zig build --help` menu, providing a way for the user to request + // running the unit tests. + const test_step = b.step("test", "Run unit tests"); + test_step.dependOn(&run_exe_unit_tests.step); } const ModelDownloadStep = struct { @@ -108,7 +120,7 @@ const ModelDownloadStep = struct { } else |_| {} // Not cached, need to download - std.fs.cwd().makePath(cache_dir) catch {}; + std.fs.cwd().makePath(cache_dir) catch @panic("Could not create cache directory"); const model_zip = std.fmt.allocPrint(self.builder.allocator, "{s}/model.zip", .{cache_dir}) catch @panic("OOM"); defer self.builder.allocator.free(model_zip); diff --git a/build.zig.zon b/build.zig.zon index f4653f6..55a03a0 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -4,17 +4,13 @@ .fingerprint = 0xc855826ba95b9540, .minimum_zig_version = "0.15.1", .dependencies = .{ - .vosk = .{ + .vosk_linux_x86_64 = .{ .url = "https://github.com/alphacep/vosk-api/releases/download/v0.3.45/vosk-linux-x86_64-0.3.45.zip", .hash = "N-V-__8AAF22jAFTSU4AVxFCNWtotf7OD8gM33Y_ScIrCeu7", }, - .zlib = .{ - .url = "git+https://github.com/allyourcodebase/zlib#61e7df7e996ec5a5f13a653db3c419adb340d6ef", - .hash = "zlib-1.3.1-ZZQ7lbYMAAB1hTSOKSXAKAgHsfDcyWNH_37ojw5WSpgR", - }, - .SDL = .{ - .url = "git+https://github.com/allyourcodebase/SDL#d29847ebcb6da34dec466a06163431982500a092", - .hash = "SDL-2.32.6-JToi38aTEgECY2mV9iUG7dNouCbarfJ1mkzjjm53gC80", + .alsa = .{ + .url = "git+https://git.lerch.org/lobo/aycb-alsa-lib/#41600d4ef511629e7c77cec3e08f7e8ca9021723", + .hash = "alsa-1.2.14-jevBFmKjAACg9m2jKphsZtsrjpWRjgM6A5Wrt7K85WBl", }, }, .paths = .{ diff --git a/src/main.zig b/src/main.zig index 99a4f59..deee6b0 100644 --- a/src/main.zig +++ b/src/main.zig @@ -4,13 +4,16 @@ const c = @cImport({ @cInclude("alsa/asoundlib.h"); }); -const SAMPLE_RATE = 16000; -const BUFFER_SIZE = 4000; +const VOSK_SAMPLE_RATE = 16000; +const BUFFER_SIZE = 256; pub fn main() !void { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer _ = gpa.deinit(); + // Set ALSA config path to our local alsa.conf + _ = c.setenv("ALSA_CONFIG_PATH", "alsa.conf", 1); + // Initialize Vosk c.vosk_set_log_level(-1); const model = c.vosk_model_new("zig-out/bin/vosk-model-small-en-us-0.15"); @@ -20,18 +23,18 @@ pub fn main() !void { } defer c.vosk_model_free(model); - const rec = c.vosk_recognizer_new(model, SAMPLE_RATE); + const rec = c.vosk_recognizer_new(model, VOSK_SAMPLE_RATE); if (rec == null) { std.debug.print("Failed to create recognizer\n", .{}); return; } defer c.vosk_recognizer_free(rec); - // Try to open default capture device + // Try to open hardware capture device directly var handle: ?*c.snd_pcm_t = null; - var err = c.snd_pcm_open(&handle, "default", c.SND_PCM_STREAM_CAPTURE, c.SND_PCM_NONBLOCK); + var err = c.snd_pcm_open(&handle, "hw:3,0", c.SND_PCM_STREAM_CAPTURE, c.SND_PCM_NONBLOCK); if (err < 0) { - std.debug.print("Cannot open default audio device: {s}\n", .{c.snd_strerror(err)}); + std.debug.print("Cannot open audio device: {s}\n", .{c.snd_strerror(err)}); std.debug.print("Make sure no other applications are using the microphone\n", .{}); return; } @@ -44,48 +47,57 @@ pub fn main() !void { return; } - // Configure audio - try simple parameters first - err = c.snd_pcm_set_params(handle, c.SND_PCM_FORMAT_S16_LE, c.SND_PCM_ACCESS_RW_INTERLEAVED, 1, SAMPLE_RATE, 1, 100000); + // Configure audio parameters + err = c.snd_pcm_set_params(handle, c.SND_PCM_FORMAT_S16_LE, c.SND_PCM_ACCESS_RW_INTERLEAVED, 2, VOSK_SAMPLE_RATE, 1, 100000); if (err < 0) { std.debug.print("Cannot configure audio: {s}\n", .{c.snd_strerror(err)}); return; } + // Prepare the PCM device + err = c.snd_pcm_prepare(handle); + if (err < 0) { + std.debug.print("Cannot prepare audio: {s}\n", .{c.snd_strerror(err)}); + return; + } + + // Start the PCM stream + err = c.snd_pcm_start(handle); + if (err < 0) { + std.debug.print("Cannot start audio: {s}\n", .{c.snd_strerror(err)}); + return; + } + std.debug.print("Audio configured successfully\n", .{}); std.debug.print("Listening... (Ctrl+C to exit)\n", .{}); - var buffer: [BUFFER_SIZE]i16 = undefined; - var frame_count: u32 = 0; + var buffer: [BUFFER_SIZE * 2]i16 = undefined; // stereo + var accumulator: [VOSK_SAMPLE_RATE]i16 = undefined; // 1 second buffer + var acc_pos: usize = 0; while (true) { - const frames = c.snd_pcm_readi(handle, &buffer, BUFFER_SIZE / 2); - if (frames < 0) { - std.debug.print("Audio read error: {s}\n", .{c.snd_strerror(@intCast(frames))}); - err = c.snd_pcm_recover(handle, @intCast(frames), 0); - if (err < 0) { - std.debug.print("Cannot recover from error: {s}\n", .{c.snd_strerror(err)}); - break; - } + const frames_read = c.snd_pcm_readi(handle, &buffer, BUFFER_SIZE); + if (frames_read < 0) { + _ = c.snd_pcm_recover(handle, @intCast(frames_read), 1); continue; } - frame_count += 1; - if (frame_count % 50 == 0) { - // Show we're getting audio data - var max_sample: u16 = 0; - for (0..@intCast(frames)) |i| { - const abs_sample = @abs(buffer[i]); - if (abs_sample > max_sample) { - max_sample = abs_sample; - } + // Convert stereo to mono and accumulate + for (0..@intCast(frames_read)) |i| { + if (acc_pos < accumulator.len) { + accumulator[acc_pos] = buffer[i * 2]; // left channel + acc_pos += 1; } - std.debug.print("Audio: {} frames, max level: {}\n", .{ frames, max_sample }); } - const result = c.vosk_recognizer_accept_waveform(rec, @ptrCast(&buffer), @intCast(frames * 2)); - if (result != 0) { - const text = c.vosk_recognizer_result(rec); - std.debug.print("RECOGNIZED: {s}\n", .{text}); + // Process when we have enough data (0.1 seconds) + if (acc_pos >= VOSK_SAMPLE_RATE / 10) { + const result = c.vosk_recognizer_accept_waveform(rec, @ptrCast(&accumulator), @intCast(acc_pos * 2)); + if (result != 0) { + const text = c.vosk_recognizer_result(rec); + std.debug.print("{s}\n", .{text}); + } + acc_pos = 0; } } }