diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3389c86 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.zig-cache/ +zig-out/ diff --git a/.mise.toml b/.mise.toml new file mode 100644 index 0000000..1deab06 --- /dev/null +++ b/.mise.toml @@ -0,0 +1,3 @@ +[tools] +zig = "0.15.1" +zls = "0.15.0" diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..506dadd --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Emil Lerch + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..beb2633 --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +# Real-time Speech Recognition with Vosk and Zig + +This project implements a minimal real-time speech-to-text application using Vosk and Zig. + +## Setup + +### Prerequisites +- Zig 0.15.1 (configured via mise) +- Nix development environment with C compilation tools, ALSA, and audio libraries + +### Vosk Model Download +The application uses the Vosk small English model for speech recognition: +- **Source**: https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip +- **Size**: ~50MB +- **Language**: English only +- **Accuracy**: Good for simple sentences and commands + +### Installation Steps +1. Enter nix development environment: `nix develop` +2. Download Vosk model: `wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip` +3. Extract model: `unzip vosk-model-small-en-us-0.15.zip` +4. Build application: `zig build` +5. Run: `./zig-out/bin/stt` + +## Usage +The application will: +- Initialize audio capture from default microphone +- Load the Vosk speech recognition model +- Process audio in real-time +- Output recognized text to terminal +- Exit on Ctrl+C + +## Dependencies +- Vosk C API library +- ALSA for audio capture +- Standard C libraries for audio processing diff --git a/build.zig b/build.zig new file mode 100644 index 0000000..9ee6d0c --- /dev/null +++ b/build.zig @@ -0,0 +1,32 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + + const exe = b.addExecutable(.{ + .name = "stt", + .root_module = b.createModule(.{ + .root_source_file = b.path("src/main.zig"), + .target = target, + .optimize = optimize, + }), + }); + + exe.linkLibC(); + exe.addIncludePath(b.path("vosk-linux-x86_64-0.3.45")); + exe.addLibraryPath(b.path("vosk-linux-x86_64-0.3.45")); + exe.linkSystemLibrary("vosk"); + exe.linkSystemLibrary("asound"); + + b.installArtifact(exe); + + const run_step = b.step("run", "Run the app"); + const run_cmd = b.addRunArtifact(exe); + run_step.dependOn(&run_cmd.step); + run_cmd.step.dependOn(b.getInstallStep()); + + if (b.args) |args| { + run_cmd.addArgs(args); + } +} diff --git a/build.zig.zon b/build.zig.zon new file mode 100644 index 0000000..f36eeea --- /dev/null +++ b/build.zig.zon @@ -0,0 +1,81 @@ +.{ + // This is the default name used by packages depending on this one. For + // example, when a user runs `zig fetch --save `, this field is used + // as the key in the `dependencies` table. Although the user can choose a + // different name, most users will stick with this provided value. + // + // It is redundant to include "zig" in this name because it is already + // within the Zig package namespace. + .name = ._1_stt_2, + // This is a [Semantic Version](https://semver.org/). + // In a future version of Zig it will be used for package deduplication. + .version = "0.0.0", + // Together with name, this represents a globally unique package + // identifier. This field is generated by the Zig toolchain when the + // package is first created, and then *never changes*. This allows + // unambiguous detection of one package being an updated version of + // another. + // + // When forking a Zig project, this id should be regenerated (delete the + // field and run `zig build`) if the upstream project is still maintained. + // Otherwise, the fork is *hostile*, attempting to take control over the + // original project's identity. Thus it is recommended to leave the comment + // on the following line intact, so that it shows up in code reviews that + // modify the field. + .fingerprint = 0xe6cb5784eea38627, // Changing this has security and trust implications. + // Tracks the earliest Zig version that the package considers to be a + // supported use case. + .minimum_zig_version = "0.15.1", + // This field is optional. + // Each dependency must either provide a `url` and `hash`, or a `path`. + // `zig build --fetch` can be used to fetch all dependencies of a package, recursively. + // Once all dependencies are fetched, `zig build` no longer requires + // internet connectivity. + .dependencies = .{ + // See `zig fetch --save ` for a command-line interface for adding dependencies. + //.example = .{ + // // When updating this field to a new URL, be sure to delete the corresponding + // // `hash`, otherwise you are communicating that you expect to find the old hash at + // // the new URL. If the contents of a URL change this will result in a hash mismatch + // // which will prevent zig from using it. + // .url = "https://example.com/foo.tar.gz", + // + // // This is computed from the file contents of the directory of files that is + // // obtained after fetching `url` and applying the inclusion rules given by + // // `paths`. + // // + // // This field is the source of truth; packages do not come from a `url`; they + // // come from a `hash`. `url` is just one of many possible mirrors for how to + // // obtain a package matching this `hash`. + // // + // // Uses the [multihash](https://multiformats.io/multihash/) format. + // .hash = "...", + // + // // When this is provided, the package is found in a directory relative to the + // // build root. In this case the package's hash is irrelevant and therefore not + // // computed. This field and `url` are mutually exclusive. + // .path = "foo", + // + // // When this is set to `true`, a package is declared to be lazily + // // fetched. This makes the dependency only get fetched if it is + // // actually used. + // .lazy = false, + //}, + }, + // Specifies the set of files and directories that are included in this package. + // Only files and directories listed here are included in the `hash` that + // is computed for this package. Only files listed here will remain on disk + // when using the zig package manager. As a rule of thumb, one should list + // files required for compilation plus any license(s). + // Paths are relative to the build root. Use the empty string (`""`) to refer to + // the build root itself. + // A directory listed here means that all files within, recursively, are included. + .paths = .{ + "build.zig", + "build.zig.zon", + "src", + // For example... + //"LICENSE", + //"README.md", + }, +} diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..e5bbead --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1757347588, + "narHash": "sha256-tLdkkC6XnsY9EOZW9TlpesTclELy8W7lL2ClL+nma8o=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "b599843bad24621dcaa5ab60dac98f9b0eb1cabe", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..55103fa --- /dev/null +++ b/flake.nix @@ -0,0 +1,32 @@ +{ + description = "Vosk speech recognition development environment"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = nixpkgs.legacyPackages.${system}; + in + { + devShells.default = pkgs.mkShell { + buildInputs = with pkgs; [ + clang + llvm + cmake + pkg-config + zlib + alsa-lib + alsa-plugins + SDL2 + ]; + + shellHook = '' + export ALSA_PLUGIN_DIR=${pkgs.alsa-plugins}/lib/alsa-lib + ''; + }; + }); +} diff --git a/src/main.zig b/src/main.zig new file mode 100644 index 0000000..2b8f132 --- /dev/null +++ b/src/main.zig @@ -0,0 +1,91 @@ +const std = @import("std"); +const c = @cImport({ + @cInclude("vosk_api.h"); + @cInclude("alsa/asoundlib.h"); +}); + +const SAMPLE_RATE = 16000; +const BUFFER_SIZE = 4000; + +pub fn main() !void { + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + + // Initialize Vosk + c.vosk_set_log_level(-1); + const model = c.vosk_model_new("vosk-model-small-en-us-0.15"); + if (model == null) { + std.debug.print("Failed to load model\n", .{}); + return; + } + defer c.vosk_model_free(model); + + const rec = c.vosk_recognizer_new(model, SAMPLE_RATE); + if (rec == null) { + std.debug.print("Failed to create recognizer\n", .{}); + return; + } + defer c.vosk_recognizer_free(rec); + + // Try to open default capture device + var handle: ?*c.snd_pcm_t = null; + var err = c.snd_pcm_open(&handle, "default", c.SND_PCM_STREAM_CAPTURE, c.SND_PCM_NONBLOCK); + if (err < 0) { + std.debug.print("Cannot open default audio device: {s}\n", .{c.snd_strerror(err)}); + std.debug.print("Make sure no other applications are using the microphone\n", .{}); + return; + } + defer _ = c.snd_pcm_close(handle); + + // Set to blocking mode + err = c.snd_pcm_nonblock(handle, 0); + if (err < 0) { + std.debug.print("Cannot set blocking mode: {s}\n", .{c.snd_strerror(err)}); + return; + } + + // Configure audio - try simple parameters first + err = c.snd_pcm_set_params(handle, c.SND_PCM_FORMAT_S16_LE, c.SND_PCM_ACCESS_RW_INTERLEAVED, 1, SAMPLE_RATE, 1, 100000); + if (err < 0) { + std.debug.print("Cannot configure audio: {s}\n", .{c.snd_strerror(err)}); + return; + } + + std.debug.print("Audio configured successfully\n", .{}); + std.debug.print("Listening... (Ctrl+C to exit)\n", .{}); + + var buffer: [BUFFER_SIZE]i16 = undefined; + var frame_count: u32 = 0; + + while (true) { + const frames = c.snd_pcm_readi(handle, &buffer, BUFFER_SIZE / 2); + if (frames < 0) { + std.debug.print("Audio read error: {s}\n", .{c.snd_strerror(@intCast(frames))}); + err = c.snd_pcm_recover(handle, @intCast(frames), 0); + if (err < 0) { + std.debug.print("Cannot recover from error: {s}\n", .{c.snd_strerror(err)}); + break; + } + continue; + } + + frame_count += 1; + if (frame_count % 50 == 0) { + // Show we're getting audio data + var max_sample: u16 = 0; + for (0..@intCast(frames)) |i| { + const abs_sample = @abs(buffer[i]); + if (abs_sample > max_sample) { + max_sample = abs_sample; + } + } + std.debug.print("Audio: {} frames, max level: {}\n", .{ frames, max_sample }); + } + + const result = c.vosk_recognizer_accept_waveform(rec, @ptrCast(&buffer), @intCast(frames * 2)); + if (result != 0) { + const text = c.vosk_recognizer_result(rec); + std.debug.print("RECOGNIZED: {s}\n", .{text}); + } + } +} diff --git a/src/root.zig b/src/root.zig new file mode 100644 index 0000000..94c7cd0 --- /dev/null +++ b/src/root.zig @@ -0,0 +1,23 @@ +//! By convention, root.zig is the root source file when making a library. +const std = @import("std"); + +pub fn bufferedPrint() !void { + // Stdout is for the actual output of your application, for example if you + // are implementing gzip, then only the compressed bytes should be sent to + // stdout, not any debugging messages. + var stdout_buffer: [1024]u8 = undefined; + var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer); + const stdout = &stdout_writer.interface; + + try stdout.print("Run `zig build test` to run the tests.\n", .{}); + + try stdout.flush(); // Don't forget to flush! +} + +pub fn add(a: i32, b: i32) i32 { + return a + b; +} + +test "basic add functionality" { + try std.testing.expect(add(3, 7) == 10); +}