working sample without nix needed
This commit is contained in:
parent
17020ec40a
commit
236682a756
5 changed files with 81 additions and 67 deletions
|
@ -3,9 +3,3 @@ zig = "0.15.1"
|
||||||
zls = "0.15.0"
|
zls = "0.15.0"
|
||||||
pre-commit = "latest"
|
pre-commit = "latest"
|
||||||
"ubi:DonIsaac/zlint" = "latest"
|
"ubi:DonIsaac/zlint" = "latest"
|
||||||
|
|
||||||
[hooks]
|
|
||||||
enter = 'echo use "nix develop" if you want to build'
|
|
||||||
|
|
||||||
[settings]
|
|
||||||
experimental = true
|
|
||||||
|
|
14
README.md
14
README.md
|
@ -2,11 +2,10 @@
|
||||||
|
|
||||||
This project implements a minimal real-time speech-to-text application using Vosk and Zig.
|
This project implements a minimal real-time speech-to-text application using Vosk and Zig.
|
||||||
|
|
||||||
## Setup
|
|
||||||
|
|
||||||
### Prerequisites
|
### Prerequisites
|
||||||
- Zig 0.15.1 (configured via mise)
|
- Zig 0.15.1 (configured via mise)
|
||||||
- Nix development environment with C compilation tools, ALSA, and audio libraries
|
- Nix development environment configured for ALSA, and audio libraries
|
||||||
|
|
||||||
### Vosk Model Download
|
### Vosk Model Download
|
||||||
The application uses the Vosk small English model for speech recognition:
|
The application uses the Vosk small English model for speech recognition:
|
||||||
|
@ -17,10 +16,8 @@ The application uses the Vosk small English model for speech recognition:
|
||||||
|
|
||||||
### Installation Steps
|
### Installation Steps
|
||||||
1. Enter nix development environment: `nix develop`
|
1. Enter nix development environment: `nix develop`
|
||||||
2. Download Vosk model: `wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip`
|
2. Build application: `zig build`
|
||||||
3. Extract model: `unzip vosk-model-small-en-us-0.15.zip`
|
3. Run: `zig build run`
|
||||||
4. Build application: `zig build`
|
|
||||||
5. Run: `./zig-out/bin/stt`
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
The application will:
|
The application will:
|
||||||
|
@ -33,4 +30,7 @@ The application will:
|
||||||
## Dependencies
|
## Dependencies
|
||||||
- Vosk C API library
|
- Vosk C API library
|
||||||
- ALSA for audio capture
|
- ALSA for audio capture
|
||||||
- Standard C libraries for audio processing
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
Vosk tends to recognize "light" as lake or like
|
||||||
|
|
40
build.zig
40
build.zig
|
@ -4,16 +4,9 @@ pub fn build(b: *std.Build) void {
|
||||||
const target = b.standardTargetOptions(.{});
|
const target = b.standardTargetOptions(.{});
|
||||||
const optimize = b.standardOptimizeOption(.{});
|
const optimize = b.standardOptimizeOption(.{});
|
||||||
|
|
||||||
const vosk_dep = b.dependency("vosk", .{});
|
const vosk_dep = b.dependency("vosk_linux_x86_64", .{});
|
||||||
|
const alsa_dep = b.dependency("alsa", .{});
|
||||||
|
|
||||||
const zlib_dep = b.dependency("zlib", .{
|
|
||||||
.target = target,
|
|
||||||
.optimize = .ReleaseFast,
|
|
||||||
});
|
|
||||||
const sdl_dep = b.dependency("SDL", .{
|
|
||||||
.target = target,
|
|
||||||
.optimize = .ReleaseFast,
|
|
||||||
});
|
|
||||||
// We need to use curl for this as the domain doesn't work with zig TLS
|
// We need to use curl for this as the domain doesn't work with zig TLS
|
||||||
const model_step = ModelDownloadStep.create(b);
|
const model_step = ModelDownloadStep.create(b);
|
||||||
|
|
||||||
|
@ -38,10 +31,13 @@ pub fn build(b: *std.Build) void {
|
||||||
exe.linkLibC();
|
exe.linkLibC();
|
||||||
exe.addIncludePath(vosk_dep.path(""));
|
exe.addIncludePath(vosk_dep.path(""));
|
||||||
exe.addLibraryPath(vosk_dep.path(""));
|
exe.addLibraryPath(vosk_dep.path(""));
|
||||||
exe.linkLibrary(zlib_dep.artifact("z"));
|
exe.linkSystemLibrary("vosk"); // comes from dependency, which is binary blob from gh releases
|
||||||
exe.linkLibrary(sdl_dep.artifact("SDL2"));
|
|
||||||
exe.linkSystemLibrary("vosk");
|
const alsa_lib = alsa_dep.artifact("asound");
|
||||||
exe.linkSystemLibrary("asound");
|
exe.linkLibrary(alsa_lib); // use our built alsa-lib
|
||||||
|
|
||||||
|
// Use the installed headers from the alsa dependency
|
||||||
|
exe.addIncludePath(alsa_dep.path("zig-out/include"));
|
||||||
|
|
||||||
b.installArtifact(exe);
|
b.installArtifact(exe);
|
||||||
|
|
||||||
|
@ -53,6 +49,22 @@ pub fn build(b: *std.Build) void {
|
||||||
if (b.args) |args| {
|
if (b.args) |args| {
|
||||||
run_cmd.addArgs(args);
|
run_cmd.addArgs(args);
|
||||||
}
|
}
|
||||||
|
// Creates a step for unit testing. This only builds the test executable
|
||||||
|
// but does not run it.
|
||||||
|
const exe_unit_tests = b.addTest(.{
|
||||||
|
.root_module = b.createModule(.{
|
||||||
|
.root_source_file = b.path("src/main.zig"),
|
||||||
|
.target = target,
|
||||||
|
.optimize = optimize,
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
|
||||||
|
|
||||||
|
// Similar to creating the run step earlier, this exposes a `test` step to
|
||||||
|
// the `zig build --help` menu, providing a way for the user to request
|
||||||
|
// running the unit tests.
|
||||||
|
const test_step = b.step("test", "Run unit tests");
|
||||||
|
test_step.dependOn(&run_exe_unit_tests.step);
|
||||||
}
|
}
|
||||||
|
|
||||||
const ModelDownloadStep = struct {
|
const ModelDownloadStep = struct {
|
||||||
|
@ -108,7 +120,7 @@ const ModelDownloadStep = struct {
|
||||||
} else |_| {}
|
} else |_| {}
|
||||||
|
|
||||||
// Not cached, need to download
|
// Not cached, need to download
|
||||||
std.fs.cwd().makePath(cache_dir) catch {};
|
std.fs.cwd().makePath(cache_dir) catch @panic("Could not create cache directory");
|
||||||
|
|
||||||
const model_zip = std.fmt.allocPrint(self.builder.allocator, "{s}/model.zip", .{cache_dir}) catch @panic("OOM");
|
const model_zip = std.fmt.allocPrint(self.builder.allocator, "{s}/model.zip", .{cache_dir}) catch @panic("OOM");
|
||||||
defer self.builder.allocator.free(model_zip);
|
defer self.builder.allocator.free(model_zip);
|
||||||
|
|
|
@ -4,17 +4,13 @@
|
||||||
.fingerprint = 0xc855826ba95b9540,
|
.fingerprint = 0xc855826ba95b9540,
|
||||||
.minimum_zig_version = "0.15.1",
|
.minimum_zig_version = "0.15.1",
|
||||||
.dependencies = .{
|
.dependencies = .{
|
||||||
.vosk = .{
|
.vosk_linux_x86_64 = .{
|
||||||
.url = "https://github.com/alphacep/vosk-api/releases/download/v0.3.45/vosk-linux-x86_64-0.3.45.zip",
|
.url = "https://github.com/alphacep/vosk-api/releases/download/v0.3.45/vosk-linux-x86_64-0.3.45.zip",
|
||||||
.hash = "N-V-__8AAF22jAFTSU4AVxFCNWtotf7OD8gM33Y_ScIrCeu7",
|
.hash = "N-V-__8AAF22jAFTSU4AVxFCNWtotf7OD8gM33Y_ScIrCeu7",
|
||||||
},
|
},
|
||||||
.zlib = .{
|
.alsa = .{
|
||||||
.url = "git+https://github.com/allyourcodebase/zlib#61e7df7e996ec5a5f13a653db3c419adb340d6ef",
|
.url = "git+https://git.lerch.org/lobo/aycb-alsa-lib/#41600d4ef511629e7c77cec3e08f7e8ca9021723",
|
||||||
.hash = "zlib-1.3.1-ZZQ7lbYMAAB1hTSOKSXAKAgHsfDcyWNH_37ojw5WSpgR",
|
.hash = "alsa-1.2.14-jevBFmKjAACg9m2jKphsZtsrjpWRjgM6A5Wrt7K85WBl",
|
||||||
},
|
|
||||||
.SDL = .{
|
|
||||||
.url = "git+https://github.com/allyourcodebase/SDL#d29847ebcb6da34dec466a06163431982500a092",
|
|
||||||
.hash = "SDL-2.32.6-JToi38aTEgECY2mV9iUG7dNouCbarfJ1mkzjjm53gC80",
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
.paths = .{
|
.paths = .{
|
||||||
|
|
76
src/main.zig
76
src/main.zig
|
@ -4,13 +4,16 @@ const c = @cImport({
|
||||||
@cInclude("alsa/asoundlib.h");
|
@cInclude("alsa/asoundlib.h");
|
||||||
});
|
});
|
||||||
|
|
||||||
const SAMPLE_RATE = 16000;
|
const VOSK_SAMPLE_RATE = 16000;
|
||||||
const BUFFER_SIZE = 4000;
|
const BUFFER_SIZE = 256;
|
||||||
|
|
||||||
pub fn main() !void {
|
pub fn main() !void {
|
||||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||||
defer _ = gpa.deinit();
|
defer _ = gpa.deinit();
|
||||||
|
|
||||||
|
// Set ALSA config path to our local alsa.conf
|
||||||
|
_ = c.setenv("ALSA_CONFIG_PATH", "alsa.conf", 1);
|
||||||
|
|
||||||
// Initialize Vosk
|
// Initialize Vosk
|
||||||
c.vosk_set_log_level(-1);
|
c.vosk_set_log_level(-1);
|
||||||
const model = c.vosk_model_new("zig-out/bin/vosk-model-small-en-us-0.15");
|
const model = c.vosk_model_new("zig-out/bin/vosk-model-small-en-us-0.15");
|
||||||
|
@ -20,18 +23,18 @@ pub fn main() !void {
|
||||||
}
|
}
|
||||||
defer c.vosk_model_free(model);
|
defer c.vosk_model_free(model);
|
||||||
|
|
||||||
const rec = c.vosk_recognizer_new(model, SAMPLE_RATE);
|
const rec = c.vosk_recognizer_new(model, VOSK_SAMPLE_RATE);
|
||||||
if (rec == null) {
|
if (rec == null) {
|
||||||
std.debug.print("Failed to create recognizer\n", .{});
|
std.debug.print("Failed to create recognizer\n", .{});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
defer c.vosk_recognizer_free(rec);
|
defer c.vosk_recognizer_free(rec);
|
||||||
|
|
||||||
// Try to open default capture device
|
// Try to open hardware capture device directly
|
||||||
var handle: ?*c.snd_pcm_t = null;
|
var handle: ?*c.snd_pcm_t = null;
|
||||||
var err = c.snd_pcm_open(&handle, "default", c.SND_PCM_STREAM_CAPTURE, c.SND_PCM_NONBLOCK);
|
var err = c.snd_pcm_open(&handle, "hw:3,0", c.SND_PCM_STREAM_CAPTURE, c.SND_PCM_NONBLOCK);
|
||||||
if (err < 0) {
|
if (err < 0) {
|
||||||
std.debug.print("Cannot open default audio device: {s}\n", .{c.snd_strerror(err)});
|
std.debug.print("Cannot open audio device: {s}\n", .{c.snd_strerror(err)});
|
||||||
std.debug.print("Make sure no other applications are using the microphone\n", .{});
|
std.debug.print("Make sure no other applications are using the microphone\n", .{});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -44,48 +47,57 @@ pub fn main() !void {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Configure audio - try simple parameters first
|
// Configure audio parameters
|
||||||
err = c.snd_pcm_set_params(handle, c.SND_PCM_FORMAT_S16_LE, c.SND_PCM_ACCESS_RW_INTERLEAVED, 1, SAMPLE_RATE, 1, 100000);
|
err = c.snd_pcm_set_params(handle, c.SND_PCM_FORMAT_S16_LE, c.SND_PCM_ACCESS_RW_INTERLEAVED, 2, VOSK_SAMPLE_RATE, 1, 100000);
|
||||||
if (err < 0) {
|
if (err < 0) {
|
||||||
std.debug.print("Cannot configure audio: {s}\n", .{c.snd_strerror(err)});
|
std.debug.print("Cannot configure audio: {s}\n", .{c.snd_strerror(err)});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prepare the PCM device
|
||||||
|
err = c.snd_pcm_prepare(handle);
|
||||||
|
if (err < 0) {
|
||||||
|
std.debug.print("Cannot prepare audio: {s}\n", .{c.snd_strerror(err)});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start the PCM stream
|
||||||
|
err = c.snd_pcm_start(handle);
|
||||||
|
if (err < 0) {
|
||||||
|
std.debug.print("Cannot start audio: {s}\n", .{c.snd_strerror(err)});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
std.debug.print("Audio configured successfully\n", .{});
|
std.debug.print("Audio configured successfully\n", .{});
|
||||||
std.debug.print("Listening... (Ctrl+C to exit)\n", .{});
|
std.debug.print("Listening... (Ctrl+C to exit)\n", .{});
|
||||||
|
|
||||||
var buffer: [BUFFER_SIZE]i16 = undefined;
|
var buffer: [BUFFER_SIZE * 2]i16 = undefined; // stereo
|
||||||
var frame_count: u32 = 0;
|
var accumulator: [VOSK_SAMPLE_RATE]i16 = undefined; // 1 second buffer
|
||||||
|
var acc_pos: usize = 0;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
const frames = c.snd_pcm_readi(handle, &buffer, BUFFER_SIZE / 2);
|
const frames_read = c.snd_pcm_readi(handle, &buffer, BUFFER_SIZE);
|
||||||
if (frames < 0) {
|
if (frames_read < 0) {
|
||||||
std.debug.print("Audio read error: {s}\n", .{c.snd_strerror(@intCast(frames))});
|
_ = c.snd_pcm_recover(handle, @intCast(frames_read), 1);
|
||||||
err = c.snd_pcm_recover(handle, @intCast(frames), 0);
|
|
||||||
if (err < 0) {
|
|
||||||
std.debug.print("Cannot recover from error: {s}\n", .{c.snd_strerror(err)});
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
frame_count += 1;
|
// Convert stereo to mono and accumulate
|
||||||
if (frame_count % 50 == 0) {
|
for (0..@intCast(frames_read)) |i| {
|
||||||
// Show we're getting audio data
|
if (acc_pos < accumulator.len) {
|
||||||
var max_sample: u16 = 0;
|
accumulator[acc_pos] = buffer[i * 2]; // left channel
|
||||||
for (0..@intCast(frames)) |i| {
|
acc_pos += 1;
|
||||||
const abs_sample = @abs(buffer[i]);
|
|
||||||
if (abs_sample > max_sample) {
|
|
||||||
max_sample = abs_sample;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
std.debug.print("Audio: {} frames, max level: {}\n", .{ frames, max_sample });
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const result = c.vosk_recognizer_accept_waveform(rec, @ptrCast(&buffer), @intCast(frames * 2));
|
// Process when we have enough data (0.1 seconds)
|
||||||
if (result != 0) {
|
if (acc_pos >= VOSK_SAMPLE_RATE / 10) {
|
||||||
const text = c.vosk_recognizer_result(rec);
|
const result = c.vosk_recognizer_accept_waveform(rec, @ptrCast(&accumulator), @intCast(acc_pos * 2));
|
||||||
std.debug.print("RECOGNIZED: {s}\n", .{text});
|
if (result != 0) {
|
||||||
|
const text = c.vosk_recognizer_result(rec);
|
||||||
|
std.debug.print("{s}\n", .{text});
|
||||||
|
}
|
||||||
|
acc_pos = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue