stt/src/main.zig

91 lines
3 KiB
Zig

const std = @import("std");
const c = @cImport({
@cInclude("vosk_api.h");
@cInclude("alsa/asoundlib.h");
});
const SAMPLE_RATE = 16000;
const BUFFER_SIZE = 4000;
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
// Initialize Vosk
c.vosk_set_log_level(-1);
const model = c.vosk_model_new("vosk-model-small-en-us-0.15");
if (model == null) {
std.debug.print("Failed to load model\n", .{});
return;
}
defer c.vosk_model_free(model);
const rec = c.vosk_recognizer_new(model, SAMPLE_RATE);
if (rec == null) {
std.debug.print("Failed to create recognizer\n", .{});
return;
}
defer c.vosk_recognizer_free(rec);
// Try to open default capture device
var handle: ?*c.snd_pcm_t = null;
var err = c.snd_pcm_open(&handle, "default", c.SND_PCM_STREAM_CAPTURE, c.SND_PCM_NONBLOCK);
if (err < 0) {
std.debug.print("Cannot open default audio device: {s}\n", .{c.snd_strerror(err)});
std.debug.print("Make sure no other applications are using the microphone\n", .{});
return;
}
defer _ = c.snd_pcm_close(handle);
// Set to blocking mode
err = c.snd_pcm_nonblock(handle, 0);
if (err < 0) {
std.debug.print("Cannot set blocking mode: {s}\n", .{c.snd_strerror(err)});
return;
}
// Configure audio - try simple parameters first
err = c.snd_pcm_set_params(handle, c.SND_PCM_FORMAT_S16_LE, c.SND_PCM_ACCESS_RW_INTERLEAVED, 1, SAMPLE_RATE, 1, 100000);
if (err < 0) {
std.debug.print("Cannot configure audio: {s}\n", .{c.snd_strerror(err)});
return;
}
std.debug.print("Audio configured successfully\n", .{});
std.debug.print("Listening... (Ctrl+C to exit)\n", .{});
var buffer: [BUFFER_SIZE]i16 = undefined;
var frame_count: u32 = 0;
while (true) {
const frames = c.snd_pcm_readi(handle, &buffer, BUFFER_SIZE / 2);
if (frames < 0) {
std.debug.print("Audio read error: {s}\n", .{c.snd_strerror(@intCast(frames))});
err = c.snd_pcm_recover(handle, @intCast(frames), 0);
if (err < 0) {
std.debug.print("Cannot recover from error: {s}\n", .{c.snd_strerror(err)});
break;
}
continue;
}
frame_count += 1;
if (frame_count % 50 == 0) {
// Show we're getting audio data
var max_sample: u16 = 0;
for (0..@intCast(frames)) |i| {
const abs_sample = @abs(buffer[i]);
if (abs_sample > max_sample) {
max_sample = abs_sample;
}
}
std.debug.print("Audio: {} frames, max level: {}\n", .{ frames, max_sample });
}
const result = c.vosk_recognizer_accept_waveform(rec, @ptrCast(&buffer), @intCast(frames * 2));
if (result != 0) {
const text = c.vosk_recognizer_result(rec);
std.debug.print("RECOGNIZED: {s}\n", .{text});
}
}
}