diff --git a/src/stt.zig b/src/stt.zig index 354a4b0..09a871c 100644 --- a/src/stt.zig +++ b/src/stt.zig @@ -963,6 +963,8 @@ pub const Session = struct { /// must be either captured at 16kHz or resampled to 16kHz prior to calling /// the function fn processingThreadFn(self: *Session) void { + if (self.vosk_recognizer == null) @panic("Session not initialized"); + const rec = self.vosk_recognizer.?; const vosk_chunk_size = 4096; var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch { const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk processing buffer"); @@ -1037,7 +1039,7 @@ pub const Session = struct { speech_max_amplitude = 0; speech_min_amplitude = std.math.maxInt(u16); - // Copy pre-roll buffer to speech buffer + // Copy pre-roll buffer to speech buffer and feed to Vosk if (preroll_filled) { // Copy from preroll_pos to end const first_part_len = preroll_samples - preroll_pos; @@ -1045,10 +1047,18 @@ pub const Session = struct { // Copy from start to preroll_pos @memcpy(speech_buffer[first_part_len..preroll_samples], preroll_buffer[0..preroll_pos]); speech_pos = preroll_samples; + + // Feed preroll to Vosk + const preroll_bytes = std.mem.sliceAsBytes(speech_buffer[0..preroll_samples]); + _ = c.vosk_recognizer_accept_waveform(rec, preroll_bytes.ptr, @intCast(preroll_bytes.len)); } else if (preroll_pos > 0) { // Buffer not full yet, just copy what we have @memcpy(speech_buffer[0..preroll_pos], preroll_buffer[0..preroll_pos]); speech_pos = preroll_pos; + + // Feed preroll to Vosk + const preroll_bytes = std.mem.sliceAsBytes(speech_buffer[0..preroll_pos]); + _ = c.vosk_recognizer_accept_waveform(rec, preroll_bytes.ptr, @intCast(preroll_bytes.len)); } } if (max_amplitude > speech_max_amplitude) speech_max_amplitude = max_amplitude; @@ -1058,12 +1068,20 @@ pub const Session = struct { speech_pos += copy_len; speech_samples += samples_read; silence_samples = 0; + + // Feed audio to Vosk in real-time + const audio_bytes = std.mem.sliceAsBytes(vosk_buffer[0..samples_read]); + _ = c.vosk_recognizer_accept_waveform(rec, audio_bytes.ptr, @intCast(audio_bytes.len)); } else if (in_speech) { silence_samples += samples_read; const copy_len = @min(samples_read, speech_buffer.len - speech_pos); @memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]); speech_pos += copy_len; + // Continue feeding silence to Vosk + const audio_bytes = std.mem.sliceAsBytes(vosk_buffer[0..samples_read]); + _ = c.vosk_recognizer_accept_waveform(rec, audio_bytes.ptr, @intCast(audio_bytes.len)); + if (silence_samples >= silence_duration_ms * samples_per_ms) { if (speech_samples >= min_speech_duration_ms * samples_per_ms) { const event = SpeechEvent{ @@ -1073,7 +1091,7 @@ pub const Session = struct { .min_amplitude = speech_min_amplitude, .audio_data = speech_buffer[0..speech_pos], }; - self.processVoskAudio(event) catch |err| { + self.finalizeVoskAudio(event) catch |err| { std.log.err("Error processing speech: {}", .{err}); }; } @@ -1085,9 +1103,7 @@ pub const Session = struct { speech_max_amplitude = 0; speech_min_amplitude = std.math.maxInt(u16); - if (self.vosk_recognizer) |rec| { - c.vosk_recognizer_reset(rec); - } + c.vosk_recognizer_reset(rec); } } else { // Not in speech - update pre-roll buffer @@ -1103,14 +1119,11 @@ pub const Session = struct { } } - /// Process complete audio clip with Vosk and handle results - fn processVoskAudio(self: *Session, event: SpeechEvent) !void { + /// Finalize Vosk processing and get result (audio already fed incrementally) + fn finalizeVoskAudio(self: *Session, event: SpeechEvent) !void { if (self.vosk_recognizer == null) return Error.InvalidState; - const audio_bytes = std.mem.sliceAsBytes(event.audio_data); - _ = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len)); - - // Get final result + // Get final result (audio was already fed incrementally) const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer); if (result_cstr != null) { const result_str = std.mem.span(result_cstr);