diff --git a/src/stt.zig b/src/stt.zig index 3f6bfbb..354a4b0 100644 --- a/src/stt.zig +++ b/src/stt.zig @@ -976,6 +976,8 @@ pub const Session = struct { const silence_duration_ms: u64 = 300; const min_speech_duration_ms: u64 = 300; const samples_per_ms = 16; // This assumes 16kHz audio + const preroll_ms: u64 = 250; // Capture 250ms before speech starts + const preroll_samples = preroll_ms * samples_per_ms; std.log.info( "Listening with silence threshold {}. If wrong, use --measure-levels to find approriate val", @@ -988,6 +990,16 @@ pub const Session = struct { }; defer self.allocator.free(speech_buffer); + // Pre-roll ring buffer + var preroll_buffer = self.allocator.alloc(i16, preroll_samples) catch { + const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate pre-roll buffer"); + self.options.event_handler.onDetailedError(error_info); + return; + }; + defer self.allocator.free(preroll_buffer); + var preroll_pos: usize = 0; + var preroll_filled: bool = false; + var in_speech = false; var silence_samples: usize = 0; var speech_samples: usize = 0; @@ -1024,6 +1036,20 @@ pub const Session = struct { speech_pos = 0; speech_max_amplitude = 0; speech_min_amplitude = std.math.maxInt(u16); + + // Copy pre-roll buffer to speech buffer + if (preroll_filled) { + // Copy from preroll_pos to end + const first_part_len = preroll_samples - preroll_pos; + @memcpy(speech_buffer[0..first_part_len], preroll_buffer[preroll_pos..]); + // Copy from start to preroll_pos + @memcpy(speech_buffer[first_part_len..preroll_samples], preroll_buffer[0..preroll_pos]); + speech_pos = preroll_samples; + } else if (preroll_pos > 0) { + // Buffer not full yet, just copy what we have + @memcpy(speech_buffer[0..preroll_pos], preroll_buffer[0..preroll_pos]); + speech_pos = preroll_pos; + } } if (max_amplitude > speech_max_amplitude) speech_max_amplitude = max_amplitude; if (max_amplitude < speech_min_amplitude) speech_min_amplitude = max_amplitude; @@ -1063,6 +1089,13 @@ pub const Session = struct { c.vosk_recognizer_reset(rec); } } + } else { + // Not in speech - update pre-roll buffer + for (vosk_buffer[0..samples_read]) |sample| { + preroll_buffer[preroll_pos] = sample; + preroll_pos = (preroll_pos + 1) % preroll_samples; + if (preroll_pos == 0) preroll_filled = true; + } } // Sleep to ease CPU pressure