From 506b877da2c9b37c4fa995bd46395e8ff12f1e06 Mon Sep 17 00:00:00 2001 From: Emil Lerch Date: Mon, 27 Oct 2025 13:46:14 -0700 Subject: [PATCH] switch from continuous speech to silence detection/batch process --- src/stt.zig | 268 ++++++++++++++++++---------------------------------- 1 file changed, 94 insertions(+), 174 deletions(-) diff --git a/src/stt.zig b/src/stt.zig index 28facd2..3e690c0 100644 --- a/src/stt.zig +++ b/src/stt.zig @@ -802,7 +802,9 @@ pub const Session = struct { retry_count = 0; // Audio capture loop with comprehensive error handling and recovery + var loop_count: u32 = 0; while (!self.should_stop.load(.acquire)) { + loop_count += 1; // Read audio data from ALSA with detailed error handling _ = capture.readAudio() catch |err| { consecutive_errors += 1; @@ -859,11 +861,20 @@ pub const Session = struct { retry_count = 0; consecutive_errors = 0; - // Transfer audio data to Vosk processing buffer with error handling - if (capture.availableSamples() >= 1024) { // Process in chunks of 1024 samples - const chunk_size = @min(1024, self.processing_buffer.len); + // Transfer audio data to Vosk processing buffer + const available = capture.availableSamples(); + // 50ms is a good threshold for speech. At 16kHz, that would be + // 800 samples. So we'll use 512 here + if (available >= 512) { + const chunk_size = @min(available, self.processing_buffer.len); const samples_read = capture.getAudioSamples(self.processing_buffer[0..chunk_size]); if (samples_read > 0) { + var max_before_resample: u16 = 0; + for (self.processing_buffer[0..samples_read]) |s| { + const abs_s = @abs(s); + if (abs_s > max_before_resample) max_before_resample = abs_s; + } + // Resample if needed, otherwise use samples directly const samples_to_write = if (self.resample_buffer) |resample_buf| blk: { const resampled_count = resample( @@ -897,15 +908,11 @@ pub const Session = struct { } } - /// Vosk processing thread function with comprehensive error handling + /// Vosk processing thread function. Audio processed by this function + /// must be either captured at 16kHz or resampled to 16kHz prior to calling + /// the function fn processingThreadFn(self: *Session) void { - // Processing buffer for Vosk (4096 samples = ~256ms at 16kHz) const vosk_chunk_size = 4096; - const min_chunk_size = 1024; // Minimum chunk size for processing - - const cpu_perf = getCpuPerformance() catch 100; - if (cpu_perf < 50) - std.log.debug("processing thread additional delay being added", .{}); var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch { const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk processing buffer"); self.options.event_handler.onDetailedError(error_info); @@ -913,186 +920,112 @@ pub const Session = struct { }; defer self.allocator.free(vosk_buffer); - var error_count: u32 = 0; - const max_errors = 10; - const error_reset_threshold = 100; // Reset error count after this many successful operations - var success_count: u32 = 0; - var consecutive_failures: u32 = 0; - const max_consecutive_failures = 5; + // Silence detection parameters + const silence_threshold: i16 = 500; + const silence_duration_ms: u64 = 500; + const min_speech_duration_ms: u64 = 300; + const samples_per_ms = 16; // This assumes 16kHz audio + + var speech_buffer = self.allocator.alloc(i16, 16000 * 10) catch { + const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate speech buffer"); + self.options.event_handler.onDetailedError(error_info); + return; + }; + defer self.allocator.free(speech_buffer); + + var in_speech = false; + var silence_samples: usize = 0; + var speech_samples: usize = 0; + var speech_pos: usize = 0; while (!self.should_stop.load(.acquire)) { - // Check if we have enough audio data for processing - const available_samples = self.vosk_audio_buffer.available(); + const available = self.vosk_audio_buffer.available(); + if (available < 256) { + std.Thread.sleep(10 * std.time.ns_per_ms); + continue; + } - if (available_samples >= min_chunk_size) { - // Process in chunks, but don't exceed our buffer size - const chunk_size = @min(available_samples, vosk_chunk_size); - const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..chunk_size]); + const read_size = @min(available, vosk_chunk_size); + const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..read_size]); + if (samples_read == 0) continue; - if (samples_read > 0 and self.vosk_recognizer != null) { - // Time the Vosk processing to identify bottlenecks - const start_time = std.time.nanoTimestamp(); - const buffer_fill = (self.vosk_audio_buffer.available() * 100) / self.vosk_audio_buffer.buffer.len; + var is_silent = true; + var max_amplitude: u16 = 0; + for (vosk_buffer[0..samples_read]) |sample| { + const abs_sample = @abs(sample); + if (abs_sample > max_amplitude) max_amplitude = abs_sample; + if (abs_sample > silence_threshold) { + is_silent = false; + } + } - // Process audio with Vosk with comprehensive error handling - self.processVoskAudio(vosk_buffer[0..samples_read]) catch |err| { - error_count += 1; - consecutive_failures += 1; + if (!is_silent) { + if (!in_speech) { + in_speech = true; + speech_samples = 0; + silence_samples = 0; + speech_pos = 0; + } + const copy_len = @min(samples_read, speech_buffer.len - speech_pos); + @memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]); + speech_pos += copy_len; + speech_samples += samples_read; + silence_samples = 0; + } else if (in_speech) { + silence_samples += samples_read; + const copy_len = @min(samples_read, speech_buffer.len - speech_pos); + @memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]); + speech_pos += copy_len; - // Create detailed error information - const error_info = switch (err) { - Error.InvalidState => ErrorInfo.initRecoverable(err, "Vosk recognizer is in invalid state", "Recognizer will be reinitialized"), - Error.OutOfMemory => ErrorInfo.init(err, "Out of memory during speech processing"), - Error.CallbackError => ErrorInfo.initWithContext(err, "Error in speech detection callback", "Check callback implementation"), - else => ErrorInfo.init(err, "Unexpected error during speech processing"), + if (silence_samples >= silence_duration_ms * samples_per_ms) { + if (speech_samples >= min_speech_duration_ms * samples_per_ms) { + self.processVoskAudio(speech_buffer[0..speech_pos]) catch |err| { + std.log.err("Error processing speech: {}", .{err}); }; + } - self.options.event_handler.onDetailedError(error_info); + in_speech = false; + speech_pos = 0; + speech_samples = 0; + silence_samples = 0; - // Handle different error scenarios - if (error_count >= max_errors) { - const fatal_error = ErrorInfo.init(Error.CallbackError, "Too many Vosk processing errors, stopping processing thread"); - self.options.event_handler.onDetailedError(fatal_error); - break; - } - - if (consecutive_failures >= max_consecutive_failures) { - // Try to recover by reinitializing Vosk - const recovery_info = ErrorInfo.initRecoverable(Error.InternalError, "Multiple consecutive processing failures, attempting recovery", "Vosk recognizer will be reinitialized"); - self.options.event_handler.onDetailedError(recovery_info); - - self.reinitializeVosk() catch { - const recovery_failed = ErrorInfo.init(Error.ModelLoadError, "Failed to recover Vosk recognizer, stopping processing"); - self.options.event_handler.onDetailedError(recovery_failed); - break; - }; - - consecutive_failures = 0; - } - - // Add delay after error to prevent rapid error loops - std.Thread.sleep(50 * std.time.ns_per_ms * consecutive_failures); // Exponential backoff - continue; - }; - - // Log timing and buffer status for diagnostics - const end_time = std.time.nanoTimestamp(); - const processing_ms = @divTrunc(end_time - start_time, std.time.ns_per_ms); - const realtime_ms = (samples_read * 1000) / 16000; - if (processing_ms > realtime_ms and buffer_fill > 20) - std.log.warn("Vosk processing behind realtime. {d} samples in {d}ms (realtime: {d}ms), buffer {d}% full", .{ samples_read, processing_ms, realtime_ms, buffer_fill }); - - // Reset error counters after successful operations - success_count += 1; - consecutive_failures = 0; - if (success_count >= error_reset_threshold) { - error_count = 0; - success_count = 0; + if (self.vosk_recognizer) |rec| { + c.vosk_recognizer_reset(rec); } } } - // Adaptive delay based on buffer fill level and error state - const base_delay_ms: u64 = if (available_samples > vosk_chunk_size * 2) - 1 // Fast processing when buffer is full - else if (available_samples > min_chunk_size) - 5 // Normal processing - else - 10; // Slower when buffer is low - - // Increase delay if we're having errors - const error_multiplier: u64 = if (consecutive_failures > 0) consecutive_failures + 1 else 1; - var delay_ms = base_delay_ms * error_multiplier; - - // Add extra delay for slower hardware (Pi) to prevent buffer overruns - if (cpu_perf < 50) { - delay_ms += 100; // Extra 10ms delay for Pi-class hardware - } - - std.Thread.sleep(delay_ms * std.time.ns_per_ms); - } - - // Final processing of any remaining audio data - const remaining_samples = self.vosk_audio_buffer.available(); - if (remaining_samples > 0 and self.vosk_recognizer != null) { - const final_chunk_size = @min(remaining_samples, vosk_chunk_size); - const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..final_chunk_size]); - if (samples_read > 0) { - self.processVoskAudio(vosk_buffer[0..samples_read]) catch { - // Ignore errors during shutdown, but log them - const shutdown_error = ErrorInfo.init(Error.InternalError, "Error during final audio processing at shutdown"); - self.options.event_handler.onDetailedError(shutdown_error); - }; - } + // Sleep to ease CPU pressure + std.Thread.sleep(1 * std.time.ns_per_ms); } } - /// Process audio chunk with Vosk and handle results + /// Process complete audio clip with Vosk and handle results fn processVoskAudio(self: *Session, audio_data: []const i16) !void { - if (self.vosk_recognizer == null) { - return Error.InvalidState; + if (self.vosk_recognizer == null) return Error.InvalidState; + + const audio_bytes = std.mem.sliceAsBytes(audio_data); + _ = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len)); + + // Get final result + const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer); + if (result_cstr != null) { + const result_str = std.mem.span(result_cstr); + self.parseVoskResult(result_str) catch |err| { + self.options.event_handler.onError(err, "Failed to parse Vosk result"); + }; } - - // Use audio data directly without resampling - const final_audio = audio_data; - - // Convert i16 samples to bytes for Vosk - const audio_bytes = std.mem.sliceAsBytes(final_audio); - - // Feed audio to Vosk recognizer - const accept_result = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len)); - - if (accept_result == 1) { - // Final result available - const result_cstr = c.vosk_recognizer_result(self.vosk_recognizer); - if (result_cstr != null) { - const result_str = std.mem.span(result_cstr); - - // Parse JSON result to extract text - self.parseVoskResult(result_str) catch |err| { - self.options.event_handler.onError(err, "Failed to parse Vosk result"); - }; - - // Reset recognizer after getting final result to clear internal buffers - c.vosk_recognizer_reset(self.vosk_recognizer); - } - } else if (accept_result == 0) { - // Partial result available (optional - for real-time feedback) - const partial_result_cstr = c.vosk_recognizer_partial_result(self.vosk_recognizer); - if (partial_result_cstr != null) { - const partial_str = std.mem.span(partial_result_cstr); - - // Parse partial result (could be used for real-time display) - self.parseVoskPartialResult(partial_str) catch |parse_err| { - // Log partial result parsing errors but continue processing - const parse_error_info = switch (parse_err) { - Error.CallbackError => ErrorInfo.init(Error.CallbackError, "Failed to parse partial speech result"), - else => ErrorInfo.init(Error.CallbackError, "Unexpected error parsing partial speech result"), - }; - self.options.event_handler.onDetailedError(parse_error_info); - }; - } - } - // accept_result == -1 means error, but we continue processing } /// Parse Vosk JSON result and extract recognized text fn parseVoskResult(self: *Session, json_str: []const u8) !void { - // Simple JSON parsing to extract "text" field - // Vosk returns JSON like: {"text": "hello world"} - if (json_str.len == 0) return; - // Find "text" field in JSON const text_key = "\"text\""; if (std.mem.indexOf(u8, json_str, text_key)) |text_start| { const value_start = text_start + text_key.len; - - // Find the colon and opening quote if (std.mem.indexOf(u8, json_str[value_start..], ":")) |colon_pos| { const after_colon = value_start + colon_pos + 1; - - // Skip whitespace and find opening quote var quote_start: ?usize = null; for (json_str[after_colon..], 0..) |char, i| { if (char == '"') { @@ -1100,13 +1033,9 @@ pub const Session = struct { break; } } - if (quote_start) |s| { - // Find closing quote if (std.mem.indexOf(u8, json_str[s..], "\"")) |quote_end| { const text = json_str[s .. s + quote_end]; - - // Only invoke callback if text is not empty if (text.len > 0 and !std.mem.eql(u8, text, " ")) { self.options.event_handler.onSpeech(text); } @@ -1116,15 +1045,6 @@ pub const Session = struct { } } - /// Parse Vosk partial result (for real-time feedback) - fn parseVoskPartialResult(self: *Session, json_str: []const u8) !void { - // Similar to parseVoskResult but for partial results - // For now, we don't use partial results, but this could be extended - // to provide real-time transcription feedback - _ = self; - _ = json_str; - } - /// Attempt to recover from audio device errors with detailed error reporting fn recoverAudioDevice(self: *Session) Error!void { if (self.alsa_capture) |*capture| {