switch from continuous speech to silence detection/batch process

2025-10-27 13:46:14 -07:00 · 2025-10-27 13:46:14 -07:00 · 506b877da2
commit 506b877da2
parent 29e8df571f
1 changed files with 94 additions and 174 deletions
--- a/src/stt.zig
+++ b/src/stt.zig
@ -802,7 +802,9 @@ pub const Session = struct {
            retry_count = 0;

            // Audio capture loop with comprehensive error handling and recovery
+            var loop_count: u32 = 0;
            while (!self.should_stop.load(.acquire)) {
+                loop_count += 1;
                // Read audio data from ALSA with detailed error handling
                _ = capture.readAudio() catch |err| {
                    consecutive_errors += 1;
@ -859,11 +861,20 @@ pub const Session = struct {
                retry_count = 0;
                consecutive_errors = 0;

-                // Transfer audio data to Vosk processing buffer with error handling
-                if (capture.availableSamples() >= 1024) { // Process in chunks of 1024 samples
-                    const chunk_size = @min(1024, self.processing_buffer.len);
+                // Transfer audio data to Vosk processing buffer
+                const available = capture.availableSamples();
+                // 50ms is a good threshold for speech. At 16kHz, that would be
+                // 800 samples. So we'll use 512 here
+                if (available >= 512) {
+                    const chunk_size = @min(available, self.processing_buffer.len);
                    const samples_read = capture.getAudioSamples(self.processing_buffer[0..chunk_size]);
                    if (samples_read > 0) {
+                        var max_before_resample: u16 = 0;
+                        for (self.processing_buffer[0..samples_read]) |s| {
+                            const abs_s = @abs(s);
+                            if (abs_s > max_before_resample) max_before_resample = abs_s;
+                        }
+
                        // Resample if needed, otherwise use samples directly
                        const samples_to_write = if (self.resample_buffer) |resample_buf| blk: {
                            const resampled_count = resample(
@ -897,15 +908,11 @@ pub const Session = struct {
        }
    }

-    /// Vosk processing thread function with comprehensive error handling
+    /// Vosk processing thread function. Audio processed by this function
+    /// must be either captured at 16kHz or resampled to 16kHz prior to calling
+    /// the function
    fn processingThreadFn(self: *Session) void {
-        // Processing buffer for Vosk (4096 samples = ~256ms at 16kHz)
        const vosk_chunk_size = 4096;
-        const min_chunk_size = 1024; // Minimum chunk size for processing
-
-        const cpu_perf = getCpuPerformance() catch 100;
-        if (cpu_perf < 50)
-            std.log.debug("processing thread additional delay being added", .{});
        var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch {
            const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk processing buffer");
            self.options.event_handler.onDetailedError(error_info);
@ -913,186 +920,112 @@ pub const Session = struct {
        };
        defer self.allocator.free(vosk_buffer);

-        var error_count: u32 = 0;
-        const max_errors = 10;
-        const error_reset_threshold = 100; // Reset error count after this many successful operations
-        var success_count: u32 = 0;
-        var consecutive_failures: u32 = 0;
-        const max_consecutive_failures = 5;
+        // Silence detection parameters
+        const silence_threshold: i16 = 500;
+        const silence_duration_ms: u64 = 500;
+        const min_speech_duration_ms: u64 = 300;
+        const samples_per_ms = 16; // This assumes 16kHz audio
+
+        var speech_buffer = self.allocator.alloc(i16, 16000 * 10) catch {
+            const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate speech buffer");
+            self.options.event_handler.onDetailedError(error_info);
+            return;
+        };
+        defer self.allocator.free(speech_buffer);
+
+        var in_speech = false;
+        var silence_samples: usize = 0;
+        var speech_samples: usize = 0;
+        var speech_pos: usize = 0;

        while (!self.should_stop.load(.acquire)) {
-            // Check if we have enough audio data for processing
-            const available_samples = self.vosk_audio_buffer.available();
-
-            if (available_samples >= min_chunk_size) {
-                // Process in chunks, but don't exceed our buffer size
-                const chunk_size = @min(available_samples, vosk_chunk_size);
-                const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..chunk_size]);
-
-                if (samples_read > 0 and self.vosk_recognizer != null) {
-                    // Time the Vosk processing to identify bottlenecks
-                    const start_time = std.time.nanoTimestamp();
-                    const buffer_fill = (self.vosk_audio_buffer.available() * 100) / self.vosk_audio_buffer.buffer.len;
-
-                    // Process audio with Vosk with comprehensive error handling
-                    self.processVoskAudio(vosk_buffer[0..samples_read]) catch |err| {
-                        error_count += 1;
-                        consecutive_failures += 1;
-
-                        // Create detailed error information
-                        const error_info = switch (err) {
-                            Error.InvalidState => ErrorInfo.initRecoverable(err, "Vosk recognizer is in invalid state", "Recognizer will be reinitialized"),
-                            Error.OutOfMemory => ErrorInfo.init(err, "Out of memory during speech processing"),
-                            Error.CallbackError => ErrorInfo.initWithContext(err, "Error in speech detection callback", "Check callback implementation"),
-                            else => ErrorInfo.init(err, "Unexpected error during speech processing"),
-                        };
-
-                        self.options.event_handler.onDetailedError(error_info);
-
-                        // Handle different error scenarios
-                        if (error_count >= max_errors) {
-                            const fatal_error = ErrorInfo.init(Error.CallbackError, "Too many Vosk processing errors, stopping processing thread");
-                            self.options.event_handler.onDetailedError(fatal_error);
-                            break;
-                        }
-
-                        if (consecutive_failures >= max_consecutive_failures) {
-                            // Try to recover by reinitializing Vosk
-                            const recovery_info = ErrorInfo.initRecoverable(Error.InternalError, "Multiple consecutive processing failures, attempting recovery", "Vosk recognizer will be reinitialized");
-                            self.options.event_handler.onDetailedError(recovery_info);
-
-                            self.reinitializeVosk() catch {
-                                const recovery_failed = ErrorInfo.init(Error.ModelLoadError, "Failed to recover Vosk recognizer, stopping processing");
-                                self.options.event_handler.onDetailedError(recovery_failed);
-                                break;
-                            };
-
-                            consecutive_failures = 0;
-                        }
-
-                        // Add delay after error to prevent rapid error loops
-                        std.Thread.sleep(50 * std.time.ns_per_ms * consecutive_failures); // Exponential backoff
+            const available = self.vosk_audio_buffer.available();
+            if (available < 256) {
+                std.Thread.sleep(10 * std.time.ns_per_ms);
                continue;
-                    };
-
-                    // Log timing and buffer status for diagnostics
-                    const end_time = std.time.nanoTimestamp();
-                    const processing_ms = @divTrunc(end_time - start_time, std.time.ns_per_ms);
-                    const realtime_ms = (samples_read * 1000) / 16000;
-                    if (processing_ms > realtime_ms and buffer_fill > 20)
-                        std.log.warn("Vosk processing behind realtime. {d} samples in {d}ms (realtime: {d}ms), buffer {d}% full", .{ samples_read, processing_ms, realtime_ms, buffer_fill });
-
-                    // Reset error counters after successful operations
-                    success_count += 1;
-                    consecutive_failures = 0;
-                    if (success_count >= error_reset_threshold) {
-                        error_count = 0;
-                        success_count = 0;
-                    }
-                }
            }

-            // Adaptive delay based on buffer fill level and error state
-            const base_delay_ms: u64 = if (available_samples > vosk_chunk_size * 2)
-                1 // Fast processing when buffer is full
-            else if (available_samples > min_chunk_size)
-                5 // Normal processing
-            else
-                10; // Slower when buffer is low
+            const read_size = @min(available, vosk_chunk_size);
+            const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..read_size]);
+            if (samples_read == 0) continue;

-            // Increase delay if we're having errors
-            const error_multiplier: u64 = if (consecutive_failures > 0) consecutive_failures + 1 else 1;
-            var delay_ms = base_delay_ms * error_multiplier;
-
-            // Add extra delay for slower hardware (Pi) to prevent buffer overruns
-            if (cpu_perf < 50) {
-                delay_ms += 100; // Extra 10ms delay for Pi-class hardware
+            var is_silent = true;
+            var max_amplitude: u16 = 0;
+            for (vosk_buffer[0..samples_read]) |sample| {
+                const abs_sample = @abs(sample);
+                if (abs_sample > max_amplitude) max_amplitude = abs_sample;
+                if (abs_sample > silence_threshold) {
+                    is_silent = false;
+                }
            }

-            std.Thread.sleep(delay_ms * std.time.ns_per_ms);
+            if (!is_silent) {
+                if (!in_speech) {
+                    in_speech = true;
+                    speech_samples = 0;
+                    silence_samples = 0;
+                    speech_pos = 0;
                }
+                const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
+                @memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
+                speech_pos += copy_len;
+                speech_samples += samples_read;
+                silence_samples = 0;
+            } else if (in_speech) {
+                silence_samples += samples_read;
+                const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
+                @memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
+                speech_pos += copy_len;

-        // Final processing of any remaining audio data
-        const remaining_samples = self.vosk_audio_buffer.available();
-        if (remaining_samples > 0 and self.vosk_recognizer != null) {
-            const final_chunk_size = @min(remaining_samples, vosk_chunk_size);
-            const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..final_chunk_size]);
-            if (samples_read > 0) {
-                self.processVoskAudio(vosk_buffer[0..samples_read]) catch {
-                    // Ignore errors during shutdown, but log them
-                    const shutdown_error = ErrorInfo.init(Error.InternalError, "Error during final audio processing at shutdown");
-                    self.options.event_handler.onDetailedError(shutdown_error);
+                if (silence_samples >= silence_duration_ms * samples_per_ms) {
+                    if (speech_samples >= min_speech_duration_ms * samples_per_ms) {
+                        self.processVoskAudio(speech_buffer[0..speech_pos]) catch |err| {
+                            std.log.err("Error processing speech: {}", .{err});
                        };
                    }
+
+                    in_speech = false;
+                    speech_pos = 0;
+                    speech_samples = 0;
+                    silence_samples = 0;
+
+                    if (self.vosk_recognizer) |rec| {
+                        c.vosk_recognizer_reset(rec);
+                    }
                }
            }

-    /// Process audio chunk with Vosk and handle results
+            // Sleep to ease CPU pressure
+            std.Thread.sleep(1 * std.time.ns_per_ms);
+        }
+    }
+
+    /// Process complete audio clip with Vosk and handle results
    fn processVoskAudio(self: *Session, audio_data: []const i16) !void {
-        if (self.vosk_recognizer == null) {
-            return Error.InvalidState;
-        }
+        if (self.vosk_recognizer == null) return Error.InvalidState;

-        // Use audio data directly without resampling
-        const final_audio = audio_data;
+        const audio_bytes = std.mem.sliceAsBytes(audio_data);
+        _ = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));

-        // Convert i16 samples to bytes for Vosk
-        const audio_bytes = std.mem.sliceAsBytes(final_audio);
-
-        // Feed audio to Vosk recognizer
-        const accept_result = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));
-
-        if (accept_result == 1) {
-            // Final result available
-            const result_cstr = c.vosk_recognizer_result(self.vosk_recognizer);
+        // Get final result
+        const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer);
        if (result_cstr != null) {
            const result_str = std.mem.span(result_cstr);
-
-                // Parse JSON result to extract text
            self.parseVoskResult(result_str) catch |err| {
                self.options.event_handler.onError(err, "Failed to parse Vosk result");
            };
-
-                // Reset recognizer after getting final result to clear internal buffers
-                c.vosk_recognizer_reset(self.vosk_recognizer);
        }
-        } else if (accept_result == 0) {
-            // Partial result available (optional - for real-time feedback)
-            const partial_result_cstr = c.vosk_recognizer_partial_result(self.vosk_recognizer);
-            if (partial_result_cstr != null) {
-                const partial_str = std.mem.span(partial_result_cstr);
-
-                // Parse partial result (could be used for real-time display)
-                self.parseVoskPartialResult(partial_str) catch |parse_err| {
-                    // Log partial result parsing errors but continue processing
-                    const parse_error_info = switch (parse_err) {
-                        Error.CallbackError => ErrorInfo.init(Error.CallbackError, "Failed to parse partial speech result"),
-                        else => ErrorInfo.init(Error.CallbackError, "Unexpected error parsing partial speech result"),
-                    };
-                    self.options.event_handler.onDetailedError(parse_error_info);
-                };
-            }
-        }
-        // accept_result == -1 means error, but we continue processing
    }

    /// Parse Vosk JSON result and extract recognized text
    fn parseVoskResult(self: *Session, json_str: []const u8) !void {
-        // Simple JSON parsing to extract "text" field
-        // Vosk returns JSON like: {"text": "hello world"}
-
        if (json_str.len == 0) return;

-        // Find "text" field in JSON
        const text_key = "\"text\"";
        if (std.mem.indexOf(u8, json_str, text_key)) |text_start| {
            const value_start = text_start + text_key.len;
-
-            // Find the colon and opening quote
            if (std.mem.indexOf(u8, json_str[value_start..], ":")) |colon_pos| {
                const after_colon = value_start + colon_pos + 1;
-
-                // Skip whitespace and find opening quote
                var quote_start: ?usize = null;
                for (json_str[after_colon..], 0..) |char, i| {
                    if (char == '"') {
@ -1100,13 +1033,9 @@ pub const Session = struct {
                        break;
                    }
                }
-
                if (quote_start) |s| {
-                    // Find closing quote
                    if (std.mem.indexOf(u8, json_str[s..], "\"")) |quote_end| {
                        const text = json_str[s .. s + quote_end];
-
-                        // Only invoke callback if text is not empty
                        if (text.len > 0 and !std.mem.eql(u8, text, " ")) {
                            self.options.event_handler.onSpeech(text);
                        }
@ -1116,15 +1045,6 @@ pub const Session = struct {
        }
    }

-    /// Parse Vosk partial result (for real-time feedback)
-    fn parseVoskPartialResult(self: *Session, json_str: []const u8) !void {
-        // Similar to parseVoskResult but for partial results
-        // For now, we don't use partial results, but this could be extended
-        // to provide real-time transcription feedback
-        _ = self;
-        _ = json_str;
-    }
-
    /// Attempt to recover from audio device errors with detailed error reporting
    fn recoverAudioDevice(self: *Session) Error!void {
        if (self.alsa_capture) |*capture| {