From 506b877da2c9b37c4fa995bd46395e8ff12f1e06 Mon Sep 17 00:00:00 2001
From: Emil Lerch <emil@lerch.org>
Date: Mon, 27 Oct 2025 13:46:14 -0700
Subject: [PATCH] switch from continuous speech to silence detection/batch
 process

---
 src/stt.zig | 268 ++++++++++++++++++----------------------------------
 1 file changed, 94 insertions(+), 174 deletions(-)

diff --git a/src/stt.zig b/src/stt.zig
index 28facd2..3e690c0 100644
--- a/src/stt.zig
+++ b/src/stt.zig
@@ -802,7 +802,9 @@ pub const Session = struct {
             retry_count = 0;
 
             // Audio capture loop with comprehensive error handling and recovery
+            var loop_count: u32 = 0;
             while (!self.should_stop.load(.acquire)) {
+                loop_count += 1;
                 // Read audio data from ALSA with detailed error handling
                 _ = capture.readAudio() catch |err| {
                     consecutive_errors += 1;
@@ -859,11 +861,20 @@ pub const Session = struct {
                 retry_count = 0;
                 consecutive_errors = 0;
 
-                // Transfer audio data to Vosk processing buffer with error handling
-                if (capture.availableSamples() >= 1024) { // Process in chunks of 1024 samples
-                    const chunk_size = @min(1024, self.processing_buffer.len);
+                // Transfer audio data to Vosk processing buffer
+                const available = capture.availableSamples();
+                // 50ms is a good threshold for speech. At 16kHz, that would be
+                // 800 samples. So we'll use 512 here
+                if (available >= 512) {
+                    const chunk_size = @min(available, self.processing_buffer.len);
                     const samples_read = capture.getAudioSamples(self.processing_buffer[0..chunk_size]);
                     if (samples_read > 0) {
+                        var max_before_resample: u16 = 0;
+                        for (self.processing_buffer[0..samples_read]) |s| {
+                            const abs_s = @abs(s);
+                            if (abs_s > max_before_resample) max_before_resample = abs_s;
+                        }
+
                         // Resample if needed, otherwise use samples directly
                         const samples_to_write = if (self.resample_buffer) |resample_buf| blk: {
                             const resampled_count = resample(
@@ -897,15 +908,11 @@ pub const Session = struct {
         }
     }
 
-    /// Vosk processing thread function with comprehensive error handling
+    /// Vosk processing thread function. Audio processed by this function
+    /// must be either captured at 16kHz or resampled to 16kHz prior to calling
+    /// the function
     fn processingThreadFn(self: *Session) void {
-        // Processing buffer for Vosk (4096 samples = ~256ms at 16kHz)
         const vosk_chunk_size = 4096;
-        const min_chunk_size = 1024; // Minimum chunk size for processing
-
-        const cpu_perf = getCpuPerformance() catch 100;
-        if (cpu_perf < 50)
-            std.log.debug("processing thread additional delay being added", .{});
         var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch {
             const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk processing buffer");
             self.options.event_handler.onDetailedError(error_info);
@@ -913,186 +920,112 @@ pub const Session = struct {
         };
         defer self.allocator.free(vosk_buffer);
 
-        var error_count: u32 = 0;
-        const max_errors = 10;
-        const error_reset_threshold = 100; // Reset error count after this many successful operations
-        var success_count: u32 = 0;
-        var consecutive_failures: u32 = 0;
-        const max_consecutive_failures = 5;
+        // Silence detection parameters
+        const silence_threshold: i16 = 500;
+        const silence_duration_ms: u64 = 500;
+        const min_speech_duration_ms: u64 = 300;
+        const samples_per_ms = 16; // This assumes 16kHz audio
+
+        var speech_buffer = self.allocator.alloc(i16, 16000 * 10) catch {
+            const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate speech buffer");
+            self.options.event_handler.onDetailedError(error_info);
+            return;
+        };
+        defer self.allocator.free(speech_buffer);
+
+        var in_speech = false;
+        var silence_samples: usize = 0;
+        var speech_samples: usize = 0;
+        var speech_pos: usize = 0;
 
         while (!self.should_stop.load(.acquire)) {
-            // Check if we have enough audio data for processing
-            const available_samples = self.vosk_audio_buffer.available();
+            const available = self.vosk_audio_buffer.available();
+            if (available < 256) {
+                std.Thread.sleep(10 * std.time.ns_per_ms);
+                continue;
+            }
 
-            if (available_samples >= min_chunk_size) {
-                // Process in chunks, but don't exceed our buffer size
-                const chunk_size = @min(available_samples, vosk_chunk_size);
-                const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..chunk_size]);
+            const read_size = @min(available, vosk_chunk_size);
+            const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..read_size]);
+            if (samples_read == 0) continue;
 
-                if (samples_read > 0 and self.vosk_recognizer != null) {
-                    // Time the Vosk processing to identify bottlenecks
-                    const start_time = std.time.nanoTimestamp();
-                    const buffer_fill = (self.vosk_audio_buffer.available() * 100) / self.vosk_audio_buffer.buffer.len;
+            var is_silent = true;
+            var max_amplitude: u16 = 0;
+            for (vosk_buffer[0..samples_read]) |sample| {
+                const abs_sample = @abs(sample);
+                if (abs_sample > max_amplitude) max_amplitude = abs_sample;
+                if (abs_sample > silence_threshold) {
+                    is_silent = false;
+                }
+            }
 
-                    // Process audio with Vosk with comprehensive error handling
-                    self.processVoskAudio(vosk_buffer[0..samples_read]) catch |err| {
-                        error_count += 1;
-                        consecutive_failures += 1;
+            if (!is_silent) {
+                if (!in_speech) {
+                    in_speech = true;
+                    speech_samples = 0;
+                    silence_samples = 0;
+                    speech_pos = 0;
+                }
+                const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
+                @memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
+                speech_pos += copy_len;
+                speech_samples += samples_read;
+                silence_samples = 0;
+            } else if (in_speech) {
+                silence_samples += samples_read;
+                const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
+                @memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
+                speech_pos += copy_len;
 
-                        // Create detailed error information
-                        const error_info = switch (err) {
-                            Error.InvalidState => ErrorInfo.initRecoverable(err, "Vosk recognizer is in invalid state", "Recognizer will be reinitialized"),
-                            Error.OutOfMemory => ErrorInfo.init(err, "Out of memory during speech processing"),
-                            Error.CallbackError => ErrorInfo.initWithContext(err, "Error in speech detection callback", "Check callback implementation"),
-                            else => ErrorInfo.init(err, "Unexpected error during speech processing"),
+                if (silence_samples >= silence_duration_ms * samples_per_ms) {
+                    if (speech_samples >= min_speech_duration_ms * samples_per_ms) {
+                        self.processVoskAudio(speech_buffer[0..speech_pos]) catch |err| {
+                            std.log.err("Error processing speech: {}", .{err});
                         };
+                    }
 
-                        self.options.event_handler.onDetailedError(error_info);
+                    in_speech = false;
+                    speech_pos = 0;
+                    speech_samples = 0;
+                    silence_samples = 0;
 
-                        // Handle different error scenarios
-                        if (error_count >= max_errors) {
-                            const fatal_error = ErrorInfo.init(Error.CallbackError, "Too many Vosk processing errors, stopping processing thread");
-                            self.options.event_handler.onDetailedError(fatal_error);
-                            break;
-                        }
-
-                        if (consecutive_failures >= max_consecutive_failures) {
-                            // Try to recover by reinitializing Vosk
-                            const recovery_info = ErrorInfo.initRecoverable(Error.InternalError, "Multiple consecutive processing failures, attempting recovery", "Vosk recognizer will be reinitialized");
-                            self.options.event_handler.onDetailedError(recovery_info);
-
-                            self.reinitializeVosk() catch {
-                                const recovery_failed = ErrorInfo.init(Error.ModelLoadError, "Failed to recover Vosk recognizer, stopping processing");
-                                self.options.event_handler.onDetailedError(recovery_failed);
-                                break;
-                            };
-
-                            consecutive_failures = 0;
-                        }
-
-                        // Add delay after error to prevent rapid error loops
-                        std.Thread.sleep(50 * std.time.ns_per_ms * consecutive_failures); // Exponential backoff
-                        continue;
-                    };
-
-                    // Log timing and buffer status for diagnostics
-                    const end_time = std.time.nanoTimestamp();
-                    const processing_ms = @divTrunc(end_time - start_time, std.time.ns_per_ms);
-                    const realtime_ms = (samples_read * 1000) / 16000;
-                    if (processing_ms > realtime_ms and buffer_fill > 20)
-                        std.log.warn("Vosk processing behind realtime. {d} samples in {d}ms (realtime: {d}ms), buffer {d}% full", .{ samples_read, processing_ms, realtime_ms, buffer_fill });
-
-                    // Reset error counters after successful operations
-                    success_count += 1;
-                    consecutive_failures = 0;
-                    if (success_count >= error_reset_threshold) {
-                        error_count = 0;
-                        success_count = 0;
+                    if (self.vosk_recognizer) |rec| {
+                        c.vosk_recognizer_reset(rec);
                     }
                 }
             }
 
-            // Adaptive delay based on buffer fill level and error state
-            const base_delay_ms: u64 = if (available_samples > vosk_chunk_size * 2)
-                1 // Fast processing when buffer is full
-            else if (available_samples > min_chunk_size)
-                5 // Normal processing
-            else
-                10; // Slower when buffer is low
-
-            // Increase delay if we're having errors
-            const error_multiplier: u64 = if (consecutive_failures > 0) consecutive_failures + 1 else 1;
-            var delay_ms = base_delay_ms * error_multiplier;
-
-            // Add extra delay for slower hardware (Pi) to prevent buffer overruns
-            if (cpu_perf < 50) {
-                delay_ms += 100; // Extra 10ms delay for Pi-class hardware
-            }
-
-            std.Thread.sleep(delay_ms * std.time.ns_per_ms);
-        }
-
-        // Final processing of any remaining audio data
-        const remaining_samples = self.vosk_audio_buffer.available();
-        if (remaining_samples > 0 and self.vosk_recognizer != null) {
-            const final_chunk_size = @min(remaining_samples, vosk_chunk_size);
-            const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..final_chunk_size]);
-            if (samples_read > 0) {
-                self.processVoskAudio(vosk_buffer[0..samples_read]) catch {
-                    // Ignore errors during shutdown, but log them
-                    const shutdown_error = ErrorInfo.init(Error.InternalError, "Error during final audio processing at shutdown");
-                    self.options.event_handler.onDetailedError(shutdown_error);
-                };
-            }
+            // Sleep to ease CPU pressure
+            std.Thread.sleep(1 * std.time.ns_per_ms);
         }
     }
 
-    /// Process audio chunk with Vosk and handle results
+    /// Process complete audio clip with Vosk and handle results
     fn processVoskAudio(self: *Session, audio_data: []const i16) !void {
-        if (self.vosk_recognizer == null) {
-            return Error.InvalidState;
+        if (self.vosk_recognizer == null) return Error.InvalidState;
+
+        const audio_bytes = std.mem.sliceAsBytes(audio_data);
+        _ = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));
+
+        // Get final result
+        const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer);
+        if (result_cstr != null) {
+            const result_str = std.mem.span(result_cstr);
+            self.parseVoskResult(result_str) catch |err| {
+                self.options.event_handler.onError(err, "Failed to parse Vosk result");
+            };
         }
-
-        // Use audio data directly without resampling
-        const final_audio = audio_data;
-
-        // Convert i16 samples to bytes for Vosk
-        const audio_bytes = std.mem.sliceAsBytes(final_audio);
-
-        // Feed audio to Vosk recognizer
-        const accept_result = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));
-
-        if (accept_result == 1) {
-            // Final result available
-            const result_cstr = c.vosk_recognizer_result(self.vosk_recognizer);
-            if (result_cstr != null) {
-                const result_str = std.mem.span(result_cstr);
-
-                // Parse JSON result to extract text
-                self.parseVoskResult(result_str) catch |err| {
-                    self.options.event_handler.onError(err, "Failed to parse Vosk result");
-                };
-
-                // Reset recognizer after getting final result to clear internal buffers
-                c.vosk_recognizer_reset(self.vosk_recognizer);
-            }
-        } else if (accept_result == 0) {
-            // Partial result available (optional - for real-time feedback)
-            const partial_result_cstr = c.vosk_recognizer_partial_result(self.vosk_recognizer);
-            if (partial_result_cstr != null) {
-                const partial_str = std.mem.span(partial_result_cstr);
-
-                // Parse partial result (could be used for real-time display)
-                self.parseVoskPartialResult(partial_str) catch |parse_err| {
-                    // Log partial result parsing errors but continue processing
-                    const parse_error_info = switch (parse_err) {
-                        Error.CallbackError => ErrorInfo.init(Error.CallbackError, "Failed to parse partial speech result"),
-                        else => ErrorInfo.init(Error.CallbackError, "Unexpected error parsing partial speech result"),
-                    };
-                    self.options.event_handler.onDetailedError(parse_error_info);
-                };
-            }
-        }
-        // accept_result == -1 means error, but we continue processing
     }
 
     /// Parse Vosk JSON result and extract recognized text
     fn parseVoskResult(self: *Session, json_str: []const u8) !void {
-        // Simple JSON parsing to extract "text" field
-        // Vosk returns JSON like: {"text": "hello world"}
-
         if (json_str.len == 0) return;
 
-        // Find "text" field in JSON
         const text_key = "\"text\"";
         if (std.mem.indexOf(u8, json_str, text_key)) |text_start| {
             const value_start = text_start + text_key.len;
-
-            // Find the colon and opening quote
             if (std.mem.indexOf(u8, json_str[value_start..], ":")) |colon_pos| {
                 const after_colon = value_start + colon_pos + 1;
-
-                // Skip whitespace and find opening quote
                 var quote_start: ?usize = null;
                 for (json_str[after_colon..], 0..) |char, i| {
                     if (char == '"') {
@@ -1100,13 +1033,9 @@ pub const Session = struct {
                         break;
                     }
                 }
-
                 if (quote_start) |s| {
-                    // Find closing quote
                     if (std.mem.indexOf(u8, json_str[s..], "\"")) |quote_end| {
                         const text = json_str[s .. s + quote_end];
-
-                        // Only invoke callback if text is not empty
                         if (text.len > 0 and !std.mem.eql(u8, text, " ")) {
                             self.options.event_handler.onSpeech(text);
                         }
@@ -1116,15 +1045,6 @@ pub const Session = struct {
         }
     }
 
-    /// Parse Vosk partial result (for real-time feedback)
-    fn parseVoskPartialResult(self: *Session, json_str: []const u8) !void {
-        // Similar to parseVoskResult but for partial results
-        // For now, we don't use partial results, but this could be extended
-        // to provide real-time transcription feedback
-        _ = self;
-        _ = json_str;
-    }
-
     /// Attempt to recover from audio device errors with detailed error reporting
     fn recoverAudioDevice(self: *Session) Error!void {
         if (self.alsa_capture) |*capture| {