switch from continuous speech to silence detection/batch process

This commit is contained in:
Emil Lerch 2025-10-27 13:46:14 -07:00
parent 29e8df571f
commit 506b877da2
Signed by: lobo
GPG key ID: A7B62D657EF764F8

View file

@ -802,7 +802,9 @@ pub const Session = struct {
retry_count = 0;
// Audio capture loop with comprehensive error handling and recovery
var loop_count: u32 = 0;
while (!self.should_stop.load(.acquire)) {
loop_count += 1;
// Read audio data from ALSA with detailed error handling
_ = capture.readAudio() catch |err| {
consecutive_errors += 1;
@ -859,11 +861,20 @@ pub const Session = struct {
retry_count = 0;
consecutive_errors = 0;
// Transfer audio data to Vosk processing buffer with error handling
if (capture.availableSamples() >= 1024) { // Process in chunks of 1024 samples
const chunk_size = @min(1024, self.processing_buffer.len);
// Transfer audio data to Vosk processing buffer
const available = capture.availableSamples();
// 50ms is a good threshold for speech. At 16kHz, that would be
// 800 samples. So we'll use 512 here
if (available >= 512) {
const chunk_size = @min(available, self.processing_buffer.len);
const samples_read = capture.getAudioSamples(self.processing_buffer[0..chunk_size]);
if (samples_read > 0) {
var max_before_resample: u16 = 0;
for (self.processing_buffer[0..samples_read]) |s| {
const abs_s = @abs(s);
if (abs_s > max_before_resample) max_before_resample = abs_s;
}
// Resample if needed, otherwise use samples directly
const samples_to_write = if (self.resample_buffer) |resample_buf| blk: {
const resampled_count = resample(
@ -897,15 +908,11 @@ pub const Session = struct {
}
}
/// Vosk processing thread function with comprehensive error handling
/// Vosk processing thread function. Audio processed by this function
/// must be either captured at 16kHz or resampled to 16kHz prior to calling
/// the function
fn processingThreadFn(self: *Session) void {
// Processing buffer for Vosk (4096 samples = ~256ms at 16kHz)
const vosk_chunk_size = 4096;
const min_chunk_size = 1024; // Minimum chunk size for processing
const cpu_perf = getCpuPerformance() catch 100;
if (cpu_perf < 50)
std.log.debug("processing thread additional delay being added", .{});
var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch {
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk processing buffer");
self.options.event_handler.onDetailedError(error_info);
@ -913,186 +920,112 @@ pub const Session = struct {
};
defer self.allocator.free(vosk_buffer);
var error_count: u32 = 0;
const max_errors = 10;
const error_reset_threshold = 100; // Reset error count after this many successful operations
var success_count: u32 = 0;
var consecutive_failures: u32 = 0;
const max_consecutive_failures = 5;
// Silence detection parameters
const silence_threshold: i16 = 500;
const silence_duration_ms: u64 = 500;
const min_speech_duration_ms: u64 = 300;
const samples_per_ms = 16; // This assumes 16kHz audio
var speech_buffer = self.allocator.alloc(i16, 16000 * 10) catch {
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate speech buffer");
self.options.event_handler.onDetailedError(error_info);
return;
};
defer self.allocator.free(speech_buffer);
var in_speech = false;
var silence_samples: usize = 0;
var speech_samples: usize = 0;
var speech_pos: usize = 0;
while (!self.should_stop.load(.acquire)) {
// Check if we have enough audio data for processing
const available_samples = self.vosk_audio_buffer.available();
if (available_samples >= min_chunk_size) {
// Process in chunks, but don't exceed our buffer size
const chunk_size = @min(available_samples, vosk_chunk_size);
const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..chunk_size]);
if (samples_read > 0 and self.vosk_recognizer != null) {
// Time the Vosk processing to identify bottlenecks
const start_time = std.time.nanoTimestamp();
const buffer_fill = (self.vosk_audio_buffer.available() * 100) / self.vosk_audio_buffer.buffer.len;
// Process audio with Vosk with comprehensive error handling
self.processVoskAudio(vosk_buffer[0..samples_read]) catch |err| {
error_count += 1;
consecutive_failures += 1;
// Create detailed error information
const error_info = switch (err) {
Error.InvalidState => ErrorInfo.initRecoverable(err, "Vosk recognizer is in invalid state", "Recognizer will be reinitialized"),
Error.OutOfMemory => ErrorInfo.init(err, "Out of memory during speech processing"),
Error.CallbackError => ErrorInfo.initWithContext(err, "Error in speech detection callback", "Check callback implementation"),
else => ErrorInfo.init(err, "Unexpected error during speech processing"),
};
self.options.event_handler.onDetailedError(error_info);
// Handle different error scenarios
if (error_count >= max_errors) {
const fatal_error = ErrorInfo.init(Error.CallbackError, "Too many Vosk processing errors, stopping processing thread");
self.options.event_handler.onDetailedError(fatal_error);
break;
}
if (consecutive_failures >= max_consecutive_failures) {
// Try to recover by reinitializing Vosk
const recovery_info = ErrorInfo.initRecoverable(Error.InternalError, "Multiple consecutive processing failures, attempting recovery", "Vosk recognizer will be reinitialized");
self.options.event_handler.onDetailedError(recovery_info);
self.reinitializeVosk() catch {
const recovery_failed = ErrorInfo.init(Error.ModelLoadError, "Failed to recover Vosk recognizer, stopping processing");
self.options.event_handler.onDetailedError(recovery_failed);
break;
};
consecutive_failures = 0;
}
// Add delay after error to prevent rapid error loops
std.Thread.sleep(50 * std.time.ns_per_ms * consecutive_failures); // Exponential backoff
const available = self.vosk_audio_buffer.available();
if (available < 256) {
std.Thread.sleep(10 * std.time.ns_per_ms);
continue;
};
// Log timing and buffer status for diagnostics
const end_time = std.time.nanoTimestamp();
const processing_ms = @divTrunc(end_time - start_time, std.time.ns_per_ms);
const realtime_ms = (samples_read * 1000) / 16000;
if (processing_ms > realtime_ms and buffer_fill > 20)
std.log.warn("Vosk processing behind realtime. {d} samples in {d}ms (realtime: {d}ms), buffer {d}% full", .{ samples_read, processing_ms, realtime_ms, buffer_fill });
// Reset error counters after successful operations
success_count += 1;
consecutive_failures = 0;
if (success_count >= error_reset_threshold) {
error_count = 0;
success_count = 0;
}
}
}
// Adaptive delay based on buffer fill level and error state
const base_delay_ms: u64 = if (available_samples > vosk_chunk_size * 2)
1 // Fast processing when buffer is full
else if (available_samples > min_chunk_size)
5 // Normal processing
else
10; // Slower when buffer is low
const read_size = @min(available, vosk_chunk_size);
const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..read_size]);
if (samples_read == 0) continue;
// Increase delay if we're having errors
const error_multiplier: u64 = if (consecutive_failures > 0) consecutive_failures + 1 else 1;
var delay_ms = base_delay_ms * error_multiplier;
// Add extra delay for slower hardware (Pi) to prevent buffer overruns
if (cpu_perf < 50) {
delay_ms += 100; // Extra 10ms delay for Pi-class hardware
var is_silent = true;
var max_amplitude: u16 = 0;
for (vosk_buffer[0..samples_read]) |sample| {
const abs_sample = @abs(sample);
if (abs_sample > max_amplitude) max_amplitude = abs_sample;
if (abs_sample > silence_threshold) {
is_silent = false;
}
}
std.Thread.sleep(delay_ms * std.time.ns_per_ms);
if (!is_silent) {
if (!in_speech) {
in_speech = true;
speech_samples = 0;
silence_samples = 0;
speech_pos = 0;
}
const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
@memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
speech_pos += copy_len;
speech_samples += samples_read;
silence_samples = 0;
} else if (in_speech) {
silence_samples += samples_read;
const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
@memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
speech_pos += copy_len;
// Final processing of any remaining audio data
const remaining_samples = self.vosk_audio_buffer.available();
if (remaining_samples > 0 and self.vosk_recognizer != null) {
const final_chunk_size = @min(remaining_samples, vosk_chunk_size);
const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..final_chunk_size]);
if (samples_read > 0) {
self.processVoskAudio(vosk_buffer[0..samples_read]) catch {
// Ignore errors during shutdown, but log them
const shutdown_error = ErrorInfo.init(Error.InternalError, "Error during final audio processing at shutdown");
self.options.event_handler.onDetailedError(shutdown_error);
if (silence_samples >= silence_duration_ms * samples_per_ms) {
if (speech_samples >= min_speech_duration_ms * samples_per_ms) {
self.processVoskAudio(speech_buffer[0..speech_pos]) catch |err| {
std.log.err("Error processing speech: {}", .{err});
};
}
in_speech = false;
speech_pos = 0;
speech_samples = 0;
silence_samples = 0;
if (self.vosk_recognizer) |rec| {
c.vosk_recognizer_reset(rec);
}
}
}
/// Process audio chunk with Vosk and handle results
// Sleep to ease CPU pressure
std.Thread.sleep(1 * std.time.ns_per_ms);
}
}
/// Process complete audio clip with Vosk and handle results
fn processVoskAudio(self: *Session, audio_data: []const i16) !void {
if (self.vosk_recognizer == null) {
return Error.InvalidState;
}
if (self.vosk_recognizer == null) return Error.InvalidState;
// Use audio data directly without resampling
const final_audio = audio_data;
const audio_bytes = std.mem.sliceAsBytes(audio_data);
_ = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));
// Convert i16 samples to bytes for Vosk
const audio_bytes = std.mem.sliceAsBytes(final_audio);
// Feed audio to Vosk recognizer
const accept_result = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));
if (accept_result == 1) {
// Final result available
const result_cstr = c.vosk_recognizer_result(self.vosk_recognizer);
// Get final result
const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer);
if (result_cstr != null) {
const result_str = std.mem.span(result_cstr);
// Parse JSON result to extract text
self.parseVoskResult(result_str) catch |err| {
self.options.event_handler.onError(err, "Failed to parse Vosk result");
};
// Reset recognizer after getting final result to clear internal buffers
c.vosk_recognizer_reset(self.vosk_recognizer);
}
} else if (accept_result == 0) {
// Partial result available (optional - for real-time feedback)
const partial_result_cstr = c.vosk_recognizer_partial_result(self.vosk_recognizer);
if (partial_result_cstr != null) {
const partial_str = std.mem.span(partial_result_cstr);
// Parse partial result (could be used for real-time display)
self.parseVoskPartialResult(partial_str) catch |parse_err| {
// Log partial result parsing errors but continue processing
const parse_error_info = switch (parse_err) {
Error.CallbackError => ErrorInfo.init(Error.CallbackError, "Failed to parse partial speech result"),
else => ErrorInfo.init(Error.CallbackError, "Unexpected error parsing partial speech result"),
};
self.options.event_handler.onDetailedError(parse_error_info);
};
}
}
// accept_result == -1 means error, but we continue processing
}
/// Parse Vosk JSON result and extract recognized text
fn parseVoskResult(self: *Session, json_str: []const u8) !void {
// Simple JSON parsing to extract "text" field
// Vosk returns JSON like: {"text": "hello world"}
if (json_str.len == 0) return;
// Find "text" field in JSON
const text_key = "\"text\"";
if (std.mem.indexOf(u8, json_str, text_key)) |text_start| {
const value_start = text_start + text_key.len;
// Find the colon and opening quote
if (std.mem.indexOf(u8, json_str[value_start..], ":")) |colon_pos| {
const after_colon = value_start + colon_pos + 1;
// Skip whitespace and find opening quote
var quote_start: ?usize = null;
for (json_str[after_colon..], 0..) |char, i| {
if (char == '"') {
@ -1100,13 +1033,9 @@ pub const Session = struct {
break;
}
}
if (quote_start) |s| {
// Find closing quote
if (std.mem.indexOf(u8, json_str[s..], "\"")) |quote_end| {
const text = json_str[s .. s + quote_end];
// Only invoke callback if text is not empty
if (text.len > 0 and !std.mem.eql(u8, text, " ")) {
self.options.event_handler.onSpeech(text);
}
@ -1116,15 +1045,6 @@ pub const Session = struct {
}
}
/// Parse Vosk partial result (for real-time feedback)
fn parseVoskPartialResult(self: *Session, json_str: []const u8) !void {
// Similar to parseVoskResult but for partial results
// For now, we don't use partial results, but this could be extended
// to provide real-time transcription feedback
_ = self;
_ = json_str;
}
/// Attempt to recover from audio device errors with detailed error reporting
fn recoverAudioDevice(self: *Session) Error!void {
if (self.alsa_capture) |*capture| {