switch from continuous speech to silence detection/batch process
This commit is contained in:
parent
29e8df571f
commit
506b877da2
1 changed files with 94 additions and 174 deletions
256
src/stt.zig
256
src/stt.zig
|
|
@ -802,7 +802,9 @@ pub const Session = struct {
|
|||
retry_count = 0;
|
||||
|
||||
// Audio capture loop with comprehensive error handling and recovery
|
||||
var loop_count: u32 = 0;
|
||||
while (!self.should_stop.load(.acquire)) {
|
||||
loop_count += 1;
|
||||
// Read audio data from ALSA with detailed error handling
|
||||
_ = capture.readAudio() catch |err| {
|
||||
consecutive_errors += 1;
|
||||
|
|
@ -859,11 +861,20 @@ pub const Session = struct {
|
|||
retry_count = 0;
|
||||
consecutive_errors = 0;
|
||||
|
||||
// Transfer audio data to Vosk processing buffer with error handling
|
||||
if (capture.availableSamples() >= 1024) { // Process in chunks of 1024 samples
|
||||
const chunk_size = @min(1024, self.processing_buffer.len);
|
||||
// Transfer audio data to Vosk processing buffer
|
||||
const available = capture.availableSamples();
|
||||
// 50ms is a good threshold for speech. At 16kHz, that would be
|
||||
// 800 samples. So we'll use 512 here
|
||||
if (available >= 512) {
|
||||
const chunk_size = @min(available, self.processing_buffer.len);
|
||||
const samples_read = capture.getAudioSamples(self.processing_buffer[0..chunk_size]);
|
||||
if (samples_read > 0) {
|
||||
var max_before_resample: u16 = 0;
|
||||
for (self.processing_buffer[0..samples_read]) |s| {
|
||||
const abs_s = @abs(s);
|
||||
if (abs_s > max_before_resample) max_before_resample = abs_s;
|
||||
}
|
||||
|
||||
// Resample if needed, otherwise use samples directly
|
||||
const samples_to_write = if (self.resample_buffer) |resample_buf| blk: {
|
||||
const resampled_count = resample(
|
||||
|
|
@ -897,15 +908,11 @@ pub const Session = struct {
|
|||
}
|
||||
}
|
||||
|
||||
/// Vosk processing thread function with comprehensive error handling
|
||||
/// Vosk processing thread function. Audio processed by this function
|
||||
/// must be either captured at 16kHz or resampled to 16kHz prior to calling
|
||||
/// the function
|
||||
fn processingThreadFn(self: *Session) void {
|
||||
// Processing buffer for Vosk (4096 samples = ~256ms at 16kHz)
|
||||
const vosk_chunk_size = 4096;
|
||||
const min_chunk_size = 1024; // Minimum chunk size for processing
|
||||
|
||||
const cpu_perf = getCpuPerformance() catch 100;
|
||||
if (cpu_perf < 50)
|
||||
std.log.debug("processing thread additional delay being added", .{});
|
||||
var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch {
|
||||
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk processing buffer");
|
||||
self.options.event_handler.onDetailedError(error_info);
|
||||
|
|
@ -913,186 +920,112 @@ pub const Session = struct {
|
|||
};
|
||||
defer self.allocator.free(vosk_buffer);
|
||||
|
||||
var error_count: u32 = 0;
|
||||
const max_errors = 10;
|
||||
const error_reset_threshold = 100; // Reset error count after this many successful operations
|
||||
var success_count: u32 = 0;
|
||||
var consecutive_failures: u32 = 0;
|
||||
const max_consecutive_failures = 5;
|
||||
// Silence detection parameters
|
||||
const silence_threshold: i16 = 500;
|
||||
const silence_duration_ms: u64 = 500;
|
||||
const min_speech_duration_ms: u64 = 300;
|
||||
const samples_per_ms = 16; // This assumes 16kHz audio
|
||||
|
||||
var speech_buffer = self.allocator.alloc(i16, 16000 * 10) catch {
|
||||
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate speech buffer");
|
||||
self.options.event_handler.onDetailedError(error_info);
|
||||
return;
|
||||
};
|
||||
defer self.allocator.free(speech_buffer);
|
||||
|
||||
var in_speech = false;
|
||||
var silence_samples: usize = 0;
|
||||
var speech_samples: usize = 0;
|
||||
var speech_pos: usize = 0;
|
||||
|
||||
while (!self.should_stop.load(.acquire)) {
|
||||
// Check if we have enough audio data for processing
|
||||
const available_samples = self.vosk_audio_buffer.available();
|
||||
|
||||
if (available_samples >= min_chunk_size) {
|
||||
// Process in chunks, but don't exceed our buffer size
|
||||
const chunk_size = @min(available_samples, vosk_chunk_size);
|
||||
const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..chunk_size]);
|
||||
|
||||
if (samples_read > 0 and self.vosk_recognizer != null) {
|
||||
// Time the Vosk processing to identify bottlenecks
|
||||
const start_time = std.time.nanoTimestamp();
|
||||
const buffer_fill = (self.vosk_audio_buffer.available() * 100) / self.vosk_audio_buffer.buffer.len;
|
||||
|
||||
// Process audio with Vosk with comprehensive error handling
|
||||
self.processVoskAudio(vosk_buffer[0..samples_read]) catch |err| {
|
||||
error_count += 1;
|
||||
consecutive_failures += 1;
|
||||
|
||||
// Create detailed error information
|
||||
const error_info = switch (err) {
|
||||
Error.InvalidState => ErrorInfo.initRecoverable(err, "Vosk recognizer is in invalid state", "Recognizer will be reinitialized"),
|
||||
Error.OutOfMemory => ErrorInfo.init(err, "Out of memory during speech processing"),
|
||||
Error.CallbackError => ErrorInfo.initWithContext(err, "Error in speech detection callback", "Check callback implementation"),
|
||||
else => ErrorInfo.init(err, "Unexpected error during speech processing"),
|
||||
};
|
||||
|
||||
self.options.event_handler.onDetailedError(error_info);
|
||||
|
||||
// Handle different error scenarios
|
||||
if (error_count >= max_errors) {
|
||||
const fatal_error = ErrorInfo.init(Error.CallbackError, "Too many Vosk processing errors, stopping processing thread");
|
||||
self.options.event_handler.onDetailedError(fatal_error);
|
||||
break;
|
||||
}
|
||||
|
||||
if (consecutive_failures >= max_consecutive_failures) {
|
||||
// Try to recover by reinitializing Vosk
|
||||
const recovery_info = ErrorInfo.initRecoverable(Error.InternalError, "Multiple consecutive processing failures, attempting recovery", "Vosk recognizer will be reinitialized");
|
||||
self.options.event_handler.onDetailedError(recovery_info);
|
||||
|
||||
self.reinitializeVosk() catch {
|
||||
const recovery_failed = ErrorInfo.init(Error.ModelLoadError, "Failed to recover Vosk recognizer, stopping processing");
|
||||
self.options.event_handler.onDetailedError(recovery_failed);
|
||||
break;
|
||||
};
|
||||
|
||||
consecutive_failures = 0;
|
||||
}
|
||||
|
||||
// Add delay after error to prevent rapid error loops
|
||||
std.Thread.sleep(50 * std.time.ns_per_ms * consecutive_failures); // Exponential backoff
|
||||
const available = self.vosk_audio_buffer.available();
|
||||
if (available < 256) {
|
||||
std.Thread.sleep(10 * std.time.ns_per_ms);
|
||||
continue;
|
||||
};
|
||||
|
||||
// Log timing and buffer status for diagnostics
|
||||
const end_time = std.time.nanoTimestamp();
|
||||
const processing_ms = @divTrunc(end_time - start_time, std.time.ns_per_ms);
|
||||
const realtime_ms = (samples_read * 1000) / 16000;
|
||||
if (processing_ms > realtime_ms and buffer_fill > 20)
|
||||
std.log.warn("Vosk processing behind realtime. {d} samples in {d}ms (realtime: {d}ms), buffer {d}% full", .{ samples_read, processing_ms, realtime_ms, buffer_fill });
|
||||
|
||||
// Reset error counters after successful operations
|
||||
success_count += 1;
|
||||
consecutive_failures = 0;
|
||||
if (success_count >= error_reset_threshold) {
|
||||
error_count = 0;
|
||||
success_count = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Adaptive delay based on buffer fill level and error state
|
||||
const base_delay_ms: u64 = if (available_samples > vosk_chunk_size * 2)
|
||||
1 // Fast processing when buffer is full
|
||||
else if (available_samples > min_chunk_size)
|
||||
5 // Normal processing
|
||||
else
|
||||
10; // Slower when buffer is low
|
||||
const read_size = @min(available, vosk_chunk_size);
|
||||
const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..read_size]);
|
||||
if (samples_read == 0) continue;
|
||||
|
||||
// Increase delay if we're having errors
|
||||
const error_multiplier: u64 = if (consecutive_failures > 0) consecutive_failures + 1 else 1;
|
||||
var delay_ms = base_delay_ms * error_multiplier;
|
||||
|
||||
// Add extra delay for slower hardware (Pi) to prevent buffer overruns
|
||||
if (cpu_perf < 50) {
|
||||
delay_ms += 100; // Extra 10ms delay for Pi-class hardware
|
||||
var is_silent = true;
|
||||
var max_amplitude: u16 = 0;
|
||||
for (vosk_buffer[0..samples_read]) |sample| {
|
||||
const abs_sample = @abs(sample);
|
||||
if (abs_sample > max_amplitude) max_amplitude = abs_sample;
|
||||
if (abs_sample > silence_threshold) {
|
||||
is_silent = false;
|
||||
}
|
||||
}
|
||||
|
||||
std.Thread.sleep(delay_ms * std.time.ns_per_ms);
|
||||
if (!is_silent) {
|
||||
if (!in_speech) {
|
||||
in_speech = true;
|
||||
speech_samples = 0;
|
||||
silence_samples = 0;
|
||||
speech_pos = 0;
|
||||
}
|
||||
const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
|
||||
@memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
|
||||
speech_pos += copy_len;
|
||||
speech_samples += samples_read;
|
||||
silence_samples = 0;
|
||||
} else if (in_speech) {
|
||||
silence_samples += samples_read;
|
||||
const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
|
||||
@memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
|
||||
speech_pos += copy_len;
|
||||
|
||||
// Final processing of any remaining audio data
|
||||
const remaining_samples = self.vosk_audio_buffer.available();
|
||||
if (remaining_samples > 0 and self.vosk_recognizer != null) {
|
||||
const final_chunk_size = @min(remaining_samples, vosk_chunk_size);
|
||||
const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..final_chunk_size]);
|
||||
if (samples_read > 0) {
|
||||
self.processVoskAudio(vosk_buffer[0..samples_read]) catch {
|
||||
// Ignore errors during shutdown, but log them
|
||||
const shutdown_error = ErrorInfo.init(Error.InternalError, "Error during final audio processing at shutdown");
|
||||
self.options.event_handler.onDetailedError(shutdown_error);
|
||||
if (silence_samples >= silence_duration_ms * samples_per_ms) {
|
||||
if (speech_samples >= min_speech_duration_ms * samples_per_ms) {
|
||||
self.processVoskAudio(speech_buffer[0..speech_pos]) catch |err| {
|
||||
std.log.err("Error processing speech: {}", .{err});
|
||||
};
|
||||
}
|
||||
|
||||
in_speech = false;
|
||||
speech_pos = 0;
|
||||
speech_samples = 0;
|
||||
silence_samples = 0;
|
||||
|
||||
if (self.vosk_recognizer) |rec| {
|
||||
c.vosk_recognizer_reset(rec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Process audio chunk with Vosk and handle results
|
||||
// Sleep to ease CPU pressure
|
||||
std.Thread.sleep(1 * std.time.ns_per_ms);
|
||||
}
|
||||
}
|
||||
|
||||
/// Process complete audio clip with Vosk and handle results
|
||||
fn processVoskAudio(self: *Session, audio_data: []const i16) !void {
|
||||
if (self.vosk_recognizer == null) {
|
||||
return Error.InvalidState;
|
||||
}
|
||||
if (self.vosk_recognizer == null) return Error.InvalidState;
|
||||
|
||||
// Use audio data directly without resampling
|
||||
const final_audio = audio_data;
|
||||
const audio_bytes = std.mem.sliceAsBytes(audio_data);
|
||||
_ = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));
|
||||
|
||||
// Convert i16 samples to bytes for Vosk
|
||||
const audio_bytes = std.mem.sliceAsBytes(final_audio);
|
||||
|
||||
// Feed audio to Vosk recognizer
|
||||
const accept_result = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));
|
||||
|
||||
if (accept_result == 1) {
|
||||
// Final result available
|
||||
const result_cstr = c.vosk_recognizer_result(self.vosk_recognizer);
|
||||
// Get final result
|
||||
const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer);
|
||||
if (result_cstr != null) {
|
||||
const result_str = std.mem.span(result_cstr);
|
||||
|
||||
// Parse JSON result to extract text
|
||||
self.parseVoskResult(result_str) catch |err| {
|
||||
self.options.event_handler.onError(err, "Failed to parse Vosk result");
|
||||
};
|
||||
|
||||
// Reset recognizer after getting final result to clear internal buffers
|
||||
c.vosk_recognizer_reset(self.vosk_recognizer);
|
||||
}
|
||||
} else if (accept_result == 0) {
|
||||
// Partial result available (optional - for real-time feedback)
|
||||
const partial_result_cstr = c.vosk_recognizer_partial_result(self.vosk_recognizer);
|
||||
if (partial_result_cstr != null) {
|
||||
const partial_str = std.mem.span(partial_result_cstr);
|
||||
|
||||
// Parse partial result (could be used for real-time display)
|
||||
self.parseVoskPartialResult(partial_str) catch |parse_err| {
|
||||
// Log partial result parsing errors but continue processing
|
||||
const parse_error_info = switch (parse_err) {
|
||||
Error.CallbackError => ErrorInfo.init(Error.CallbackError, "Failed to parse partial speech result"),
|
||||
else => ErrorInfo.init(Error.CallbackError, "Unexpected error parsing partial speech result"),
|
||||
};
|
||||
self.options.event_handler.onDetailedError(parse_error_info);
|
||||
};
|
||||
}
|
||||
}
|
||||
// accept_result == -1 means error, but we continue processing
|
||||
}
|
||||
|
||||
/// Parse Vosk JSON result and extract recognized text
|
||||
fn parseVoskResult(self: *Session, json_str: []const u8) !void {
|
||||
// Simple JSON parsing to extract "text" field
|
||||
// Vosk returns JSON like: {"text": "hello world"}
|
||||
|
||||
if (json_str.len == 0) return;
|
||||
|
||||
// Find "text" field in JSON
|
||||
const text_key = "\"text\"";
|
||||
if (std.mem.indexOf(u8, json_str, text_key)) |text_start| {
|
||||
const value_start = text_start + text_key.len;
|
||||
|
||||
// Find the colon and opening quote
|
||||
if (std.mem.indexOf(u8, json_str[value_start..], ":")) |colon_pos| {
|
||||
const after_colon = value_start + colon_pos + 1;
|
||||
|
||||
// Skip whitespace and find opening quote
|
||||
var quote_start: ?usize = null;
|
||||
for (json_str[after_colon..], 0..) |char, i| {
|
||||
if (char == '"') {
|
||||
|
|
@ -1100,13 +1033,9 @@ pub const Session = struct {
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (quote_start) |s| {
|
||||
// Find closing quote
|
||||
if (std.mem.indexOf(u8, json_str[s..], "\"")) |quote_end| {
|
||||
const text = json_str[s .. s + quote_end];
|
||||
|
||||
// Only invoke callback if text is not empty
|
||||
if (text.len > 0 and !std.mem.eql(u8, text, " ")) {
|
||||
self.options.event_handler.onSpeech(text);
|
||||
}
|
||||
|
|
@ -1116,15 +1045,6 @@ pub const Session = struct {
|
|||
}
|
||||
}
|
||||
|
||||
/// Parse Vosk partial result (for real-time feedback)
|
||||
fn parseVoskPartialResult(self: *Session, json_str: []const u8) !void {
|
||||
// Similar to parseVoskResult but for partial results
|
||||
// For now, we don't use partial results, but this could be extended
|
||||
// to provide real-time transcription feedback
|
||||
_ = self;
|
||||
_ = json_str;
|
||||
}
|
||||
|
||||
/// Attempt to recover from audio device errors with detailed error reporting
|
||||
fn recoverAudioDevice(self: *Session) Error!void {
|
||||
if (self.alsa_capture) |*capture| {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue