From 43a40e2d76fe4e261baf71b76c5d8dd51aeafa0e Mon Sep 17 00:00:00 2001 From: Emil Lerch Date: Mon, 6 Oct 2025 20:49:53 -0700 Subject: [PATCH] add back resampling/force Vosk to 16kHz --- src/stt.zig | 176 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 158 insertions(+), 18 deletions(-) diff --git a/src/stt.zig b/src/stt.zig index 019165d..c078699 100644 --- a/src/stt.zig +++ b/src/stt.zig @@ -173,6 +173,69 @@ pub const SpeechEventHandler = struct { } }; +/// Resample audio from input rate to output rate using linear interpolation +fn resample(input_samples: []const i16, output_samples: []i16, input_rate: u32, output_rate: u32) usize { + if (input_rate == output_rate) { + const copy_len = @min(input_samples.len, output_samples.len); + @memcpy(output_samples[0..copy_len], input_samples[0..copy_len]); + return copy_len; + } + + const ratio = @as(f64, @floatFromInt(input_rate)) / @as(f64, @floatFromInt(output_rate)); + const output_len = @min(output_samples.len, @as(usize, @intFromFloat(@as(f64, @floatFromInt(input_samples.len)) / ratio))); + + for (0..output_len) |i| { + const src_pos = @as(f64, @floatFromInt(i)) * ratio; + const src_idx: usize = @intFromFloat(src_pos); + + if (src_idx >= input_samples.len) break; + + if (src_idx + 1 < input_samples.len) { + const frac = src_pos - @as(f64, @floatFromInt(src_idx)); + const sample1: f64 = @floatFromInt(input_samples[src_idx]); + const sample2: f64 = @floatFromInt(input_samples[src_idx + 1]); + const interpolated = sample1 + (sample2 - sample1) * frac; + output_samples[i] = @intFromFloat(@max(@min(interpolated, std.math.maxInt(i16)), std.math.minInt(i16))); + } else { + output_samples[i] = input_samples[src_idx]; + } + } + + return output_len; +} + +/// Get CPU performance metric from /proc/cpuinfo (BogoMIPS or MHz) +fn getCpuPerformance() !u32 { + const file = try std.fs.openFileAbsolute("/proc/cpuinfo", .{}); + defer file.close(); + + var buf: [4096]u8 = undefined; + const bytes_read = try file.readAll(&buf); + + var lines = std.mem.splitScalar(u8, buf[0..bytes_read], '\n'); + while (lines.next()) |line| { + if (std.mem.startsWith(u8, line, "BogoMIPS")) { + var parts = std.mem.splitScalar(u8, line, ':'); + _ = parts.next(); // Skip key + if (parts.next()) |value| { + const trimmed = std.mem.trim(u8, value, " \t"); + return @intFromFloat(try std.fmt.parseFloat(f32, trimmed)); + } + } + if (std.mem.startsWith(u8, line, "cpu MHz")) { + var parts = std.mem.splitScalar(u8, line, ':'); + _ = parts.next(); // Skip key + if (parts.next()) |value| { + const trimmed = std.mem.trim(u8, value, " \t"); + // Convert MHz to equivalent BogoMIPS scale for consistent thresholds + const mhz = try std.fmt.parseFloat(f32, trimmed); + return @intFromFloat(mhz / 20.0); // Rough conversion to BogoMIPS scale + } + } + } + return error.PerformanceNotFound; // Default fallback +} + /// Audio buffer for managing audio data flow using std.io interfaces pub const AudioBuffer = struct { const Self = @This(); @@ -521,6 +584,8 @@ pub const Session = struct { should_stop: std.atomic.Value(bool) = std.atomic.Value(bool).init(false), /// Processing buffer for audio samples processing_buffer: []i16, + /// Resample buffer for converting hardware rate to 16kHz (null if not needed) + resample_buffer: ?[]i16, /// Vosk model vosk_model: ?*c.VoskModel = null, /// Vosk recognizer @@ -574,23 +639,32 @@ pub const Session = struct { alsa_capture_mut.deinit(); } - // Initialize Vosk audio buffer (larger buffer for processing) - const vosk_audio_buffer = AudioBuffer.init(allocator, alsa_capture.sample_rate * 2) catch { - const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk audio buffer during initialization"); - options.event_handler.onDetailedError(error_info); - return Error.OutOfMemory; - }; - errdefer { - var vosk_audio_buffer_mut = vosk_audio_buffer; - vosk_audio_buffer_mut.deinit(); - } + const cpu_perf = getCpuPerformance() catch 100; + const buffer_multiplier: u32 = if (cpu_perf < 50) 8 else if (cpu_perf < 100) 4 else 2; + const new_buffer_size = 16000 * buffer_multiplier; + std.log.debug( + "Buffer multiplier {d} based on implied BogoMIPS of {d} (100 default in case of error)", + .{ buffer_multiplier, cpu_perf }, + ); + + // Resize the Vosk buffer with the actual sample rate + var vosk_buf = AudioBuffer.init( + allocator, + new_buffer_size, + ) catch { + const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to initialize Vosk buffer after ALSA open"); + options.event_handler.onDetailedError(error_info); + return error.InitializationFailed; + }; + errdefer vosk_buf.deinit(); var session = Session{ .allocator = allocator, .options = options, .alsa_capture = alsa_capture, .processing_buffer = processing_buffer, - .vosk_audio_buffer = vosk_audio_buffer, + .resample_buffer = null, + .vosk_audio_buffer = vosk_buf, }; // Initialize Vosk model and recognizer with detailed error reporting @@ -631,8 +705,8 @@ pub const Session = struct { return Error.ModelLoadError; } - // Create Vosk recognizer using actual hardware sample rate - self.vosk_recognizer = c.vosk_recognizer_new(self.vosk_model, @floatFromInt(self.alsa_capture.?.sample_rate)); + // Always create Vosk recognizer at 16kHz + self.vosk_recognizer = c.vosk_recognizer_new(self.vosk_model, 16000.0); if (self.vosk_recognizer == null) { if (self.vosk_model) |model| { c.vosk_model_free(model); @@ -714,6 +788,16 @@ pub const Session = struct { return; } + // Allocate resample buffer if hardware sample rate differs from 16kHz + if (capture.sample_rate != 16000) { + std.log.info("Hardware rate {d}Hz != 16kHz, enabling resampling", .{capture.sample_rate}); + self.resample_buffer = self.allocator.alloc(i16, 16000) catch { + const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate resample buffer"); + self.options.event_handler.onDetailedError(error_info); + return; + }; + } + // Reset retry count for audio reading retry_count = 0; @@ -780,15 +864,26 @@ pub const Session = struct { const chunk_size = @min(1024, self.processing_buffer.len); const samples_read = capture.getAudioSamples(self.processing_buffer[0..chunk_size]); if (samples_read > 0) { + // Resample if needed, otherwise use samples directly + const samples_to_write = if (self.resample_buffer) |resample_buf| blk: { + const resampled_count = resample( + self.processing_buffer[0..samples_read], + resample_buf, + capture.sample_rate, + 16000, + ); + break :blk resample_buf[0..resampled_count]; + } else self.processing_buffer[0..samples_read]; + // Send audio to Vosk processing buffer with overflow protection - const written = self.vosk_audio_buffer.write(self.processing_buffer[0..samples_read]); - if (written < samples_read) { + const written = self.vosk_audio_buffer.write(samples_to_write); + if (written < samples_to_write.len) { // Buffer overflow - report warning and clear buffer const warning = ErrorInfo.initRecoverable(Error.InternalError, "Audio buffer overflow, clearing buffer to prevent data loss", "Consider increasing buffer size if this happens frequently"); self.options.event_handler.onDetailedError(warning); self.vosk_audio_buffer.clear(); - _ = self.vosk_audio_buffer.write(self.processing_buffer[0..samples_read]); + _ = self.vosk_audio_buffer.write(samples_to_write); } } } @@ -808,6 +903,9 @@ pub const Session = struct { const vosk_chunk_size = 4096; const min_chunk_size = 1024; // Minimum chunk size for processing + const cpu_perf = getCpuPerformance() catch 100; + if (cpu_perf < 50) + std.log.debug("processing thread additional delay being added", .{}); var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch { const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk processing buffer"); self.options.event_handler.onDetailedError(error_info); @@ -832,6 +930,10 @@ pub const Session = struct { const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..chunk_size]); if (samples_read > 0 and self.vosk_recognizer != null) { + // Time the Vosk processing to identify bottlenecks + const start_time = std.time.nanoTimestamp(); + const buffer_fill = (self.vosk_audio_buffer.available() * 100) / self.vosk_audio_buffer.buffer.len; + // Process audio with Vosk with comprehensive error handling self.processVoskAudio(vosk_buffer[0..samples_read]) catch |err| { error_count += 1; @@ -873,6 +975,13 @@ pub const Session = struct { continue; }; + // Log timing and buffer status for diagnostics + const end_time = std.time.nanoTimestamp(); + const processing_ms = @divTrunc(end_time - start_time, std.time.ns_per_ms); + const realtime_ms = (samples_read * 1000) / 16000; + if (processing_ms > realtime_ms) + std.log.warn("Vosk processing behind realtime. {d} samples in {d}ms (realtime: {d}ms), buffer {d}% full", .{ samples_read, processing_ms, realtime_ms, buffer_fill }); + // Reset error counters after successful operations success_count += 1; consecutive_failures = 0; @@ -893,7 +1002,12 @@ pub const Session = struct { // Increase delay if we're having errors const error_multiplier: u64 = if (consecutive_failures > 0) consecutive_failures + 1 else 1; - const delay_ms = base_delay_ms * error_multiplier; + var delay_ms = base_delay_ms * error_multiplier; + + // Add extra delay for slower hardware (Pi) to prevent buffer overruns + if (cpu_perf < 50) { + delay_ms += 100; // Extra 10ms delay for Pi-class hardware + } std.Thread.sleep(delay_ms * std.time.ns_per_ms); } @@ -1051,7 +1165,7 @@ pub const Session = struct { // Reinitialize recognizer (model should still be valid) if (self.vosk_model) |model| { - self.vosk_recognizer = c.vosk_recognizer_new(model, @floatFromInt(self.alsa_capture.?.sample_rate)); + self.vosk_recognizer = c.vosk_recognizer_new(model, 16000.0); if (self.vosk_recognizer == null) { const error_info = ErrorInfo.init(Error.ModelLoadError, "Failed to reinitialize Vosk recognizer"); self.options.event_handler.onDetailedError(error_info); @@ -1259,6 +1373,11 @@ pub const Session = struct { // Free processing buffer self.allocator.free(self.processing_buffer); + // Free resample buffer if allocated + if (self.resample_buffer) |buf| { + self.allocator.free(buf); + } + // Clean up ALSA global configuration cache _ = c.snd_config_update_free_global(); @@ -1589,3 +1708,24 @@ test "Session status and recovery" { // which can cause segmentation faults during deinit return error.SkipZigTest; } + +test "resample function" { + // Test same sample rate (no conversion) + const input = [_]i16{ 100, 200, 300, 400 }; + var output: [4]i16 = undefined; + const count = resample(&input, &output, 16000, 16000); + try std.testing.expect(count == 4); + try std.testing.expectEqualSlices(i16, &input, output[0..count]); + + // Test downsampling (48kHz -> 16kHz, 3:1 ratio) + const input_48k = [_]i16{ 100, 150, 200, 250, 300, 350 }; + var output_16k: [2]i16 = undefined; + const down_count = resample(&input_48k, &output_16k, 48000, 16000); + try std.testing.expect(down_count == 2); + + // Test upsampling (16kHz -> 48kHz, 1:3 ratio) + const input_16k = [_]i16{ 100, 200 }; + var output_48k: [6]i16 = undefined; + const up_count = resample(&input_16k, &output_48k, 16000, 48000); + try std.testing.expect(up_count == 6); +}