add back resampling/force Vosk to 16kHz
This commit is contained in:
parent
e2490ec3e3
commit
43a40e2d76
1 changed files with 158 additions and 18 deletions
176
src/stt.zig
176
src/stt.zig
|
@ -173,6 +173,69 @@ pub const SpeechEventHandler = struct {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Resample audio from input rate to output rate using linear interpolation
|
||||||
|
fn resample(input_samples: []const i16, output_samples: []i16, input_rate: u32, output_rate: u32) usize {
|
||||||
|
if (input_rate == output_rate) {
|
||||||
|
const copy_len = @min(input_samples.len, output_samples.len);
|
||||||
|
@memcpy(output_samples[0..copy_len], input_samples[0..copy_len]);
|
||||||
|
return copy_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
const ratio = @as(f64, @floatFromInt(input_rate)) / @as(f64, @floatFromInt(output_rate));
|
||||||
|
const output_len = @min(output_samples.len, @as(usize, @intFromFloat(@as(f64, @floatFromInt(input_samples.len)) / ratio)));
|
||||||
|
|
||||||
|
for (0..output_len) |i| {
|
||||||
|
const src_pos = @as(f64, @floatFromInt(i)) * ratio;
|
||||||
|
const src_idx: usize = @intFromFloat(src_pos);
|
||||||
|
|
||||||
|
if (src_idx >= input_samples.len) break;
|
||||||
|
|
||||||
|
if (src_idx + 1 < input_samples.len) {
|
||||||
|
const frac = src_pos - @as(f64, @floatFromInt(src_idx));
|
||||||
|
const sample1: f64 = @floatFromInt(input_samples[src_idx]);
|
||||||
|
const sample2: f64 = @floatFromInt(input_samples[src_idx + 1]);
|
||||||
|
const interpolated = sample1 + (sample2 - sample1) * frac;
|
||||||
|
output_samples[i] = @intFromFloat(@max(@min(interpolated, std.math.maxInt(i16)), std.math.minInt(i16)));
|
||||||
|
} else {
|
||||||
|
output_samples[i] = input_samples[src_idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return output_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get CPU performance metric from /proc/cpuinfo (BogoMIPS or MHz)
|
||||||
|
fn getCpuPerformance() !u32 {
|
||||||
|
const file = try std.fs.openFileAbsolute("/proc/cpuinfo", .{});
|
||||||
|
defer file.close();
|
||||||
|
|
||||||
|
var buf: [4096]u8 = undefined;
|
||||||
|
const bytes_read = try file.readAll(&buf);
|
||||||
|
|
||||||
|
var lines = std.mem.splitScalar(u8, buf[0..bytes_read], '\n');
|
||||||
|
while (lines.next()) |line| {
|
||||||
|
if (std.mem.startsWith(u8, line, "BogoMIPS")) {
|
||||||
|
var parts = std.mem.splitScalar(u8, line, ':');
|
||||||
|
_ = parts.next(); // Skip key
|
||||||
|
if (parts.next()) |value| {
|
||||||
|
const trimmed = std.mem.trim(u8, value, " \t");
|
||||||
|
return @intFromFloat(try std.fmt.parseFloat(f32, trimmed));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (std.mem.startsWith(u8, line, "cpu MHz")) {
|
||||||
|
var parts = std.mem.splitScalar(u8, line, ':');
|
||||||
|
_ = parts.next(); // Skip key
|
||||||
|
if (parts.next()) |value| {
|
||||||
|
const trimmed = std.mem.trim(u8, value, " \t");
|
||||||
|
// Convert MHz to equivalent BogoMIPS scale for consistent thresholds
|
||||||
|
const mhz = try std.fmt.parseFloat(f32, trimmed);
|
||||||
|
return @intFromFloat(mhz / 20.0); // Rough conversion to BogoMIPS scale
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return error.PerformanceNotFound; // Default fallback
|
||||||
|
}
|
||||||
|
|
||||||
/// Audio buffer for managing audio data flow using std.io interfaces
|
/// Audio buffer for managing audio data flow using std.io interfaces
|
||||||
pub const AudioBuffer = struct {
|
pub const AudioBuffer = struct {
|
||||||
const Self = @This();
|
const Self = @This();
|
||||||
|
@ -521,6 +584,8 @@ pub const Session = struct {
|
||||||
should_stop: std.atomic.Value(bool) = std.atomic.Value(bool).init(false),
|
should_stop: std.atomic.Value(bool) = std.atomic.Value(bool).init(false),
|
||||||
/// Processing buffer for audio samples
|
/// Processing buffer for audio samples
|
||||||
processing_buffer: []i16,
|
processing_buffer: []i16,
|
||||||
|
/// Resample buffer for converting hardware rate to 16kHz (null if not needed)
|
||||||
|
resample_buffer: ?[]i16,
|
||||||
/// Vosk model
|
/// Vosk model
|
||||||
vosk_model: ?*c.VoskModel = null,
|
vosk_model: ?*c.VoskModel = null,
|
||||||
/// Vosk recognizer
|
/// Vosk recognizer
|
||||||
|
@ -574,23 +639,32 @@ pub const Session = struct {
|
||||||
alsa_capture_mut.deinit();
|
alsa_capture_mut.deinit();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize Vosk audio buffer (larger buffer for processing)
|
const cpu_perf = getCpuPerformance() catch 100;
|
||||||
const vosk_audio_buffer = AudioBuffer.init(allocator, alsa_capture.sample_rate * 2) catch {
|
const buffer_multiplier: u32 = if (cpu_perf < 50) 8 else if (cpu_perf < 100) 4 else 2;
|
||||||
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk audio buffer during initialization");
|
const new_buffer_size = 16000 * buffer_multiplier;
|
||||||
options.event_handler.onDetailedError(error_info);
|
|
||||||
return Error.OutOfMemory;
|
|
||||||
};
|
|
||||||
errdefer {
|
|
||||||
var vosk_audio_buffer_mut = vosk_audio_buffer;
|
|
||||||
vosk_audio_buffer_mut.deinit();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
std.log.debug(
|
||||||
|
"Buffer multiplier {d} based on implied BogoMIPS of {d} (100 default in case of error)",
|
||||||
|
.{ buffer_multiplier, cpu_perf },
|
||||||
|
);
|
||||||
|
|
||||||
|
// Resize the Vosk buffer with the actual sample rate
|
||||||
|
var vosk_buf = AudioBuffer.init(
|
||||||
|
allocator,
|
||||||
|
new_buffer_size,
|
||||||
|
) catch {
|
||||||
|
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to initialize Vosk buffer after ALSA open");
|
||||||
|
options.event_handler.onDetailedError(error_info);
|
||||||
|
return error.InitializationFailed;
|
||||||
|
};
|
||||||
|
errdefer vosk_buf.deinit();
|
||||||
var session = Session{
|
var session = Session{
|
||||||
.allocator = allocator,
|
.allocator = allocator,
|
||||||
.options = options,
|
.options = options,
|
||||||
.alsa_capture = alsa_capture,
|
.alsa_capture = alsa_capture,
|
||||||
.processing_buffer = processing_buffer,
|
.processing_buffer = processing_buffer,
|
||||||
.vosk_audio_buffer = vosk_audio_buffer,
|
.resample_buffer = null,
|
||||||
|
.vosk_audio_buffer = vosk_buf,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Initialize Vosk model and recognizer with detailed error reporting
|
// Initialize Vosk model and recognizer with detailed error reporting
|
||||||
|
@ -631,8 +705,8 @@ pub const Session = struct {
|
||||||
return Error.ModelLoadError;
|
return Error.ModelLoadError;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create Vosk recognizer using actual hardware sample rate
|
// Always create Vosk recognizer at 16kHz
|
||||||
self.vosk_recognizer = c.vosk_recognizer_new(self.vosk_model, @floatFromInt(self.alsa_capture.?.sample_rate));
|
self.vosk_recognizer = c.vosk_recognizer_new(self.vosk_model, 16000.0);
|
||||||
if (self.vosk_recognizer == null) {
|
if (self.vosk_recognizer == null) {
|
||||||
if (self.vosk_model) |model| {
|
if (self.vosk_model) |model| {
|
||||||
c.vosk_model_free(model);
|
c.vosk_model_free(model);
|
||||||
|
@ -714,6 +788,16 @@ pub const Session = struct {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Allocate resample buffer if hardware sample rate differs from 16kHz
|
||||||
|
if (capture.sample_rate != 16000) {
|
||||||
|
std.log.info("Hardware rate {d}Hz != 16kHz, enabling resampling", .{capture.sample_rate});
|
||||||
|
self.resample_buffer = self.allocator.alloc(i16, 16000) catch {
|
||||||
|
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate resample buffer");
|
||||||
|
self.options.event_handler.onDetailedError(error_info);
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// Reset retry count for audio reading
|
// Reset retry count for audio reading
|
||||||
retry_count = 0;
|
retry_count = 0;
|
||||||
|
|
||||||
|
@ -780,15 +864,26 @@ pub const Session = struct {
|
||||||
const chunk_size = @min(1024, self.processing_buffer.len);
|
const chunk_size = @min(1024, self.processing_buffer.len);
|
||||||
const samples_read = capture.getAudioSamples(self.processing_buffer[0..chunk_size]);
|
const samples_read = capture.getAudioSamples(self.processing_buffer[0..chunk_size]);
|
||||||
if (samples_read > 0) {
|
if (samples_read > 0) {
|
||||||
|
// Resample if needed, otherwise use samples directly
|
||||||
|
const samples_to_write = if (self.resample_buffer) |resample_buf| blk: {
|
||||||
|
const resampled_count = resample(
|
||||||
|
self.processing_buffer[0..samples_read],
|
||||||
|
resample_buf,
|
||||||
|
capture.sample_rate,
|
||||||
|
16000,
|
||||||
|
);
|
||||||
|
break :blk resample_buf[0..resampled_count];
|
||||||
|
} else self.processing_buffer[0..samples_read];
|
||||||
|
|
||||||
// Send audio to Vosk processing buffer with overflow protection
|
// Send audio to Vosk processing buffer with overflow protection
|
||||||
const written = self.vosk_audio_buffer.write(self.processing_buffer[0..samples_read]);
|
const written = self.vosk_audio_buffer.write(samples_to_write);
|
||||||
if (written < samples_read) {
|
if (written < samples_to_write.len) {
|
||||||
// Buffer overflow - report warning and clear buffer
|
// Buffer overflow - report warning and clear buffer
|
||||||
const warning = ErrorInfo.initRecoverable(Error.InternalError, "Audio buffer overflow, clearing buffer to prevent data loss", "Consider increasing buffer size if this happens frequently");
|
const warning = ErrorInfo.initRecoverable(Error.InternalError, "Audio buffer overflow, clearing buffer to prevent data loss", "Consider increasing buffer size if this happens frequently");
|
||||||
self.options.event_handler.onDetailedError(warning);
|
self.options.event_handler.onDetailedError(warning);
|
||||||
|
|
||||||
self.vosk_audio_buffer.clear();
|
self.vosk_audio_buffer.clear();
|
||||||
_ = self.vosk_audio_buffer.write(self.processing_buffer[0..samples_read]);
|
_ = self.vosk_audio_buffer.write(samples_to_write);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -808,6 +903,9 @@ pub const Session = struct {
|
||||||
const vosk_chunk_size = 4096;
|
const vosk_chunk_size = 4096;
|
||||||
const min_chunk_size = 1024; // Minimum chunk size for processing
|
const min_chunk_size = 1024; // Minimum chunk size for processing
|
||||||
|
|
||||||
|
const cpu_perf = getCpuPerformance() catch 100;
|
||||||
|
if (cpu_perf < 50)
|
||||||
|
std.log.debug("processing thread additional delay being added", .{});
|
||||||
var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch {
|
var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch {
|
||||||
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk processing buffer");
|
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk processing buffer");
|
||||||
self.options.event_handler.onDetailedError(error_info);
|
self.options.event_handler.onDetailedError(error_info);
|
||||||
|
@ -832,6 +930,10 @@ pub const Session = struct {
|
||||||
const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..chunk_size]);
|
const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..chunk_size]);
|
||||||
|
|
||||||
if (samples_read > 0 and self.vosk_recognizer != null) {
|
if (samples_read > 0 and self.vosk_recognizer != null) {
|
||||||
|
// Time the Vosk processing to identify bottlenecks
|
||||||
|
const start_time = std.time.nanoTimestamp();
|
||||||
|
const buffer_fill = (self.vosk_audio_buffer.available() * 100) / self.vosk_audio_buffer.buffer.len;
|
||||||
|
|
||||||
// Process audio with Vosk with comprehensive error handling
|
// Process audio with Vosk with comprehensive error handling
|
||||||
self.processVoskAudio(vosk_buffer[0..samples_read]) catch |err| {
|
self.processVoskAudio(vosk_buffer[0..samples_read]) catch |err| {
|
||||||
error_count += 1;
|
error_count += 1;
|
||||||
|
@ -873,6 +975,13 @@ pub const Session = struct {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Log timing and buffer status for diagnostics
|
||||||
|
const end_time = std.time.nanoTimestamp();
|
||||||
|
const processing_ms = @divTrunc(end_time - start_time, std.time.ns_per_ms);
|
||||||
|
const realtime_ms = (samples_read * 1000) / 16000;
|
||||||
|
if (processing_ms > realtime_ms)
|
||||||
|
std.log.warn("Vosk processing behind realtime. {d} samples in {d}ms (realtime: {d}ms), buffer {d}% full", .{ samples_read, processing_ms, realtime_ms, buffer_fill });
|
||||||
|
|
||||||
// Reset error counters after successful operations
|
// Reset error counters after successful operations
|
||||||
success_count += 1;
|
success_count += 1;
|
||||||
consecutive_failures = 0;
|
consecutive_failures = 0;
|
||||||
|
@ -893,7 +1002,12 @@ pub const Session = struct {
|
||||||
|
|
||||||
// Increase delay if we're having errors
|
// Increase delay if we're having errors
|
||||||
const error_multiplier: u64 = if (consecutive_failures > 0) consecutive_failures + 1 else 1;
|
const error_multiplier: u64 = if (consecutive_failures > 0) consecutive_failures + 1 else 1;
|
||||||
const delay_ms = base_delay_ms * error_multiplier;
|
var delay_ms = base_delay_ms * error_multiplier;
|
||||||
|
|
||||||
|
// Add extra delay for slower hardware (Pi) to prevent buffer overruns
|
||||||
|
if (cpu_perf < 50) {
|
||||||
|
delay_ms += 100; // Extra 10ms delay for Pi-class hardware
|
||||||
|
}
|
||||||
|
|
||||||
std.Thread.sleep(delay_ms * std.time.ns_per_ms);
|
std.Thread.sleep(delay_ms * std.time.ns_per_ms);
|
||||||
}
|
}
|
||||||
|
@ -1051,7 +1165,7 @@ pub const Session = struct {
|
||||||
|
|
||||||
// Reinitialize recognizer (model should still be valid)
|
// Reinitialize recognizer (model should still be valid)
|
||||||
if (self.vosk_model) |model| {
|
if (self.vosk_model) |model| {
|
||||||
self.vosk_recognizer = c.vosk_recognizer_new(model, @floatFromInt(self.alsa_capture.?.sample_rate));
|
self.vosk_recognizer = c.vosk_recognizer_new(model, 16000.0);
|
||||||
if (self.vosk_recognizer == null) {
|
if (self.vosk_recognizer == null) {
|
||||||
const error_info = ErrorInfo.init(Error.ModelLoadError, "Failed to reinitialize Vosk recognizer");
|
const error_info = ErrorInfo.init(Error.ModelLoadError, "Failed to reinitialize Vosk recognizer");
|
||||||
self.options.event_handler.onDetailedError(error_info);
|
self.options.event_handler.onDetailedError(error_info);
|
||||||
|
@ -1259,6 +1373,11 @@ pub const Session = struct {
|
||||||
// Free processing buffer
|
// Free processing buffer
|
||||||
self.allocator.free(self.processing_buffer);
|
self.allocator.free(self.processing_buffer);
|
||||||
|
|
||||||
|
// Free resample buffer if allocated
|
||||||
|
if (self.resample_buffer) |buf| {
|
||||||
|
self.allocator.free(buf);
|
||||||
|
}
|
||||||
|
|
||||||
// Clean up ALSA global configuration cache
|
// Clean up ALSA global configuration cache
|
||||||
_ = c.snd_config_update_free_global();
|
_ = c.snd_config_update_free_global();
|
||||||
|
|
||||||
|
@ -1589,3 +1708,24 @@ test "Session status and recovery" {
|
||||||
// which can cause segmentation faults during deinit
|
// which can cause segmentation faults during deinit
|
||||||
return error.SkipZigTest;
|
return error.SkipZigTest;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test "resample function" {
|
||||||
|
// Test same sample rate (no conversion)
|
||||||
|
const input = [_]i16{ 100, 200, 300, 400 };
|
||||||
|
var output: [4]i16 = undefined;
|
||||||
|
const count = resample(&input, &output, 16000, 16000);
|
||||||
|
try std.testing.expect(count == 4);
|
||||||
|
try std.testing.expectEqualSlices(i16, &input, output[0..count]);
|
||||||
|
|
||||||
|
// Test downsampling (48kHz -> 16kHz, 3:1 ratio)
|
||||||
|
const input_48k = [_]i16{ 100, 150, 200, 250, 300, 350 };
|
||||||
|
var output_16k: [2]i16 = undefined;
|
||||||
|
const down_count = resample(&input_48k, &output_16k, 48000, 16000);
|
||||||
|
try std.testing.expect(down_count == 2);
|
||||||
|
|
||||||
|
// Test upsampling (16kHz -> 48kHz, 1:3 ratio)
|
||||||
|
const input_16k = [_]i16{ 100, 200 };
|
||||||
|
var output_48k: [6]i16 = undefined;
|
||||||
|
const up_count = resample(&input_16k, &output_48k, 16000, 48000);
|
||||||
|
try std.testing.expect(up_count == 6);
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue