set vosk initializer sample rate to actual hardware/remove resample

This commit is contained in:
Emil Lerch 2025-10-06 13:01:22 -07:00
parent 43cf222a98
commit e2490ec3e3
Signed by: lobo
GPG key ID: A7B62D657EF764F8
3 changed files with 6 additions and 85 deletions

1
.gitignore vendored
View file

@ -1,3 +1,4 @@
.zig-cache/
zig-out/
alsa-*.conf
alsa.conf

View file

@ -277,39 +277,6 @@ pub const AudioConverter = struct {
return frames;
}
/// Simple sample rate conversion (basic linear interpolation)
/// Note: This is a basic implementation. For production use, consider more sophisticated algorithms
pub fn resample(input_samples: []const i16, output_samples: []i16, input_rate: u32, output_rate: u32) usize {
if (input_rate == output_rate) {
const copy_len = @min(input_samples.len, output_samples.len);
@memcpy(output_samples[0..copy_len], input_samples[0..copy_len]);
return copy_len;
}
const ratio = @as(f64, @floatFromInt(input_rate)) / @as(f64, @floatFromInt(output_rate));
const output_len = @min(output_samples.len, @as(usize, @intFromFloat(@as(f64, @floatFromInt(input_samples.len)) / ratio)));
for (0..output_len) |i| {
const src_pos = @as(f64, @floatFromInt(i)) * ratio;
const src_idx: usize = @intFromFloat(src_pos);
if (src_idx >= input_samples.len) break;
if (src_idx + 1 < input_samples.len) {
// Linear interpolation
const frac = src_pos - @as(f64, @floatFromInt(src_idx));
const sample1: f64 = @floatFromInt(input_samples[src_idx]);
const sample2: f64 = @floatFromInt(input_samples[src_idx + 1]);
const interpolated = sample1 + (sample2 - sample1) * frac;
output_samples[i] = @intFromFloat(@max(@min(interpolated, std.math.maxInt(i16)), std.math.minInt(i16)));
} else {
output_samples[i] = input_samples[src_idx];
}
}
return output_len;
}
};
/// ALSA audio capture configuration and state
@ -664,8 +631,8 @@ pub const Session = struct {
return Error.ModelLoadError;
}
// Create Vosk recognizer
self.vosk_recognizer = c.vosk_recognizer_new(self.vosk_model, @floatFromInt(self.options.sample_rate));
// Create Vosk recognizer using actual hardware sample rate
self.vosk_recognizer = c.vosk_recognizer_new(self.vosk_model, @floatFromInt(self.alsa_capture.?.sample_rate));
if (self.vosk_recognizer == null) {
if (self.vosk_model) |model| {
c.vosk_model_free(model);
@ -952,12 +919,8 @@ pub const Session = struct {
return Error.InvalidState;
}
// Resample to 16kHz if needed
var resampled_buffer: [4096]i16 = undefined;
const final_audio = if (self.alsa_capture.?.sample_rate != 16000) blk: {
const resampled_count = AudioConverter.resample(audio_data, &resampled_buffer, self.alsa_capture.?.sample_rate, 16000);
break :blk resampled_buffer[0..resampled_count];
} else audio_data;
// Use audio data directly without resampling
const final_audio = audio_data;
// Convert i16 samples to bytes for Vosk
const audio_bytes = std.mem.sliceAsBytes(final_audio);
@ -1088,7 +1051,7 @@ pub const Session = struct {
// Reinitialize recognizer (model should still be valid)
if (self.vosk_model) |model| {
self.vosk_recognizer = c.vosk_recognizer_new(model, @floatFromInt(self.options.sample_rate));
self.vosk_recognizer = c.vosk_recognizer_new(model, @floatFromInt(self.alsa_capture.?.sample_rate));
if (self.vosk_recognizer == null) {
const error_info = ErrorInfo.init(Error.ModelLoadError, "Failed to reinitialize Vosk recognizer");
self.options.event_handler.onDetailedError(error_info);
@ -1510,28 +1473,6 @@ test "AudioConverter stereo to mono conversion" {
try testing.expect(mono_samples[2] == 550); // (500 + 600) / 2
}
test "AudioConverter sample rate conversion" {
const testing = std.testing;
// Test same sample rate (should copy directly)
const input_samples = [_]i16{ 100, 200, 300, 400 };
var output_samples: [4]i16 = undefined;
const converted = AudioConverter.resample(&input_samples, &output_samples, 16000, 16000);
try testing.expect(converted == 4);
try testing.expect(output_samples[0] == 100);
try testing.expect(output_samples[1] == 200);
try testing.expect(output_samples[2] == 300);
try testing.expect(output_samples[3] == 400);
// Test downsampling (2:1 ratio)
var downsampled: [2]i16 = undefined;
const downsampled_count = AudioConverter.resample(&input_samples, &downsampled, 16000, 8000);
try testing.expect(downsampled_count == 2);
try testing.expect(downsampled[0] == 100); // First sample
try testing.expect(downsampled[1] == 300); // Interpolated sample
}
test "AlsaCapture initialization" {
const testing = std.testing;
var gpa = std.heap.GeneralPurposeAllocator(.{}){};

View file

@ -228,27 +228,6 @@ test "AudioConverter stereo to mono conversion" {
try testing.expect(overflow_mono[0] == std.math.maxInt(i16)); // Should clamp to max
}
test "AudioConverter sample rate conversion" {
// Test same sample rate (no conversion)
const input_samples = [_]i16{ 100, 200, 300, 400 };
var output_samples: [4]i16 = undefined;
const converted = stt.AudioConverter.resample(&input_samples, &output_samples, 44100, 44100);
try testing.expect(converted == 4);
try testing.expectEqualSlices(i16, &input_samples, output_samples[0..converted]);
// Test downsampling (44100 -> 22050, 2:1 ratio)
var downsampled: [2]i16 = undefined;
const downsampled_count = stt.AudioConverter.resample(&input_samples, &downsampled, 44100, 22050);
try testing.expect(downsampled_count == 2);
// Test upsampling (22050 -> 44100, 1:2 ratio)
const small_input = [_]i16{ 100, 200 };
var upsampled: [4]i16 = undefined;
const upsampled_count = stt.AudioConverter.resample(&small_input, &upsampled, 22050, 44100);
try testing.expect(upsampled_count == 4);
}
test "Session initialization error handling" {
var test_handler = TestEventHandler.init(test_allocator);
defer test_handler.deinit();