diff --git a/.gitignore b/.gitignore index 0577261..1faef7e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .zig-cache/ zig-out/ alsa-*.conf +alsa.conf diff --git a/src/stt.zig b/src/stt.zig index 9122b65..019165d 100644 --- a/src/stt.zig +++ b/src/stt.zig @@ -277,39 +277,6 @@ pub const AudioConverter = struct { return frames; } - - /// Simple sample rate conversion (basic linear interpolation) - /// Note: This is a basic implementation. For production use, consider more sophisticated algorithms - pub fn resample(input_samples: []const i16, output_samples: []i16, input_rate: u32, output_rate: u32) usize { - if (input_rate == output_rate) { - const copy_len = @min(input_samples.len, output_samples.len); - @memcpy(output_samples[0..copy_len], input_samples[0..copy_len]); - return copy_len; - } - - const ratio = @as(f64, @floatFromInt(input_rate)) / @as(f64, @floatFromInt(output_rate)); - const output_len = @min(output_samples.len, @as(usize, @intFromFloat(@as(f64, @floatFromInt(input_samples.len)) / ratio))); - - for (0..output_len) |i| { - const src_pos = @as(f64, @floatFromInt(i)) * ratio; - const src_idx: usize = @intFromFloat(src_pos); - - if (src_idx >= input_samples.len) break; - - if (src_idx + 1 < input_samples.len) { - // Linear interpolation - const frac = src_pos - @as(f64, @floatFromInt(src_idx)); - const sample1: f64 = @floatFromInt(input_samples[src_idx]); - const sample2: f64 = @floatFromInt(input_samples[src_idx + 1]); - const interpolated = sample1 + (sample2 - sample1) * frac; - output_samples[i] = @intFromFloat(@max(@min(interpolated, std.math.maxInt(i16)), std.math.minInt(i16))); - } else { - output_samples[i] = input_samples[src_idx]; - } - } - - return output_len; - } }; /// ALSA audio capture configuration and state @@ -664,8 +631,8 @@ pub const Session = struct { return Error.ModelLoadError; } - // Create Vosk recognizer - self.vosk_recognizer = c.vosk_recognizer_new(self.vosk_model, @floatFromInt(self.options.sample_rate)); + // Create Vosk recognizer using actual hardware sample rate + self.vosk_recognizer = c.vosk_recognizer_new(self.vosk_model, @floatFromInt(self.alsa_capture.?.sample_rate)); if (self.vosk_recognizer == null) { if (self.vosk_model) |model| { c.vosk_model_free(model); @@ -952,12 +919,8 @@ pub const Session = struct { return Error.InvalidState; } - // Resample to 16kHz if needed - var resampled_buffer: [4096]i16 = undefined; - const final_audio = if (self.alsa_capture.?.sample_rate != 16000) blk: { - const resampled_count = AudioConverter.resample(audio_data, &resampled_buffer, self.alsa_capture.?.sample_rate, 16000); - break :blk resampled_buffer[0..resampled_count]; - } else audio_data; + // Use audio data directly without resampling + const final_audio = audio_data; // Convert i16 samples to bytes for Vosk const audio_bytes = std.mem.sliceAsBytes(final_audio); @@ -1088,7 +1051,7 @@ pub const Session = struct { // Reinitialize recognizer (model should still be valid) if (self.vosk_model) |model| { - self.vosk_recognizer = c.vosk_recognizer_new(model, @floatFromInt(self.options.sample_rate)); + self.vosk_recognizer = c.vosk_recognizer_new(model, @floatFromInt(self.alsa_capture.?.sample_rate)); if (self.vosk_recognizer == null) { const error_info = ErrorInfo.init(Error.ModelLoadError, "Failed to reinitialize Vosk recognizer"); self.options.event_handler.onDetailedError(error_info); @@ -1510,28 +1473,6 @@ test "AudioConverter stereo to mono conversion" { try testing.expect(mono_samples[2] == 550); // (500 + 600) / 2 } -test "AudioConverter sample rate conversion" { - const testing = std.testing; - - // Test same sample rate (should copy directly) - const input_samples = [_]i16{ 100, 200, 300, 400 }; - var output_samples: [4]i16 = undefined; - - const converted = AudioConverter.resample(&input_samples, &output_samples, 16000, 16000); - try testing.expect(converted == 4); - try testing.expect(output_samples[0] == 100); - try testing.expect(output_samples[1] == 200); - try testing.expect(output_samples[2] == 300); - try testing.expect(output_samples[3] == 400); - - // Test downsampling (2:1 ratio) - var downsampled: [2]i16 = undefined; - const downsampled_count = AudioConverter.resample(&input_samples, &downsampled, 16000, 8000); - try testing.expect(downsampled_count == 2); - try testing.expect(downsampled[0] == 100); // First sample - try testing.expect(downsampled[1] == 300); // Interpolated sample -} - test "AlsaCapture initialization" { const testing = std.testing; var gpa = std.heap.GeneralPurposeAllocator(.{}){}; diff --git a/src/test.zig b/src/test.zig index 1dbb16f..56c22fc 100644 --- a/src/test.zig +++ b/src/test.zig @@ -228,27 +228,6 @@ test "AudioConverter stereo to mono conversion" { try testing.expect(overflow_mono[0] == std.math.maxInt(i16)); // Should clamp to max } -test "AudioConverter sample rate conversion" { - // Test same sample rate (no conversion) - const input_samples = [_]i16{ 100, 200, 300, 400 }; - var output_samples: [4]i16 = undefined; - - const converted = stt.AudioConverter.resample(&input_samples, &output_samples, 44100, 44100); - try testing.expect(converted == 4); - try testing.expectEqualSlices(i16, &input_samples, output_samples[0..converted]); - - // Test downsampling (44100 -> 22050, 2:1 ratio) - var downsampled: [2]i16 = undefined; - const downsampled_count = stt.AudioConverter.resample(&input_samples, &downsampled, 44100, 22050); - try testing.expect(downsampled_count == 2); - - // Test upsampling (22050 -> 44100, 1:2 ratio) - const small_input = [_]i16{ 100, 200 }; - var upsampled: [4]i16 = undefined; - const upsampled_count = stt.AudioConverter.resample(&small_input, &upsampled, 22050, 44100); - try testing.expect(upsampled_count == 4); -} - test "Session initialization error handling" { var test_handler = TestEventHandler.init(test_allocator); defer test_handler.deinit();