Adjust threshold and report min/max amplitude
This commit is contained in:
parent
4ad75b1b07
commit
8a07e365c3
3 changed files with 37 additions and 31 deletions
|
|
@ -39,7 +39,8 @@ const SpeechHandler = struct {
|
|||
var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer);
|
||||
const stdout = &stdout_writer.interface;
|
||||
defer stdout.flush() catch std.log.warn("Caught error writing speech data to stdout", .{});
|
||||
stdout.print("[amp:{}] Speech {}->{?s}: {s}\n", .{
|
||||
stdout.print("[{}-{}] Speech {}->{?s}: {s}\n", .{
|
||||
event.min_amplitude,
|
||||
event.max_amplitude,
|
||||
self.speech_count,
|
||||
self.exec_program,
|
||||
|
|
@ -525,7 +526,7 @@ test "handler callbacks" {
|
|||
};
|
||||
|
||||
// Test that callbacks can be invoked without crashing
|
||||
const event = stt.SpeechEvent{ .text = "test speech", .max_amplitude = 500 };
|
||||
const event = stt.SpeechEvent{ .text = "test speech", .max_amplitude = 500, .min_amplitude = 200 };
|
||||
speech_handler.onSpeech(event);
|
||||
speech_handler.onError(stt.Error.AudioDeviceError, "test error");
|
||||
|
||||
|
|
|
|||
42
src/stt.zig
42
src/stt.zig
|
|
@ -109,7 +109,11 @@ pub const SpeechEvent = struct {
|
|||
/// Recognized text
|
||||
text: []const u8,
|
||||
/// Maximum amplitude detected in the speech segment
|
||||
max_amplitude: u16,
|
||||
max_amplitude: u16 = std.math.maxInt(u16),
|
||||
/// Minimum chunk amplitude that triggered speech detection
|
||||
min_amplitude: u16 = 0,
|
||||
/// Audio data for the speech segment
|
||||
audio_data: []const i16 = &[_]i16{},
|
||||
};
|
||||
|
||||
/// Callback function type for speech detection events
|
||||
|
|
@ -966,7 +970,7 @@ pub const Session = struct {
|
|||
defer self.allocator.free(vosk_buffer);
|
||||
|
||||
// Silence detection parameters
|
||||
const silence_threshold: i16 = 500;
|
||||
const silence_threshold: i16 = 300;
|
||||
const silence_duration_ms: u64 = 500;
|
||||
const min_speech_duration_ms: u64 = 300;
|
||||
const samples_per_ms = 16; // This assumes 16kHz audio
|
||||
|
|
@ -983,6 +987,7 @@ pub const Session = struct {
|
|||
var speech_samples: usize = 0;
|
||||
var speech_pos: usize = 0;
|
||||
var speech_max_amplitude: u16 = 0;
|
||||
var speech_min_amplitude: u16 = std.math.maxInt(u16);
|
||||
|
||||
while (!self.should_stop.load(.acquire)) {
|
||||
const available = self.vosk_audio_buffer.available();
|
||||
|
|
@ -1012,8 +1017,10 @@ pub const Session = struct {
|
|||
silence_samples = 0;
|
||||
speech_pos = 0;
|
||||
speech_max_amplitude = 0;
|
||||
speech_min_amplitude = std.math.maxInt(u16);
|
||||
}
|
||||
if (max_amplitude > speech_max_amplitude) speech_max_amplitude = max_amplitude;
|
||||
if (max_amplitude < speech_min_amplitude) speech_min_amplitude = max_amplitude;
|
||||
const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
|
||||
@memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
|
||||
speech_pos += copy_len;
|
||||
|
|
@ -1027,7 +1034,14 @@ pub const Session = struct {
|
|||
|
||||
if (silence_samples >= silence_duration_ms * samples_per_ms) {
|
||||
if (speech_samples >= min_speech_duration_ms * samples_per_ms) {
|
||||
self.processVoskAudio(speech_buffer[0..speech_pos], speech_max_amplitude) catch |err| {
|
||||
const event = SpeechEvent{
|
||||
// SAFETY: This will be defined in the next line when we process the audio data
|
||||
.text = undefined,
|
||||
.max_amplitude = speech_max_amplitude,
|
||||
.min_amplitude = speech_min_amplitude,
|
||||
.audio_data = speech_buffer[0..speech_pos],
|
||||
};
|
||||
self.processVoskAudio(event) catch |err| {
|
||||
std.log.err("Error processing speech: {}", .{err});
|
||||
};
|
||||
}
|
||||
|
|
@ -1037,6 +1051,7 @@ pub const Session = struct {
|
|||
speech_samples = 0;
|
||||
silence_samples = 0;
|
||||
speech_max_amplitude = 0;
|
||||
speech_min_amplitude = std.math.maxInt(u16);
|
||||
|
||||
if (self.vosk_recognizer) |rec| {
|
||||
c.vosk_recognizer_reset(rec);
|
||||
|
|
@ -1050,24 +1065,24 @@ pub const Session = struct {
|
|||
}
|
||||
|
||||
/// Process complete audio clip with Vosk and handle results
|
||||
fn processVoskAudio(self: *Session, audio_data: []const i16, max_amplitude: u16) !void {
|
||||
fn processVoskAudio(self: *Session, event: SpeechEvent) !void {
|
||||
if (self.vosk_recognizer == null) return Error.InvalidState;
|
||||
|
||||
const audio_bytes = std.mem.sliceAsBytes(audio_data);
|
||||
const audio_bytes = std.mem.sliceAsBytes(event.audio_data);
|
||||
_ = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));
|
||||
|
||||
// Get final result
|
||||
const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer);
|
||||
if (result_cstr != null) {
|
||||
const result_str = std.mem.span(result_cstr);
|
||||
self.parseVoskResult(result_str, max_amplitude) catch |err| {
|
||||
self.parseVoskResult(result_str, event) catch |err| {
|
||||
self.options.event_handler.onError(err, "Failed to parse Vosk result");
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse Vosk JSON result and extract recognized text
|
||||
fn parseVoskResult(self: *Session, json_str: []const u8, max_amplitude: u16) !void {
|
||||
fn parseVoskResult(self: *Session, json_str: []const u8, event: SpeechEvent) !void {
|
||||
if (json_str.len == 0) return;
|
||||
|
||||
const text_key = "\"text\"";
|
||||
|
|
@ -1086,11 +1101,9 @@ pub const Session = struct {
|
|||
if (std.mem.indexOf(u8, json_str[s..], "\"")) |quote_end| {
|
||||
const text = json_str[s .. s + quote_end];
|
||||
if (text.len > 0 and !std.mem.eql(u8, text, " ")) {
|
||||
const event = SpeechEvent{
|
||||
.text = text,
|
||||
.max_amplitude = max_amplitude,
|
||||
};
|
||||
self.options.event_handler.onSpeech(event);
|
||||
var result_event = event;
|
||||
result_event.text = text;
|
||||
self.options.event_handler.onSpeech(result_event);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1493,11 +1506,10 @@ test "SpeechEventHandler interface" {
|
|||
};
|
||||
|
||||
// Test speech callback
|
||||
const event = SpeechEvent{ .text = "hello world", .max_amplitude = 1234 };
|
||||
event_handler.onSpeech(event);
|
||||
event_handler.onSpeech(.{ .text = "hello world" });
|
||||
try testing.expect(handler.speech_called);
|
||||
try testing.expectEqualStrings("hello world", handler.last_text);
|
||||
try testing.expect(handler.last_amplitude == 1234);
|
||||
try testing.expect(handler.last_amplitude == std.math.maxInt(u16));
|
||||
|
||||
// Test error callback
|
||||
event_handler.onError(Error.AudioDeviceError, "test error");
|
||||
|
|
|
|||
21
src/test.zig
21
src/test.zig
|
|
@ -154,8 +154,7 @@ test "SpeechEventHandler callback invocation" {
|
|||
const speech_handler = test_handler.getSpeechEventHandler();
|
||||
|
||||
// Test speech callback
|
||||
const event = stt.SpeechEvent{ .text = "Hello world", .max_amplitude = 1000 };
|
||||
speech_handler.onSpeech(event);
|
||||
speech_handler.onSpeech(.{ .text = "Hello world" });
|
||||
try testing.expect(test_handler.speech_events.items.len == 1);
|
||||
try testing.expectEqualStrings("Hello world", test_handler.speech_events.items[0]);
|
||||
|
||||
|
|
@ -352,15 +351,13 @@ test "Callback error handling robustness" {
|
|||
for (0..100) |i| {
|
||||
const text = std.fmt.allocPrint(test_allocator, "Speech event {}", .{i}) catch continue;
|
||||
defer test_allocator.free(text);
|
||||
const event = stt.SpeechEvent{ .text = text, .max_amplitude = @intCast(i) };
|
||||
speech_handler.onSpeech(event);
|
||||
speech_handler.onSpeech(.{ .text = text });
|
||||
}
|
||||
|
||||
try testing.expect(test_handler.speech_events.items.len == 100);
|
||||
|
||||
// Test mixed callback types
|
||||
const final_event = stt.SpeechEvent{ .text = "Final speech", .max_amplitude = 800 };
|
||||
speech_handler.onSpeech(final_event);
|
||||
speech_handler.onSpeech(.{ .text = "Final speech" });
|
||||
speech_handler.onError(stt.Error.CallbackError, "Callback error");
|
||||
|
||||
const final_error = stt.ErrorInfo.init(stt.Error.InternalError, "Internal error");
|
||||
|
|
@ -390,8 +387,7 @@ test "Memory management and resource cleanup" {
|
|||
defer test_handler.deinit(); // Should not leak memory
|
||||
|
||||
const speech_handler = test_handler.getSpeechEventHandler();
|
||||
const event = stt.SpeechEvent{ .text = "Test speech", .max_amplitude = 500 };
|
||||
speech_handler.onSpeech(event);
|
||||
speech_handler.onSpeech(.{ .text = "Test speech" });
|
||||
speech_handler.onError(stt.Error.AudioDeviceError, "Test error");
|
||||
|
||||
const error_info = stt.ErrorInfo.initWithContext(stt.Error.ModelLoadError, "Test detailed error", "test context");
|
||||
|
|
@ -465,18 +461,15 @@ test "Complete workflow simulation" {
|
|||
try testing.expect(processed_count == audio_samples.len);
|
||||
|
||||
// 3. Speech detection phase
|
||||
const event1 = stt.SpeechEvent{ .text = "Hello world", .max_amplitude = 2000 };
|
||||
speech_handler.onSpeech(event1);
|
||||
const event2 = stt.SpeechEvent{ .text = "This is a test", .max_amplitude = 1500 };
|
||||
speech_handler.onSpeech(event2);
|
||||
speech_handler.onSpeech(.{ .text = "Hello world" });
|
||||
speech_handler.onSpeech(.{ .text = "This is a test" });
|
||||
|
||||
// 4. Error handling phase
|
||||
const recoverable_error = stt.ErrorInfo.initRecoverable(stt.Error.AudioDeviceBusy, "Audio device temporarily busy", "Retrying in 100ms");
|
||||
speech_handler.onDetailedError(recoverable_error);
|
||||
|
||||
// 5. Recovery phase
|
||||
const resume_event = stt.SpeechEvent{ .text = "Speech recognition resumed", .max_amplitude = 1800 };
|
||||
speech_handler.onSpeech(resume_event);
|
||||
speech_handler.onSpeech(.{ .text = "Speech recognition resumed" });
|
||||
|
||||
// 6. Cleanup phase
|
||||
const cleanup_info = stt.ErrorInfo.initRecoverable(stt.Error.InternalError, "STT session cleanup completed", "All resources freed");
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue