diff --git a/src/main.zig b/src/main.zig index 51bc5af..ad92bc5 100644 --- a/src/main.zig +++ b/src/main.zig @@ -39,7 +39,8 @@ const SpeechHandler = struct { var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer); const stdout = &stdout_writer.interface; defer stdout.flush() catch std.log.warn("Caught error writing speech data to stdout", .{}); - stdout.print("[amp:{}] Speech {}->{?s}: {s}\n", .{ + stdout.print("[{}-{}] Speech {}->{?s}: {s}\n", .{ + event.min_amplitude, event.max_amplitude, self.speech_count, self.exec_program, @@ -525,7 +526,7 @@ test "handler callbacks" { }; // Test that callbacks can be invoked without crashing - const event = stt.SpeechEvent{ .text = "test speech", .max_amplitude = 500 }; + const event = stt.SpeechEvent{ .text = "test speech", .max_amplitude = 500, .min_amplitude = 200 }; speech_handler.onSpeech(event); speech_handler.onError(stt.Error.AudioDeviceError, "test error"); diff --git a/src/stt.zig b/src/stt.zig index 92999ca..ff7bdf8 100644 --- a/src/stt.zig +++ b/src/stt.zig @@ -109,7 +109,11 @@ pub const SpeechEvent = struct { /// Recognized text text: []const u8, /// Maximum amplitude detected in the speech segment - max_amplitude: u16, + max_amplitude: u16 = std.math.maxInt(u16), + /// Minimum chunk amplitude that triggered speech detection + min_amplitude: u16 = 0, + /// Audio data for the speech segment + audio_data: []const i16 = &[_]i16{}, }; /// Callback function type for speech detection events @@ -966,7 +970,7 @@ pub const Session = struct { defer self.allocator.free(vosk_buffer); // Silence detection parameters - const silence_threshold: i16 = 500; + const silence_threshold: i16 = 300; const silence_duration_ms: u64 = 500; const min_speech_duration_ms: u64 = 300; const samples_per_ms = 16; // This assumes 16kHz audio @@ -983,6 +987,7 @@ pub const Session = struct { var speech_samples: usize = 0; var speech_pos: usize = 0; var speech_max_amplitude: u16 = 0; + var speech_min_amplitude: u16 = std.math.maxInt(u16); while (!self.should_stop.load(.acquire)) { const available = self.vosk_audio_buffer.available(); @@ -1012,8 +1017,10 @@ pub const Session = struct { silence_samples = 0; speech_pos = 0; speech_max_amplitude = 0; + speech_min_amplitude = std.math.maxInt(u16); } if (max_amplitude > speech_max_amplitude) speech_max_amplitude = max_amplitude; + if (max_amplitude < speech_min_amplitude) speech_min_amplitude = max_amplitude; const copy_len = @min(samples_read, speech_buffer.len - speech_pos); @memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]); speech_pos += copy_len; @@ -1027,7 +1034,14 @@ pub const Session = struct { if (silence_samples >= silence_duration_ms * samples_per_ms) { if (speech_samples >= min_speech_duration_ms * samples_per_ms) { - self.processVoskAudio(speech_buffer[0..speech_pos], speech_max_amplitude) catch |err| { + const event = SpeechEvent{ + // SAFETY: This will be defined in the next line when we process the audio data + .text = undefined, + .max_amplitude = speech_max_amplitude, + .min_amplitude = speech_min_amplitude, + .audio_data = speech_buffer[0..speech_pos], + }; + self.processVoskAudio(event) catch |err| { std.log.err("Error processing speech: {}", .{err}); }; } @@ -1037,6 +1051,7 @@ pub const Session = struct { speech_samples = 0; silence_samples = 0; speech_max_amplitude = 0; + speech_min_amplitude = std.math.maxInt(u16); if (self.vosk_recognizer) |rec| { c.vosk_recognizer_reset(rec); @@ -1050,24 +1065,24 @@ pub const Session = struct { } /// Process complete audio clip with Vosk and handle results - fn processVoskAudio(self: *Session, audio_data: []const i16, max_amplitude: u16) !void { + fn processVoskAudio(self: *Session, event: SpeechEvent) !void { if (self.vosk_recognizer == null) return Error.InvalidState; - const audio_bytes = std.mem.sliceAsBytes(audio_data); + const audio_bytes = std.mem.sliceAsBytes(event.audio_data); _ = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len)); // Get final result const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer); if (result_cstr != null) { const result_str = std.mem.span(result_cstr); - self.parseVoskResult(result_str, max_amplitude) catch |err| { + self.parseVoskResult(result_str, event) catch |err| { self.options.event_handler.onError(err, "Failed to parse Vosk result"); }; } } /// Parse Vosk JSON result and extract recognized text - fn parseVoskResult(self: *Session, json_str: []const u8, max_amplitude: u16) !void { + fn parseVoskResult(self: *Session, json_str: []const u8, event: SpeechEvent) !void { if (json_str.len == 0) return; const text_key = "\"text\""; @@ -1086,11 +1101,9 @@ pub const Session = struct { if (std.mem.indexOf(u8, json_str[s..], "\"")) |quote_end| { const text = json_str[s .. s + quote_end]; if (text.len > 0 and !std.mem.eql(u8, text, " ")) { - const event = SpeechEvent{ - .text = text, - .max_amplitude = max_amplitude, - }; - self.options.event_handler.onSpeech(event); + var result_event = event; + result_event.text = text; + self.options.event_handler.onSpeech(result_event); } } } @@ -1493,11 +1506,10 @@ test "SpeechEventHandler interface" { }; // Test speech callback - const event = SpeechEvent{ .text = "hello world", .max_amplitude = 1234 }; - event_handler.onSpeech(event); + event_handler.onSpeech(.{ .text = "hello world" }); try testing.expect(handler.speech_called); try testing.expectEqualStrings("hello world", handler.last_text); - try testing.expect(handler.last_amplitude == 1234); + try testing.expect(handler.last_amplitude == std.math.maxInt(u16)); // Test error callback event_handler.onError(Error.AudioDeviceError, "test error"); diff --git a/src/test.zig b/src/test.zig index 84fff4d..f3bc7a3 100644 --- a/src/test.zig +++ b/src/test.zig @@ -154,8 +154,7 @@ test "SpeechEventHandler callback invocation" { const speech_handler = test_handler.getSpeechEventHandler(); // Test speech callback - const event = stt.SpeechEvent{ .text = "Hello world", .max_amplitude = 1000 }; - speech_handler.onSpeech(event); + speech_handler.onSpeech(.{ .text = "Hello world" }); try testing.expect(test_handler.speech_events.items.len == 1); try testing.expectEqualStrings("Hello world", test_handler.speech_events.items[0]); @@ -352,15 +351,13 @@ test "Callback error handling robustness" { for (0..100) |i| { const text = std.fmt.allocPrint(test_allocator, "Speech event {}", .{i}) catch continue; defer test_allocator.free(text); - const event = stt.SpeechEvent{ .text = text, .max_amplitude = @intCast(i) }; - speech_handler.onSpeech(event); + speech_handler.onSpeech(.{ .text = text }); } try testing.expect(test_handler.speech_events.items.len == 100); // Test mixed callback types - const final_event = stt.SpeechEvent{ .text = "Final speech", .max_amplitude = 800 }; - speech_handler.onSpeech(final_event); + speech_handler.onSpeech(.{ .text = "Final speech" }); speech_handler.onError(stt.Error.CallbackError, "Callback error"); const final_error = stt.ErrorInfo.init(stt.Error.InternalError, "Internal error"); @@ -390,8 +387,7 @@ test "Memory management and resource cleanup" { defer test_handler.deinit(); // Should not leak memory const speech_handler = test_handler.getSpeechEventHandler(); - const event = stt.SpeechEvent{ .text = "Test speech", .max_amplitude = 500 }; - speech_handler.onSpeech(event); + speech_handler.onSpeech(.{ .text = "Test speech" }); speech_handler.onError(stt.Error.AudioDeviceError, "Test error"); const error_info = stt.ErrorInfo.initWithContext(stt.Error.ModelLoadError, "Test detailed error", "test context"); @@ -465,18 +461,15 @@ test "Complete workflow simulation" { try testing.expect(processed_count == audio_samples.len); // 3. Speech detection phase - const event1 = stt.SpeechEvent{ .text = "Hello world", .max_amplitude = 2000 }; - speech_handler.onSpeech(event1); - const event2 = stt.SpeechEvent{ .text = "This is a test", .max_amplitude = 1500 }; - speech_handler.onSpeech(event2); + speech_handler.onSpeech(.{ .text = "Hello world" }); + speech_handler.onSpeech(.{ .text = "This is a test" }); // 4. Error handling phase const recoverable_error = stt.ErrorInfo.initRecoverable(stt.Error.AudioDeviceBusy, "Audio device temporarily busy", "Retrying in 100ms"); speech_handler.onDetailedError(recoverable_error); // 5. Recovery phase - const resume_event = stt.SpeechEvent{ .text = "Speech recognition resumed", .max_amplitude = 1800 }; - speech_handler.onSpeech(resume_event); + speech_handler.onSpeech(.{ .text = "Speech recognition resumed" }); // 6. Cleanup phase const cleanup_info = stt.ErrorInfo.initRecoverable(stt.Error.InternalError, "STT session cleanup completed", "All resources freed");