From 4ad75b1b07e722f3742832f0206c284b6fbc525e Mon Sep 17 00:00:00 2001 From: Emil Lerch Date: Wed, 29 Oct 2025 10:44:39 -0700 Subject: [PATCH] report max amplitude in speech output --- src/main.zig | 20 +++++++++--------- src/stt.zig | 58 +++++++++++++++++++++++++++++++++++----------------- src/test.zig | 25 ++++++++++++++-------- 3 files changed, 65 insertions(+), 38 deletions(-) diff --git a/src/main.zig b/src/main.zig index cd50228..51bc5af 100644 --- a/src/main.zig +++ b/src/main.zig @@ -26,29 +26,28 @@ const SpeechHandler = struct { const max_children = 5; const Process = struct { child: ?*std.process.Child, start: i64, id: std.process.Child.Id }; // why id? /// Handle detected speech - fn onSpeech(ctx: *anyopaque, text: []const u8) void { + fn onSpeech(ctx: *anyopaque, event: stt.SpeechEvent) void { if (builtin.is_test) return; // Suppress output during tests // Look for noise words and skip it if so - if (std.mem.eql(u8, text, "huh")) return; - if (std.mem.eql(u8, text, "but")) return; + if (std.mem.eql(u8, event.text, "huh")) return; + if (std.mem.eql(u8, event.text, "but")) return; const self: *SpeechHandler = @ptrCast(@alignCast(ctx)); self.speech_count += 1; - // Print with timestamp for better experience - const timestamp = std.time.timestamp(); + // Print with amplitude for better experience var stdout_buffer: [1024]u8 = undefined; var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer); const stdout = &stdout_writer.interface; defer stdout.flush() catch std.log.warn("Caught error writing speech data to stdout", .{}); - stdout.print("[{}] Speech {}->{?s}: {s}\n", .{ - timestamp, + stdout.print("[amp:{}] Speech {}->{?s}: {s}\n", .{ + event.max_amplitude, self.speech_count, self.exec_program, - text, + event.text, }) catch std.log.warn("Caught error writing speech data to stdout", .{}); // Execute program if specified - if (self.exec_program) |program| self.exec(text) catch |err| { + if (self.exec_program) |program| self.exec(event.text) catch |err| { std.log.err("Failed to execute program '{s}': {}", .{ program, err }); }; } @@ -526,7 +525,8 @@ test "handler callbacks" { }; // Test that callbacks can be invoked without crashing - speech_handler.onSpeech("test speech"); + const event = stt.SpeechEvent{ .text = "test speech", .max_amplitude = 500 }; + speech_handler.onSpeech(event); speech_handler.onError(stt.Error.AudioDeviceError, "test error"); // If we get here without crashing, the test passes diff --git a/src/stt.zig b/src/stt.zig index 0911db8..92999ca 100644 --- a/src/stt.zig +++ b/src/stt.zig @@ -104,12 +104,20 @@ pub const ErrorInfo = struct { } }; +/// Speech detection event data +pub const SpeechEvent = struct { + /// Recognized text + text: []const u8, + /// Maximum amplitude detected in the speech segment + max_amplitude: u16, +}; + /// Callback function type for speech detection events /// /// Parameters: -/// - text: Null-terminated string containing the detected speech +/// - event: Speech event data containing text and amplitude /// - user_data: Optional user-provided context data -pub const SpeechCallback = *const fn (text: [*:0]const u8, user_data: ?*anyopaque) void; +pub const SpeechCallback = *const fn (event: SpeechEvent, user_data: ?*anyopaque) void; /// Callback function type for error events /// @@ -132,7 +140,7 @@ pub const DetailedErrorCallback = *const fn (error_info: ErrorInfo, user_data: ? /// with both speech detection and error handling callbacks. pub const SpeechEventHandler = struct { /// Function to call when speech is detected - onSpeechFn: *const fn (ctx: *anyopaque, text: []const u8) void, + onSpeechFn: *const fn (ctx: *anyopaque, event: SpeechEvent) void, /// Function to call when an error occurs onErrorFn: *const fn (ctx: *anyopaque, error_code: Error, message: []const u8) void, /// Optional function to call for detailed error information @@ -141,11 +149,11 @@ pub const SpeechEventHandler = struct { ctx: *anyopaque, /// Invoke the speech detection callback with error handling - pub fn onSpeech(self: SpeechEventHandler, text: []const u8) void { + pub fn onSpeech(self: SpeechEventHandler, event: SpeechEvent) void { // Call the speech callback function // Note: If the callback panics or causes undefined behavior, // there's not much we can do to recover gracefully in Zig - self.onSpeechFn(self.ctx, text); + self.onSpeechFn(self.ctx, event); } /// Invoke the error callback @@ -974,6 +982,7 @@ pub const Session = struct { var silence_samples: usize = 0; var speech_samples: usize = 0; var speech_pos: usize = 0; + var speech_max_amplitude: u16 = 0; while (!self.should_stop.load(.acquire)) { const available = self.vosk_audio_buffer.available(); @@ -1002,7 +1011,9 @@ pub const Session = struct { speech_samples = 0; silence_samples = 0; speech_pos = 0; + speech_max_amplitude = 0; } + if (max_amplitude > speech_max_amplitude) speech_max_amplitude = max_amplitude; const copy_len = @min(samples_read, speech_buffer.len - speech_pos); @memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]); speech_pos += copy_len; @@ -1016,7 +1027,7 @@ pub const Session = struct { if (silence_samples >= silence_duration_ms * samples_per_ms) { if (speech_samples >= min_speech_duration_ms * samples_per_ms) { - self.processVoskAudio(speech_buffer[0..speech_pos]) catch |err| { + self.processVoskAudio(speech_buffer[0..speech_pos], speech_max_amplitude) catch |err| { std.log.err("Error processing speech: {}", .{err}); }; } @@ -1025,6 +1036,7 @@ pub const Session = struct { speech_pos = 0; speech_samples = 0; silence_samples = 0; + speech_max_amplitude = 0; if (self.vosk_recognizer) |rec| { c.vosk_recognizer_reset(rec); @@ -1038,7 +1050,7 @@ pub const Session = struct { } /// Process complete audio clip with Vosk and handle results - fn processVoskAudio(self: *Session, audio_data: []const i16) !void { + fn processVoskAudio(self: *Session, audio_data: []const i16, max_amplitude: u16) !void { if (self.vosk_recognizer == null) return Error.InvalidState; const audio_bytes = std.mem.sliceAsBytes(audio_data); @@ -1048,14 +1060,14 @@ pub const Session = struct { const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer); if (result_cstr != null) { const result_str = std.mem.span(result_cstr); - self.parseVoskResult(result_str) catch |err| { + self.parseVoskResult(result_str, max_amplitude) catch |err| { self.options.event_handler.onError(err, "Failed to parse Vosk result"); }; } } /// Parse Vosk JSON result and extract recognized text - fn parseVoskResult(self: *Session, json_str: []const u8) !void { + fn parseVoskResult(self: *Session, json_str: []const u8, max_amplitude: u16) !void { if (json_str.len == 0) return; const text_key = "\"text\""; @@ -1074,7 +1086,11 @@ pub const Session = struct { if (std.mem.indexOf(u8, json_str[s..], "\"")) |quote_end| { const text = json_str[s .. s + quote_end]; if (text.len > 0 and !std.mem.eql(u8, text, " ")) { - self.options.event_handler.onSpeech(text); + const event = SpeechEvent{ + .text = text, + .max_amplitude = max_amplitude, + }; + self.options.event_handler.onSpeech(event); } } } @@ -1376,9 +1392,9 @@ test "Options validation" { // Test valid options const DummyHandler = struct { - fn onSpeech(ctx: *anyopaque, text: []const u8) void { + fn onSpeech(ctx: *anyopaque, event: SpeechEvent) void { _ = ctx; - _ = text; + _ = event; } fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void { _ = ctx; @@ -1412,9 +1428,9 @@ test "Session state management" { const testing = std.testing; const DummyHandler = struct { - fn onSpeech(ctx: *anyopaque, text: []const u8) void { + fn onSpeech(ctx: *anyopaque, event: SpeechEvent) void { _ = ctx; - _ = text; + _ = event; } fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void { _ = ctx; @@ -1451,12 +1467,14 @@ test "SpeechEventHandler interface" { speech_called: bool = false, error_called: bool = false, last_text: []const u8 = "", + last_amplitude: u16 = 0, last_error: Error = Error.InitializationFailed, - fn onSpeech(ctx: *anyopaque, text: []const u8) void { + fn onSpeech(ctx: *anyopaque, event: SpeechEvent) void { const self: *@This() = @ptrCast(@alignCast(ctx)); self.speech_called = true; - self.last_text = text; + self.last_text = event.text; + self.last_amplitude = event.max_amplitude; } fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void { @@ -1475,9 +1493,11 @@ test "SpeechEventHandler interface" { }; // Test speech callback - event_handler.onSpeech("hello world"); + const event = SpeechEvent{ .text = "hello world", .max_amplitude = 1234 }; + event_handler.onSpeech(event); try testing.expect(handler.speech_called); try testing.expectEqualStrings("hello world", handler.last_text); + try testing.expect(handler.last_amplitude == 1234); // Test error callback event_handler.onError(Error.AudioDeviceError, "test error"); @@ -1615,10 +1635,10 @@ test "Session session management API" { speech_count: u32 = 0, error_count: u32 = 0, - fn onSpeech(ctx: *anyopaque, text: []const u8) void { + fn onSpeech(ctx: *anyopaque, event: SpeechEvent) void { const self: *@This() = @ptrCast(@alignCast(ctx)); self.speech_count += 1; - _ = text; + _ = event; } fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void { diff --git a/src/test.zig b/src/test.zig index 56c22fc..84fff4d 100644 --- a/src/test.zig +++ b/src/test.zig @@ -57,9 +57,9 @@ const TestEventHandler = struct { self.detailed_error_events.deinit(self.allocator); } - fn onSpeech(ctx: *anyopaque, text: []const u8) void { + fn onSpeech(ctx: *anyopaque, event: stt.SpeechEvent) void { const self: *TestEventHandler = @ptrCast(@alignCast(ctx)); - const owned_text = self.allocator.dupe(u8, text) catch return; + const owned_text = self.allocator.dupe(u8, event.text) catch return; self.speech_events.append(self.allocator, owned_text) catch return; } @@ -154,7 +154,8 @@ test "SpeechEventHandler callback invocation" { const speech_handler = test_handler.getSpeechEventHandler(); // Test speech callback - speech_handler.onSpeech("Hello world"); + const event = stt.SpeechEvent{ .text = "Hello world", .max_amplitude = 1000 }; + speech_handler.onSpeech(event); try testing.expect(test_handler.speech_events.items.len == 1); try testing.expectEqualStrings("Hello world", test_handler.speech_events.items[0]); @@ -351,13 +352,15 @@ test "Callback error handling robustness" { for (0..100) |i| { const text = std.fmt.allocPrint(test_allocator, "Speech event {}", .{i}) catch continue; defer test_allocator.free(text); - speech_handler.onSpeech(text); + const event = stt.SpeechEvent{ .text = text, .max_amplitude = @intCast(i) }; + speech_handler.onSpeech(event); } try testing.expect(test_handler.speech_events.items.len == 100); // Test mixed callback types - speech_handler.onSpeech("Final speech"); + const final_event = stt.SpeechEvent{ .text = "Final speech", .max_amplitude = 800 }; + speech_handler.onSpeech(final_event); speech_handler.onError(stt.Error.CallbackError, "Callback error"); const final_error = stt.ErrorInfo.init(stt.Error.InternalError, "Internal error"); @@ -387,7 +390,8 @@ test "Memory management and resource cleanup" { defer test_handler.deinit(); // Should not leak memory const speech_handler = test_handler.getSpeechEventHandler(); - speech_handler.onSpeech("Test speech"); + const event = stt.SpeechEvent{ .text = "Test speech", .max_amplitude = 500 }; + speech_handler.onSpeech(event); speech_handler.onError(stt.Error.AudioDeviceError, "Test error"); const error_info = stt.ErrorInfo.initWithContext(stt.Error.ModelLoadError, "Test detailed error", "test context"); @@ -461,15 +465,18 @@ test "Complete workflow simulation" { try testing.expect(processed_count == audio_samples.len); // 3. Speech detection phase - speech_handler.onSpeech("Hello world"); - speech_handler.onSpeech("This is a test"); + const event1 = stt.SpeechEvent{ .text = "Hello world", .max_amplitude = 2000 }; + speech_handler.onSpeech(event1); + const event2 = stt.SpeechEvent{ .text = "This is a test", .max_amplitude = 1500 }; + speech_handler.onSpeech(event2); // 4. Error handling phase const recoverable_error = stt.ErrorInfo.initRecoverable(stt.Error.AudioDeviceBusy, "Audio device temporarily busy", "Retrying in 100ms"); speech_handler.onDetailedError(recoverable_error); // 5. Recovery phase - speech_handler.onSpeech("Speech recognition resumed"); + const resume_event = stt.SpeechEvent{ .text = "Speech recognition resumed", .max_amplitude = 1800 }; + speech_handler.onSpeech(resume_event); // 6. Cleanup phase const cleanup_info = stt.ErrorInfo.initRecoverable(stt.Error.InternalError, "STT session cleanup completed", "All resources freed");