From 4ad75b1b07e722f3742832f0206c284b6fbc525e Mon Sep 17 00:00:00 2001
From: Emil Lerch <emil@lerch.org>
Date: Wed, 29 Oct 2025 10:44:39 -0700
Subject: [PATCH] report max amplitude in speech output

---
 src/main.zig | 20 +++++++++---------
 src/stt.zig  | 58 +++++++++++++++++++++++++++++++++++-----------------
 src/test.zig | 25 ++++++++++++++--------
 3 files changed, 65 insertions(+), 38 deletions(-)

diff --git a/src/main.zig b/src/main.zig
index cd50228..51bc5af 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -26,29 +26,28 @@ const SpeechHandler = struct {
     const max_children = 5;
     const Process = struct { child: ?*std.process.Child, start: i64, id: std.process.Child.Id }; // why id?
     /// Handle detected speech
-    fn onSpeech(ctx: *anyopaque, text: []const u8) void {
+    fn onSpeech(ctx: *anyopaque, event: stt.SpeechEvent) void {
         if (builtin.is_test) return; // Suppress output during tests
         // Look for noise words and skip it if so
-        if (std.mem.eql(u8, text, "huh")) return;
-        if (std.mem.eql(u8, text, "but")) return;
+        if (std.mem.eql(u8, event.text, "huh")) return;
+        if (std.mem.eql(u8, event.text, "but")) return;
         const self: *SpeechHandler = @ptrCast(@alignCast(ctx));
         self.speech_count += 1;
 
-        // Print with timestamp for better experience
-        const timestamp = std.time.timestamp();
+        // Print with amplitude for better experience
         var stdout_buffer: [1024]u8 = undefined;
         var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer);
         const stdout = &stdout_writer.interface;
         defer stdout.flush() catch std.log.warn("Caught error writing speech data to stdout", .{});
-        stdout.print("[{}] Speech {}->{?s}: {s}\n", .{
-            timestamp,
+        stdout.print("[amp:{}] Speech {}->{?s}: {s}\n", .{
+            event.max_amplitude,
             self.speech_count,
             self.exec_program,
-            text,
+            event.text,
         }) catch std.log.warn("Caught error writing speech data to stdout", .{});
 
         // Execute program if specified
-        if (self.exec_program) |program| self.exec(text) catch |err| {
+        if (self.exec_program) |program| self.exec(event.text) catch |err| {
             std.log.err("Failed to execute program '{s}': {}", .{ program, err });
         };
     }
@@ -526,7 +525,8 @@ test "handler callbacks" {
     };
 
     // Test that callbacks can be invoked without crashing
-    speech_handler.onSpeech("test speech");
+    const event = stt.SpeechEvent{ .text = "test speech", .max_amplitude = 500 };
+    speech_handler.onSpeech(event);
     speech_handler.onError(stt.Error.AudioDeviceError, "test error");
 
     // If we get here without crashing, the test passes
diff --git a/src/stt.zig b/src/stt.zig
index 0911db8..92999ca 100644
--- a/src/stt.zig
+++ b/src/stt.zig
@@ -104,12 +104,20 @@ pub const ErrorInfo = struct {
     }
 };
 
+/// Speech detection event data
+pub const SpeechEvent = struct {
+    /// Recognized text
+    text: []const u8,
+    /// Maximum amplitude detected in the speech segment
+    max_amplitude: u16,
+};
+
 /// Callback function type for speech detection events
 ///
 /// Parameters:
-/// - text: Null-terminated string containing the detected speech
+/// - event: Speech event data containing text and amplitude
 /// - user_data: Optional user-provided context data
-pub const SpeechCallback = *const fn (text: [*:0]const u8, user_data: ?*anyopaque) void;
+pub const SpeechCallback = *const fn (event: SpeechEvent, user_data: ?*anyopaque) void;
 
 /// Callback function type for error events
 ///
@@ -132,7 +140,7 @@ pub const DetailedErrorCallback = *const fn (error_info: ErrorInfo, user_data: ?
 /// with both speech detection and error handling callbacks.
 pub const SpeechEventHandler = struct {
     /// Function to call when speech is detected
-    onSpeechFn: *const fn (ctx: *anyopaque, text: []const u8) void,
+    onSpeechFn: *const fn (ctx: *anyopaque, event: SpeechEvent) void,
     /// Function to call when an error occurs
     onErrorFn: *const fn (ctx: *anyopaque, error_code: Error, message: []const u8) void,
     /// Optional function to call for detailed error information
@@ -141,11 +149,11 @@ pub const SpeechEventHandler = struct {
     ctx: *anyopaque,
 
     /// Invoke the speech detection callback with error handling
-    pub fn onSpeech(self: SpeechEventHandler, text: []const u8) void {
+    pub fn onSpeech(self: SpeechEventHandler, event: SpeechEvent) void {
         // Call the speech callback function
         // Note: If the callback panics or causes undefined behavior,
         // there's not much we can do to recover gracefully in Zig
-        self.onSpeechFn(self.ctx, text);
+        self.onSpeechFn(self.ctx, event);
     }
 
     /// Invoke the error callback
@@ -974,6 +982,7 @@ pub const Session = struct {
         var silence_samples: usize = 0;
         var speech_samples: usize = 0;
         var speech_pos: usize = 0;
+        var speech_max_amplitude: u16 = 0;
 
         while (!self.should_stop.load(.acquire)) {
             const available = self.vosk_audio_buffer.available();
@@ -1002,7 +1011,9 @@ pub const Session = struct {
                     speech_samples = 0;
                     silence_samples = 0;
                     speech_pos = 0;
+                    speech_max_amplitude = 0;
                 }
+                if (max_amplitude > speech_max_amplitude) speech_max_amplitude = max_amplitude;
                 const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
                 @memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
                 speech_pos += copy_len;
@@ -1016,7 +1027,7 @@ pub const Session = struct {
 
                 if (silence_samples >= silence_duration_ms * samples_per_ms) {
                     if (speech_samples >= min_speech_duration_ms * samples_per_ms) {
-                        self.processVoskAudio(speech_buffer[0..speech_pos]) catch |err| {
+                        self.processVoskAudio(speech_buffer[0..speech_pos], speech_max_amplitude) catch |err| {
                             std.log.err("Error processing speech: {}", .{err});
                         };
                     }
@@ -1025,6 +1036,7 @@ pub const Session = struct {
                     speech_pos = 0;
                     speech_samples = 0;
                     silence_samples = 0;
+                    speech_max_amplitude = 0;
 
                     if (self.vosk_recognizer) |rec| {
                         c.vosk_recognizer_reset(rec);
@@ -1038,7 +1050,7 @@ pub const Session = struct {
     }
 
     /// Process complete audio clip with Vosk and handle results
-    fn processVoskAudio(self: *Session, audio_data: []const i16) !void {
+    fn processVoskAudio(self: *Session, audio_data: []const i16, max_amplitude: u16) !void {
         if (self.vosk_recognizer == null) return Error.InvalidState;
 
         const audio_bytes = std.mem.sliceAsBytes(audio_data);
@@ -1048,14 +1060,14 @@ pub const Session = struct {
         const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer);
         if (result_cstr != null) {
             const result_str = std.mem.span(result_cstr);
-            self.parseVoskResult(result_str) catch |err| {
+            self.parseVoskResult(result_str, max_amplitude) catch |err| {
                 self.options.event_handler.onError(err, "Failed to parse Vosk result");
             };
         }
     }
 
     /// Parse Vosk JSON result and extract recognized text
-    fn parseVoskResult(self: *Session, json_str: []const u8) !void {
+    fn parseVoskResult(self: *Session, json_str: []const u8, max_amplitude: u16) !void {
         if (json_str.len == 0) return;
 
         const text_key = "\"text\"";
@@ -1074,7 +1086,11 @@ pub const Session = struct {
                     if (std.mem.indexOf(u8, json_str[s..], "\"")) |quote_end| {
                         const text = json_str[s .. s + quote_end];
                         if (text.len > 0 and !std.mem.eql(u8, text, " ")) {
-                            self.options.event_handler.onSpeech(text);
+                            const event = SpeechEvent{
+                                .text = text,
+                                .max_amplitude = max_amplitude,
+                            };
+                            self.options.event_handler.onSpeech(event);
                         }
                     }
                 }
@@ -1376,9 +1392,9 @@ test "Options validation" {
 
     // Test valid options
     const DummyHandler = struct {
-        fn onSpeech(ctx: *anyopaque, text: []const u8) void {
+        fn onSpeech(ctx: *anyopaque, event: SpeechEvent) void {
             _ = ctx;
-            _ = text;
+            _ = event;
         }
         fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void {
             _ = ctx;
@@ -1412,9 +1428,9 @@ test "Session state management" {
     const testing = std.testing;
 
     const DummyHandler = struct {
-        fn onSpeech(ctx: *anyopaque, text: []const u8) void {
+        fn onSpeech(ctx: *anyopaque, event: SpeechEvent) void {
             _ = ctx;
-            _ = text;
+            _ = event;
         }
         fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void {
             _ = ctx;
@@ -1451,12 +1467,14 @@ test "SpeechEventHandler interface" {
         speech_called: bool = false,
         error_called: bool = false,
         last_text: []const u8 = "",
+        last_amplitude: u16 = 0,
         last_error: Error = Error.InitializationFailed,
 
-        fn onSpeech(ctx: *anyopaque, text: []const u8) void {
+        fn onSpeech(ctx: *anyopaque, event: SpeechEvent) void {
             const self: *@This() = @ptrCast(@alignCast(ctx));
             self.speech_called = true;
-            self.last_text = text;
+            self.last_text = event.text;
+            self.last_amplitude = event.max_amplitude;
         }
 
         fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void {
@@ -1475,9 +1493,11 @@ test "SpeechEventHandler interface" {
     };
 
     // Test speech callback
-    event_handler.onSpeech("hello world");
+    const event = SpeechEvent{ .text = "hello world", .max_amplitude = 1234 };
+    event_handler.onSpeech(event);
     try testing.expect(handler.speech_called);
     try testing.expectEqualStrings("hello world", handler.last_text);
+    try testing.expect(handler.last_amplitude == 1234);
 
     // Test error callback
     event_handler.onError(Error.AudioDeviceError, "test error");
@@ -1615,10 +1635,10 @@ test "Session session management API" {
         speech_count: u32 = 0,
         error_count: u32 = 0,
 
-        fn onSpeech(ctx: *anyopaque, text: []const u8) void {
+        fn onSpeech(ctx: *anyopaque, event: SpeechEvent) void {
             const self: *@This() = @ptrCast(@alignCast(ctx));
             self.speech_count += 1;
-            _ = text;
+            _ = event;
         }
 
         fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void {
diff --git a/src/test.zig b/src/test.zig
index 56c22fc..84fff4d 100644
--- a/src/test.zig
+++ b/src/test.zig
@@ -57,9 +57,9 @@ const TestEventHandler = struct {
         self.detailed_error_events.deinit(self.allocator);
     }
 
-    fn onSpeech(ctx: *anyopaque, text: []const u8) void {
+    fn onSpeech(ctx: *anyopaque, event: stt.SpeechEvent) void {
         const self: *TestEventHandler = @ptrCast(@alignCast(ctx));
-        const owned_text = self.allocator.dupe(u8, text) catch return;
+        const owned_text = self.allocator.dupe(u8, event.text) catch return;
         self.speech_events.append(self.allocator, owned_text) catch return;
     }
 
@@ -154,7 +154,8 @@ test "SpeechEventHandler callback invocation" {
     const speech_handler = test_handler.getSpeechEventHandler();
 
     // Test speech callback
-    speech_handler.onSpeech("Hello world");
+    const event = stt.SpeechEvent{ .text = "Hello world", .max_amplitude = 1000 };
+    speech_handler.onSpeech(event);
     try testing.expect(test_handler.speech_events.items.len == 1);
     try testing.expectEqualStrings("Hello world", test_handler.speech_events.items[0]);
 
@@ -351,13 +352,15 @@ test "Callback error handling robustness" {
     for (0..100) |i| {
         const text = std.fmt.allocPrint(test_allocator, "Speech event {}", .{i}) catch continue;
         defer test_allocator.free(text);
-        speech_handler.onSpeech(text);
+        const event = stt.SpeechEvent{ .text = text, .max_amplitude = @intCast(i) };
+        speech_handler.onSpeech(event);
     }
 
     try testing.expect(test_handler.speech_events.items.len == 100);
 
     // Test mixed callback types
-    speech_handler.onSpeech("Final speech");
+    const final_event = stt.SpeechEvent{ .text = "Final speech", .max_amplitude = 800 };
+    speech_handler.onSpeech(final_event);
     speech_handler.onError(stt.Error.CallbackError, "Callback error");
 
     const final_error = stt.ErrorInfo.init(stt.Error.InternalError, "Internal error");
@@ -387,7 +390,8 @@ test "Memory management and resource cleanup" {
         defer test_handler.deinit(); // Should not leak memory
 
         const speech_handler = test_handler.getSpeechEventHandler();
-        speech_handler.onSpeech("Test speech");
+        const event = stt.SpeechEvent{ .text = "Test speech", .max_amplitude = 500 };
+        speech_handler.onSpeech(event);
         speech_handler.onError(stt.Error.AudioDeviceError, "Test error");
 
         const error_info = stt.ErrorInfo.initWithContext(stt.Error.ModelLoadError, "Test detailed error", "test context");
@@ -461,15 +465,18 @@ test "Complete workflow simulation" {
     try testing.expect(processed_count == audio_samples.len);
 
     // 3. Speech detection phase
-    speech_handler.onSpeech("Hello world");
-    speech_handler.onSpeech("This is a test");
+    const event1 = stt.SpeechEvent{ .text = "Hello world", .max_amplitude = 2000 };
+    speech_handler.onSpeech(event1);
+    const event2 = stt.SpeechEvent{ .text = "This is a test", .max_amplitude = 1500 };
+    speech_handler.onSpeech(event2);
 
     // 4. Error handling phase
     const recoverable_error = stt.ErrorInfo.initRecoverable(stt.Error.AudioDeviceBusy, "Audio device temporarily busy", "Retrying in 100ms");
     speech_handler.onDetailedError(recoverable_error);
 
     // 5. Recovery phase
-    speech_handler.onSpeech("Speech recognition resumed");
+    const resume_event = stt.SpeechEvent{ .text = "Speech recognition resumed", .max_amplitude = 1800 };
+    speech_handler.onSpeech(resume_event);
 
     // 6. Cleanup phase
     const cleanup_info = stt.ErrorInfo.initRecoverable(stt.Error.InternalError, "STT session cleanup completed", "All resources freed");