From e82b119b3646b992e6cb903f82ef4ea2ab6ea314 Mon Sep 17 00:00:00 2001 From: Emil Lerch Date: Wed, 29 Oct 2025 17:25:59 -0700 Subject: [PATCH] add silence threshold for great good --- src/main.zig | 21 +++++++++++++++++---- src/stt.zig | 8 +++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/main.zig b/src/main.zig index 5e932de..5be549c 100644 --- a/src/main.zig +++ b/src/main.zig @@ -418,6 +418,7 @@ pub fn main() !void { var model_path: ?[]const u8 = null; var exec_program: ?[]const u8 = null; var measure_levels = false; + var silence_threshold: ?i16 = null; // Parse arguments for (args[1..]) |arg| { @@ -426,10 +427,11 @@ pub fn main() !void { _ = try stdout.writeAll("USAGE:\n"); _ = try stdout.writeAll(" stt [OPTIONS]\n\n"); _ = try stdout.writeAll("OPTIONS:\n"); - _ = try stdout.writeAll(" --model= Path to Vosk model directory\n"); - _ = try stdout.writeAll(" --exec= Program to execute with recognized text\n"); - _ = try stdout.writeAll(" --measure-levels Display real-time audio level histogram\n"); - _ = try stdout.writeAll(" --help, -h Show this help message\n\n"); + _ = try stdout.writeAll(" --model= Path to Vosk model directory\n"); + _ = try stdout.writeAll(" --exec= Program to execute with recognized text\n"); + _ = try stdout.writeAll(" --silence-threshold= Silence detection threshold (default: 300)\n"); + _ = try stdout.writeAll(" --measure-levels Display real-time audio level histogram\n"); + _ = try stdout.writeAll(" --help, -h Show this help message\n\n"); _ = try stdout.writeAll("EXAMPLES:\n"); _ = try stdout.writeAll(" stt\n"); _ = try stdout.writeAll(" stt --model=../share/vosk/models/vosk-model-small-en-us-0.15\n"); @@ -445,6 +447,16 @@ pub fn main() !void { exec_program = arg[7..]; // Skip "--exec=" } else if (std.mem.eql(u8, arg, "--measure-levels")) { measure_levels = true; + } else if (std.mem.startsWith(u8, arg, "--silence-threshold=")) { + const threshold_str = arg[20..]; // Skip "--silence-threshold=" + silence_threshold = std.fmt.parseInt(i16, threshold_str, 10) catch { + std.log.err("Invalid silence threshold: {s}", .{threshold_str}); + return error.InvalidArgument; + }; + if (silence_threshold.? < 0) { + std.log.err("Invalid silence threshold: {s}", .{threshold_str}); + return error.InvalidArgument; + } } } @@ -516,6 +528,7 @@ pub fn main() !void { .event_handler = speech_handler, .sample_rate = 16000, // Standard sample rate for speech recognition .buffer_size = 256, // Existing buffer size for low latency + .silence_threshold = silence_threshold orelse 300, }; std.log.debug("Initializing STT library...", .{}); diff --git a/src/stt.zig b/src/stt.zig index 48e744d..a268a57 100644 --- a/src/stt.zig +++ b/src/stt.zig @@ -606,6 +606,8 @@ pub const Options = struct { // channels: u32 = 2, /// Audio buffer size in frames (default: 256) buffer_size: u32 = 256, + /// Silence detection threshold (default: 300) + silence_threshold: i16 = 300, }; /// Main STT session handle @@ -970,11 +972,15 @@ pub const Session = struct { defer self.allocator.free(vosk_buffer); // Silence detection parameters - const silence_threshold: i16 = 300; + const silence_threshold: i16 = self.options.silence_threshold; const silence_duration_ms: u64 = 500; const min_speech_duration_ms: u64 = 300; const samples_per_ms = 16; // This assumes 16kHz audio + std.log.info( + "Listening with silence threshold {}. If wrong, use --measure-levels to find approriate val", + .{silence_threshold}, + ); var speech_buffer = self.allocator.alloc(i16, 16000 * 10) catch { const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate speech buffer"); self.options.event_handler.onDetailedError(error_info);