add silence threshold for great good

2025-10-29 17:25:59 -07:00 · 2025-10-29 17:25:59 -07:00 · e82b119b36
commit e82b119b36
parent 07308a548a
2 changed files with 24 additions and 5 deletions
--- a/src/main.zig
+++ b/src/main.zig
@ -418,6 +418,7 @@ pub fn main() !void {
    var model_path: ?[]const u8 = null;
    var exec_program: ?[]const u8 = null;
    var measure_levels = false;
+    var silence_threshold: ?i16 = null;

    // Parse arguments
    for (args[1..]) |arg| {
@ -428,6 +429,7 @@ pub fn main() !void {
            _ = try stdout.writeAll("OPTIONS:\n");
            _ = try stdout.writeAll("    --model=<path>            Path to Vosk model directory\n");
            _ = try stdout.writeAll("    --exec=<program>          Program to execute with recognized text\n");
+            _ = try stdout.writeAll("    --silence-threshold=<n>   Silence detection threshold (default: 300)\n");
            _ = try stdout.writeAll("    --measure-levels          Display real-time audio level histogram\n");
            _ = try stdout.writeAll("    --help, -h                Show this help message\n\n");
            _ = try stdout.writeAll("EXAMPLES:\n");
@ -445,6 +447,16 @@ pub fn main() !void {
            exec_program = arg[7..]; // Skip "--exec="
        } else if (std.mem.eql(u8, arg, "--measure-levels")) {
            measure_levels = true;
+        } else if (std.mem.startsWith(u8, arg, "--silence-threshold=")) {
+            const threshold_str = arg[20..]; // Skip "--silence-threshold="
+            silence_threshold = std.fmt.parseInt(i16, threshold_str, 10) catch {
+                std.log.err("Invalid silence threshold: {s}", .{threshold_str});
+                return error.InvalidArgument;
+            };
+            if (silence_threshold.? < 0) {
+                std.log.err("Invalid silence threshold: {s}", .{threshold_str});
+                return error.InvalidArgument;
+            }
        }
    }

@ -516,6 +528,7 @@ pub fn main() !void {
        .event_handler = speech_handler,
        .sample_rate = 16000, // Standard sample rate for speech recognition
        .buffer_size = 256, // Existing buffer size for low latency
+        .silence_threshold = silence_threshold orelse 300,
    };

    std.log.debug("Initializing STT library...", .{});
--- a/src/stt.zig
+++ b/src/stt.zig
@ -606,6 +606,8 @@ pub const Options = struct {
    // channels: u32 = 2,
    /// Audio buffer size in frames (default: 256)
    buffer_size: u32 = 256,
+    /// Silence detection threshold (default: 300)
+    silence_threshold: i16 = 300,
 };

 /// Main STT session handle
@ -970,11 +972,15 @@ pub const Session = struct {
        defer self.allocator.free(vosk_buffer);

        // Silence detection parameters
-        const silence_threshold: i16 = 300;
+        const silence_threshold: i16 = self.options.silence_threshold;
        const silence_duration_ms: u64 = 500;
        const min_speech_duration_ms: u64 = 300;
        const samples_per_ms = 16; // This assumes 16kHz audio

+        std.log.info(
+            "Listening with silence threshold {}. If wrong, use --measure-levels to find approriate val",
+            .{silence_threshold},
+        );
        var speech_buffer = self.allocator.alloc(i16, 16000 * 10) catch {
            const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate speech buffer");
            self.options.event_handler.onDetailedError(error_info);