add silence threshold for great good

2025-10-29 17:25:59 -07:00 · 2025-10-29 17:25:59 -07:00 · e82b119b36
commit e82b119b36
parent 07308a548a
2 changed files with 24 additions and 5 deletions
--- a/src/main.zig
+++ b/src/main.zig
@ -418,6 +418,7 @@ pub fn main() !void {
    var model_path: ?[]const u8 = null;
    var exec_program: ?[]const u8 = null;
    var measure_levels = false;
    var silence_threshold: ?i16 = null;
    // Parse arguments
    for (args[1..]) |arg| {
@ -426,10 +427,11 @@ pub fn main() !void {
            _ = try stdout.writeAll("USAGE:\n");
            _ = try stdout.writeAll("    stt [OPTIONS]\n\n");
            _ = try stdout.writeAll("OPTIONS:\n");
-            _ = try stdout.writeAll("    --model=<path>       Path to Vosk model directory\n");
+            _ = try stdout.writeAll("    --model=<path>            Path to Vosk model directory\n");
-            _ = try stdout.writeAll("    --exec=<program>     Program to execute with recognized text\n");
+            _ = try stdout.writeAll("    --exec=<program>          Program to execute with recognized text\n");
-            _ = try stdout.writeAll("    --measure-levels     Display real-time audio level histogram\n");
+            _ = try stdout.writeAll("    --silence-threshold=<n>   Silence detection threshold (default: 300)\n");
-            _ = try stdout.writeAll("    --help, -h           Show this help message\n\n");
+            _ = try stdout.writeAll("    --measure-levels          Display real-time audio level histogram\n");
            _ = try stdout.writeAll("    --help, -h                Show this help message\n\n");
            _ = try stdout.writeAll("EXAMPLES:\n");
            _ = try stdout.writeAll("    stt\n");
            _ = try stdout.writeAll("    stt --model=../share/vosk/models/vosk-model-small-en-us-0.15\n");
@ -445,6 +447,16 @@ pub fn main() !void {
            exec_program = arg[7..]; // Skip "--exec="
        } else if (std.mem.eql(u8, arg, "--measure-levels")) {
            measure_levels = true;
        } else if (std.mem.startsWith(u8, arg, "--silence-threshold=")) {
            const threshold_str = arg[20..]; // Skip "--silence-threshold="
            silence_threshold = std.fmt.parseInt(i16, threshold_str, 10) catch {
                std.log.err("Invalid silence threshold: {s}", .{threshold_str});
                return error.InvalidArgument;
            };
            if (silence_threshold.? < 0) {
                std.log.err("Invalid silence threshold: {s}", .{threshold_str});
                return error.InvalidArgument;
            }
        }
    }
@ -516,6 +528,7 @@ pub fn main() !void {
        .event_handler = speech_handler,
        .sample_rate = 16000, // Standard sample rate for speech recognition
        .buffer_size = 256, // Existing buffer size for low latency
        .silence_threshold = silence_threshold orelse 300,
    };
    std.log.debug("Initializing STT library...", .{});
--- a/src/stt.zig
+++ b/src/stt.zig
@ -606,6 +606,8 @@ pub const Options = struct {
    // channels: u32 = 2,
    /// Audio buffer size in frames (default: 256)
    buffer_size: u32 = 256,
    /// Silence detection threshold (default: 300)
    silence_threshold: i16 = 300,
 };
 /// Main STT session handle
@ -970,11 +972,15 @@ pub const Session = struct {
        defer self.allocator.free(vosk_buffer);
        // Silence detection parameters
-        const silence_threshold: i16 = 300;
+        const silence_threshold: i16 = self.options.silence_threshold;
        const silence_duration_ms: u64 = 500;
        const min_speech_duration_ms: u64 = 300;
        const samples_per_ms = 16; // This assumes 16kHz audio
        std.log.info(
            "Listening with silence threshold {}. If wrong, use --measure-levels to find approriate val",
            .{silence_threshold},
        );
        var speech_buffer = self.allocator.alloc(i16, 16000 * 10) catch {
            const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate speech buffer");
            self.options.event_handler.onDetailedError(error_info);