Compare commits
5 commits
b1c2ee20ff
...
e82b119b36
| Author | SHA1 | Date | |
|---|---|---|---|
| e82b119b36 | |||
| 07308a548a | |||
| 362be00d07 | |||
| 8a07e365c3 | |||
| 4ad75b1b07 |
3 changed files with 180 additions and 48 deletions
128
src/main.zig
128
src/main.zig
|
|
@ -26,31 +26,36 @@ const SpeechHandler = struct {
|
||||||
const max_children = 5;
|
const max_children = 5;
|
||||||
const Process = struct { child: ?*std.process.Child, start: i64, id: std.process.Child.Id }; // why id?
|
const Process = struct { child: ?*std.process.Child, start: i64, id: std.process.Child.Id }; // why id?
|
||||||
/// Handle detected speech
|
/// Handle detected speech
|
||||||
fn onSpeech(ctx: *anyopaque, text: []const u8) void {
|
fn onSpeech(ctx: *anyopaque, event: stt.SpeechEvent) void {
|
||||||
if (builtin.is_test) return; // Suppress output during tests
|
if (builtin.is_test) return; // Suppress output during tests
|
||||||
// Look for noise words and skip it if so
|
// Look for noise words and skip it if so
|
||||||
if (std.mem.eql(u8, text, "huh")) return;
|
if (std.mem.eql(u8, event.text, "huh")) return;
|
||||||
if (std.mem.eql(u8, text, "but")) return;
|
if (std.mem.eql(u8, event.text, "but")) return;
|
||||||
const self: *SpeechHandler = @ptrCast(@alignCast(ctx));
|
const self: *SpeechHandler = @ptrCast(@alignCast(ctx));
|
||||||
self.speech_count += 1;
|
self.speech_count += 1;
|
||||||
|
|
||||||
// Print with timestamp for better experience
|
// Print with amplitude for better experience
|
||||||
const timestamp = std.time.timestamp();
|
|
||||||
var stdout_buffer: [1024]u8 = undefined;
|
var stdout_buffer: [1024]u8 = undefined;
|
||||||
var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer);
|
var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer);
|
||||||
const stdout = &stdout_writer.interface;
|
const stdout = &stdout_writer.interface;
|
||||||
defer stdout.flush() catch std.log.warn("Caught error writing speech data to stdout", .{});
|
defer stdout.flush() catch std.log.warn("Caught error writing speech data to stdout", .{});
|
||||||
stdout.print("[{}] Speech {}->{?s}: {s}\n", .{
|
stdout.print("[{}-{}] Speech {}->{?s}: {s}\n", .{
|
||||||
timestamp,
|
event.min_amplitude,
|
||||||
|
event.max_amplitude,
|
||||||
self.speech_count,
|
self.speech_count,
|
||||||
self.exec_program,
|
self.exec_program,
|
||||||
text,
|
event.text,
|
||||||
}) catch std.log.warn("Caught error writing speech data to stdout", .{});
|
}) catch std.log.warn("Caught error writing speech data to stdout", .{});
|
||||||
|
|
||||||
// Execute program if specified
|
// Execute program if specified
|
||||||
if (self.exec_program) |program| self.exec(text) catch |err| {
|
if (self.exec_program) |program| {
|
||||||
std.log.err("Failed to execute program '{s}': {}", .{ program, err });
|
self.exec(event.text) catch |err| {
|
||||||
};
|
std.log.err("Failed to execute program '{s}': {}", .{ program, err });
|
||||||
|
};
|
||||||
|
self.reclaimProcessesPosix(false) catch |err| {
|
||||||
|
std.log.err("Failed to reclaim processes: {}", .{err});
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
fn exec(self: *SpeechHandler, text: []const u8) !void {
|
fn exec(self: *SpeechHandler, text: []const u8) !void {
|
||||||
const program = self.exec_program.?; // should only be called when exec_program is not null
|
const program = self.exec_program.?; // should only be called when exec_program is not null
|
||||||
|
|
@ -78,7 +83,6 @@ const SpeechHandler = struct {
|
||||||
try process.child.?.spawn();
|
try process.child.?.spawn();
|
||||||
try process.child.?.waitForSpawn();
|
try process.child.?.waitForSpawn();
|
||||||
process.id = process.child.?.id;
|
process.id = process.child.?.id;
|
||||||
try self.reclaimProcessesPosix(false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn reclaimProcessesPosix(self: *SpeechHandler, reap_all: bool) !void {
|
fn reclaimProcessesPosix(self: *SpeechHandler, reap_all: bool) !void {
|
||||||
|
|
@ -282,6 +286,70 @@ fn signalHandler(sig: i32) callconv(.c) void {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn runMeasureLevels(allocator: std.mem.Allocator) !void {
|
||||||
|
const stdout = std.fs.File.stdout();
|
||||||
|
const is_tty = stdout.isTty();
|
||||||
|
|
||||||
|
var capture = try stt.AlsaCapture.init(allocator, "default", 16000, 1024);
|
||||||
|
defer capture.deinit();
|
||||||
|
try capture.open();
|
||||||
|
|
||||||
|
_ = try stdout.writeAll("Measuring audio levels... Press Ctrl+C to exit\n");
|
||||||
|
if (is_tty) {
|
||||||
|
_ = try stdout.writeAll("Histogram (0-10000):\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
var buffer: [4096]i16 = undefined;
|
||||||
|
var write_buffer: [256]u8 = undefined;
|
||||||
|
var second_max: u16 = 0;
|
||||||
|
var last_print = std.time.milliTimestamp();
|
||||||
|
|
||||||
|
while (!should_exit.load(.acquire)) {
|
||||||
|
_ = try capture.readAudio();
|
||||||
|
const samples_read = capture.getAudioSamples(&buffer);
|
||||||
|
if (samples_read == 0) {
|
||||||
|
std.Thread.sleep(10 * std.time.ns_per_ms);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var max_amp: u16 = 0;
|
||||||
|
for (buffer[0..samples_read]) |sample| {
|
||||||
|
const abs_sample = @abs(sample);
|
||||||
|
if (abs_sample > max_amp) max_amp = abs_sample;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (max_amp > second_max) second_max = max_amp;
|
||||||
|
|
||||||
|
const now = std.time.milliTimestamp();
|
||||||
|
if (now - last_print >= 1000) {
|
||||||
|
if (is_tty) {
|
||||||
|
const bar_width = (@as(u32, second_max) * 60) / 10000;
|
||||||
|
var writer = stdout.writer(&write_buffer);
|
||||||
|
const w = &writer.interface;
|
||||||
|
try w.print("{d:5} |", .{second_max});
|
||||||
|
try w.flush();
|
||||||
|
for (0..bar_width) |_| {
|
||||||
|
_ = try stdout.writeAll("█");
|
||||||
|
}
|
||||||
|
_ = try stdout.writeAll("\n");
|
||||||
|
} else {
|
||||||
|
var writer = stdout.writer(&write_buffer);
|
||||||
|
const w = &writer.interface;
|
||||||
|
try w.print("{d}\n", .{second_max});
|
||||||
|
try w.flush();
|
||||||
|
}
|
||||||
|
second_max = 0;
|
||||||
|
last_print = now;
|
||||||
|
}
|
||||||
|
|
||||||
|
std.Thread.sleep(50 * std.time.ns_per_ms);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_tty) {
|
||||||
|
_ = try stdout.writeAll("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn signalAction(sig: i32, info: *const std.posix.siginfo_t, _: ?*anyopaque) callconv(.c) void {
|
fn signalAction(sig: i32, info: *const std.posix.siginfo_t, _: ?*anyopaque) callconv(.c) void {
|
||||||
// NOTE: info only works correctly if std.posix.SA.SIGINFO is in the flags
|
// NOTE: info only works correctly if std.posix.SA.SIGINFO is in the flags
|
||||||
// std.log.debug("signal action. sig {d}", .{sig});
|
// std.log.debug("signal action. sig {d}", .{sig});
|
||||||
|
|
@ -349,6 +417,8 @@ pub fn main() !void {
|
||||||
|
|
||||||
var model_path: ?[]const u8 = null;
|
var model_path: ?[]const u8 = null;
|
||||||
var exec_program: ?[]const u8 = null;
|
var exec_program: ?[]const u8 = null;
|
||||||
|
var measure_levels = false;
|
||||||
|
var silence_threshold: ?i16 = null;
|
||||||
|
|
||||||
// Parse arguments
|
// Parse arguments
|
||||||
for (args[1..]) |arg| {
|
for (args[1..]) |arg| {
|
||||||
|
|
@ -357,13 +427,16 @@ pub fn main() !void {
|
||||||
_ = try stdout.writeAll("USAGE:\n");
|
_ = try stdout.writeAll("USAGE:\n");
|
||||||
_ = try stdout.writeAll(" stt [OPTIONS]\n\n");
|
_ = try stdout.writeAll(" stt [OPTIONS]\n\n");
|
||||||
_ = try stdout.writeAll("OPTIONS:\n");
|
_ = try stdout.writeAll("OPTIONS:\n");
|
||||||
_ = try stdout.writeAll(" --model=<path> Path to Vosk model directory\n");
|
_ = try stdout.writeAll(" --model=<path> Path to Vosk model directory\n");
|
||||||
_ = try stdout.writeAll(" --exec=<program> Program to execute with recognized text\n");
|
_ = try stdout.writeAll(" --exec=<program> Program to execute with recognized text\n");
|
||||||
_ = try stdout.writeAll(" --help, -h Show this help message\n\n");
|
_ = try stdout.writeAll(" --silence-threshold=<n> Silence detection threshold (default: 300)\n");
|
||||||
|
_ = try stdout.writeAll(" --measure-levels Display real-time audio level histogram\n");
|
||||||
|
_ = try stdout.writeAll(" --help, -h Show this help message\n\n");
|
||||||
_ = try stdout.writeAll("EXAMPLES:\n");
|
_ = try stdout.writeAll("EXAMPLES:\n");
|
||||||
_ = try stdout.writeAll(" stt\n");
|
_ = try stdout.writeAll(" stt\n");
|
||||||
_ = try stdout.writeAll(" stt --model=../share/vosk/models/vosk-model-small-en-us-0.15\n");
|
_ = try stdout.writeAll(" stt --model=../share/vosk/models/vosk-model-small-en-us-0.15\n");
|
||||||
_ = try stdout.writeAll(" stt --exec=echo\n\n");
|
_ = try stdout.writeAll(" stt --exec=echo\n");
|
||||||
|
_ = try stdout.writeAll(" stt --measure-levels\n\n");
|
||||||
_ = try stdout.writeAll("The application will search for models in these locations:\n");
|
_ = try stdout.writeAll("The application will search for models in these locations:\n");
|
||||||
_ = try stdout.writeAll(" vosk-model-small-en-us-0.15\n");
|
_ = try stdout.writeAll(" vosk-model-small-en-us-0.15\n");
|
||||||
_ = try stdout.writeAll(" <binary_dir>/../share/vosk/models/vosk-model-small-en-us-0.15\n");
|
_ = try stdout.writeAll(" <binary_dir>/../share/vosk/models/vosk-model-small-en-us-0.15\n");
|
||||||
|
|
@ -372,6 +445,18 @@ pub fn main() !void {
|
||||||
model_path = arg[8..]; // Skip "--model="
|
model_path = arg[8..]; // Skip "--model="
|
||||||
} else if (std.mem.startsWith(u8, arg, "--exec=")) {
|
} else if (std.mem.startsWith(u8, arg, "--exec=")) {
|
||||||
exec_program = arg[7..]; // Skip "--exec="
|
exec_program = arg[7..]; // Skip "--exec="
|
||||||
|
} else if (std.mem.eql(u8, arg, "--measure-levels")) {
|
||||||
|
measure_levels = true;
|
||||||
|
} else if (std.mem.startsWith(u8, arg, "--silence-threshold=")) {
|
||||||
|
const threshold_str = arg[20..]; // Skip "--silence-threshold="
|
||||||
|
silence_threshold = std.fmt.parseInt(i16, threshold_str, 10) catch {
|
||||||
|
std.log.err("Invalid silence threshold: {s}", .{threshold_str});
|
||||||
|
return error.InvalidArgument;
|
||||||
|
};
|
||||||
|
if (silence_threshold.? < 0) {
|
||||||
|
std.log.err("Invalid silence threshold: {s}", .{threshold_str});
|
||||||
|
return error.InvalidArgument;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -381,6 +466,13 @@ pub fn main() !void {
|
||||||
.exec_program = exec_program,
|
.exec_program = exec_program,
|
||||||
};
|
};
|
||||||
defer handler.deinit();
|
defer handler.deinit();
|
||||||
|
|
||||||
|
// If measure-levels mode, run that instead of normal STT
|
||||||
|
if (measure_levels) {
|
||||||
|
try runMeasureLevels(allocator);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const speech_handler = stt.SpeechEventHandler{
|
const speech_handler = stt.SpeechEventHandler{
|
||||||
.onSpeechFn = SpeechHandler.onSpeech,
|
.onSpeechFn = SpeechHandler.onSpeech,
|
||||||
.onErrorFn = SpeechHandler.onError,
|
.onErrorFn = SpeechHandler.onError,
|
||||||
|
|
@ -436,6 +528,7 @@ pub fn main() !void {
|
||||||
.event_handler = speech_handler,
|
.event_handler = speech_handler,
|
||||||
.sample_rate = 16000, // Standard sample rate for speech recognition
|
.sample_rate = 16000, // Standard sample rate for speech recognition
|
||||||
.buffer_size = 256, // Existing buffer size for low latency
|
.buffer_size = 256, // Existing buffer size for low latency
|
||||||
|
.silence_threshold = silence_threshold orelse 300,
|
||||||
};
|
};
|
||||||
|
|
||||||
std.log.debug("Initializing STT library...", .{});
|
std.log.debug("Initializing STT library...", .{});
|
||||||
|
|
@ -526,7 +619,8 @@ test "handler callbacks" {
|
||||||
};
|
};
|
||||||
|
|
||||||
// Test that callbacks can be invoked without crashing
|
// Test that callbacks can be invoked without crashing
|
||||||
speech_handler.onSpeech("test speech");
|
const event = stt.SpeechEvent{ .text = "test speech", .max_amplitude = 500, .min_amplitude = 200 };
|
||||||
|
speech_handler.onSpeech(event);
|
||||||
speech_handler.onError(stt.Error.AudioDeviceError, "test error");
|
speech_handler.onError(stt.Error.AudioDeviceError, "test error");
|
||||||
|
|
||||||
// If we get here without crashing, the test passes
|
// If we get here without crashing, the test passes
|
||||||
|
|
|
||||||
82
src/stt.zig
82
src/stt.zig
|
|
@ -104,12 +104,24 @@ pub const ErrorInfo = struct {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Speech detection event data
|
||||||
|
pub const SpeechEvent = struct {
|
||||||
|
/// Recognized text
|
||||||
|
text: []const u8,
|
||||||
|
/// Maximum amplitude detected in the speech segment
|
||||||
|
max_amplitude: u16 = std.math.maxInt(u16),
|
||||||
|
/// Minimum chunk amplitude that triggered speech detection
|
||||||
|
min_amplitude: u16 = 0,
|
||||||
|
/// Audio data for the speech segment
|
||||||
|
audio_data: []const i16 = &[_]i16{},
|
||||||
|
};
|
||||||
|
|
||||||
/// Callback function type for speech detection events
|
/// Callback function type for speech detection events
|
||||||
///
|
///
|
||||||
/// Parameters:
|
/// Parameters:
|
||||||
/// - text: Null-terminated string containing the detected speech
|
/// - event: Speech event data containing text and amplitude
|
||||||
/// - user_data: Optional user-provided context data
|
/// - user_data: Optional user-provided context data
|
||||||
pub const SpeechCallback = *const fn (text: [*:0]const u8, user_data: ?*anyopaque) void;
|
pub const SpeechCallback = *const fn (event: SpeechEvent, user_data: ?*anyopaque) void;
|
||||||
|
|
||||||
/// Callback function type for error events
|
/// Callback function type for error events
|
||||||
///
|
///
|
||||||
|
|
@ -132,7 +144,7 @@ pub const DetailedErrorCallback = *const fn (error_info: ErrorInfo, user_data: ?
|
||||||
/// with both speech detection and error handling callbacks.
|
/// with both speech detection and error handling callbacks.
|
||||||
pub const SpeechEventHandler = struct {
|
pub const SpeechEventHandler = struct {
|
||||||
/// Function to call when speech is detected
|
/// Function to call when speech is detected
|
||||||
onSpeechFn: *const fn (ctx: *anyopaque, text: []const u8) void,
|
onSpeechFn: *const fn (ctx: *anyopaque, event: SpeechEvent) void,
|
||||||
/// Function to call when an error occurs
|
/// Function to call when an error occurs
|
||||||
onErrorFn: *const fn (ctx: *anyopaque, error_code: Error, message: []const u8) void,
|
onErrorFn: *const fn (ctx: *anyopaque, error_code: Error, message: []const u8) void,
|
||||||
/// Optional function to call for detailed error information
|
/// Optional function to call for detailed error information
|
||||||
|
|
@ -141,11 +153,11 @@ pub const SpeechEventHandler = struct {
|
||||||
ctx: *anyopaque,
|
ctx: *anyopaque,
|
||||||
|
|
||||||
/// Invoke the speech detection callback with error handling
|
/// Invoke the speech detection callback with error handling
|
||||||
pub fn onSpeech(self: SpeechEventHandler, text: []const u8) void {
|
pub fn onSpeech(self: SpeechEventHandler, event: SpeechEvent) void {
|
||||||
// Call the speech callback function
|
// Call the speech callback function
|
||||||
// Note: If the callback panics or causes undefined behavior,
|
// Note: If the callback panics or causes undefined behavior,
|
||||||
// there's not much we can do to recover gracefully in Zig
|
// there's not much we can do to recover gracefully in Zig
|
||||||
self.onSpeechFn(self.ctx, text);
|
self.onSpeechFn(self.ctx, event);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Invoke the error callback
|
/// Invoke the error callback
|
||||||
|
|
@ -519,7 +531,7 @@ pub const AlsaCapture = struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read audio data from ALSA device and process it
|
/// Read audio data from ALSA device and process it
|
||||||
fn readAudio(self: *Self) !usize {
|
pub fn readAudio(self: *Self) !usize {
|
||||||
if (self.pcm_handle == null)
|
if (self.pcm_handle == null)
|
||||||
return Error.AudioDeviceError;
|
return Error.AudioDeviceError;
|
||||||
|
|
||||||
|
|
@ -594,6 +606,8 @@ pub const Options = struct {
|
||||||
// channels: u32 = 2,
|
// channels: u32 = 2,
|
||||||
/// Audio buffer size in frames (default: 256)
|
/// Audio buffer size in frames (default: 256)
|
||||||
buffer_size: u32 = 256,
|
buffer_size: u32 = 256,
|
||||||
|
/// Silence detection threshold (default: 300)
|
||||||
|
silence_threshold: i16 = 300,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Main STT session handle
|
/// Main STT session handle
|
||||||
|
|
@ -958,11 +972,15 @@ pub const Session = struct {
|
||||||
defer self.allocator.free(vosk_buffer);
|
defer self.allocator.free(vosk_buffer);
|
||||||
|
|
||||||
// Silence detection parameters
|
// Silence detection parameters
|
||||||
const silence_threshold: i16 = 500;
|
const silence_threshold: i16 = self.options.silence_threshold;
|
||||||
const silence_duration_ms: u64 = 500;
|
const silence_duration_ms: u64 = 500;
|
||||||
const min_speech_duration_ms: u64 = 300;
|
const min_speech_duration_ms: u64 = 300;
|
||||||
const samples_per_ms = 16; // This assumes 16kHz audio
|
const samples_per_ms = 16; // This assumes 16kHz audio
|
||||||
|
|
||||||
|
std.log.info(
|
||||||
|
"Listening with silence threshold {}. If wrong, use --measure-levels to find approriate val",
|
||||||
|
.{silence_threshold},
|
||||||
|
);
|
||||||
var speech_buffer = self.allocator.alloc(i16, 16000 * 10) catch {
|
var speech_buffer = self.allocator.alloc(i16, 16000 * 10) catch {
|
||||||
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate speech buffer");
|
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate speech buffer");
|
||||||
self.options.event_handler.onDetailedError(error_info);
|
self.options.event_handler.onDetailedError(error_info);
|
||||||
|
|
@ -974,6 +992,8 @@ pub const Session = struct {
|
||||||
var silence_samples: usize = 0;
|
var silence_samples: usize = 0;
|
||||||
var speech_samples: usize = 0;
|
var speech_samples: usize = 0;
|
||||||
var speech_pos: usize = 0;
|
var speech_pos: usize = 0;
|
||||||
|
var speech_max_amplitude: u16 = 0;
|
||||||
|
var speech_min_amplitude: u16 = std.math.maxInt(u16);
|
||||||
|
|
||||||
while (!self.should_stop.load(.acquire)) {
|
while (!self.should_stop.load(.acquire)) {
|
||||||
const available = self.vosk_audio_buffer.available();
|
const available = self.vosk_audio_buffer.available();
|
||||||
|
|
@ -1002,7 +1022,11 @@ pub const Session = struct {
|
||||||
speech_samples = 0;
|
speech_samples = 0;
|
||||||
silence_samples = 0;
|
silence_samples = 0;
|
||||||
speech_pos = 0;
|
speech_pos = 0;
|
||||||
|
speech_max_amplitude = 0;
|
||||||
|
speech_min_amplitude = std.math.maxInt(u16);
|
||||||
}
|
}
|
||||||
|
if (max_amplitude > speech_max_amplitude) speech_max_amplitude = max_amplitude;
|
||||||
|
if (max_amplitude < speech_min_amplitude) speech_min_amplitude = max_amplitude;
|
||||||
const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
|
const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
|
||||||
@memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
|
@memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
|
||||||
speech_pos += copy_len;
|
speech_pos += copy_len;
|
||||||
|
|
@ -1016,7 +1040,14 @@ pub const Session = struct {
|
||||||
|
|
||||||
if (silence_samples >= silence_duration_ms * samples_per_ms) {
|
if (silence_samples >= silence_duration_ms * samples_per_ms) {
|
||||||
if (speech_samples >= min_speech_duration_ms * samples_per_ms) {
|
if (speech_samples >= min_speech_duration_ms * samples_per_ms) {
|
||||||
self.processVoskAudio(speech_buffer[0..speech_pos]) catch |err| {
|
const event = SpeechEvent{
|
||||||
|
// SAFETY: This will be defined in the next line when we process the audio data
|
||||||
|
.text = undefined,
|
||||||
|
.max_amplitude = speech_max_amplitude,
|
||||||
|
.min_amplitude = speech_min_amplitude,
|
||||||
|
.audio_data = speech_buffer[0..speech_pos],
|
||||||
|
};
|
||||||
|
self.processVoskAudio(event) catch |err| {
|
||||||
std.log.err("Error processing speech: {}", .{err});
|
std.log.err("Error processing speech: {}", .{err});
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
@ -1025,6 +1056,8 @@ pub const Session = struct {
|
||||||
speech_pos = 0;
|
speech_pos = 0;
|
||||||
speech_samples = 0;
|
speech_samples = 0;
|
||||||
silence_samples = 0;
|
silence_samples = 0;
|
||||||
|
speech_max_amplitude = 0;
|
||||||
|
speech_min_amplitude = std.math.maxInt(u16);
|
||||||
|
|
||||||
if (self.vosk_recognizer) |rec| {
|
if (self.vosk_recognizer) |rec| {
|
||||||
c.vosk_recognizer_reset(rec);
|
c.vosk_recognizer_reset(rec);
|
||||||
|
|
@ -1038,24 +1071,24 @@ pub const Session = struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Process complete audio clip with Vosk and handle results
|
/// Process complete audio clip with Vosk and handle results
|
||||||
fn processVoskAudio(self: *Session, audio_data: []const i16) !void {
|
fn processVoskAudio(self: *Session, event: SpeechEvent) !void {
|
||||||
if (self.vosk_recognizer == null) return Error.InvalidState;
|
if (self.vosk_recognizer == null) return Error.InvalidState;
|
||||||
|
|
||||||
const audio_bytes = std.mem.sliceAsBytes(audio_data);
|
const audio_bytes = std.mem.sliceAsBytes(event.audio_data);
|
||||||
_ = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));
|
_ = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));
|
||||||
|
|
||||||
// Get final result
|
// Get final result
|
||||||
const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer);
|
const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer);
|
||||||
if (result_cstr != null) {
|
if (result_cstr != null) {
|
||||||
const result_str = std.mem.span(result_cstr);
|
const result_str = std.mem.span(result_cstr);
|
||||||
self.parseVoskResult(result_str) catch |err| {
|
self.parseVoskResult(result_str, event) catch |err| {
|
||||||
self.options.event_handler.onError(err, "Failed to parse Vosk result");
|
self.options.event_handler.onError(err, "Failed to parse Vosk result");
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse Vosk JSON result and extract recognized text
|
/// Parse Vosk JSON result and extract recognized text
|
||||||
fn parseVoskResult(self: *Session, json_str: []const u8) !void {
|
fn parseVoskResult(self: *Session, json_str: []const u8, event: SpeechEvent) !void {
|
||||||
if (json_str.len == 0) return;
|
if (json_str.len == 0) return;
|
||||||
|
|
||||||
const text_key = "\"text\"";
|
const text_key = "\"text\"";
|
||||||
|
|
@ -1074,7 +1107,9 @@ pub const Session = struct {
|
||||||
if (std.mem.indexOf(u8, json_str[s..], "\"")) |quote_end| {
|
if (std.mem.indexOf(u8, json_str[s..], "\"")) |quote_end| {
|
||||||
const text = json_str[s .. s + quote_end];
|
const text = json_str[s .. s + quote_end];
|
||||||
if (text.len > 0 and !std.mem.eql(u8, text, " ")) {
|
if (text.len > 0 and !std.mem.eql(u8, text, " ")) {
|
||||||
self.options.event_handler.onSpeech(text);
|
var result_event = event;
|
||||||
|
result_event.text = text;
|
||||||
|
self.options.event_handler.onSpeech(result_event);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1376,9 +1411,9 @@ test "Options validation" {
|
||||||
|
|
||||||
// Test valid options
|
// Test valid options
|
||||||
const DummyHandler = struct {
|
const DummyHandler = struct {
|
||||||
fn onSpeech(ctx: *anyopaque, text: []const u8) void {
|
fn onSpeech(ctx: *anyopaque, event: SpeechEvent) void {
|
||||||
_ = ctx;
|
_ = ctx;
|
||||||
_ = text;
|
_ = event;
|
||||||
}
|
}
|
||||||
fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void {
|
fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void {
|
||||||
_ = ctx;
|
_ = ctx;
|
||||||
|
|
@ -1412,9 +1447,9 @@ test "Session state management" {
|
||||||
const testing = std.testing;
|
const testing = std.testing;
|
||||||
|
|
||||||
const DummyHandler = struct {
|
const DummyHandler = struct {
|
||||||
fn onSpeech(ctx: *anyopaque, text: []const u8) void {
|
fn onSpeech(ctx: *anyopaque, event: SpeechEvent) void {
|
||||||
_ = ctx;
|
_ = ctx;
|
||||||
_ = text;
|
_ = event;
|
||||||
}
|
}
|
||||||
fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void {
|
fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void {
|
||||||
_ = ctx;
|
_ = ctx;
|
||||||
|
|
@ -1451,12 +1486,14 @@ test "SpeechEventHandler interface" {
|
||||||
speech_called: bool = false,
|
speech_called: bool = false,
|
||||||
error_called: bool = false,
|
error_called: bool = false,
|
||||||
last_text: []const u8 = "",
|
last_text: []const u8 = "",
|
||||||
|
last_amplitude: u16 = 0,
|
||||||
last_error: Error = Error.InitializationFailed,
|
last_error: Error = Error.InitializationFailed,
|
||||||
|
|
||||||
fn onSpeech(ctx: *anyopaque, text: []const u8) void {
|
fn onSpeech(ctx: *anyopaque, event: SpeechEvent) void {
|
||||||
const self: *@This() = @ptrCast(@alignCast(ctx));
|
const self: *@This() = @ptrCast(@alignCast(ctx));
|
||||||
self.speech_called = true;
|
self.speech_called = true;
|
||||||
self.last_text = text;
|
self.last_text = event.text;
|
||||||
|
self.last_amplitude = event.max_amplitude;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void {
|
fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void {
|
||||||
|
|
@ -1475,9 +1512,10 @@ test "SpeechEventHandler interface" {
|
||||||
};
|
};
|
||||||
|
|
||||||
// Test speech callback
|
// Test speech callback
|
||||||
event_handler.onSpeech("hello world");
|
event_handler.onSpeech(.{ .text = "hello world" });
|
||||||
try testing.expect(handler.speech_called);
|
try testing.expect(handler.speech_called);
|
||||||
try testing.expectEqualStrings("hello world", handler.last_text);
|
try testing.expectEqualStrings("hello world", handler.last_text);
|
||||||
|
try testing.expect(handler.last_amplitude == std.math.maxInt(u16));
|
||||||
|
|
||||||
// Test error callback
|
// Test error callback
|
||||||
event_handler.onError(Error.AudioDeviceError, "test error");
|
event_handler.onError(Error.AudioDeviceError, "test error");
|
||||||
|
|
@ -1615,10 +1653,10 @@ test "Session session management API" {
|
||||||
speech_count: u32 = 0,
|
speech_count: u32 = 0,
|
||||||
error_count: u32 = 0,
|
error_count: u32 = 0,
|
||||||
|
|
||||||
fn onSpeech(ctx: *anyopaque, text: []const u8) void {
|
fn onSpeech(ctx: *anyopaque, event: SpeechEvent) void {
|
||||||
const self: *@This() = @ptrCast(@alignCast(ctx));
|
const self: *@This() = @ptrCast(@alignCast(ctx));
|
||||||
self.speech_count += 1;
|
self.speech_count += 1;
|
||||||
_ = text;
|
_ = event;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void {
|
fn onError(ctx: *anyopaque, error_code: Error, message: []const u8) void {
|
||||||
|
|
|
||||||
18
src/test.zig
18
src/test.zig
|
|
@ -57,9 +57,9 @@ const TestEventHandler = struct {
|
||||||
self.detailed_error_events.deinit(self.allocator);
|
self.detailed_error_events.deinit(self.allocator);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn onSpeech(ctx: *anyopaque, text: []const u8) void {
|
fn onSpeech(ctx: *anyopaque, event: stt.SpeechEvent) void {
|
||||||
const self: *TestEventHandler = @ptrCast(@alignCast(ctx));
|
const self: *TestEventHandler = @ptrCast(@alignCast(ctx));
|
||||||
const owned_text = self.allocator.dupe(u8, text) catch return;
|
const owned_text = self.allocator.dupe(u8, event.text) catch return;
|
||||||
self.speech_events.append(self.allocator, owned_text) catch return;
|
self.speech_events.append(self.allocator, owned_text) catch return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -154,7 +154,7 @@ test "SpeechEventHandler callback invocation" {
|
||||||
const speech_handler = test_handler.getSpeechEventHandler();
|
const speech_handler = test_handler.getSpeechEventHandler();
|
||||||
|
|
||||||
// Test speech callback
|
// Test speech callback
|
||||||
speech_handler.onSpeech("Hello world");
|
speech_handler.onSpeech(.{ .text = "Hello world" });
|
||||||
try testing.expect(test_handler.speech_events.items.len == 1);
|
try testing.expect(test_handler.speech_events.items.len == 1);
|
||||||
try testing.expectEqualStrings("Hello world", test_handler.speech_events.items[0]);
|
try testing.expectEqualStrings("Hello world", test_handler.speech_events.items[0]);
|
||||||
|
|
||||||
|
|
@ -351,13 +351,13 @@ test "Callback error handling robustness" {
|
||||||
for (0..100) |i| {
|
for (0..100) |i| {
|
||||||
const text = std.fmt.allocPrint(test_allocator, "Speech event {}", .{i}) catch continue;
|
const text = std.fmt.allocPrint(test_allocator, "Speech event {}", .{i}) catch continue;
|
||||||
defer test_allocator.free(text);
|
defer test_allocator.free(text);
|
||||||
speech_handler.onSpeech(text);
|
speech_handler.onSpeech(.{ .text = text });
|
||||||
}
|
}
|
||||||
|
|
||||||
try testing.expect(test_handler.speech_events.items.len == 100);
|
try testing.expect(test_handler.speech_events.items.len == 100);
|
||||||
|
|
||||||
// Test mixed callback types
|
// Test mixed callback types
|
||||||
speech_handler.onSpeech("Final speech");
|
speech_handler.onSpeech(.{ .text = "Final speech" });
|
||||||
speech_handler.onError(stt.Error.CallbackError, "Callback error");
|
speech_handler.onError(stt.Error.CallbackError, "Callback error");
|
||||||
|
|
||||||
const final_error = stt.ErrorInfo.init(stt.Error.InternalError, "Internal error");
|
const final_error = stt.ErrorInfo.init(stt.Error.InternalError, "Internal error");
|
||||||
|
|
@ -387,7 +387,7 @@ test "Memory management and resource cleanup" {
|
||||||
defer test_handler.deinit(); // Should not leak memory
|
defer test_handler.deinit(); // Should not leak memory
|
||||||
|
|
||||||
const speech_handler = test_handler.getSpeechEventHandler();
|
const speech_handler = test_handler.getSpeechEventHandler();
|
||||||
speech_handler.onSpeech("Test speech");
|
speech_handler.onSpeech(.{ .text = "Test speech" });
|
||||||
speech_handler.onError(stt.Error.AudioDeviceError, "Test error");
|
speech_handler.onError(stt.Error.AudioDeviceError, "Test error");
|
||||||
|
|
||||||
const error_info = stt.ErrorInfo.initWithContext(stt.Error.ModelLoadError, "Test detailed error", "test context");
|
const error_info = stt.ErrorInfo.initWithContext(stt.Error.ModelLoadError, "Test detailed error", "test context");
|
||||||
|
|
@ -461,15 +461,15 @@ test "Complete workflow simulation" {
|
||||||
try testing.expect(processed_count == audio_samples.len);
|
try testing.expect(processed_count == audio_samples.len);
|
||||||
|
|
||||||
// 3. Speech detection phase
|
// 3. Speech detection phase
|
||||||
speech_handler.onSpeech("Hello world");
|
speech_handler.onSpeech(.{ .text = "Hello world" });
|
||||||
speech_handler.onSpeech("This is a test");
|
speech_handler.onSpeech(.{ .text = "This is a test" });
|
||||||
|
|
||||||
// 4. Error handling phase
|
// 4. Error handling phase
|
||||||
const recoverable_error = stt.ErrorInfo.initRecoverable(stt.Error.AudioDeviceBusy, "Audio device temporarily busy", "Retrying in 100ms");
|
const recoverable_error = stt.ErrorInfo.initRecoverable(stt.Error.AudioDeviceBusy, "Audio device temporarily busy", "Retrying in 100ms");
|
||||||
speech_handler.onDetailedError(recoverable_error);
|
speech_handler.onDetailedError(recoverable_error);
|
||||||
|
|
||||||
// 5. Recovery phase
|
// 5. Recovery phase
|
||||||
speech_handler.onSpeech("Speech recognition resumed");
|
speech_handler.onSpeech(.{ .text = "Speech recognition resumed" });
|
||||||
|
|
||||||
// 6. Cleanup phase
|
// 6. Cleanup phase
|
||||||
const cleanup_info = stt.ErrorInfo.initRecoverable(stt.Error.InternalError, "STT session cleanup completed", "All resources freed");
|
const cleanup_info = stt.ErrorInfo.initRecoverable(stt.Error.InternalError, "STT session cleanup completed", "All resources freed");
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue