From 446c146dedb2849a2b6f1701ec72c2b58839de14 Mon Sep 17 00:00:00 2001 From: Emil Lerch Date: Wed, 10 Sep 2025 12:19:56 -0700 Subject: [PATCH] ai generated - no review yet --- build.zig | 74 ++- src/main.zig | 178 +++--- src/root.zig | 1550 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 1688 insertions(+), 114 deletions(-) diff --git a/build.zig b/build.zig index b754d11..fdae3d6 100644 --- a/build.zig +++ b/build.zig @@ -4,7 +4,6 @@ pub fn build(b: *std.Build) void { const target = b.standardTargetOptions(.{}); const optimize = b.standardOptimizeOption(.{}); - // Select Vosk dependency based on target const vosk_dep_name = selectVoskDependency(target.result); const vosk_dep = b.dependency(vosk_dep_name, .{}); const alsa_dep = b.dependency("alsa", .{ @@ -24,24 +23,49 @@ pub fn build(b: *std.Build) void { install_model.step.dependOn(&model_step.step); b.getInstallStep().dependOn(&install_model.step); - const exe = b.addExecutable(.{ + // Create the STT library + const stt_lib = b.addLibrary(.{ .name = "stt", + .linkage = .static, + .root_module = b.createModule(.{ + .root_source_file = b.path("src/root.zig"), + .target = target, + .optimize = optimize, + .link_libc = true, + }), + }); + + // Link with Vosk library + stt_lib.addIncludePath(vosk_dep.path("")); + stt_lib.addLibraryPath(vosk_dep.path("")); + stt_lib.linkSystemLibrary("vosk"); + + const alsa_lib = alsa_dep.artifact("asound"); + stt_lib.linkLibrary(alsa_lib); + stt_lib.addIncludePath(alsa_dep.path("zig-out/include")); + + b.installArtifact(stt_lib); + + // Create the demo executable + const exe = b.addExecutable(.{ + .name = "stt-demo", .root_module = b.createModule(.{ .root_source_file = b.path("src/main.zig"), .target = target, .optimize = optimize, + .link_libc = true, }), }); - exe.linkLibC(); + exe.linkLibrary(stt_lib); + exe.linkLibrary(alsa_lib); + exe.addIncludePath(alsa_dep.path("zig-out/include")); + + // Link with Vosk for the executable exe.addIncludePath(vosk_dep.path("")); exe.addLibraryPath(vosk_dep.path("")); exe.linkSystemLibrary("vosk"); - const alsa_lib = alsa_dep.artifact("asound"); - exe.linkLibrary(alsa_lib); - exe.addIncludePath(alsa_dep.path("zig-out/include")); - b.installArtifact(exe); const run_step = b.step("run", "Run the app"); @@ -52,21 +76,47 @@ pub fn build(b: *std.Build) void { if (b.args) |args| { run_cmd.addArgs(args); } - // Creates a step for unit testing. This only builds the test executable - // but does not run it. + // Creates a step for unit testing the library + const lib_unit_tests = b.addTest(.{ + .root_module = b.createModule(.{ + .root_source_file = b.path("src/root.zig"), + .target = target, + .optimize = optimize, + .link_libc = true, + }), + }); + + // Link the same dependencies as the library + lib_unit_tests.linkLibrary(alsa_lib); + lib_unit_tests.addIncludePath(alsa_dep.path("zig-out/include")); + lib_unit_tests.addIncludePath(vosk_dep.path("")); + lib_unit_tests.addLibraryPath(vosk_dep.path("")); + lib_unit_tests.linkSystemLibrary("vosk"); + + const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests); + + // Creates a step for unit testing the demo application const exe_unit_tests = b.addTest(.{ .root_module = b.createModule(.{ .root_source_file = b.path("src/main.zig"), .target = target, .optimize = optimize, + .link_libc = true, }), }); + + exe_unit_tests.linkLibrary(stt_lib); + exe_unit_tests.linkLibrary(alsa_lib); + exe_unit_tests.addIncludePath(alsa_dep.path("zig-out/include")); + exe_unit_tests.addIncludePath(vosk_dep.path("")); + exe_unit_tests.addLibraryPath(vosk_dep.path("")); + exe_unit_tests.linkSystemLibrary("vosk"); + const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); - // Similar to creating the run step earlier, this exposes a `test` step to - // the `zig build --help` menu, providing a way for the user to request - // running the unit tests. + // Test step that runs both library and demo tests const test_step = b.step("test", "Run unit tests"); + test_step.dependOn(&run_lib_unit_tests.step); test_step.dependOn(&run_exe_unit_tests.step); } diff --git a/src/main.zig b/src/main.zig index deee6b0..94cafaa 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,103 +1,105 @@ -const std = @import("std"); -const c = @cImport({ - @cInclude("vosk_api.h"); - @cInclude("alsa/asoundlib.h"); -}); +//! STT Library Demo Application +//! +//! This demonstrates how to use the STT library for speech recognition. +//! It will be updated in subsequent tasks to use the actual Vosk integration. -const VOSK_SAMPLE_RATE = 16000; -const BUFFER_SIZE = 256; +const std = @import("std"); +const stt = @import("root.zig"); + +/// Demo implementation of speech event handler +const DemoHandler = struct { + /// Handle detected speech + fn onSpeech(ctx: *anyopaque, text: []const u8) void { + const self: *DemoHandler = @ptrCast(@alignCast(ctx)); + _ = self; // Handler context not used in this simple demo + + std.debug.print("Detected: {s}\n", .{text}); + } + + /// Handle errors + fn onError(ctx: *anyopaque, error_code: stt.SttError, message: []const u8) void { + const self: *DemoHandler = @ptrCast(@alignCast(ctx)); + _ = self; // Handler context not used in this simple demo + + std.debug.print("Error {}: {s}\n", .{ error_code, message }); + } +}; pub fn main() !void { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer _ = gpa.deinit(); + const allocator = gpa.allocator(); - // Set ALSA config path to our local alsa.conf - _ = c.setenv("ALSA_CONFIG_PATH", "alsa.conf", 1); + std.debug.print("STT Library Demo\n", .{}); + std.debug.print("================\n", .{}); - // Initialize Vosk - c.vosk_set_log_level(-1); - const model = c.vosk_model_new("zig-out/bin/vosk-model-small-en-us-0.15"); - if (model == null) { - std.debug.print("Failed to load model\n", .{}); + // Create demo handler + var demo_handler = DemoHandler{}; + const speech_handler = stt.SpeechEventHandler{ + .onSpeechFn = DemoHandler.onSpeech, + .onErrorFn = DemoHandler.onError, + .ctx = &demo_handler, + }; + + // Initialize STT session with configuration + const options = stt.SttOptions{ + .model_path = "zig-out/bin/vosk-model-small-en-us-0.15", + .audio_device = "hw:3,0", + .event_handler = speech_handler, + .sample_rate = 16000, + .channels = 2, + .buffer_size = 256, + }; + + var session = stt.SttSession.init(allocator, options) catch |err| { + std.debug.print("Failed to initialize STT library: {}\n", .{err}); return; - } - defer c.vosk_model_free(model); + }; + defer session.deinit(); - const rec = c.vosk_recognizer_new(model, VOSK_SAMPLE_RATE); - if (rec == null) { - std.debug.print("Failed to create recognizer\n", .{}); + std.debug.print("STT library initialized successfully\n", .{}); + std.debug.print("Model path: {s}\n", .{options.model_path}); + std.debug.print("Audio device: {s}\n", .{options.audio_device}); + std.debug.print("Sample rate: {} Hz\n", .{options.sample_rate}); + std.debug.print("Channels: {}\n", .{options.channels}); + std.debug.print("Buffer size: {} frames\n", .{options.buffer_size}); + std.debug.print("\n", .{}); + + // Start listening for speech + session.start_listening() catch |err| { + std.debug.print("Failed to start listening: {}\n", .{err}); return; - } - defer c.vosk_recognizer_free(rec); + }; - // Try to open hardware capture device directly - var handle: ?*c.snd_pcm_t = null; - var err = c.snd_pcm_open(&handle, "hw:3,0", c.SND_PCM_STREAM_CAPTURE, c.SND_PCM_NONBLOCK); - if (err < 0) { - std.debug.print("Cannot open audio device: {s}\n", .{c.snd_strerror(err)}); - std.debug.print("Make sure no other applications are using the microphone\n", .{}); - return; - } - defer _ = c.snd_pcm_close(handle); + std.debug.print("Listening for speech... (Press Enter to exit)\n", .{}); - // Set to blocking mode - err = c.snd_pcm_nonblock(handle, 0); - if (err < 0) { - std.debug.print("Cannot set blocking mode: {s}\n", .{c.snd_strerror(err)}); - return; - } + // Wait for user input to exit (simulating Ctrl+C behavior) + // In subsequent tasks, this will be replaced with actual audio processing + const stdin = std.fs.File.stdin(); + var buffer: [1]u8 = undefined; + _ = stdin.read(&buffer) catch {}; - // Configure audio parameters - err = c.snd_pcm_set_params(handle, c.SND_PCM_FORMAT_S16_LE, c.SND_PCM_ACCESS_RW_INTERLEAVED, 2, VOSK_SAMPLE_RATE, 1, 100000); - if (err < 0) { - std.debug.print("Cannot configure audio: {s}\n", .{c.snd_strerror(err)}); - return; - } + std.debug.print("\nStopping speech recognition...\n", .{}); + session.stop_listening(); - // Prepare the PCM device - err = c.snd_pcm_prepare(handle); - if (err < 0) { - std.debug.print("Cannot prepare audio: {s}\n", .{c.snd_strerror(err)}); - return; - } - - // Start the PCM stream - err = c.snd_pcm_start(handle); - if (err < 0) { - std.debug.print("Cannot start audio: {s}\n", .{c.snd_strerror(err)}); - return; - } - - std.debug.print("Audio configured successfully\n", .{}); - std.debug.print("Listening... (Ctrl+C to exit)\n", .{}); - - var buffer: [BUFFER_SIZE * 2]i16 = undefined; // stereo - var accumulator: [VOSK_SAMPLE_RATE]i16 = undefined; // 1 second buffer - var acc_pos: usize = 0; - - while (true) { - const frames_read = c.snd_pcm_readi(handle, &buffer, BUFFER_SIZE); - if (frames_read < 0) { - _ = c.snd_pcm_recover(handle, @intCast(frames_read), 1); - continue; - } - - // Convert stereo to mono and accumulate - for (0..@intCast(frames_read)) |i| { - if (acc_pos < accumulator.len) { - accumulator[acc_pos] = buffer[i * 2]; // left channel - acc_pos += 1; - } - } - - // Process when we have enough data (0.1 seconds) - if (acc_pos >= VOSK_SAMPLE_RATE / 10) { - const result = c.vosk_recognizer_accept_waveform(rec, @ptrCast(&accumulator), @intCast(acc_pos * 2)); - if (result != 0) { - const text = c.vosk_recognizer_result(rec); - std.debug.print("{s}\n", .{text}); - } - acc_pos = 0; - } - } + std.debug.print("Demo completed successfully\n", .{}); +} + +// Test the demo functionality +test "demo handler functionality" { + const testing = std.testing; + + var demo_handler = DemoHandler{}; + const speech_handler = stt.SpeechEventHandler{ + .onSpeechFn = DemoHandler.onSpeech, + .onErrorFn = DemoHandler.onError, + .ctx = &demo_handler, + }; + + // Test that callbacks can be invoked without crashing + speech_handler.onSpeech("test speech"); + speech_handler.onError(stt.SttError.AudioDeviceError, "test error"); + + // If we get here without crashing, the test passes + try testing.expect(true); } diff --git a/src/root.zig b/src/root.zig index 94c7cd0..882b636 100644 --- a/src/root.zig +++ b/src/root.zig @@ -1,23 +1,1545 @@ -//! By convention, root.zig is the root source file when making a library. +//! STT (Speech-to-Text) Library +//! +//! This library provides callback-based speech recognition functionality +//! using Vosk and ALSA for audio capture. + const std = @import("std"); +const c = @cImport({ + @cInclude("alsa/asoundlib.h"); + @cInclude("vosk_api.h"); +}); -pub fn bufferedPrint() !void { - // Stdout is for the actual output of your application, for example if you - // are implementing gzip, then only the compressed bytes should be sent to - // stdout, not any debugging messages. - var stdout_buffer: [1024]u8 = undefined; - var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer); - const stdout = &stdout_writer.interface; +/// Core error types for the STT library +pub const SttError = error{ + /// Failed to initialize the library (model loading, audio setup, etc.) + InitializationFailed, + /// Audio device access or configuration error + AudioDeviceError, + /// Failed to load the speech recognition model + ModelLoadError, + /// Error occurred during callback execution + CallbackError, + /// Memory allocation failed + OutOfMemory, + /// Invalid parameters provided + InvalidParameter, + /// Library is not in the correct state for the operation + InvalidState, + /// Threading or synchronization error + ThreadingError, +}; - try stdout.print("Run `zig build test` to run the tests.\n", .{}); +/// Callback function type for speech detection events +/// +/// Parameters: +/// - text: Null-terminated string containing the detected speech +/// - user_data: Optional user-provided context data +pub const SpeechCallback = *const fn (text: [*:0]const u8, user_data: ?*anyopaque) void; - try stdout.flush(); // Don't forget to flush! +/// Callback function type for error events +/// +/// Parameters: +/// - error_code: The specific error that occurred +/// - message: Null-terminated string with error details +/// - user_data: Optional user-provided context data +pub const ErrorCallback = *const fn (error_code: SttError, message: [*:0]const u8, user_data: ?*anyopaque) void; + +/// Speech event handler interface pattern +/// +/// This provides a structured way to handle speech recognition events +/// with both speech detection and error handling callbacks. +pub const SpeechEventHandler = struct { + /// Function to call when speech is detected + onSpeechFn: *const fn (ctx: *anyopaque, text: []const u8) void, + /// Function to call when an error occurs + onErrorFn: *const fn (ctx: *anyopaque, error_code: SttError, message: []const u8) void, + /// Context pointer passed to callback functions + ctx: *anyopaque, + + /// Invoke the speech detection callback + pub fn onSpeech(self: SpeechEventHandler, text: []const u8) void { + self.onSpeechFn(self.ctx, text); + } + + /// Invoke the error callback + pub fn onError(self: SpeechEventHandler, error_code: SttError, message: []const u8) void { + self.onErrorFn(self.ctx, error_code, message); + } +}; + +/// Audio buffer for managing audio data flow using std.io interfaces +pub const AudioBuffer = struct { + const Self = @This(); + + /// Internal ring buffer for audio data + buffer: []i16, + /// Read position in the buffer + read_pos: usize = 0, + /// Write position in the buffer + write_pos: usize = 0, + /// Number of samples currently in buffer + count: usize = 0, + /// Mutex for thread-safe access + mutex: std.Thread.Mutex = .{}, + /// Allocator used for buffer allocation + allocator: std.mem.Allocator, + + /// Initialize audio buffer with specified capacity + pub fn init(allocator: std.mem.Allocator, buffer_capacity: usize) !Self { + const buffer = try allocator.alloc(i16, buffer_capacity); + return Self{ + .buffer = buffer, + .allocator = allocator, + }; + } + + /// Deinitialize and free buffer memory + pub fn deinit(self: *Self) void { + self.allocator.free(self.buffer); + } + + /// Write audio samples to the buffer (thread-safe) + pub fn write(self: *Self, samples: []const i16) usize { + self.mutex.lock(); + defer self.mutex.unlock(); + + const available_space = self.buffer.len - self.count; + const to_write = @min(samples.len, available_space); + + for (0..to_write) |i| { + self.buffer[self.write_pos] = samples[i]; + self.write_pos = (self.write_pos + 1) % self.buffer.len; + } + + self.count += to_write; + return to_write; + } + + /// Read audio samples from the buffer (thread-safe) + pub fn read(self: *Self, samples: []i16) usize { + self.mutex.lock(); + defer self.mutex.unlock(); + + const to_read = @min(samples.len, self.count); + + for (0..to_read) |i| { + samples[i] = self.buffer[self.read_pos]; + self.read_pos = (self.read_pos + 1) % self.buffer.len; + } + + self.count -= to_read; + return to_read; + } + + /// Get number of samples available for reading + pub fn available(self: *Self) usize { + self.mutex.lock(); + defer self.mutex.unlock(); + return self.count; + } + + /// Get remaining capacity for writing + pub fn capacity(self: *Self) usize { + self.mutex.lock(); + defer self.mutex.unlock(); + return self.buffer.len - self.count; + } + + /// Clear all data from buffer + pub fn clear(self: *Self) void { + self.mutex.lock(); + defer self.mutex.unlock(); + self.read_pos = 0; + self.write_pos = 0; + self.count = 0; + } +}; + +/// Audio format conversion utilities +pub const AudioConverter = struct { + /// Convert stereo samples to mono by averaging channels + pub fn stereoToMono(stereo_samples: []const i16, mono_samples: []i16) usize { + const frames = @min(stereo_samples.len / 2, mono_samples.len); + + for (0..frames) |i| { + const left = stereo_samples[i * 2]; + const right = stereo_samples[i * 2 + 1]; + // Average the channels and clamp to prevent overflow + const avg: i32 = @divTrunc(@as(i32, left) + @as(i32, right), 2); + mono_samples[i] = @intCast(@max(@min(avg, std.math.maxInt(i16)), std.math.minInt(i16))); + } + + return frames; + } + + /// Simple sample rate conversion (basic linear interpolation) + /// Note: This is a basic implementation. For production use, consider more sophisticated algorithms + pub fn resample(input_samples: []const i16, output_samples: []i16, input_rate: u32, output_rate: u32) usize { + if (input_rate == output_rate) { + const copy_len = @min(input_samples.len, output_samples.len); + @memcpy(output_samples[0..copy_len], input_samples[0..copy_len]); + return copy_len; + } + + const ratio = @as(f64, @floatFromInt(input_rate)) / @as(f64, @floatFromInt(output_rate)); + const output_len = @min(output_samples.len, @as(usize, @intFromFloat(@as(f64, @floatFromInt(input_samples.len)) / ratio))); + + for (0..output_len) |i| { + const src_pos = @as(f64, @floatFromInt(i)) * ratio; + const src_idx: usize = @intFromFloat(src_pos); + + if (src_idx >= input_samples.len) break; + + if (src_idx + 1 < input_samples.len) { + // Linear interpolation + const frac = src_pos - @as(f64, @floatFromInt(src_idx)); + const sample1: f64 = @floatFromInt(input_samples[src_idx]); + const sample2: f64 = @floatFromInt(input_samples[src_idx + 1]); + const interpolated = sample1 + (sample2 - sample1) * frac; + output_samples[i] = @intFromFloat(@max(@min(interpolated, std.math.maxInt(i16)), std.math.minInt(i16))); + } else { + output_samples[i] = input_samples[src_idx]; + } + } + + return output_len; + } +}; + +/// ALSA audio capture configuration and state +pub const AlsaCapture = struct { + const Self = @This(); + + /// ALSA PCM handle + pcm_handle: ?*c.snd_pcm_t = null, + /// Device name + device_name: []const u8, + /// Sample rate + sample_rate: u32, + /// Number of channels + channels: u32, + /// Buffer size in frames + buffer_size: u32, + /// Period size in frames + period_size: u32, + /// Audio buffer for captured data + audio_buffer: AudioBuffer, + /// Temporary buffer for ALSA reads + temp_buffer: []i16, + /// Allocator for memory management + allocator: std.mem.Allocator, + + /// Initialize ALSA capture with specified parameters + pub fn init(allocator: std.mem.Allocator, device_name: []const u8, sample_rate: u32, channels: u32, buffer_size: u32) !Self { + // Calculate period size (typically 1/4 of buffer size) + const period_size = buffer_size / 4; + + // Create audio buffer (make it larger than ALSA buffer to prevent overruns) + const audio_buffer = try AudioBuffer.init(allocator, buffer_size * 4); + + // Allocate temporary buffer for ALSA reads + const temp_buffer = try allocator.alloc(i16, period_size * channels); + + return Self{ + .device_name = device_name, + .sample_rate = sample_rate, + .channels = channels, + .buffer_size = buffer_size, + .period_size = period_size, + .audio_buffer = audio_buffer, + .temp_buffer = temp_buffer, + .allocator = allocator, + }; + } + + /// Deinitialize ALSA capture and free resources + pub fn deinit(self: *Self) void { + self.close(); + self.audio_buffer.deinit(); + self.allocator.free(self.temp_buffer); + } + + /// Open ALSA device and configure parameters + pub fn open(self: *Self) !void { + // Convert device name to null-terminated string + const device_cstr = try self.allocator.dupeZ(u8, self.device_name); + defer self.allocator.free(device_cstr); + + // Open PCM device + var err = c.snd_pcm_open(&self.pcm_handle, device_cstr.ptr, c.SND_PCM_STREAM_CAPTURE, 0); + if (err < 0) { + return SttError.AudioDeviceError; + } + + // Allocate hardware parameters structure + var hw_params: ?*c.snd_pcm_hw_params_t = null; + err = c.snd_pcm_hw_params_malloc(@ptrCast(&hw_params)); + if (err < 0) { + self.close(); + return SttError.AudioDeviceError; + } + defer c.snd_pcm_hw_params_free(hw_params); + + // Initialize hardware parameters + err = c.snd_pcm_hw_params_any(self.pcm_handle, hw_params); + if (err < 0) { + self.close(); + return SttError.AudioDeviceError; + } + + // Set access type to interleaved + err = c.snd_pcm_hw_params_set_access(self.pcm_handle, hw_params, c.SND_PCM_ACCESS_RW_INTERLEAVED); + if (err < 0) { + self.close(); + return SttError.AudioDeviceError; + } + + // Set sample format to 16-bit signed little endian + err = c.snd_pcm_hw_params_set_format(self.pcm_handle, hw_params, c.SND_PCM_FORMAT_S16_LE); + if (err < 0) { + self.close(); + return SttError.AudioDeviceError; + } + + // Set number of channels + err = c.snd_pcm_hw_params_set_channels(self.pcm_handle, hw_params, self.channels); + if (err < 0) { + self.close(); + return SttError.AudioDeviceError; + } + + // Set sample rate + var actual_rate = self.sample_rate; + err = c.snd_pcm_hw_params_set_rate_near(self.pcm_handle, hw_params, &actual_rate, null); + if (err < 0) { + self.close(); + return SttError.AudioDeviceError; + } + + // Set buffer size + var actual_buffer_size: c.snd_pcm_uframes_t = self.buffer_size; + err = c.snd_pcm_hw_params_set_buffer_size_near(self.pcm_handle, hw_params, &actual_buffer_size); + if (err < 0) { + self.close(); + return SttError.AudioDeviceError; + } + + // Set period size + var actual_period_size: c.snd_pcm_uframes_t = self.period_size; + err = c.snd_pcm_hw_params_set_period_size_near(self.pcm_handle, hw_params, &actual_period_size, null); + if (err < 0) { + self.close(); + return SttError.AudioDeviceError; + } + + // Apply hardware parameters + err = c.snd_pcm_hw_params(self.pcm_handle, hw_params); + if (err < 0) { + self.close(); + return SttError.AudioDeviceError; + } + + // Prepare the PCM for use + err = c.snd_pcm_prepare(self.pcm_handle); + if (err < 0) { + self.close(); + return SttError.AudioDeviceError; + } + } + + /// Close ALSA device + pub fn close(self: *Self) void { + if (self.pcm_handle) |handle| { + _ = c.snd_pcm_close(handle); + self.pcm_handle = null; + } + } + + /// Read audio data from ALSA device and process it + pub fn readAudio(self: *Self) !usize { + if (self.pcm_handle == null) { + return SttError.AudioDeviceError; + } + + // Read audio data from ALSA + const frames_read = c.snd_pcm_readi(self.pcm_handle, self.temp_buffer.ptr, self.period_size); + + if (frames_read < 0) { + // Handle underrun or other errors + if (frames_read == -c.EPIPE) { + // Underrun occurred, try to recover + const err = c.snd_pcm_prepare(self.pcm_handle); + if (err < 0) { + return SttError.AudioDeviceError; + } + return 0; // No data read this time + } else { + return SttError.AudioDeviceError; + } + } + + const samples_read = @as(usize, @intCast(frames_read)) * self.channels; + + // Process audio based on channel configuration + if (self.channels == 1) { + // Mono input - write directly to buffer + _ = self.audio_buffer.write(self.temp_buffer[0..samples_read]); + } else if (self.channels == 2) { + // Stereo input - convert to mono + const mono_buffer = try self.allocator.alloc(i16, @as(usize, @intCast(frames_read))); + defer self.allocator.free(mono_buffer); + + const mono_samples = AudioConverter.stereoToMono(self.temp_buffer[0..samples_read], mono_buffer); + _ = self.audio_buffer.write(mono_buffer[0..mono_samples]); + } else { + // Multi-channel input - take first channel only + const mono_buffer = try self.allocator.alloc(i16, @as(usize, @intCast(frames_read))); + defer self.allocator.free(mono_buffer); + + for (0..@as(usize, @intCast(frames_read))) |i| { + mono_buffer[i] = self.temp_buffer[i * self.channels]; + } + _ = self.audio_buffer.write(mono_buffer); + } + + return @intCast(frames_read); + } + + /// Get processed audio samples (mono, at configured sample rate) + pub fn getAudioSamples(self: *Self, output_buffer: []i16) usize { + return self.audio_buffer.read(output_buffer); + } + + /// Get number of samples available for reading + pub fn availableSamples(self: *Self) usize { + return self.audio_buffer.available(); + } +}; + +/// Configuration options for STT session initialization +pub const SttOptions = struct { + /// Path to the Vosk model directory + model_path: []const u8, + /// ALSA audio device name (e.g., "hw:3,0") + audio_device: []const u8, + /// Speech event handler for callbacks + event_handler: SpeechEventHandler, + /// Sample rate for audio processing (default: 16000) + sample_rate: u32 = 16000, + /// Number of audio channels (default: 2 for stereo) + channels: u32 = 2, + /// Audio buffer size in frames (default: 256) + buffer_size: u32 = 256, +}; + +/// Main STT session handle +/// +/// This represents an active speech-to-text session with configured +/// audio input and speech recognition model. +pub const SttSession = struct { + const Self = @This(); + + /// Memory allocator + allocator: std.mem.Allocator, + /// Configuration options + options: SttOptions, + /// Initialization state + initialized: bool = false, + /// Listening state + listening: bool = false, + /// ALSA audio capture + alsa_capture: ?AlsaCapture = null, + /// Audio capture thread + audio_thread: ?std.Thread = null, + /// Processing thread for Vosk + processing_thread: ?std.Thread = null, + /// Thread synchronization + should_stop: std.atomic.Value(bool) = std.atomic.Value(bool).init(false), + /// Processing buffer for audio samples + processing_buffer: []i16, + /// Vosk model + vosk_model: ?*c.VoskModel = null, + /// Vosk recognizer + vosk_recognizer: ?*c.VoskRecognizer = null, + /// Audio buffer for Vosk processing + vosk_audio_buffer: AudioBuffer, + + /// Initialize a new STT session with the given options + /// + /// Parameters: + /// - allocator: Memory allocator to use for the session + /// - options: Configuration options for the session + /// + /// Returns: + /// - SttSession instance on success + /// - SttError on failure + pub fn init(allocator: std.mem.Allocator, options: SttOptions) SttError!SttSession { + // Validate options first + try validateOptions(options); + + // Allocate processing buffer for audio samples (1 second worth of samples) + const processing_buffer = allocator.alloc(i16, options.sample_rate) catch { + return SttError.OutOfMemory; + }; + + // Initialize ALSA capture + const alsa_capture = AlsaCapture.init( + allocator, + options.audio_device, + options.sample_rate, + options.channels, + options.buffer_size, + ) catch { + allocator.free(processing_buffer); + return SttError.InitializationFailed; + }; + + // Initialize Vosk audio buffer (larger buffer for processing) + const vosk_audio_buffer = AudioBuffer.init(allocator, options.sample_rate * 2) catch { + allocator.free(processing_buffer); + var alsa_capture_mut = alsa_capture; + alsa_capture_mut.deinit(); + return SttError.OutOfMemory; + }; + + var session = SttSession{ + .allocator = allocator, + .options = options, + .alsa_capture = alsa_capture, + .processing_buffer = processing_buffer, + .vosk_audio_buffer = vosk_audio_buffer, + }; + + // Initialize Vosk model and recognizer + session.initVosk() catch { + session.deinitPartial(); + return SttError.ModelLoadError; + }; + + session.initialized = true; + return session; + } + + /// Initialize Vosk model and recognizer + fn initVosk(self: *SttSession) !void { + // Convert model path to null-terminated string + const model_path_cstr = try self.allocator.dupeZ(u8, self.options.model_path); + defer self.allocator.free(model_path_cstr); + + // Load Vosk model + self.vosk_model = c.vosk_model_new(model_path_cstr.ptr); + if (self.vosk_model == null) { + return SttError.ModelLoadError; + } + + // Create Vosk recognizer + self.vosk_recognizer = c.vosk_recognizer_new(self.vosk_model, @floatFromInt(self.options.sample_rate)); + if (self.vosk_recognizer == null) { + if (self.vosk_model) |model| { + c.vosk_model_free(model); + self.vosk_model = null; + } + return SttError.ModelLoadError; + } + } + + /// Partial cleanup for initialization failures + fn deinitPartial(self: *SttSession) void { + // Clean up Vosk resources + if (self.vosk_recognizer) |recognizer| { + c.vosk_recognizer_free(recognizer); + self.vosk_recognizer = null; + } + if (self.vosk_model) |model| { + c.vosk_model_free(model); + self.vosk_model = null; + } + + // Clean up audio buffer + self.vosk_audio_buffer.deinit(); + + // Clean up ALSA capture resources + if (self.alsa_capture) |*capture| { + capture.deinit(); + self.alsa_capture = null; + } + + // Free processing buffer + self.allocator.free(self.processing_buffer); + } + + /// Audio capture thread function + fn audioThreadFn(self: *SttSession) void { + var retry_count: u32 = 0; + const max_retries = 5; + const retry_delay_ms = 100; + + // Open ALSA device with retry logic + if (self.alsa_capture) |*capture| { + while (retry_count < max_retries and !self.should_stop.load(.acquire)) { + capture.open() catch |err| { + retry_count += 1; + if (retry_count >= max_retries) { + self.options.event_handler.onError(err, "Failed to open audio device after retries"); + return; + } + std.Thread.sleep(retry_delay_ms * std.time.ns_per_ms); + continue; + }; + break; + } + + if (retry_count >= max_retries) { + return; + } + + // Reset retry count for audio reading + retry_count = 0; + + // Audio capture loop with proper error handling and recovery + while (!self.should_stop.load(.acquire)) { + // Read audio data from ALSA + _ = capture.readAudio() catch |err| { + if (err == SttError.AudioDeviceError) { + retry_count += 1; + if (retry_count >= max_retries) { + self.options.event_handler.onError(err, "Audio capture failed after retries"); + break; + } + // Try to recover from audio errors + std.Thread.sleep(retry_delay_ms * std.time.ns_per_ms); + continue; + } + self.options.event_handler.onError(err, "Audio capture error"); + break; + }; + + // Reset retry count on successful read + retry_count = 0; + + // Transfer audio data to Vosk processing buffer + if (capture.availableSamples() >= 1024) { // Process in chunks of 1024 samples + const chunk_size = @min(1024, self.processing_buffer.len); + const samples_read = capture.getAudioSamples(self.processing_buffer[0..chunk_size]); + if (samples_read > 0) { + // Send audio to Vosk processing buffer with overflow protection + const written = self.vosk_audio_buffer.write(self.processing_buffer[0..samples_read]); + if (written < samples_read) { + // Buffer overflow - clear some old data to make room + self.vosk_audio_buffer.clear(); + _ = self.vosk_audio_buffer.write(self.processing_buffer[0..samples_read]); + } + } + } + + // Small delay to prevent busy waiting + std.Thread.sleep(1 * std.time.ns_per_ms); // 1ms + } + + // Ensure ALSA device is properly closed + capture.close(); + } + } + + /// Vosk processing thread function + fn processingThreadFn(self: *SttSession) void { + // Processing buffer for Vosk (4096 samples = ~256ms at 16kHz) + const vosk_chunk_size = 4096; + const min_chunk_size = 1024; // Minimum chunk size for processing + + var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch { + self.options.event_handler.onError(SttError.OutOfMemory, "Failed to allocate Vosk processing buffer"); + return; + }; + defer self.allocator.free(vosk_buffer); + + var error_count: u32 = 0; + const max_errors = 10; + const error_reset_threshold = 100; // Reset error count after this many successful operations + var success_count: u32 = 0; + + while (!self.should_stop.load(.acquire)) { + // Check if we have enough audio data for processing + const available_samples = self.vosk_audio_buffer.available(); + + if (available_samples >= min_chunk_size) { + // Process in chunks, but don't exceed our buffer size + const chunk_size = @min(available_samples, vosk_chunk_size); + const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..chunk_size]); + + if (samples_read > 0 and self.vosk_recognizer != null) { + // Process audio with Vosk + self.processVoskAudio(vosk_buffer[0..samples_read]) catch |err| { + error_count += 1; + if (error_count >= max_errors) { + self.options.event_handler.onError(SttError.CallbackError, "Too many Vosk processing errors, stopping"); + break; + } + self.options.event_handler.onError(err, "Vosk processing error"); + + // Add delay after error to prevent rapid error loops + std.Thread.sleep(50 * std.time.ns_per_ms); // 50ms delay + continue; + }; + + // Reset error count after successful operations + success_count += 1; + if (success_count >= error_reset_threshold) { + error_count = 0; + success_count = 0; + } + } + } + + // Adaptive delay based on buffer fill level + const delay_ms: u64 = if (available_samples > vosk_chunk_size * 2) + 1 // Fast processing when buffer is full + else if (available_samples > min_chunk_size) + 5 // Normal processing + else + 10; // Slower when buffer is low + + std.Thread.sleep(delay_ms * std.time.ns_per_ms); + } + + // Final processing of any remaining audio data + const remaining_samples = self.vosk_audio_buffer.available(); + if (remaining_samples > 0 and self.vosk_recognizer != null) { + const final_chunk_size = @min(remaining_samples, vosk_chunk_size); + const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..final_chunk_size]); + if (samples_read > 0) { + self.processVoskAudio(vosk_buffer[0..samples_read]) catch { + // Ignore errors during shutdown + }; + } + } + } + + /// Process audio chunk with Vosk and handle results + fn processVoskAudio(self: *SttSession, audio_data: []const i16) !void { + if (self.vosk_recognizer == null) { + return SttError.InvalidState; + } + + // Convert i16 samples to bytes for Vosk + const audio_bytes = std.mem.sliceAsBytes(audio_data); + + // Feed audio to Vosk recognizer + const accept_result = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len)); + + if (accept_result == 1) { + // Final result available + const result_cstr = c.vosk_recognizer_result(self.vosk_recognizer); + if (result_cstr != null) { + const result_str = std.mem.span(result_cstr); + + // Parse JSON result to extract text + self.parseVoskResult(result_str) catch |err| { + self.options.event_handler.onError(err, "Failed to parse Vosk result"); + }; + } + } else if (accept_result == 0) { + // Partial result available (optional - for real-time feedback) + const partial_result_cstr = c.vosk_recognizer_partial_result(self.vosk_recognizer); + if (partial_result_cstr != null) { + const partial_str = std.mem.span(partial_result_cstr); + + // Parse partial result (could be used for real-time display) + self.parseVoskPartialResult(partial_str) catch { + // Ignore partial result parsing errors + }; + } + } + // accept_result == -1 means error, but we continue processing + } + + /// Parse Vosk JSON result and extract recognized text + fn parseVoskResult(self: *SttSession, json_str: []const u8) !void { + // Simple JSON parsing to extract "text" field + // Vosk returns JSON like: {"text": "hello world"} + + if (json_str.len == 0) return; + + // Find "text" field in JSON + const text_key = "\"text\""; + if (std.mem.indexOf(u8, json_str, text_key)) |text_start| { + const value_start = text_start + text_key.len; + + // Find the colon and opening quote + if (std.mem.indexOf(u8, json_str[value_start..], ":")) |colon_pos| { + const after_colon = value_start + colon_pos + 1; + + // Skip whitespace and find opening quote + var quote_start: ?usize = null; + for (json_str[after_colon..], 0..) |char, i| { + if (char == '"') { + quote_start = after_colon + i + 1; + break; + } + } + + if (quote_start) |start| { + // Find closing quote + if (std.mem.indexOf(u8, json_str[start..], "\"")) |quote_end| { + const text = json_str[start .. start + quote_end]; + + // Only invoke callback if text is not empty + if (text.len > 0 and !std.mem.eql(u8, text, " ")) { + self.options.event_handler.onSpeech(text); + } + } + } + } + } + } + + /// Parse Vosk partial result (for real-time feedback) + fn parseVoskPartialResult(self: *SttSession, json_str: []const u8) !void { + // Similar to parseVoskResult but for partial results + // For now, we don't use partial results, but this could be extended + // to provide real-time transcription feedback + _ = self; + _ = json_str; + } + + /// Attempt to recover from audio device errors + fn recoverAudioDevice(self: *SttSession) SttError!void { + if (self.alsa_capture) |*capture| { + // Close and reopen the audio device + capture.close(); + + // Wait a bit before attempting to reopen + std.Thread.sleep(100 * std.time.ns_per_ms); + + // Try to reopen the device + capture.open() catch { + return SttError.AudioDeviceError; + }; + } + } + + /// Get current session status information + pub fn getStatus(self: *SttSession) struct { + initialized: bool, + listening: bool, + audio_samples_available: usize, + processing_samples_available: usize, + } { + return .{ + .initialized = self.initialized, + .listening = self.listening, + .audio_samples_available = if (self.alsa_capture) |*capture| capture.availableSamples() else 0, + .processing_samples_available = self.vosk_audio_buffer.available(), + }; + } + + /// Validate session options before initialization + fn validateOptions(options: SttOptions) SttError!void { + if (options.model_path.len == 0) { + return SttError.InvalidParameter; + } + if (options.audio_device.len == 0) { + return SttError.InvalidParameter; + } + if (options.sample_rate == 0 or options.sample_rate > 48000) { + return SttError.InvalidParameter; + } + if (options.channels == 0 or options.channels > 8) { + return SttError.InvalidParameter; + } + if (options.buffer_size == 0 or options.buffer_size > 8192) { + return SttError.InvalidParameter; + } + } + + /// Reinitialize the session after an error (recovery mechanism) + pub fn reinitialize(self: *SttSession) SttError!void { + if (self.listening) { + self.stop_listening(); + } + + // Clean up existing Vosk resources + if (self.vosk_recognizer) |recognizer| { + c.vosk_recognizer_free(recognizer); + self.vosk_recognizer = null; + } + if (self.vosk_model) |model| { + c.vosk_model_free(model); + self.vosk_model = null; + } + + // Reinitialize Vosk + try self.initVosk(); + + // Reset audio buffers + if (self.alsa_capture) |*capture| { + capture.audio_buffer.clear(); + } + self.vosk_audio_buffer.clear(); + + self.initialized = true; + } + + /// Start listening for speech input + /// + /// This begins audio capture and speech recognition processing. + /// Speech detection events will be delivered via the configured + /// event handler callbacks. + /// + /// Returns: + /// - void on success + /// - SttError on failure + pub fn start_listening(self: *SttSession) SttError!void { + if (!self.initialized) { + return SttError.InvalidState; + } + if (self.listening) { + return SttError.InvalidState; + } + + // Clear any existing audio buffers + if (self.alsa_capture) |*capture| { + capture.audio_buffer.clear(); + } + self.vosk_audio_buffer.clear(); + + // Reset stop flag + self.should_stop.store(false, .release); + + // Start audio capture thread with error handling + self.audio_thread = std.Thread.spawn(.{}, audioThreadFn, .{self}) catch |err| { + self.should_stop.store(true, .release); + return switch (err) { + error.SystemResources, error.ThreadQuotaExceeded => SttError.ThreadingError, + else => SttError.ThreadingError, + }; + }; + + // Start Vosk processing thread with cleanup on failure + self.processing_thread = std.Thread.spawn(.{}, processingThreadFn, .{self}) catch |err| { + // Clean up audio thread if processing thread fails + self.should_stop.store(true, .release); + if (self.audio_thread) |thread| { + thread.join(); + self.audio_thread = null; + } + return switch (err) { + error.SystemResources, error.ThreadQuotaExceeded => SttError.ThreadingError, + else => SttError.ThreadingError, + }; + }; + + // Give threads a moment to start up + std.Thread.sleep(10 * std.time.ns_per_ms); + + self.listening = true; + } + + /// Stop listening for speech input + /// + /// This stops audio capture and speech recognition processing. + /// Any ongoing processing will be completed before returning. + pub fn stop_listening(self: *SttSession) void { + if (!self.listening) { + return; + } + + // Signal threads to stop + self.should_stop.store(true, .release); + + // Give threads a moment to see the stop signal + std.Thread.sleep(5 * std.time.ns_per_ms); + + // Wait for audio thread to finish with timeout + if (self.audio_thread) |thread| { + // Join with reasonable timeout - threads should stop quickly + thread.join(); + self.audio_thread = null; + } + + // Wait for processing thread to finish with timeout + if (self.processing_thread) |thread| { + thread.join(); + self.processing_thread = null; + } + + // Clear any remaining audio data + if (self.alsa_capture) |*capture| { + capture.audio_buffer.clear(); + } + self.vosk_audio_buffer.clear(); + + self.listening = false; + } + + /// Check if the session is currently listening + pub fn is_listening(self: *const SttSession) bool { + return self.listening; + } + + /// Check if the session is initialized + pub fn is_initialized(self: *const SttSession) bool { + return self.initialized; + } + + /// Deinitialize the STT session and free all resources + /// + /// This must be called to properly clean up the session. + /// After calling deinit(), the session should not be used. + pub fn deinit(self: *SttSession) void { + // Ensure we're not listening before cleanup + if (self.listening) { + self.stop_listening(); + } + + // Double-check that threads are properly stopped + if (self.audio_thread != null or self.processing_thread != null) { + self.should_stop.store(true, .release); + + // Give threads one more chance to stop + std.Thread.sleep(50 * std.time.ns_per_ms); + + if (self.audio_thread) |thread| { + thread.join(); + self.audio_thread = null; + } + if (self.processing_thread) |thread| { + thread.join(); + self.processing_thread = null; + } + } + + // Clean up Vosk resources in proper order + if (self.vosk_recognizer) |recognizer| { + c.vosk_recognizer_free(recognizer); + self.vosk_recognizer = null; + } + if (self.vosk_model) |model| { + c.vosk_model_free(model); + self.vosk_model = null; + } + + // Clean up audio buffers + self.vosk_audio_buffer.deinit(); + + // Clean up ALSA capture resources + if (self.alsa_capture) |*capture| { + capture.deinit(); + self.alsa_capture = null; + } + + // Free processing buffer + self.allocator.free(self.processing_buffer); + + // Mark as uninitialized + self.initialized = false; + } +}; + +/// Initialize STT library with the given options +/// +/// This is the main entry point for the STT library. It creates and initializes +/// a new STT session with the provided configuration. +/// +/// Parameters: +/// - allocator: Memory allocator to use for the session +/// - options: Configuration options for the session +/// +/// Returns: +/// - SttSession instance on success +/// - SttError on failure +pub fn init(allocator: std.mem.Allocator, options: SttOptions) SttError!SttSession { + return SttSession.init(allocator, options); } -pub fn add(a: i32, b: i32) i32 { - return a + b; +/// C-compatible API functions for use from other languages +/// These wrap the Zig API with C calling conventions +/// Opaque handle type for C API +pub const SttHandle = opaque {}; + +/// Initialize STT library (C API) +/// +/// Parameters: +/// - model_path: Null-terminated path to Vosk model +/// - audio_device: Null-terminated ALSA device name +/// +/// Returns: +/// - Pointer to SttHandle on success +/// - null on failure +pub export fn stt_init(model_path: [*:0]const u8, audio_device: [*:0]const u8) ?*SttHandle { + // TODO: Implement C API wrapper in subsequent tasks + _ = model_path; + _ = audio_device; + return null; } -test "basic add functionality" { - try std.testing.expect(add(3, 7) == 10); +/// Set speech detection callback (C API) +/// TODO: Implement in subsequent tasks with proper C-compatible callback types +// pub export fn stt_set_speech_callback(handle: *SttHandle, callback: SpeechCallback, user_data: ?*anyopaque) void { +// _ = handle; +// _ = callback; +// _ = user_data; +// } + +/// Set error callback (C API) +/// TODO: Implement in subsequent tasks with proper C-compatible callback types +// pub export fn stt_set_error_callback(handle: *SttHandle, callback: ErrorCallback, user_data: ?*anyopaque) void { +// _ = handle; +// _ = callback; +// _ = user_data; +// } + +/// Start listening (C API) +pub export fn stt_start_listening(handle: *SttHandle) c_int { + _ = handle; + // TODO: Implement in subsequent tasks + return -1; // Error for now +} + +/// Stop listening (C API) +pub export fn stt_stop_listening(handle: *SttHandle) void { + _ = handle; + // TODO: Implement in subsequent tasks +} + +/// Deinitialize STT library (C API) +pub export fn stt_deinit(handle: *SttHandle) void { + _ = handle; + // TODO: Implement in subsequent tasks +} + +// Tests +test "SttError enum" { + const testing = std.testing; + + // Test that error types can be created and compared + const err1 = SttError.InitializationFailed; + const err2 = SttError.AudioDeviceError; + + try testing.expect(err1 != err2); + try testing.expect(err1 == SttError.InitializationFailed); +} + +test "SttOptions validation" { + const testing = std.testing; + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + const allocator = gpa.allocator(); + + // Test valid options + const DummyHandler = struct { + fn onSpeech(ctx: *anyopaque, text: []const u8) void { + _ = ctx; + _ = text; + } + fn onError(ctx: *anyopaque, error_code: SttError, message: []const u8) void { + _ = ctx; + _ = message; + // Can't discard error types with _, so we just don't use it + switch (error_code) { + else => {}, + } + } + }; + + var dummy_ctx: u8 = 0; + const valid_options = SttOptions{ + .model_path = "/path/to/model", + .audio_device = "hw:0,0", + .event_handler = SpeechEventHandler{ + .onSpeechFn = DummyHandler.onSpeech, + .onErrorFn = DummyHandler.onError, + .ctx = &dummy_ctx, + }, + }; + + // Test that initialization fails with invalid model path (expected behavior) + const result = SttSession.init(allocator, valid_options); + try testing.expectError(SttError.ModelLoadError, result); +} + +test "SttSession state management" { + const testing = std.testing; + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + const allocator = gpa.allocator(); + + const DummyHandler = struct { + fn onSpeech(ctx: *anyopaque, text: []const u8) void { + _ = ctx; + _ = text; + } + fn onError(ctx: *anyopaque, error_code: SttError, message: []const u8) void { + _ = ctx; + _ = message; + // Can't discard error types with _, so we just don't use it + switch (error_code) { + else => {}, + } + } + }; + + var dummy_ctx: u8 = 0; + const options = SttOptions{ + .model_path = "/path/to/model", + .audio_device = "hw:0,0", + .event_handler = SpeechEventHandler{ + .onSpeechFn = DummyHandler.onSpeech, + .onErrorFn = DummyHandler.onError, + .ctx = &dummy_ctx, + }, + }; + + // Test that initialization fails with invalid model path (expected behavior) + const result = SttSession.init(allocator, options); + try testing.expectError(SttError.ModelLoadError, result); +} + +test "SpeechEventHandler interface" { + const testing = std.testing; + + const TestHandler = struct { + speech_called: bool = false, + error_called: bool = false, + last_text: []const u8 = "", + last_error: SttError = SttError.InitializationFailed, + + fn onSpeech(ctx: *anyopaque, text: []const u8) void { + const self: *@This() = @ptrCast(@alignCast(ctx)); + self.speech_called = true; + self.last_text = text; + } + + fn onError(ctx: *anyopaque, error_code: SttError, message: []const u8) void { + const self: *@This() = @ptrCast(@alignCast(ctx)); + self.error_called = true; + self.last_error = error_code; + _ = message; + } + }; + + var handler = TestHandler{}; + const event_handler = SpeechEventHandler{ + .onSpeechFn = TestHandler.onSpeech, + .onErrorFn = TestHandler.onError, + .ctx = &handler, + }; + + // Test speech callback + event_handler.onSpeech("hello world"); + try testing.expect(handler.speech_called); + try testing.expectEqualStrings("hello world", handler.last_text); + + // Test error callback + event_handler.onError(SttError.AudioDeviceError, "test error"); + try testing.expect(handler.error_called); + try testing.expect(handler.last_error == SttError.AudioDeviceError); +} + +test "Vosk integration with valid model" { + const testing = std.testing; + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + const allocator = gpa.allocator(); + + const DummyHandler = struct { + fn onSpeech(ctx: *anyopaque, text: []const u8) void { + _ = ctx; + _ = text; + } + fn onError(ctx: *anyopaque, error_code: SttError, message: []const u8) void { + _ = ctx; + _ = message; + switch (error_code) { + else => {}, + } + } + }; + + var dummy_ctx: u8 = 0; + const options = SttOptions{ + .model_path = "zig-out/bin/vosk-model-small-en-us-0.15", + .audio_device = "hw:0,0", // This will fail in tests, but that's OK + .event_handler = SpeechEventHandler{ + .onSpeechFn = DummyHandler.onSpeech, + .onErrorFn = DummyHandler.onError, + .ctx = &dummy_ctx, + }, + }; + + // Try to initialize with real model path + const result = SttSession.init(allocator, options); + + // If model exists, initialization should succeed (except for audio device) + // If model doesn't exist, we expect ModelLoadError + if (result) |session| { + var session_mut = session; + defer session_mut.deinit(); + + // If we get here, Vosk model loaded successfully + try testing.expect(session_mut.is_initialized()); + try testing.expect(!session_mut.is_listening()); + } else |err| { + // Model not found or other initialization error - this is acceptable in tests + try testing.expect(err == SttError.ModelLoadError or err == SttError.InitializationFailed); + } +} + +test "AudioBuffer basic operations" { + const testing = std.testing; + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + const allocator = gpa.allocator(); + + var buffer = try AudioBuffer.init(allocator, 10); + defer buffer.deinit(); + + // Test initial state + try testing.expect(buffer.available() == 0); + try testing.expect(buffer.capacity() == 10); + + // Test writing samples + const samples = [_]i16{ 1, 2, 3, 4, 5 }; + const written = buffer.write(&samples); + try testing.expect(written == 5); + try testing.expect(buffer.available() == 5); + try testing.expect(buffer.capacity() == 5); + + // Test reading samples + var read_samples: [3]i16 = undefined; + const read_count = buffer.read(&read_samples); + try testing.expect(read_count == 3); + try testing.expect(read_samples[0] == 1); + try testing.expect(read_samples[1] == 2); + try testing.expect(read_samples[2] == 3); + try testing.expect(buffer.available() == 2); + + // Test buffer wrap-around + const more_samples = [_]i16{ 6, 7, 8, 9, 10, 11, 12, 13 }; + const written2 = buffer.write(&more_samples); + try testing.expect(written2 == 8); // Should write 8 samples (2 remaining + 6 new) + try testing.expect(buffer.available() == 10); // Buffer should be full + + // Test clearing buffer + buffer.clear(); + try testing.expect(buffer.available() == 0); + try testing.expect(buffer.capacity() == 10); +} + +test "AudioConverter stereo to mono conversion" { + const testing = std.testing; + + // Test stereo to mono conversion + const stereo_samples = [_]i16{ 100, 200, 300, 400, 500, 600 }; // 3 stereo frames + var mono_samples: [3]i16 = undefined; + + const frames_converted = AudioConverter.stereoToMono(&stereo_samples, &mono_samples); + try testing.expect(frames_converted == 3); + + // Check averaged values + try testing.expect(mono_samples[0] == 150); // (100 + 200) / 2 + try testing.expect(mono_samples[1] == 350); // (300 + 400) / 2 + try testing.expect(mono_samples[2] == 550); // (500 + 600) / 2 +} + +test "AudioConverter sample rate conversion" { + const testing = std.testing; + + // Test same sample rate (should copy directly) + const input_samples = [_]i16{ 100, 200, 300, 400 }; + var output_samples: [4]i16 = undefined; + + const converted = AudioConverter.resample(&input_samples, &output_samples, 16000, 16000); + try testing.expect(converted == 4); + try testing.expect(output_samples[0] == 100); + try testing.expect(output_samples[1] == 200); + try testing.expect(output_samples[2] == 300); + try testing.expect(output_samples[3] == 400); + + // Test downsampling (2:1 ratio) + var downsampled: [2]i16 = undefined; + const downsampled_count = AudioConverter.resample(&input_samples, &downsampled, 16000, 8000); + try testing.expect(downsampled_count == 2); + try testing.expect(downsampled[0] == 100); // First sample + try testing.expect(downsampled[1] == 300); // Interpolated sample +} + +test "AlsaCapture initialization" { + const testing = std.testing; + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + const allocator = gpa.allocator(); + + // Test ALSA capture initialization (without actually opening device) + var capture = AlsaCapture.init(allocator, "hw:0,0", 16000, 2, 1024) catch |err| { + // If ALSA initialization fails (e.g., no audio device), that's expected in test environment + if (err == error.OutOfMemory) { + return err; + } + return; // Skip test if ALSA not available + }; + defer capture.deinit(); + + // Test basic properties + try testing.expect(capture.sample_rate == 16000); + try testing.expect(capture.channels == 2); + try testing.expect(capture.buffer_size == 1024); + try testing.expect(capture.period_size == 256); // buffer_size / 4 + try testing.expect(capture.pcm_handle == null); // Not opened yet +} + +test "AudioBuffer thread safety" { + const testing = std.testing; + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + const allocator = gpa.allocator(); + + var buffer = try AudioBuffer.init(allocator, 1000); + defer buffer.deinit(); + + // Test concurrent access (simplified test) + const samples1 = [_]i16{ 1, 2, 3, 4, 5 }; + const samples2 = [_]i16{ 6, 7, 8, 9, 10 }; + + // Write from multiple "threads" (simulated) + const written1 = buffer.write(&samples1); + const written2 = buffer.write(&samples2); + + try testing.expect(written1 == 5); + try testing.expect(written2 == 5); + try testing.expect(buffer.available() == 10); + + // Read back samples + var read_buffer: [10]i16 = undefined; + const read_count = buffer.read(&read_buffer); + try testing.expect(read_count == 10); + + // Verify order is maintained + try testing.expect(read_buffer[0] == 1); + try testing.expect(read_buffer[4] == 5); + try testing.expect(read_buffer[5] == 6); + try testing.expect(read_buffer[9] == 10); +} + +test "SttSession session management API" { + const testing = std.testing; + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + const allocator = gpa.allocator(); + + const TestHandler = struct { + speech_count: u32 = 0, + error_count: u32 = 0, + + fn onSpeech(ctx: *anyopaque, text: []const u8) void { + const self: *@This() = @ptrCast(@alignCast(ctx)); + self.speech_count += 1; + _ = text; + } + + fn onError(ctx: *anyopaque, error_code: SttError, message: []const u8) void { + const self: *@This() = @ptrCast(@alignCast(ctx)); + self.error_count += 1; + switch (error_code) { + else => {}, + } + _ = message; + } + }; + + var handler = TestHandler{}; + const options = SttOptions{ + .model_path = "/invalid/path", // Will fail, but that's expected + .audio_device = "hw:0,0", + .event_handler = SpeechEventHandler{ + .onSpeechFn = TestHandler.onSpeech, + .onErrorFn = TestHandler.onError, + .ctx = &handler, + }, + }; + + // Test module-level init function + const result = init(allocator, options); + try testing.expectError(SttError.ModelLoadError, result); + + // Test options validation + const invalid_options = SttOptions{ + .model_path = "", // Invalid empty path + .audio_device = "hw:0,0", + .event_handler = options.event_handler, + }; + + const invalid_result = init(allocator, invalid_options); + try testing.expectError(SttError.InvalidParameter, invalid_result); +} + +test "SttSession status and recovery" { + const testing = std.testing; + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + const allocator = gpa.allocator(); + + const DummyHandler = struct { + fn onSpeech(ctx: *anyopaque, text: []const u8) void { + _ = ctx; + _ = text; + } + fn onError(ctx: *anyopaque, error_code: SttError, message: []const u8) void { + _ = ctx; + switch (error_code) { + else => {}, + } + _ = message; + } + }; + + var dummy_ctx: u8 = 0; + const options = SttOptions{ + .model_path = "zig-out/bin/vosk-model-small-en-us-0.15", + .audio_device = "hw:0,0", + .event_handler = SpeechEventHandler{ + .onSpeechFn = DummyHandler.onSpeech, + .onErrorFn = DummyHandler.onError, + .ctx = &dummy_ctx, + }, + }; + + // Try to create session (may fail if model not available) + const result = SttSession.init(allocator, options); + if (result) |session| { + var session_mut = session; + defer session_mut.deinit(); + + // Test status methods + try testing.expect(session_mut.is_initialized()); + try testing.expect(!session_mut.is_listening()); + + const status = session_mut.getStatus(); + try testing.expect(status.initialized); + try testing.expect(!status.listening); + try testing.expect(status.audio_samples_available == 0); + try testing.expect(status.processing_samples_available == 0); + + // Test that we can't start listening twice + const start_result = session_mut.start_listening(); + if (start_result) |_| { + // If start succeeded, test double start + const double_start = session_mut.start_listening(); + try testing.expectError(SttError.InvalidState, double_start); + + // Test stop listening + session_mut.stop_listening(); + try testing.expect(!session_mut.is_listening()); + + // Test that we can stop multiple times safely + session_mut.stop_listening(); + try testing.expect(!session_mut.is_listening()); + } else |err| { + // Audio device error expected in test environment + try testing.expect(err == SttError.ThreadingError or err == SttError.AudioDeviceError); + } + } else |err| { + // Model not available in test environment - this is acceptable + try testing.expect(err == SttError.ModelLoadError or err == SttError.InitializationFailed); + } }