From 446c146dedb2849a2b6f1701ec72c2b58839de14 Mon Sep 17 00:00:00 2001
From: Emil Lerch <emil@lerch.org>
Date: Wed, 10 Sep 2025 12:19:56 -0700
Subject: [PATCH] ai generated - no review yet

---
 build.zig    |   74 ++-
 src/main.zig |  178 +++---
 src/root.zig | 1550 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 1688 insertions(+), 114 deletions(-)

diff --git a/build.zig b/build.zig
index b754d11..fdae3d6 100644
--- a/build.zig
+++ b/build.zig
@@ -4,7 +4,6 @@ pub fn build(b: *std.Build) void {
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
 
-    // Select Vosk dependency based on target
     const vosk_dep_name = selectVoskDependency(target.result);
     const vosk_dep = b.dependency(vosk_dep_name, .{});
     const alsa_dep = b.dependency("alsa", .{
@@ -24,24 +23,49 @@ pub fn build(b: *std.Build) void {
     install_model.step.dependOn(&model_step.step);
     b.getInstallStep().dependOn(&install_model.step);
 
-    const exe = b.addExecutable(.{
+    // Create the STT library
+    const stt_lib = b.addLibrary(.{
         .name = "stt",
+        .linkage = .static,
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("src/root.zig"),
+            .target = target,
+            .optimize = optimize,
+            .link_libc = true,
+        }),
+    });
+
+    // Link with Vosk library
+    stt_lib.addIncludePath(vosk_dep.path(""));
+    stt_lib.addLibraryPath(vosk_dep.path(""));
+    stt_lib.linkSystemLibrary("vosk");
+
+    const alsa_lib = alsa_dep.artifact("asound");
+    stt_lib.linkLibrary(alsa_lib);
+    stt_lib.addIncludePath(alsa_dep.path("zig-out/include"));
+
+    b.installArtifact(stt_lib);
+
+    // Create the demo executable
+    const exe = b.addExecutable(.{
+        .name = "stt-demo",
         .root_module = b.createModule(.{
             .root_source_file = b.path("src/main.zig"),
             .target = target,
             .optimize = optimize,
+            .link_libc = true,
         }),
     });
 
-    exe.linkLibC();
+    exe.linkLibrary(stt_lib);
+    exe.linkLibrary(alsa_lib);
+    exe.addIncludePath(alsa_dep.path("zig-out/include"));
+
+    // Link with Vosk for the executable
     exe.addIncludePath(vosk_dep.path(""));
     exe.addLibraryPath(vosk_dep.path(""));
     exe.linkSystemLibrary("vosk");
 
-    const alsa_lib = alsa_dep.artifact("asound");
-    exe.linkLibrary(alsa_lib);
-    exe.addIncludePath(alsa_dep.path("zig-out/include"));
-
     b.installArtifact(exe);
 
     const run_step = b.step("run", "Run the app");
@@ -52,21 +76,47 @@ pub fn build(b: *std.Build) void {
     if (b.args) |args| {
         run_cmd.addArgs(args);
     }
-    // Creates a step for unit testing. This only builds the test executable
-    // but does not run it.
+    // Creates a step for unit testing the library
+    const lib_unit_tests = b.addTest(.{
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("src/root.zig"),
+            .target = target,
+            .optimize = optimize,
+            .link_libc = true,
+        }),
+    });
+
+    // Link the same dependencies as the library
+    lib_unit_tests.linkLibrary(alsa_lib);
+    lib_unit_tests.addIncludePath(alsa_dep.path("zig-out/include"));
+    lib_unit_tests.addIncludePath(vosk_dep.path(""));
+    lib_unit_tests.addLibraryPath(vosk_dep.path(""));
+    lib_unit_tests.linkSystemLibrary("vosk");
+
+    const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests);
+
+    // Creates a step for unit testing the demo application
     const exe_unit_tests = b.addTest(.{
         .root_module = b.createModule(.{
             .root_source_file = b.path("src/main.zig"),
             .target = target,
             .optimize = optimize,
+            .link_libc = true,
         }),
     });
+
+    exe_unit_tests.linkLibrary(stt_lib);
+    exe_unit_tests.linkLibrary(alsa_lib);
+    exe_unit_tests.addIncludePath(alsa_dep.path("zig-out/include"));
+    exe_unit_tests.addIncludePath(vosk_dep.path(""));
+    exe_unit_tests.addLibraryPath(vosk_dep.path(""));
+    exe_unit_tests.linkSystemLibrary("vosk");
+
     const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
 
-    // Similar to creating the run step earlier, this exposes a `test` step to
-    // the `zig build --help` menu, providing a way for the user to request
-    // running the unit tests.
+    // Test step that runs both library and demo tests
     const test_step = b.step("test", "Run unit tests");
+    test_step.dependOn(&run_lib_unit_tests.step);
     test_step.dependOn(&run_exe_unit_tests.step);
 }
 
diff --git a/src/main.zig b/src/main.zig
index deee6b0..94cafaa 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -1,103 +1,105 @@
-const std = @import("std");
-const c = @cImport({
-    @cInclude("vosk_api.h");
-    @cInclude("alsa/asoundlib.h");
-});
+//! STT Library Demo Application
+//!
+//! This demonstrates how to use the STT library for speech recognition.
+//! It will be updated in subsequent tasks to use the actual Vosk integration.
 
-const VOSK_SAMPLE_RATE = 16000;
-const BUFFER_SIZE = 256;
+const std = @import("std");
+const stt = @import("root.zig");
+
+/// Demo implementation of speech event handler
+const DemoHandler = struct {
+    /// Handle detected speech
+    fn onSpeech(ctx: *anyopaque, text: []const u8) void {
+        const self: *DemoHandler = @ptrCast(@alignCast(ctx));
+        _ = self; // Handler context not used in this simple demo
+
+        std.debug.print("Detected: {s}\n", .{text});
+    }
+
+    /// Handle errors
+    fn onError(ctx: *anyopaque, error_code: stt.SttError, message: []const u8) void {
+        const self: *DemoHandler = @ptrCast(@alignCast(ctx));
+        _ = self; // Handler context not used in this simple demo
+
+        std.debug.print("Error {}: {s}\n", .{ error_code, message });
+    }
+};
 
 pub fn main() !void {
     var gpa = std.heap.GeneralPurposeAllocator(.{}){};
     defer _ = gpa.deinit();
+    const allocator = gpa.allocator();
 
-    // Set ALSA config path to our local alsa.conf
-    _ = c.setenv("ALSA_CONFIG_PATH", "alsa.conf", 1);
+    std.debug.print("STT Library Demo\n", .{});
+    std.debug.print("================\n", .{});
 
-    // Initialize Vosk
-    c.vosk_set_log_level(-1);
-    const model = c.vosk_model_new("zig-out/bin/vosk-model-small-en-us-0.15");
-    if (model == null) {
-        std.debug.print("Failed to load model\n", .{});
+    // Create demo handler
+    var demo_handler = DemoHandler{};
+    const speech_handler = stt.SpeechEventHandler{
+        .onSpeechFn = DemoHandler.onSpeech,
+        .onErrorFn = DemoHandler.onError,
+        .ctx = &demo_handler,
+    };
+
+    // Initialize STT session with configuration
+    const options = stt.SttOptions{
+        .model_path = "zig-out/bin/vosk-model-small-en-us-0.15",
+        .audio_device = "hw:3,0",
+        .event_handler = speech_handler,
+        .sample_rate = 16000,
+        .channels = 2,
+        .buffer_size = 256,
+    };
+
+    var session = stt.SttSession.init(allocator, options) catch |err| {
+        std.debug.print("Failed to initialize STT library: {}\n", .{err});
         return;
-    }
-    defer c.vosk_model_free(model);
+    };
+    defer session.deinit();
 
-    const rec = c.vosk_recognizer_new(model, VOSK_SAMPLE_RATE);
-    if (rec == null) {
-        std.debug.print("Failed to create recognizer\n", .{});
+    std.debug.print("STT library initialized successfully\n", .{});
+    std.debug.print("Model path: {s}\n", .{options.model_path});
+    std.debug.print("Audio device: {s}\n", .{options.audio_device});
+    std.debug.print("Sample rate: {} Hz\n", .{options.sample_rate});
+    std.debug.print("Channels: {}\n", .{options.channels});
+    std.debug.print("Buffer size: {} frames\n", .{options.buffer_size});
+    std.debug.print("\n", .{});
+
+    // Start listening for speech
+    session.start_listening() catch |err| {
+        std.debug.print("Failed to start listening: {}\n", .{err});
         return;
-    }
-    defer c.vosk_recognizer_free(rec);
+    };
 
-    // Try to open hardware capture device directly
-    var handle: ?*c.snd_pcm_t = null;
-    var err = c.snd_pcm_open(&handle, "hw:3,0", c.SND_PCM_STREAM_CAPTURE, c.SND_PCM_NONBLOCK);
-    if (err < 0) {
-        std.debug.print("Cannot open audio device: {s}\n", .{c.snd_strerror(err)});
-        std.debug.print("Make sure no other applications are using the microphone\n", .{});
-        return;
-    }
-    defer _ = c.snd_pcm_close(handle);
+    std.debug.print("Listening for speech... (Press Enter to exit)\n", .{});
 
-    // Set to blocking mode
-    err = c.snd_pcm_nonblock(handle, 0);
-    if (err < 0) {
-        std.debug.print("Cannot set blocking mode: {s}\n", .{c.snd_strerror(err)});
-        return;
-    }
+    // Wait for user input to exit (simulating Ctrl+C behavior)
+    // In subsequent tasks, this will be replaced with actual audio processing
+    const stdin = std.fs.File.stdin();
+    var buffer: [1]u8 = undefined;
+    _ = stdin.read(&buffer) catch {};
 
-    // Configure audio parameters
-    err = c.snd_pcm_set_params(handle, c.SND_PCM_FORMAT_S16_LE, c.SND_PCM_ACCESS_RW_INTERLEAVED, 2, VOSK_SAMPLE_RATE, 1, 100000);
-    if (err < 0) {
-        std.debug.print("Cannot configure audio: {s}\n", .{c.snd_strerror(err)});
-        return;
-    }
+    std.debug.print("\nStopping speech recognition...\n", .{});
+    session.stop_listening();
 
-    // Prepare the PCM device
-    err = c.snd_pcm_prepare(handle);
-    if (err < 0) {
-        std.debug.print("Cannot prepare audio: {s}\n", .{c.snd_strerror(err)});
-        return;
-    }
-
-    // Start the PCM stream
-    err = c.snd_pcm_start(handle);
-    if (err < 0) {
-        std.debug.print("Cannot start audio: {s}\n", .{c.snd_strerror(err)});
-        return;
-    }
-
-    std.debug.print("Audio configured successfully\n", .{});
-    std.debug.print("Listening... (Ctrl+C to exit)\n", .{});
-
-    var buffer: [BUFFER_SIZE * 2]i16 = undefined; // stereo
-    var accumulator: [VOSK_SAMPLE_RATE]i16 = undefined; // 1 second buffer
-    var acc_pos: usize = 0;
-
-    while (true) {
-        const frames_read = c.snd_pcm_readi(handle, &buffer, BUFFER_SIZE);
-        if (frames_read < 0) {
-            _ = c.snd_pcm_recover(handle, @intCast(frames_read), 1);
-            continue;
-        }
-
-        // Convert stereo to mono and accumulate
-        for (0..@intCast(frames_read)) |i| {
-            if (acc_pos < accumulator.len) {
-                accumulator[acc_pos] = buffer[i * 2]; // left channel
-                acc_pos += 1;
-            }
-        }
-
-        // Process when we have enough data (0.1 seconds)
-        if (acc_pos >= VOSK_SAMPLE_RATE / 10) {
-            const result = c.vosk_recognizer_accept_waveform(rec, @ptrCast(&accumulator), @intCast(acc_pos * 2));
-            if (result != 0) {
-                const text = c.vosk_recognizer_result(rec);
-                std.debug.print("{s}\n", .{text});
-            }
-            acc_pos = 0;
-        }
-    }
+    std.debug.print("Demo completed successfully\n", .{});
+}
+
+// Test the demo functionality
+test "demo handler functionality" {
+    const testing = std.testing;
+
+    var demo_handler = DemoHandler{};
+    const speech_handler = stt.SpeechEventHandler{
+        .onSpeechFn = DemoHandler.onSpeech,
+        .onErrorFn = DemoHandler.onError,
+        .ctx = &demo_handler,
+    };
+
+    // Test that callbacks can be invoked without crashing
+    speech_handler.onSpeech("test speech");
+    speech_handler.onError(stt.SttError.AudioDeviceError, "test error");
+
+    // If we get here without crashing, the test passes
+    try testing.expect(true);
 }
diff --git a/src/root.zig b/src/root.zig
index 94c7cd0..882b636 100644
--- a/src/root.zig
+++ b/src/root.zig
@@ -1,23 +1,1545 @@
-//! By convention, root.zig is the root source file when making a library.
+//! STT (Speech-to-Text) Library
+//!
+//! This library provides callback-based speech recognition functionality
+//! using Vosk and ALSA for audio capture.
+
 const std = @import("std");
+const c = @cImport({
+    @cInclude("alsa/asoundlib.h");
+    @cInclude("vosk_api.h");
+});
 
-pub fn bufferedPrint() !void {
-    // Stdout is for the actual output of your application, for example if you
-    // are implementing gzip, then only the compressed bytes should be sent to
-    // stdout, not any debugging messages.
-    var stdout_buffer: [1024]u8 = undefined;
-    var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer);
-    const stdout = &stdout_writer.interface;
+/// Core error types for the STT library
+pub const SttError = error{
+    /// Failed to initialize the library (model loading, audio setup, etc.)
+    InitializationFailed,
+    /// Audio device access or configuration error
+    AudioDeviceError,
+    /// Failed to load the speech recognition model
+    ModelLoadError,
+    /// Error occurred during callback execution
+    CallbackError,
+    /// Memory allocation failed
+    OutOfMemory,
+    /// Invalid parameters provided
+    InvalidParameter,
+    /// Library is not in the correct state for the operation
+    InvalidState,
+    /// Threading or synchronization error
+    ThreadingError,
+};
 
-    try stdout.print("Run `zig build test` to run the tests.\n", .{});
+/// Callback function type for speech detection events
+///
+/// Parameters:
+/// - text: Null-terminated string containing the detected speech
+/// - user_data: Optional user-provided context data
+pub const SpeechCallback = *const fn (text: [*:0]const u8, user_data: ?*anyopaque) void;
 
-    try stdout.flush(); // Don't forget to flush!
+/// Callback function type for error events
+///
+/// Parameters:
+/// - error_code: The specific error that occurred
+/// - message: Null-terminated string with error details
+/// - user_data: Optional user-provided context data
+pub const ErrorCallback = *const fn (error_code: SttError, message: [*:0]const u8, user_data: ?*anyopaque) void;
+
+/// Speech event handler interface pattern
+///
+/// This provides a structured way to handle speech recognition events
+/// with both speech detection and error handling callbacks.
+pub const SpeechEventHandler = struct {
+    /// Function to call when speech is detected
+    onSpeechFn: *const fn (ctx: *anyopaque, text: []const u8) void,
+    /// Function to call when an error occurs
+    onErrorFn: *const fn (ctx: *anyopaque, error_code: SttError, message: []const u8) void,
+    /// Context pointer passed to callback functions
+    ctx: *anyopaque,
+
+    /// Invoke the speech detection callback
+    pub fn onSpeech(self: SpeechEventHandler, text: []const u8) void {
+        self.onSpeechFn(self.ctx, text);
+    }
+
+    /// Invoke the error callback
+    pub fn onError(self: SpeechEventHandler, error_code: SttError, message: []const u8) void {
+        self.onErrorFn(self.ctx, error_code, message);
+    }
+};
+
+/// Audio buffer for managing audio data flow using std.io interfaces
+pub const AudioBuffer = struct {
+    const Self = @This();
+
+    /// Internal ring buffer for audio data
+    buffer: []i16,
+    /// Read position in the buffer
+    read_pos: usize = 0,
+    /// Write position in the buffer
+    write_pos: usize = 0,
+    /// Number of samples currently in buffer
+    count: usize = 0,
+    /// Mutex for thread-safe access
+    mutex: std.Thread.Mutex = .{},
+    /// Allocator used for buffer allocation
+    allocator: std.mem.Allocator,
+
+    /// Initialize audio buffer with specified capacity
+    pub fn init(allocator: std.mem.Allocator, buffer_capacity: usize) !Self {
+        const buffer = try allocator.alloc(i16, buffer_capacity);
+        return Self{
+            .buffer = buffer,
+            .allocator = allocator,
+        };
+    }
+
+    /// Deinitialize and free buffer memory
+    pub fn deinit(self: *Self) void {
+        self.allocator.free(self.buffer);
+    }
+
+    /// Write audio samples to the buffer (thread-safe)
+    pub fn write(self: *Self, samples: []const i16) usize {
+        self.mutex.lock();
+        defer self.mutex.unlock();
+
+        const available_space = self.buffer.len - self.count;
+        const to_write = @min(samples.len, available_space);
+
+        for (0..to_write) |i| {
+            self.buffer[self.write_pos] = samples[i];
+            self.write_pos = (self.write_pos + 1) % self.buffer.len;
+        }
+
+        self.count += to_write;
+        return to_write;
+    }
+
+    /// Read audio samples from the buffer (thread-safe)
+    pub fn read(self: *Self, samples: []i16) usize {
+        self.mutex.lock();
+        defer self.mutex.unlock();
+
+        const to_read = @min(samples.len, self.count);
+
+        for (0..to_read) |i| {
+            samples[i] = self.buffer[self.read_pos];
+            self.read_pos = (self.read_pos + 1) % self.buffer.len;
+        }
+
+        self.count -= to_read;
+        return to_read;
+    }
+
+    /// Get number of samples available for reading
+    pub fn available(self: *Self) usize {
+        self.mutex.lock();
+        defer self.mutex.unlock();
+        return self.count;
+    }
+
+    /// Get remaining capacity for writing
+    pub fn capacity(self: *Self) usize {
+        self.mutex.lock();
+        defer self.mutex.unlock();
+        return self.buffer.len - self.count;
+    }
+
+    /// Clear all data from buffer
+    pub fn clear(self: *Self) void {
+        self.mutex.lock();
+        defer self.mutex.unlock();
+        self.read_pos = 0;
+        self.write_pos = 0;
+        self.count = 0;
+    }
+};
+
+/// Audio format conversion utilities
+pub const AudioConverter = struct {
+    /// Convert stereo samples to mono by averaging channels
+    pub fn stereoToMono(stereo_samples: []const i16, mono_samples: []i16) usize {
+        const frames = @min(stereo_samples.len / 2, mono_samples.len);
+
+        for (0..frames) |i| {
+            const left = stereo_samples[i * 2];
+            const right = stereo_samples[i * 2 + 1];
+            // Average the channels and clamp to prevent overflow
+            const avg: i32 = @divTrunc(@as(i32, left) + @as(i32, right), 2);
+            mono_samples[i] = @intCast(@max(@min(avg, std.math.maxInt(i16)), std.math.minInt(i16)));
+        }
+
+        return frames;
+    }
+
+    /// Simple sample rate conversion (basic linear interpolation)
+    /// Note: This is a basic implementation. For production use, consider more sophisticated algorithms
+    pub fn resample(input_samples: []const i16, output_samples: []i16, input_rate: u32, output_rate: u32) usize {
+        if (input_rate == output_rate) {
+            const copy_len = @min(input_samples.len, output_samples.len);
+            @memcpy(output_samples[0..copy_len], input_samples[0..copy_len]);
+            return copy_len;
+        }
+
+        const ratio = @as(f64, @floatFromInt(input_rate)) / @as(f64, @floatFromInt(output_rate));
+        const output_len = @min(output_samples.len, @as(usize, @intFromFloat(@as(f64, @floatFromInt(input_samples.len)) / ratio)));
+
+        for (0..output_len) |i| {
+            const src_pos = @as(f64, @floatFromInt(i)) * ratio;
+            const src_idx: usize = @intFromFloat(src_pos);
+
+            if (src_idx >= input_samples.len) break;
+
+            if (src_idx + 1 < input_samples.len) {
+                // Linear interpolation
+                const frac = src_pos - @as(f64, @floatFromInt(src_idx));
+                const sample1: f64 = @floatFromInt(input_samples[src_idx]);
+                const sample2: f64 = @floatFromInt(input_samples[src_idx + 1]);
+                const interpolated = sample1 + (sample2 - sample1) * frac;
+                output_samples[i] = @intFromFloat(@max(@min(interpolated, std.math.maxInt(i16)), std.math.minInt(i16)));
+            } else {
+                output_samples[i] = input_samples[src_idx];
+            }
+        }
+
+        return output_len;
+    }
+};
+
+/// ALSA audio capture configuration and state
+pub const AlsaCapture = struct {
+    const Self = @This();
+
+    /// ALSA PCM handle
+    pcm_handle: ?*c.snd_pcm_t = null,
+    /// Device name
+    device_name: []const u8,
+    /// Sample rate
+    sample_rate: u32,
+    /// Number of channels
+    channels: u32,
+    /// Buffer size in frames
+    buffer_size: u32,
+    /// Period size in frames
+    period_size: u32,
+    /// Audio buffer for captured data
+    audio_buffer: AudioBuffer,
+    /// Temporary buffer for ALSA reads
+    temp_buffer: []i16,
+    /// Allocator for memory management
+    allocator: std.mem.Allocator,
+
+    /// Initialize ALSA capture with specified parameters
+    pub fn init(allocator: std.mem.Allocator, device_name: []const u8, sample_rate: u32, channels: u32, buffer_size: u32) !Self {
+        // Calculate period size (typically 1/4 of buffer size)
+        const period_size = buffer_size / 4;
+
+        // Create audio buffer (make it larger than ALSA buffer to prevent overruns)
+        const audio_buffer = try AudioBuffer.init(allocator, buffer_size * 4);
+
+        // Allocate temporary buffer for ALSA reads
+        const temp_buffer = try allocator.alloc(i16, period_size * channels);
+
+        return Self{
+            .device_name = device_name,
+            .sample_rate = sample_rate,
+            .channels = channels,
+            .buffer_size = buffer_size,
+            .period_size = period_size,
+            .audio_buffer = audio_buffer,
+            .temp_buffer = temp_buffer,
+            .allocator = allocator,
+        };
+    }
+
+    /// Deinitialize ALSA capture and free resources
+    pub fn deinit(self: *Self) void {
+        self.close();
+        self.audio_buffer.deinit();
+        self.allocator.free(self.temp_buffer);
+    }
+
+    /// Open ALSA device and configure parameters
+    pub fn open(self: *Self) !void {
+        // Convert device name to null-terminated string
+        const device_cstr = try self.allocator.dupeZ(u8, self.device_name);
+        defer self.allocator.free(device_cstr);
+
+        // Open PCM device
+        var err = c.snd_pcm_open(&self.pcm_handle, device_cstr.ptr, c.SND_PCM_STREAM_CAPTURE, 0);
+        if (err < 0) {
+            return SttError.AudioDeviceError;
+        }
+
+        // Allocate hardware parameters structure
+        var hw_params: ?*c.snd_pcm_hw_params_t = null;
+        err = c.snd_pcm_hw_params_malloc(@ptrCast(&hw_params));
+        if (err < 0) {
+            self.close();
+            return SttError.AudioDeviceError;
+        }
+        defer c.snd_pcm_hw_params_free(hw_params);
+
+        // Initialize hardware parameters
+        err = c.snd_pcm_hw_params_any(self.pcm_handle, hw_params);
+        if (err < 0) {
+            self.close();
+            return SttError.AudioDeviceError;
+        }
+
+        // Set access type to interleaved
+        err = c.snd_pcm_hw_params_set_access(self.pcm_handle, hw_params, c.SND_PCM_ACCESS_RW_INTERLEAVED);
+        if (err < 0) {
+            self.close();
+            return SttError.AudioDeviceError;
+        }
+
+        // Set sample format to 16-bit signed little endian
+        err = c.snd_pcm_hw_params_set_format(self.pcm_handle, hw_params, c.SND_PCM_FORMAT_S16_LE);
+        if (err < 0) {
+            self.close();
+            return SttError.AudioDeviceError;
+        }
+
+        // Set number of channels
+        err = c.snd_pcm_hw_params_set_channels(self.pcm_handle, hw_params, self.channels);
+        if (err < 0) {
+            self.close();
+            return SttError.AudioDeviceError;
+        }
+
+        // Set sample rate
+        var actual_rate = self.sample_rate;
+        err = c.snd_pcm_hw_params_set_rate_near(self.pcm_handle, hw_params, &actual_rate, null);
+        if (err < 0) {
+            self.close();
+            return SttError.AudioDeviceError;
+        }
+
+        // Set buffer size
+        var actual_buffer_size: c.snd_pcm_uframes_t = self.buffer_size;
+        err = c.snd_pcm_hw_params_set_buffer_size_near(self.pcm_handle, hw_params, &actual_buffer_size);
+        if (err < 0) {
+            self.close();
+            return SttError.AudioDeviceError;
+        }
+
+        // Set period size
+        var actual_period_size: c.snd_pcm_uframes_t = self.period_size;
+        err = c.snd_pcm_hw_params_set_period_size_near(self.pcm_handle, hw_params, &actual_period_size, null);
+        if (err < 0) {
+            self.close();
+            return SttError.AudioDeviceError;
+        }
+
+        // Apply hardware parameters
+        err = c.snd_pcm_hw_params(self.pcm_handle, hw_params);
+        if (err < 0) {
+            self.close();
+            return SttError.AudioDeviceError;
+        }
+
+        // Prepare the PCM for use
+        err = c.snd_pcm_prepare(self.pcm_handle);
+        if (err < 0) {
+            self.close();
+            return SttError.AudioDeviceError;
+        }
+    }
+
+    /// Close ALSA device
+    pub fn close(self: *Self) void {
+        if (self.pcm_handle) |handle| {
+            _ = c.snd_pcm_close(handle);
+            self.pcm_handle = null;
+        }
+    }
+
+    /// Read audio data from ALSA device and process it
+    pub fn readAudio(self: *Self) !usize {
+        if (self.pcm_handle == null) {
+            return SttError.AudioDeviceError;
+        }
+
+        // Read audio data from ALSA
+        const frames_read = c.snd_pcm_readi(self.pcm_handle, self.temp_buffer.ptr, self.period_size);
+
+        if (frames_read < 0) {
+            // Handle underrun or other errors
+            if (frames_read == -c.EPIPE) {
+                // Underrun occurred, try to recover
+                const err = c.snd_pcm_prepare(self.pcm_handle);
+                if (err < 0) {
+                    return SttError.AudioDeviceError;
+                }
+                return 0; // No data read this time
+            } else {
+                return SttError.AudioDeviceError;
+            }
+        }
+
+        const samples_read = @as(usize, @intCast(frames_read)) * self.channels;
+
+        // Process audio based on channel configuration
+        if (self.channels == 1) {
+            // Mono input - write directly to buffer
+            _ = self.audio_buffer.write(self.temp_buffer[0..samples_read]);
+        } else if (self.channels == 2) {
+            // Stereo input - convert to mono
+            const mono_buffer = try self.allocator.alloc(i16, @as(usize, @intCast(frames_read)));
+            defer self.allocator.free(mono_buffer);
+
+            const mono_samples = AudioConverter.stereoToMono(self.temp_buffer[0..samples_read], mono_buffer);
+            _ = self.audio_buffer.write(mono_buffer[0..mono_samples]);
+        } else {
+            // Multi-channel input - take first channel only
+            const mono_buffer = try self.allocator.alloc(i16, @as(usize, @intCast(frames_read)));
+            defer self.allocator.free(mono_buffer);
+
+            for (0..@as(usize, @intCast(frames_read))) |i| {
+                mono_buffer[i] = self.temp_buffer[i * self.channels];
+            }
+            _ = self.audio_buffer.write(mono_buffer);
+        }
+
+        return @intCast(frames_read);
+    }
+
+    /// Get processed audio samples (mono, at configured sample rate)
+    pub fn getAudioSamples(self: *Self, output_buffer: []i16) usize {
+        return self.audio_buffer.read(output_buffer);
+    }
+
+    /// Get number of samples available for reading
+    pub fn availableSamples(self: *Self) usize {
+        return self.audio_buffer.available();
+    }
+};
+
+/// Configuration options for STT session initialization
+pub const SttOptions = struct {
+    /// Path to the Vosk model directory
+    model_path: []const u8,
+    /// ALSA audio device name (e.g., "hw:3,0")
+    audio_device: []const u8,
+    /// Speech event handler for callbacks
+    event_handler: SpeechEventHandler,
+    /// Sample rate for audio processing (default: 16000)
+    sample_rate: u32 = 16000,
+    /// Number of audio channels (default: 2 for stereo)
+    channels: u32 = 2,
+    /// Audio buffer size in frames (default: 256)
+    buffer_size: u32 = 256,
+};
+
+/// Main STT session handle
+///
+/// This represents an active speech-to-text session with configured
+/// audio input and speech recognition model.
+pub const SttSession = struct {
+    const Self = @This();
+
+    /// Memory allocator
+    allocator: std.mem.Allocator,
+    /// Configuration options
+    options: SttOptions,
+    /// Initialization state
+    initialized: bool = false,
+    /// Listening state
+    listening: bool = false,
+    /// ALSA audio capture
+    alsa_capture: ?AlsaCapture = null,
+    /// Audio capture thread
+    audio_thread: ?std.Thread = null,
+    /// Processing thread for Vosk
+    processing_thread: ?std.Thread = null,
+    /// Thread synchronization
+    should_stop: std.atomic.Value(bool) = std.atomic.Value(bool).init(false),
+    /// Processing buffer for audio samples
+    processing_buffer: []i16,
+    /// Vosk model
+    vosk_model: ?*c.VoskModel = null,
+    /// Vosk recognizer
+    vosk_recognizer: ?*c.VoskRecognizer = null,
+    /// Audio buffer for Vosk processing
+    vosk_audio_buffer: AudioBuffer,
+
+    /// Initialize a new STT session with the given options
+    ///
+    /// Parameters:
+    /// - allocator: Memory allocator to use for the session
+    /// - options: Configuration options for the session
+    ///
+    /// Returns:
+    /// - SttSession instance on success
+    /// - SttError on failure
+    pub fn init(allocator: std.mem.Allocator, options: SttOptions) SttError!SttSession {
+        // Validate options first
+        try validateOptions(options);
+
+        // Allocate processing buffer for audio samples (1 second worth of samples)
+        const processing_buffer = allocator.alloc(i16, options.sample_rate) catch {
+            return SttError.OutOfMemory;
+        };
+
+        // Initialize ALSA capture
+        const alsa_capture = AlsaCapture.init(
+            allocator,
+            options.audio_device,
+            options.sample_rate,
+            options.channels,
+            options.buffer_size,
+        ) catch {
+            allocator.free(processing_buffer);
+            return SttError.InitializationFailed;
+        };
+
+        // Initialize Vosk audio buffer (larger buffer for processing)
+        const vosk_audio_buffer = AudioBuffer.init(allocator, options.sample_rate * 2) catch {
+            allocator.free(processing_buffer);
+            var alsa_capture_mut = alsa_capture;
+            alsa_capture_mut.deinit();
+            return SttError.OutOfMemory;
+        };
+
+        var session = SttSession{
+            .allocator = allocator,
+            .options = options,
+            .alsa_capture = alsa_capture,
+            .processing_buffer = processing_buffer,
+            .vosk_audio_buffer = vosk_audio_buffer,
+        };
+
+        // Initialize Vosk model and recognizer
+        session.initVosk() catch {
+            session.deinitPartial();
+            return SttError.ModelLoadError;
+        };
+
+        session.initialized = true;
+        return session;
+    }
+
+    /// Initialize Vosk model and recognizer
+    fn initVosk(self: *SttSession) !void {
+        // Convert model path to null-terminated string
+        const model_path_cstr = try self.allocator.dupeZ(u8, self.options.model_path);
+        defer self.allocator.free(model_path_cstr);
+
+        // Load Vosk model
+        self.vosk_model = c.vosk_model_new(model_path_cstr.ptr);
+        if (self.vosk_model == null) {
+            return SttError.ModelLoadError;
+        }
+
+        // Create Vosk recognizer
+        self.vosk_recognizer = c.vosk_recognizer_new(self.vosk_model, @floatFromInt(self.options.sample_rate));
+        if (self.vosk_recognizer == null) {
+            if (self.vosk_model) |model| {
+                c.vosk_model_free(model);
+                self.vosk_model = null;
+            }
+            return SttError.ModelLoadError;
+        }
+    }
+
+    /// Partial cleanup for initialization failures
+    fn deinitPartial(self: *SttSession) void {
+        // Clean up Vosk resources
+        if (self.vosk_recognizer) |recognizer| {
+            c.vosk_recognizer_free(recognizer);
+            self.vosk_recognizer = null;
+        }
+        if (self.vosk_model) |model| {
+            c.vosk_model_free(model);
+            self.vosk_model = null;
+        }
+
+        // Clean up audio buffer
+        self.vosk_audio_buffer.deinit();
+
+        // Clean up ALSA capture resources
+        if (self.alsa_capture) |*capture| {
+            capture.deinit();
+            self.alsa_capture = null;
+        }
+
+        // Free processing buffer
+        self.allocator.free(self.processing_buffer);
+    }
+
+    /// Audio capture thread function
+    fn audioThreadFn(self: *SttSession) void {
+        var retry_count: u32 = 0;
+        const max_retries = 5;
+        const retry_delay_ms = 100;
+
+        // Open ALSA device with retry logic
+        if (self.alsa_capture) |*capture| {
+            while (retry_count < max_retries and !self.should_stop.load(.acquire)) {
+                capture.open() catch |err| {
+                    retry_count += 1;
+                    if (retry_count >= max_retries) {
+                        self.options.event_handler.onError(err, "Failed to open audio device after retries");
+                        return;
+                    }
+                    std.Thread.sleep(retry_delay_ms * std.time.ns_per_ms);
+                    continue;
+                };
+                break;
+            }
+
+            if (retry_count >= max_retries) {
+                return;
+            }
+
+            // Reset retry count for audio reading
+            retry_count = 0;
+
+            // Audio capture loop with proper error handling and recovery
+            while (!self.should_stop.load(.acquire)) {
+                // Read audio data from ALSA
+                _ = capture.readAudio() catch |err| {
+                    if (err == SttError.AudioDeviceError) {
+                        retry_count += 1;
+                        if (retry_count >= max_retries) {
+                            self.options.event_handler.onError(err, "Audio capture failed after retries");
+                            break;
+                        }
+                        // Try to recover from audio errors
+                        std.Thread.sleep(retry_delay_ms * std.time.ns_per_ms);
+                        continue;
+                    }
+                    self.options.event_handler.onError(err, "Audio capture error");
+                    break;
+                };
+
+                // Reset retry count on successful read
+                retry_count = 0;
+
+                // Transfer audio data to Vosk processing buffer
+                if (capture.availableSamples() >= 1024) { // Process in chunks of 1024 samples
+                    const chunk_size = @min(1024, self.processing_buffer.len);
+                    const samples_read = capture.getAudioSamples(self.processing_buffer[0..chunk_size]);
+                    if (samples_read > 0) {
+                        // Send audio to Vosk processing buffer with overflow protection
+                        const written = self.vosk_audio_buffer.write(self.processing_buffer[0..samples_read]);
+                        if (written < samples_read) {
+                            // Buffer overflow - clear some old data to make room
+                            self.vosk_audio_buffer.clear();
+                            _ = self.vosk_audio_buffer.write(self.processing_buffer[0..samples_read]);
+                        }
+                    }
+                }
+
+                // Small delay to prevent busy waiting
+                std.Thread.sleep(1 * std.time.ns_per_ms); // 1ms
+            }
+
+            // Ensure ALSA device is properly closed
+            capture.close();
+        }
+    }
+
+    /// Vosk processing thread function
+    fn processingThreadFn(self: *SttSession) void {
+        // Processing buffer for Vosk (4096 samples = ~256ms at 16kHz)
+        const vosk_chunk_size = 4096;
+        const min_chunk_size = 1024; // Minimum chunk size for processing
+
+        var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch {
+            self.options.event_handler.onError(SttError.OutOfMemory, "Failed to allocate Vosk processing buffer");
+            return;
+        };
+        defer self.allocator.free(vosk_buffer);
+
+        var error_count: u32 = 0;
+        const max_errors = 10;
+        const error_reset_threshold = 100; // Reset error count after this many successful operations
+        var success_count: u32 = 0;
+
+        while (!self.should_stop.load(.acquire)) {
+            // Check if we have enough audio data for processing
+            const available_samples = self.vosk_audio_buffer.available();
+
+            if (available_samples >= min_chunk_size) {
+                // Process in chunks, but don't exceed our buffer size
+                const chunk_size = @min(available_samples, vosk_chunk_size);
+                const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..chunk_size]);
+
+                if (samples_read > 0 and self.vosk_recognizer != null) {
+                    // Process audio with Vosk
+                    self.processVoskAudio(vosk_buffer[0..samples_read]) catch |err| {
+                        error_count += 1;
+                        if (error_count >= max_errors) {
+                            self.options.event_handler.onError(SttError.CallbackError, "Too many Vosk processing errors, stopping");
+                            break;
+                        }
+                        self.options.event_handler.onError(err, "Vosk processing error");
+
+                        // Add delay after error to prevent rapid error loops
+                        std.Thread.sleep(50 * std.time.ns_per_ms); // 50ms delay
+                        continue;
+                    };
+
+                    // Reset error count after successful operations
+                    success_count += 1;
+                    if (success_count >= error_reset_threshold) {
+                        error_count = 0;
+                        success_count = 0;
+                    }
+                }
+            }
+
+            // Adaptive delay based on buffer fill level
+            const delay_ms: u64 = if (available_samples > vosk_chunk_size * 2)
+                1 // Fast processing when buffer is full
+            else if (available_samples > min_chunk_size)
+                5 // Normal processing
+            else
+                10; // Slower when buffer is low
+
+            std.Thread.sleep(delay_ms * std.time.ns_per_ms);
+        }
+
+        // Final processing of any remaining audio data
+        const remaining_samples = self.vosk_audio_buffer.available();
+        if (remaining_samples > 0 and self.vosk_recognizer != null) {
+            const final_chunk_size = @min(remaining_samples, vosk_chunk_size);
+            const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..final_chunk_size]);
+            if (samples_read > 0) {
+                self.processVoskAudio(vosk_buffer[0..samples_read]) catch {
+                    // Ignore errors during shutdown
+                };
+            }
+        }
+    }
+
+    /// Process audio chunk with Vosk and handle results
+    fn processVoskAudio(self: *SttSession, audio_data: []const i16) !void {
+        if (self.vosk_recognizer == null) {
+            return SttError.InvalidState;
+        }
+
+        // Convert i16 samples to bytes for Vosk
+        const audio_bytes = std.mem.sliceAsBytes(audio_data);
+
+        // Feed audio to Vosk recognizer
+        const accept_result = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));
+
+        if (accept_result == 1) {
+            // Final result available
+            const result_cstr = c.vosk_recognizer_result(self.vosk_recognizer);
+            if (result_cstr != null) {
+                const result_str = std.mem.span(result_cstr);
+
+                // Parse JSON result to extract text
+                self.parseVoskResult(result_str) catch |err| {
+                    self.options.event_handler.onError(err, "Failed to parse Vosk result");
+                };
+            }
+        } else if (accept_result == 0) {
+            // Partial result available (optional - for real-time feedback)
+            const partial_result_cstr = c.vosk_recognizer_partial_result(self.vosk_recognizer);
+            if (partial_result_cstr != null) {
+                const partial_str = std.mem.span(partial_result_cstr);
+
+                // Parse partial result (could be used for real-time display)
+                self.parseVoskPartialResult(partial_str) catch {
+                    // Ignore partial result parsing errors
+                };
+            }
+        }
+        // accept_result == -1 means error, but we continue processing
+    }
+
+    /// Parse Vosk JSON result and extract recognized text
+    fn parseVoskResult(self: *SttSession, json_str: []const u8) !void {
+        // Simple JSON parsing to extract "text" field
+        // Vosk returns JSON like: {"text": "hello world"}
+
+        if (json_str.len == 0) return;
+
+        // Find "text" field in JSON
+        const text_key = "\"text\"";
+        if (std.mem.indexOf(u8, json_str, text_key)) |text_start| {
+            const value_start = text_start + text_key.len;
+
+            // Find the colon and opening quote
+            if (std.mem.indexOf(u8, json_str[value_start..], ":")) |colon_pos| {
+                const after_colon = value_start + colon_pos + 1;
+
+                // Skip whitespace and find opening quote
+                var quote_start: ?usize = null;
+                for (json_str[after_colon..], 0..) |char, i| {
+                    if (char == '"') {
+                        quote_start = after_colon + i + 1;
+                        break;
+                    }
+                }
+
+                if (quote_start) |start| {
+                    // Find closing quote
+                    if (std.mem.indexOf(u8, json_str[start..], "\"")) |quote_end| {
+                        const text = json_str[start .. start + quote_end];
+
+                        // Only invoke callback if text is not empty
+                        if (text.len > 0 and !std.mem.eql(u8, text, " ")) {
+                            self.options.event_handler.onSpeech(text);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /// Parse Vosk partial result (for real-time feedback)
+    fn parseVoskPartialResult(self: *SttSession, json_str: []const u8) !void {
+        // Similar to parseVoskResult but for partial results
+        // For now, we don't use partial results, but this could be extended
+        // to provide real-time transcription feedback
+        _ = self;
+        _ = json_str;
+    }
+
+    /// Attempt to recover from audio device errors
+    fn recoverAudioDevice(self: *SttSession) SttError!void {
+        if (self.alsa_capture) |*capture| {
+            // Close and reopen the audio device
+            capture.close();
+
+            // Wait a bit before attempting to reopen
+            std.Thread.sleep(100 * std.time.ns_per_ms);
+
+            // Try to reopen the device
+            capture.open() catch {
+                return SttError.AudioDeviceError;
+            };
+        }
+    }
+
+    /// Get current session status information
+    pub fn getStatus(self: *SttSession) struct {
+        initialized: bool,
+        listening: bool,
+        audio_samples_available: usize,
+        processing_samples_available: usize,
+    } {
+        return .{
+            .initialized = self.initialized,
+            .listening = self.listening,
+            .audio_samples_available = if (self.alsa_capture) |*capture| capture.availableSamples() else 0,
+            .processing_samples_available = self.vosk_audio_buffer.available(),
+        };
+    }
+
+    /// Validate session options before initialization
+    fn validateOptions(options: SttOptions) SttError!void {
+        if (options.model_path.len == 0) {
+            return SttError.InvalidParameter;
+        }
+        if (options.audio_device.len == 0) {
+            return SttError.InvalidParameter;
+        }
+        if (options.sample_rate == 0 or options.sample_rate > 48000) {
+            return SttError.InvalidParameter;
+        }
+        if (options.channels == 0 or options.channels > 8) {
+            return SttError.InvalidParameter;
+        }
+        if (options.buffer_size == 0 or options.buffer_size > 8192) {
+            return SttError.InvalidParameter;
+        }
+    }
+
+    /// Reinitialize the session after an error (recovery mechanism)
+    pub fn reinitialize(self: *SttSession) SttError!void {
+        if (self.listening) {
+            self.stop_listening();
+        }
+
+        // Clean up existing Vosk resources
+        if (self.vosk_recognizer) |recognizer| {
+            c.vosk_recognizer_free(recognizer);
+            self.vosk_recognizer = null;
+        }
+        if (self.vosk_model) |model| {
+            c.vosk_model_free(model);
+            self.vosk_model = null;
+        }
+
+        // Reinitialize Vosk
+        try self.initVosk();
+
+        // Reset audio buffers
+        if (self.alsa_capture) |*capture| {
+            capture.audio_buffer.clear();
+        }
+        self.vosk_audio_buffer.clear();
+
+        self.initialized = true;
+    }
+
+    /// Start listening for speech input
+    ///
+    /// This begins audio capture and speech recognition processing.
+    /// Speech detection events will be delivered via the configured
+    /// event handler callbacks.
+    ///
+    /// Returns:
+    /// - void on success
+    /// - SttError on failure
+    pub fn start_listening(self: *SttSession) SttError!void {
+        if (!self.initialized) {
+            return SttError.InvalidState;
+        }
+        if (self.listening) {
+            return SttError.InvalidState;
+        }
+
+        // Clear any existing audio buffers
+        if (self.alsa_capture) |*capture| {
+            capture.audio_buffer.clear();
+        }
+        self.vosk_audio_buffer.clear();
+
+        // Reset stop flag
+        self.should_stop.store(false, .release);
+
+        // Start audio capture thread with error handling
+        self.audio_thread = std.Thread.spawn(.{}, audioThreadFn, .{self}) catch |err| {
+            self.should_stop.store(true, .release);
+            return switch (err) {
+                error.SystemResources, error.ThreadQuotaExceeded => SttError.ThreadingError,
+                else => SttError.ThreadingError,
+            };
+        };
+
+        // Start Vosk processing thread with cleanup on failure
+        self.processing_thread = std.Thread.spawn(.{}, processingThreadFn, .{self}) catch |err| {
+            // Clean up audio thread if processing thread fails
+            self.should_stop.store(true, .release);
+            if (self.audio_thread) |thread| {
+                thread.join();
+                self.audio_thread = null;
+            }
+            return switch (err) {
+                error.SystemResources, error.ThreadQuotaExceeded => SttError.ThreadingError,
+                else => SttError.ThreadingError,
+            };
+        };
+
+        // Give threads a moment to start up
+        std.Thread.sleep(10 * std.time.ns_per_ms);
+
+        self.listening = true;
+    }
+
+    /// Stop listening for speech input
+    ///
+    /// This stops audio capture and speech recognition processing.
+    /// Any ongoing processing will be completed before returning.
+    pub fn stop_listening(self: *SttSession) void {
+        if (!self.listening) {
+            return;
+        }
+
+        // Signal threads to stop
+        self.should_stop.store(true, .release);
+
+        // Give threads a moment to see the stop signal
+        std.Thread.sleep(5 * std.time.ns_per_ms);
+
+        // Wait for audio thread to finish with timeout
+        if (self.audio_thread) |thread| {
+            // Join with reasonable timeout - threads should stop quickly
+            thread.join();
+            self.audio_thread = null;
+        }
+
+        // Wait for processing thread to finish with timeout
+        if (self.processing_thread) |thread| {
+            thread.join();
+            self.processing_thread = null;
+        }
+
+        // Clear any remaining audio data
+        if (self.alsa_capture) |*capture| {
+            capture.audio_buffer.clear();
+        }
+        self.vosk_audio_buffer.clear();
+
+        self.listening = false;
+    }
+
+    /// Check if the session is currently listening
+    pub fn is_listening(self: *const SttSession) bool {
+        return self.listening;
+    }
+
+    /// Check if the session is initialized
+    pub fn is_initialized(self: *const SttSession) bool {
+        return self.initialized;
+    }
+
+    /// Deinitialize the STT session and free all resources
+    ///
+    /// This must be called to properly clean up the session.
+    /// After calling deinit(), the session should not be used.
+    pub fn deinit(self: *SttSession) void {
+        // Ensure we're not listening before cleanup
+        if (self.listening) {
+            self.stop_listening();
+        }
+
+        // Double-check that threads are properly stopped
+        if (self.audio_thread != null or self.processing_thread != null) {
+            self.should_stop.store(true, .release);
+
+            // Give threads one more chance to stop
+            std.Thread.sleep(50 * std.time.ns_per_ms);
+
+            if (self.audio_thread) |thread| {
+                thread.join();
+                self.audio_thread = null;
+            }
+            if (self.processing_thread) |thread| {
+                thread.join();
+                self.processing_thread = null;
+            }
+        }
+
+        // Clean up Vosk resources in proper order
+        if (self.vosk_recognizer) |recognizer| {
+            c.vosk_recognizer_free(recognizer);
+            self.vosk_recognizer = null;
+        }
+        if (self.vosk_model) |model| {
+            c.vosk_model_free(model);
+            self.vosk_model = null;
+        }
+
+        // Clean up audio buffers
+        self.vosk_audio_buffer.deinit();
+
+        // Clean up ALSA capture resources
+        if (self.alsa_capture) |*capture| {
+            capture.deinit();
+            self.alsa_capture = null;
+        }
+
+        // Free processing buffer
+        self.allocator.free(self.processing_buffer);
+
+        // Mark as uninitialized
+        self.initialized = false;
+    }
+};
+
+/// Initialize STT library with the given options
+///
+/// This is the main entry point for the STT library. It creates and initializes
+/// a new STT session with the provided configuration.
+///
+/// Parameters:
+/// - allocator: Memory allocator to use for the session
+/// - options: Configuration options for the session
+///
+/// Returns:
+/// - SttSession instance on success
+/// - SttError on failure
+pub fn init(allocator: std.mem.Allocator, options: SttOptions) SttError!SttSession {
+    return SttSession.init(allocator, options);
 }
 
-pub fn add(a: i32, b: i32) i32 {
-    return a + b;
+/// C-compatible API functions for use from other languages
+/// These wrap the Zig API with C calling conventions
+/// Opaque handle type for C API
+pub const SttHandle = opaque {};
+
+/// Initialize STT library (C API)
+///
+/// Parameters:
+/// - model_path: Null-terminated path to Vosk model
+/// - audio_device: Null-terminated ALSA device name
+///
+/// Returns:
+/// - Pointer to SttHandle on success
+/// - null on failure
+pub export fn stt_init(model_path: [*:0]const u8, audio_device: [*:0]const u8) ?*SttHandle {
+    // TODO: Implement C API wrapper in subsequent tasks
+    _ = model_path;
+    _ = audio_device;
+    return null;
 }
 
-test "basic add functionality" {
-    try std.testing.expect(add(3, 7) == 10);
+/// Set speech detection callback (C API)
+/// TODO: Implement in subsequent tasks with proper C-compatible callback types
+// pub export fn stt_set_speech_callback(handle: *SttHandle, callback: SpeechCallback, user_data: ?*anyopaque) void {
+//     _ = handle;
+//     _ = callback;
+//     _ = user_data;
+// }
+
+/// Set error callback (C API)
+/// TODO: Implement in subsequent tasks with proper C-compatible callback types
+// pub export fn stt_set_error_callback(handle: *SttHandle, callback: ErrorCallback, user_data: ?*anyopaque) void {
+//     _ = handle;
+//     _ = callback;
+//     _ = user_data;
+// }
+
+/// Start listening (C API)
+pub export fn stt_start_listening(handle: *SttHandle) c_int {
+    _ = handle;
+    // TODO: Implement in subsequent tasks
+    return -1; // Error for now
+}
+
+/// Stop listening (C API)
+pub export fn stt_stop_listening(handle: *SttHandle) void {
+    _ = handle;
+    // TODO: Implement in subsequent tasks
+}
+
+/// Deinitialize STT library (C API)
+pub export fn stt_deinit(handle: *SttHandle) void {
+    _ = handle;
+    // TODO: Implement in subsequent tasks
+}
+
+// Tests
+test "SttError enum" {
+    const testing = std.testing;
+
+    // Test that error types can be created and compared
+    const err1 = SttError.InitializationFailed;
+    const err2 = SttError.AudioDeviceError;
+
+    try testing.expect(err1 != err2);
+    try testing.expect(err1 == SttError.InitializationFailed);
+}
+
+test "SttOptions validation" {
+    const testing = std.testing;
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    defer _ = gpa.deinit();
+    const allocator = gpa.allocator();
+
+    // Test valid options
+    const DummyHandler = struct {
+        fn onSpeech(ctx: *anyopaque, text: []const u8) void {
+            _ = ctx;
+            _ = text;
+        }
+        fn onError(ctx: *anyopaque, error_code: SttError, message: []const u8) void {
+            _ = ctx;
+            _ = message;
+            // Can't discard error types with _, so we just don't use it
+            switch (error_code) {
+                else => {},
+            }
+        }
+    };
+
+    var dummy_ctx: u8 = 0;
+    const valid_options = SttOptions{
+        .model_path = "/path/to/model",
+        .audio_device = "hw:0,0",
+        .event_handler = SpeechEventHandler{
+            .onSpeechFn = DummyHandler.onSpeech,
+            .onErrorFn = DummyHandler.onError,
+            .ctx = &dummy_ctx,
+        },
+    };
+
+    // Test that initialization fails with invalid model path (expected behavior)
+    const result = SttSession.init(allocator, valid_options);
+    try testing.expectError(SttError.ModelLoadError, result);
+}
+
+test "SttSession state management" {
+    const testing = std.testing;
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    defer _ = gpa.deinit();
+    const allocator = gpa.allocator();
+
+    const DummyHandler = struct {
+        fn onSpeech(ctx: *anyopaque, text: []const u8) void {
+            _ = ctx;
+            _ = text;
+        }
+        fn onError(ctx: *anyopaque, error_code: SttError, message: []const u8) void {
+            _ = ctx;
+            _ = message;
+            // Can't discard error types with _, so we just don't use it
+            switch (error_code) {
+                else => {},
+            }
+        }
+    };
+
+    var dummy_ctx: u8 = 0;
+    const options = SttOptions{
+        .model_path = "/path/to/model",
+        .audio_device = "hw:0,0",
+        .event_handler = SpeechEventHandler{
+            .onSpeechFn = DummyHandler.onSpeech,
+            .onErrorFn = DummyHandler.onError,
+            .ctx = &dummy_ctx,
+        },
+    };
+
+    // Test that initialization fails with invalid model path (expected behavior)
+    const result = SttSession.init(allocator, options);
+    try testing.expectError(SttError.ModelLoadError, result);
+}
+
+test "SpeechEventHandler interface" {
+    const testing = std.testing;
+
+    const TestHandler = struct {
+        speech_called: bool = false,
+        error_called: bool = false,
+        last_text: []const u8 = "",
+        last_error: SttError = SttError.InitializationFailed,
+
+        fn onSpeech(ctx: *anyopaque, text: []const u8) void {
+            const self: *@This() = @ptrCast(@alignCast(ctx));
+            self.speech_called = true;
+            self.last_text = text;
+        }
+
+        fn onError(ctx: *anyopaque, error_code: SttError, message: []const u8) void {
+            const self: *@This() = @ptrCast(@alignCast(ctx));
+            self.error_called = true;
+            self.last_error = error_code;
+            _ = message;
+        }
+    };
+
+    var handler = TestHandler{};
+    const event_handler = SpeechEventHandler{
+        .onSpeechFn = TestHandler.onSpeech,
+        .onErrorFn = TestHandler.onError,
+        .ctx = &handler,
+    };
+
+    // Test speech callback
+    event_handler.onSpeech("hello world");
+    try testing.expect(handler.speech_called);
+    try testing.expectEqualStrings("hello world", handler.last_text);
+
+    // Test error callback
+    event_handler.onError(SttError.AudioDeviceError, "test error");
+    try testing.expect(handler.error_called);
+    try testing.expect(handler.last_error == SttError.AudioDeviceError);
+}
+
+test "Vosk integration with valid model" {
+    const testing = std.testing;
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    defer _ = gpa.deinit();
+    const allocator = gpa.allocator();
+
+    const DummyHandler = struct {
+        fn onSpeech(ctx: *anyopaque, text: []const u8) void {
+            _ = ctx;
+            _ = text;
+        }
+        fn onError(ctx: *anyopaque, error_code: SttError, message: []const u8) void {
+            _ = ctx;
+            _ = message;
+            switch (error_code) {
+                else => {},
+            }
+        }
+    };
+
+    var dummy_ctx: u8 = 0;
+    const options = SttOptions{
+        .model_path = "zig-out/bin/vosk-model-small-en-us-0.15",
+        .audio_device = "hw:0,0", // This will fail in tests, but that's OK
+        .event_handler = SpeechEventHandler{
+            .onSpeechFn = DummyHandler.onSpeech,
+            .onErrorFn = DummyHandler.onError,
+            .ctx = &dummy_ctx,
+        },
+    };
+
+    // Try to initialize with real model path
+    const result = SttSession.init(allocator, options);
+
+    // If model exists, initialization should succeed (except for audio device)
+    // If model doesn't exist, we expect ModelLoadError
+    if (result) |session| {
+        var session_mut = session;
+        defer session_mut.deinit();
+
+        // If we get here, Vosk model loaded successfully
+        try testing.expect(session_mut.is_initialized());
+        try testing.expect(!session_mut.is_listening());
+    } else |err| {
+        // Model not found or other initialization error - this is acceptable in tests
+        try testing.expect(err == SttError.ModelLoadError or err == SttError.InitializationFailed);
+    }
+}
+
+test "AudioBuffer basic operations" {
+    const testing = std.testing;
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    defer _ = gpa.deinit();
+    const allocator = gpa.allocator();
+
+    var buffer = try AudioBuffer.init(allocator, 10);
+    defer buffer.deinit();
+
+    // Test initial state
+    try testing.expect(buffer.available() == 0);
+    try testing.expect(buffer.capacity() == 10);
+
+    // Test writing samples
+    const samples = [_]i16{ 1, 2, 3, 4, 5 };
+    const written = buffer.write(&samples);
+    try testing.expect(written == 5);
+    try testing.expect(buffer.available() == 5);
+    try testing.expect(buffer.capacity() == 5);
+
+    // Test reading samples
+    var read_samples: [3]i16 = undefined;
+    const read_count = buffer.read(&read_samples);
+    try testing.expect(read_count == 3);
+    try testing.expect(read_samples[0] == 1);
+    try testing.expect(read_samples[1] == 2);
+    try testing.expect(read_samples[2] == 3);
+    try testing.expect(buffer.available() == 2);
+
+    // Test buffer wrap-around
+    const more_samples = [_]i16{ 6, 7, 8, 9, 10, 11, 12, 13 };
+    const written2 = buffer.write(&more_samples);
+    try testing.expect(written2 == 8); // Should write 8 samples (2 remaining + 6 new)
+    try testing.expect(buffer.available() == 10); // Buffer should be full
+
+    // Test clearing buffer
+    buffer.clear();
+    try testing.expect(buffer.available() == 0);
+    try testing.expect(buffer.capacity() == 10);
+}
+
+test "AudioConverter stereo to mono conversion" {
+    const testing = std.testing;
+
+    // Test stereo to mono conversion
+    const stereo_samples = [_]i16{ 100, 200, 300, 400, 500, 600 }; // 3 stereo frames
+    var mono_samples: [3]i16 = undefined;
+
+    const frames_converted = AudioConverter.stereoToMono(&stereo_samples, &mono_samples);
+    try testing.expect(frames_converted == 3);
+
+    // Check averaged values
+    try testing.expect(mono_samples[0] == 150); // (100 + 200) / 2
+    try testing.expect(mono_samples[1] == 350); // (300 + 400) / 2
+    try testing.expect(mono_samples[2] == 550); // (500 + 600) / 2
+}
+
+test "AudioConverter sample rate conversion" {
+    const testing = std.testing;
+
+    // Test same sample rate (should copy directly)
+    const input_samples = [_]i16{ 100, 200, 300, 400 };
+    var output_samples: [4]i16 = undefined;
+
+    const converted = AudioConverter.resample(&input_samples, &output_samples, 16000, 16000);
+    try testing.expect(converted == 4);
+    try testing.expect(output_samples[0] == 100);
+    try testing.expect(output_samples[1] == 200);
+    try testing.expect(output_samples[2] == 300);
+    try testing.expect(output_samples[3] == 400);
+
+    // Test downsampling (2:1 ratio)
+    var downsampled: [2]i16 = undefined;
+    const downsampled_count = AudioConverter.resample(&input_samples, &downsampled, 16000, 8000);
+    try testing.expect(downsampled_count == 2);
+    try testing.expect(downsampled[0] == 100); // First sample
+    try testing.expect(downsampled[1] == 300); // Interpolated sample
+}
+
+test "AlsaCapture initialization" {
+    const testing = std.testing;
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    defer _ = gpa.deinit();
+    const allocator = gpa.allocator();
+
+    // Test ALSA capture initialization (without actually opening device)
+    var capture = AlsaCapture.init(allocator, "hw:0,0", 16000, 2, 1024) catch |err| {
+        // If ALSA initialization fails (e.g., no audio device), that's expected in test environment
+        if (err == error.OutOfMemory) {
+            return err;
+        }
+        return; // Skip test if ALSA not available
+    };
+    defer capture.deinit();
+
+    // Test basic properties
+    try testing.expect(capture.sample_rate == 16000);
+    try testing.expect(capture.channels == 2);
+    try testing.expect(capture.buffer_size == 1024);
+    try testing.expect(capture.period_size == 256); // buffer_size / 4
+    try testing.expect(capture.pcm_handle == null); // Not opened yet
+}
+
+test "AudioBuffer thread safety" {
+    const testing = std.testing;
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    defer _ = gpa.deinit();
+    const allocator = gpa.allocator();
+
+    var buffer = try AudioBuffer.init(allocator, 1000);
+    defer buffer.deinit();
+
+    // Test concurrent access (simplified test)
+    const samples1 = [_]i16{ 1, 2, 3, 4, 5 };
+    const samples2 = [_]i16{ 6, 7, 8, 9, 10 };
+
+    // Write from multiple "threads" (simulated)
+    const written1 = buffer.write(&samples1);
+    const written2 = buffer.write(&samples2);
+
+    try testing.expect(written1 == 5);
+    try testing.expect(written2 == 5);
+    try testing.expect(buffer.available() == 10);
+
+    // Read back samples
+    var read_buffer: [10]i16 = undefined;
+    const read_count = buffer.read(&read_buffer);
+    try testing.expect(read_count == 10);
+
+    // Verify order is maintained
+    try testing.expect(read_buffer[0] == 1);
+    try testing.expect(read_buffer[4] == 5);
+    try testing.expect(read_buffer[5] == 6);
+    try testing.expect(read_buffer[9] == 10);
+}
+
+test "SttSession session management API" {
+    const testing = std.testing;
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    defer _ = gpa.deinit();
+    const allocator = gpa.allocator();
+
+    const TestHandler = struct {
+        speech_count: u32 = 0,
+        error_count: u32 = 0,
+
+        fn onSpeech(ctx: *anyopaque, text: []const u8) void {
+            const self: *@This() = @ptrCast(@alignCast(ctx));
+            self.speech_count += 1;
+            _ = text;
+        }
+
+        fn onError(ctx: *anyopaque, error_code: SttError, message: []const u8) void {
+            const self: *@This() = @ptrCast(@alignCast(ctx));
+            self.error_count += 1;
+            switch (error_code) {
+                else => {},
+            }
+            _ = message;
+        }
+    };
+
+    var handler = TestHandler{};
+    const options = SttOptions{
+        .model_path = "/invalid/path", // Will fail, but that's expected
+        .audio_device = "hw:0,0",
+        .event_handler = SpeechEventHandler{
+            .onSpeechFn = TestHandler.onSpeech,
+            .onErrorFn = TestHandler.onError,
+            .ctx = &handler,
+        },
+    };
+
+    // Test module-level init function
+    const result = init(allocator, options);
+    try testing.expectError(SttError.ModelLoadError, result);
+
+    // Test options validation
+    const invalid_options = SttOptions{
+        .model_path = "", // Invalid empty path
+        .audio_device = "hw:0,0",
+        .event_handler = options.event_handler,
+    };
+
+    const invalid_result = init(allocator, invalid_options);
+    try testing.expectError(SttError.InvalidParameter, invalid_result);
+}
+
+test "SttSession status and recovery" {
+    const testing = std.testing;
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    defer _ = gpa.deinit();
+    const allocator = gpa.allocator();
+
+    const DummyHandler = struct {
+        fn onSpeech(ctx: *anyopaque, text: []const u8) void {
+            _ = ctx;
+            _ = text;
+        }
+        fn onError(ctx: *anyopaque, error_code: SttError, message: []const u8) void {
+            _ = ctx;
+            switch (error_code) {
+                else => {},
+            }
+            _ = message;
+        }
+    };
+
+    var dummy_ctx: u8 = 0;
+    const options = SttOptions{
+        .model_path = "zig-out/bin/vosk-model-small-en-us-0.15",
+        .audio_device = "hw:0,0",
+        .event_handler = SpeechEventHandler{
+            .onSpeechFn = DummyHandler.onSpeech,
+            .onErrorFn = DummyHandler.onError,
+            .ctx = &dummy_ctx,
+        },
+    };
+
+    // Try to create session (may fail if model not available)
+    const result = SttSession.init(allocator, options);
+    if (result) |session| {
+        var session_mut = session;
+        defer session_mut.deinit();
+
+        // Test status methods
+        try testing.expect(session_mut.is_initialized());
+        try testing.expect(!session_mut.is_listening());
+
+        const status = session_mut.getStatus();
+        try testing.expect(status.initialized);
+        try testing.expect(!status.listening);
+        try testing.expect(status.audio_samples_available == 0);
+        try testing.expect(status.processing_samples_available == 0);
+
+        // Test that we can't start listening twice
+        const start_result = session_mut.start_listening();
+        if (start_result) |_| {
+            // If start succeeded, test double start
+            const double_start = session_mut.start_listening();
+            try testing.expectError(SttError.InvalidState, double_start);
+
+            // Test stop listening
+            session_mut.stop_listening();
+            try testing.expect(!session_mut.is_listening());
+
+            // Test that we can stop multiple times safely
+            session_mut.stop_listening();
+            try testing.expect(!session_mut.is_listening());
+        } else |err| {
+            // Audio device error expected in test environment
+            try testing.expect(err == SttError.ThreadingError or err == SttError.AudioDeviceError);
+        }
+    } else |err| {
+        // Model not available in test environment - this is acceptable
+        try testing.expect(err == SttError.ModelLoadError or err == SttError.InitializationFailed);
+    }
 }