Compare commits
10 commits
e2490ec3e3
...
b509b88569
Author | SHA1 | Date | |
---|---|---|---|
b509b88569 | |||
11b5e1a267 | |||
186651dce0 | |||
0262ca6659 | |||
ebd4b531c7 | |||
d6cab46c99 | |||
15522c41a8 | |||
6148c30dbc | |||
67ee2b4091 | |||
43a40e2d76 |
7 changed files with 330 additions and 28 deletions
28
.gitea/workflows/zig-build.yaml
Normal file
28
.gitea/workflows/zig-build.yaml
Normal file
|
@ -0,0 +1,28 @@
|
|||
name: Generic zig build
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- '*'
|
||||
- '!zig-develop*'
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@v4
|
||||
- name: Setup Zig
|
||||
uses: https://github.com/mlugg/setup-zig@v2.0.5
|
||||
- name: Build project
|
||||
run: zig build --summary all
|
||||
- name: Run tests
|
||||
run: zig build test --summary all
|
||||
- name: Notify
|
||||
uses: https://git.lerch.org/lobo/action-notify-ntfy@v2
|
||||
if: always() && env.GITEA_ACTIONS == 'true'
|
||||
with:
|
||||
host: ${{ secrets.NTFY_HOST }}
|
||||
topic: ${{ secrets.NTFY_TOPIC }}
|
||||
status: ${{ job.status }}
|
||||
user: ${{ secrets.NTFY_USER }}
|
||||
password: ${{ secrets.NTFY_PASSWORD }}
|
15
README.md
15
README.md
|
@ -20,6 +20,7 @@ The application uses ALSA's default device, which is configured in `alsa.conf`.
|
|||
### Prerequisites
|
||||
- Zig 0.15.1 (configured via mise)
|
||||
- Nix development environment configured for ALSA, and audio libraries
|
||||
- patchelf (for fixing RPATH in release builds): `nix-env -iA nixpkgs.patchelf`
|
||||
|
||||
### Vosk Model Download
|
||||
The application uses the Vosk small English model for speech recognition:
|
||||
|
@ -33,6 +34,20 @@ The application uses the Vosk small English model for speech recognition:
|
|||
2. Build application: `zig build`
|
||||
3. Run: `zig build run`
|
||||
|
||||
### Release Builds and Portability
|
||||
|
||||
When building in release mode (`-Doptimize=ReleaseSafe`), Zig embeds the full path to libvosk.so in the ELF NEEDED entries, making the binary non-portable. The build system automatically fixes this by running `fix_needed.sh` which uses `patchelf` to replace the full path with just the library name.
|
||||
|
||||
**Automatic fix**: Just run `zig build -Doptimize=ReleaseSafe` - the NEEDED entries are fixed automatically.
|
||||
|
||||
**Manual fix**: If needed, you can run `./fix_needed.sh [binary_path] [library_name]` manually.
|
||||
|
||||
The script uses `patchelf` (via nix-shell if not installed) to replace entries like:
|
||||
- Before: `NEEDED: [/home/user/.cache/zig/.../libvosk.so]`
|
||||
- After: `NEEDED: [libvosk.so]`
|
||||
|
||||
This makes the binary portable while using the existing RPATH (`$ORIGIN/../lib`) to find the library at runtime.
|
||||
|
||||
## Usage
|
||||
The application will:
|
||||
- Initialize audio capture from default microphone
|
||||
|
|
29
build.zig
29
build.zig
|
@ -6,6 +6,12 @@ pub fn build(b: *std.Build) void {
|
|||
|
||||
const vosk_dep_name = selectVoskDependency(target.result);
|
||||
const vosk_dep = b.dependency(vosk_dep_name, .{});
|
||||
const install_vosk = b.addInstallFileWithDir(
|
||||
vosk_dep.path("libvosk.so"),
|
||||
.lib,
|
||||
"libvosk.so",
|
||||
);
|
||||
b.getInstallStep().dependOn(&install_vosk.step);
|
||||
const alsa_dep = b.dependency("alsa", .{
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
|
@ -17,7 +23,7 @@ pub fn build(b: *std.Build) void {
|
|||
// Install the model to the output directory
|
||||
const install_model = b.addInstallDirectory(.{
|
||||
.source_dir = model_step.getOutputPath(),
|
||||
.install_dir = .bin,
|
||||
.install_dir = .{ .custom = "share/vosk/models" },
|
||||
.install_subdir = "vosk-model-small-en-us-0.15",
|
||||
});
|
||||
install_model.step.dependOn(&model_step.step);
|
||||
|
@ -26,7 +32,7 @@ pub fn build(b: *std.Build) void {
|
|||
// Create the STT library
|
||||
const stt_lib = b.addLibrary(.{
|
||||
.name = "stt",
|
||||
.linkage = .static,
|
||||
// .linkage = .static,
|
||||
.root_module = b.createModule(.{
|
||||
.root_source_file = b.path("src/stt.zig"),
|
||||
.target = target,
|
||||
|
@ -56,6 +62,7 @@ pub fn build(b: *std.Build) void {
|
|||
.link_libc = true,
|
||||
}),
|
||||
});
|
||||
exe.root_module.addRPathSpecial("$ORIGIN/../lib");
|
||||
|
||||
exe.linkLibrary(stt_lib);
|
||||
exe.linkLibrary(alsa_lib);
|
||||
|
@ -66,7 +73,23 @@ pub fn build(b: *std.Build) void {
|
|||
exe.addLibraryPath(vosk_dep.path(""));
|
||||
exe.linkSystemLibrary("vosk");
|
||||
|
||||
b.installArtifact(exe);
|
||||
const install_exe = b.addInstallArtifact(exe, .{});
|
||||
|
||||
// Fix NEEDED entries in release builds to make binary portable
|
||||
if (optimize != .Debug) {
|
||||
const script_path = b.pathFromRoot("fix_needed.sh");
|
||||
const fix_needed = b.addSystemCommand(&.{script_path});
|
||||
fix_needed.step.dependOn(&install_exe.step);
|
||||
fix_needed.addFileInput(install_exe.emitted_bin.?);
|
||||
fix_needed.has_side_effects = true;
|
||||
_ = fix_needed.captureStdOut();
|
||||
b.getInstallStep().dependOn(&fix_needed.step);
|
||||
|
||||
const fix_step = b.step("fix-needed", "Fix NEEDED entries to make binary portable");
|
||||
fix_step.dependOn(&fix_needed.step);
|
||||
} else {
|
||||
b.getInstallStep().dependOn(&install_exe.step);
|
||||
}
|
||||
|
||||
const run_step = b.step("run", "Run the app");
|
||||
const run_cmd = b.addRunArtifact(exe);
|
||||
|
|
55
fix_needed.sh
Executable file
55
fix_needed.sh
Executable file
|
@ -0,0 +1,55 @@
|
|||
#!/bin/bash
|
||||
# Fix NEEDED entries in release builds
|
||||
# This script replaces full paths with just library names in NEEDED entries
|
||||
|
||||
if [[ "$1" == "--help" || "$1" == "-h" ]]; then
|
||||
echo "Usage: $0 [binary_path] [library_name]"
|
||||
echo " binary_path: Path to binary (default: script_dir/zig-out/bin/stt)"
|
||||
echo " library_name: Library to fix (default: libvosk.so)"
|
||||
echo ""
|
||||
echo "Example: $0 my_binary libfoo.so"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
BINARY="${1:-$SCRIPT_DIR/zig-out/bin/stt}"
|
||||
LIBRARY="${2:-libvosk.so}"
|
||||
|
||||
if [ ! -f "$BINARY" ]; then
|
||||
echo "Binary not found: $BINARY" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Fixing NEEDED entries for $BINARY..."
|
||||
|
||||
# Get the current NEEDED entries
|
||||
if command -v readelf >/dev/null 2>&1; then
|
||||
FULL_PATH=$(readelf --dynamic "$BINARY" | grep "NEEDED" | grep "$LIBRARY" | head -1 | sed 's/.*\[\(.*\)\]/\1/')
|
||||
elif command -v nix-shell >/dev/null 2>&1; then
|
||||
echo "Using nix-shell to run readelf..."
|
||||
FULL_PATH=$(nix-shell -p binutils --run "readelf --dynamic '$BINARY'" | grep "NEEDED" | grep "$LIBRARY" | head -1 | sed 's/.*\[\(.*\)\]/\1/')
|
||||
else
|
||||
echo "Error: Neither readelf nor nix-shell found" >&2
|
||||
echo "Install binutils or nix to check NEEDED entries" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "$FULL_PATH" == *"/"* ]]; then
|
||||
echo "Found full path: $FULL_PATH"
|
||||
echo "Replacing with: $LIBRARY"
|
||||
|
||||
# Try patchelf directly, fall back to nix-shell
|
||||
if command -v patchelf >/dev/null 2>&1; then
|
||||
patchelf --replace-needed "$FULL_PATH" "$LIBRARY" "$BINARY"
|
||||
elif command -v nix-shell >/dev/null 2>&1; then
|
||||
echo "Using nix-shell to run patchelf..."
|
||||
nix-shell -p patchelf --run "patchelf --replace-needed '$FULL_PATH' '$LIBRARY' '$BINARY'"
|
||||
else
|
||||
echo "Error: Neither patchelf nor nix-shell found" >&2
|
||||
echo "Install patchelf or nix to fix NEEDED entries" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "Fixed!"
|
||||
else
|
||||
echo "No full path found, binary is already portable"
|
||||
fi
|
41
src/main.zig
41
src/main.zig
|
@ -26,7 +26,9 @@ const SpeechHandler = struct {
|
|||
/// Handle detected speech
|
||||
fn onSpeech(ctx: *anyopaque, text: []const u8) void {
|
||||
if (builtin.is_test) return; // Suppress output during tests
|
||||
|
||||
// Look for noise words and skip it if so
|
||||
if (std.mem.eql(u8, text, "huh")) return;
|
||||
if (std.mem.eql(u8, text, "but")) return;
|
||||
const self: *SpeechHandler = @ptrCast(@alignCast(ctx));
|
||||
self.speech_count += 1;
|
||||
|
||||
|
@ -323,9 +325,25 @@ pub fn main() !void {
|
|||
var model_path: ?[]const u8 = null;
|
||||
var exec_program: ?[]const u8 = null;
|
||||
|
||||
// Parse --model and --exec arguments
|
||||
// Parse arguments
|
||||
for (args[1..]) |arg| {
|
||||
if (std.mem.startsWith(u8, arg, "--model=")) {
|
||||
if (std.mem.eql(u8, arg, "--help") or std.mem.eql(u8, arg, "-h")) {
|
||||
_ = try stdout.writeAll("Real-time Speech Recognition\n\n");
|
||||
_ = try stdout.writeAll("USAGE:\n");
|
||||
_ = try stdout.writeAll(" stt [OPTIONS]\n\n");
|
||||
_ = try stdout.writeAll("OPTIONS:\n");
|
||||
_ = try stdout.writeAll(" --model=<path> Path to Vosk model directory\n");
|
||||
_ = try stdout.writeAll(" --exec=<program> Program to execute with recognized text\n");
|
||||
_ = try stdout.writeAll(" --help, -h Show this help message\n\n");
|
||||
_ = try stdout.writeAll("EXAMPLES:\n");
|
||||
_ = try stdout.writeAll(" stt\n");
|
||||
_ = try stdout.writeAll(" stt --model=../share/vosk/models/vosk-model-small-en-us-0.15\n");
|
||||
_ = try stdout.writeAll(" stt --exec=echo\n\n");
|
||||
_ = try stdout.writeAll("The application will search for models in these locations:\n");
|
||||
_ = try stdout.writeAll(" vosk-model-small-en-us-0.15\n");
|
||||
_ = try stdout.writeAll(" <binary_dir>/../share/vosk/models/vosk-model-small-en-us-0.15\n");
|
||||
return;
|
||||
} else if (std.mem.startsWith(u8, arg, "--model=")) {
|
||||
model_path = arg[8..]; // Skip "--model="
|
||||
} else if (std.mem.startsWith(u8, arg, "--exec=")) {
|
||||
exec_program = arg[7..]; // Skip "--exec="
|
||||
|
@ -346,10 +364,16 @@ pub fn main() !void {
|
|||
};
|
||||
|
||||
// If no model specified, try default locations
|
||||
var exe_path_buf: [std.fs.max_path_bytes]u8 = undefined;
|
||||
const exe_path = std.fs.selfExePath(&exe_path_buf) catch "stt";
|
||||
const exe_dir = std.fs.path.dirname(exe_path) orelse ".";
|
||||
|
||||
const xdg_model_path = try std.fmt.allocPrint(allocator, "{s}/../share/vosk/models/vosk-model-small-en-us-0.15", .{exe_dir});
|
||||
defer allocator.free(xdg_model_path);
|
||||
|
||||
const default_paths = [_][]const u8{
|
||||
"vosk-model-small-en-us-0.15",
|
||||
"zig-out/bin/vosk-model-small-en-us-0.15",
|
||||
"/usr/share/vosk/models/vosk-model-small-en-us-0.15",
|
||||
xdg_model_path,
|
||||
};
|
||||
|
||||
if (model_path == null) {
|
||||
|
@ -365,8 +389,11 @@ pub fn main() !void {
|
|||
_ = try stderr.writeAll("Error: Vosk model not found.\n\n");
|
||||
_ = try stderr.writeAll("Usage: stt [--model=<path>] [--exec=<program>]\n\n");
|
||||
_ = try stderr.writeAll("Locations searched:\n");
|
||||
inline for (default_paths) |path|
|
||||
_ = try stderr.writeAll("\t" ++ path ++ "\n");
|
||||
for (default_paths) |path| {
|
||||
_ = try stderr.writeAll("\t");
|
||||
_ = try stderr.writeAll(path);
|
||||
_ = try stderr.writeAll("\n");
|
||||
}
|
||||
_ = try stderr.writeAll("Please download the model. A fine model can be downloaded from:\n");
|
||||
_ = try stderr.writeAll("\thttps://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip\n");
|
||||
std.process.exit(1);
|
||||
|
|
176
src/stt.zig
176
src/stt.zig
|
@ -173,6 +173,69 @@ pub const SpeechEventHandler = struct {
|
|||
}
|
||||
};
|
||||
|
||||
/// Resample audio from input rate to output rate using linear interpolation
|
||||
fn resample(input_samples: []const i16, output_samples: []i16, input_rate: u32, output_rate: u32) usize {
|
||||
if (input_rate == output_rate) {
|
||||
const copy_len = @min(input_samples.len, output_samples.len);
|
||||
@memcpy(output_samples[0..copy_len], input_samples[0..copy_len]);
|
||||
return copy_len;
|
||||
}
|
||||
|
||||
const ratio = @as(f64, @floatFromInt(input_rate)) / @as(f64, @floatFromInt(output_rate));
|
||||
const output_len = @min(output_samples.len, @as(usize, @intFromFloat(@as(f64, @floatFromInt(input_samples.len)) / ratio)));
|
||||
|
||||
for (0..output_len) |i| {
|
||||
const src_pos = @as(f64, @floatFromInt(i)) * ratio;
|
||||
const src_idx: usize = @intFromFloat(src_pos);
|
||||
|
||||
if (src_idx >= input_samples.len) break;
|
||||
|
||||
if (src_idx + 1 < input_samples.len) {
|
||||
const frac = src_pos - @as(f64, @floatFromInt(src_idx));
|
||||
const sample1: f64 = @floatFromInt(input_samples[src_idx]);
|
||||
const sample2: f64 = @floatFromInt(input_samples[src_idx + 1]);
|
||||
const interpolated = sample1 + (sample2 - sample1) * frac;
|
||||
output_samples[i] = @intFromFloat(@max(@min(interpolated, std.math.maxInt(i16)), std.math.minInt(i16)));
|
||||
} else {
|
||||
output_samples[i] = input_samples[src_idx];
|
||||
}
|
||||
}
|
||||
|
||||
return output_len;
|
||||
}
|
||||
|
||||
/// Get CPU performance metric from /proc/cpuinfo (BogoMIPS or MHz)
|
||||
fn getCpuPerformance() !u32 {
|
||||
const file = try std.fs.openFileAbsolute("/proc/cpuinfo", .{});
|
||||
defer file.close();
|
||||
|
||||
var buf: [4096]u8 = undefined;
|
||||
const bytes_read = try file.readAll(&buf);
|
||||
|
||||
var lines = std.mem.splitScalar(u8, buf[0..bytes_read], '\n');
|
||||
while (lines.next()) |line| {
|
||||
if (std.mem.startsWith(u8, line, "BogoMIPS")) {
|
||||
var parts = std.mem.splitScalar(u8, line, ':');
|
||||
_ = parts.next(); // Skip key
|
||||
if (parts.next()) |value| {
|
||||
const trimmed = std.mem.trim(u8, value, " \t");
|
||||
return @intFromFloat(try std.fmt.parseFloat(f32, trimmed));
|
||||
}
|
||||
}
|
||||
if (std.mem.startsWith(u8, line, "cpu MHz")) {
|
||||
var parts = std.mem.splitScalar(u8, line, ':');
|
||||
_ = parts.next(); // Skip key
|
||||
if (parts.next()) |value| {
|
||||
const trimmed = std.mem.trim(u8, value, " \t");
|
||||
// Convert MHz to equivalent BogoMIPS scale for consistent thresholds
|
||||
const mhz = try std.fmt.parseFloat(f32, trimmed);
|
||||
return @intFromFloat(mhz / 20.0); // Rough conversion to BogoMIPS scale
|
||||
}
|
||||
}
|
||||
}
|
||||
return error.PerformanceNotFound; // Default fallback
|
||||
}
|
||||
|
||||
/// Audio buffer for managing audio data flow using std.io interfaces
|
||||
pub const AudioBuffer = struct {
|
||||
const Self = @This();
|
||||
|
@ -521,6 +584,8 @@ pub const Session = struct {
|
|||
should_stop: std.atomic.Value(bool) = std.atomic.Value(bool).init(false),
|
||||
/// Processing buffer for audio samples
|
||||
processing_buffer: []i16,
|
||||
/// Resample buffer for converting hardware rate to 16kHz (null if not needed)
|
||||
resample_buffer: ?[]i16,
|
||||
/// Vosk model
|
||||
vosk_model: ?*c.VoskModel = null,
|
||||
/// Vosk recognizer
|
||||
|
@ -574,23 +639,32 @@ pub const Session = struct {
|
|||
alsa_capture_mut.deinit();
|
||||
}
|
||||
|
||||
// Initialize Vosk audio buffer (larger buffer for processing)
|
||||
const vosk_audio_buffer = AudioBuffer.init(allocator, alsa_capture.sample_rate * 2) catch {
|
||||
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk audio buffer during initialization");
|
||||
options.event_handler.onDetailedError(error_info);
|
||||
return Error.OutOfMemory;
|
||||
};
|
||||
errdefer {
|
||||
var vosk_audio_buffer_mut = vosk_audio_buffer;
|
||||
vosk_audio_buffer_mut.deinit();
|
||||
}
|
||||
const cpu_perf = getCpuPerformance() catch 100;
|
||||
const buffer_multiplier: u32 = if (cpu_perf < 50) 8 else if (cpu_perf < 100) 4 else 2;
|
||||
const new_buffer_size = 16000 * buffer_multiplier;
|
||||
|
||||
std.log.debug(
|
||||
"Buffer multiplier {d} based on implied BogoMIPS of {d} (100 default in case of error)",
|
||||
.{ buffer_multiplier, cpu_perf },
|
||||
);
|
||||
|
||||
// Resize the Vosk buffer with the actual sample rate
|
||||
var vosk_buf = AudioBuffer.init(
|
||||
allocator,
|
||||
new_buffer_size,
|
||||
) catch {
|
||||
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to initialize Vosk buffer after ALSA open");
|
||||
options.event_handler.onDetailedError(error_info);
|
||||
return error.InitializationFailed;
|
||||
};
|
||||
errdefer vosk_buf.deinit();
|
||||
var session = Session{
|
||||
.allocator = allocator,
|
||||
.options = options,
|
||||
.alsa_capture = alsa_capture,
|
||||
.processing_buffer = processing_buffer,
|
||||
.vosk_audio_buffer = vosk_audio_buffer,
|
||||
.resample_buffer = null,
|
||||
.vosk_audio_buffer = vosk_buf,
|
||||
};
|
||||
|
||||
// Initialize Vosk model and recognizer with detailed error reporting
|
||||
|
@ -631,8 +705,8 @@ pub const Session = struct {
|
|||
return Error.ModelLoadError;
|
||||
}
|
||||
|
||||
// Create Vosk recognizer using actual hardware sample rate
|
||||
self.vosk_recognizer = c.vosk_recognizer_new(self.vosk_model, @floatFromInt(self.alsa_capture.?.sample_rate));
|
||||
// Always create Vosk recognizer at 16kHz
|
||||
self.vosk_recognizer = c.vosk_recognizer_new(self.vosk_model, 16000.0);
|
||||
if (self.vosk_recognizer == null) {
|
||||
if (self.vosk_model) |model| {
|
||||
c.vosk_model_free(model);
|
||||
|
@ -714,6 +788,16 @@ pub const Session = struct {
|
|||
return;
|
||||
}
|
||||
|
||||
// Allocate resample buffer if hardware sample rate differs from 16kHz
|
||||
if (capture.sample_rate != 16000) {
|
||||
std.log.info("Hardware rate {d}Hz != 16kHz, enabling resampling", .{capture.sample_rate});
|
||||
self.resample_buffer = self.allocator.alloc(i16, 16000) catch {
|
||||
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate resample buffer");
|
||||
self.options.event_handler.onDetailedError(error_info);
|
||||
return;
|
||||
};
|
||||
}
|
||||
|
||||
// Reset retry count for audio reading
|
||||
retry_count = 0;
|
||||
|
||||
|
@ -780,15 +864,26 @@ pub const Session = struct {
|
|||
const chunk_size = @min(1024, self.processing_buffer.len);
|
||||
const samples_read = capture.getAudioSamples(self.processing_buffer[0..chunk_size]);
|
||||
if (samples_read > 0) {
|
||||
// Resample if needed, otherwise use samples directly
|
||||
const samples_to_write = if (self.resample_buffer) |resample_buf| blk: {
|
||||
const resampled_count = resample(
|
||||
self.processing_buffer[0..samples_read],
|
||||
resample_buf,
|
||||
capture.sample_rate,
|
||||
16000,
|
||||
);
|
||||
break :blk resample_buf[0..resampled_count];
|
||||
} else self.processing_buffer[0..samples_read];
|
||||
|
||||
// Send audio to Vosk processing buffer with overflow protection
|
||||
const written = self.vosk_audio_buffer.write(self.processing_buffer[0..samples_read]);
|
||||
if (written < samples_read) {
|
||||
const written = self.vosk_audio_buffer.write(samples_to_write);
|
||||
if (written < samples_to_write.len) {
|
||||
// Buffer overflow - report warning and clear buffer
|
||||
const warning = ErrorInfo.initRecoverable(Error.InternalError, "Audio buffer overflow, clearing buffer to prevent data loss", "Consider increasing buffer size if this happens frequently");
|
||||
self.options.event_handler.onDetailedError(warning);
|
||||
|
||||
self.vosk_audio_buffer.clear();
|
||||
_ = self.vosk_audio_buffer.write(self.processing_buffer[0..samples_read]);
|
||||
_ = self.vosk_audio_buffer.write(samples_to_write);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -808,6 +903,9 @@ pub const Session = struct {
|
|||
const vosk_chunk_size = 4096;
|
||||
const min_chunk_size = 1024; // Minimum chunk size for processing
|
||||
|
||||
const cpu_perf = getCpuPerformance() catch 100;
|
||||
if (cpu_perf < 50)
|
||||
std.log.debug("processing thread additional delay being added", .{});
|
||||
var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch {
|
||||
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk processing buffer");
|
||||
self.options.event_handler.onDetailedError(error_info);
|
||||
|
@ -832,6 +930,10 @@ pub const Session = struct {
|
|||
const samples_read = self.vosk_audio_buffer.read(vosk_buffer[0..chunk_size]);
|
||||
|
||||
if (samples_read > 0 and self.vosk_recognizer != null) {
|
||||
// Time the Vosk processing to identify bottlenecks
|
||||
const start_time = std.time.nanoTimestamp();
|
||||
const buffer_fill = (self.vosk_audio_buffer.available() * 100) / self.vosk_audio_buffer.buffer.len;
|
||||
|
||||
// Process audio with Vosk with comprehensive error handling
|
||||
self.processVoskAudio(vosk_buffer[0..samples_read]) catch |err| {
|
||||
error_count += 1;
|
||||
|
@ -873,6 +975,13 @@ pub const Session = struct {
|
|||
continue;
|
||||
};
|
||||
|
||||
// Log timing and buffer status for diagnostics
|
||||
const end_time = std.time.nanoTimestamp();
|
||||
const processing_ms = @divTrunc(end_time - start_time, std.time.ns_per_ms);
|
||||
const realtime_ms = (samples_read * 1000) / 16000;
|
||||
if (processing_ms > realtime_ms and buffer_fill > 20)
|
||||
std.log.warn("Vosk processing behind realtime. {d} samples in {d}ms (realtime: {d}ms), buffer {d}% full", .{ samples_read, processing_ms, realtime_ms, buffer_fill });
|
||||
|
||||
// Reset error counters after successful operations
|
||||
success_count += 1;
|
||||
consecutive_failures = 0;
|
||||
|
@ -893,7 +1002,12 @@ pub const Session = struct {
|
|||
|
||||
// Increase delay if we're having errors
|
||||
const error_multiplier: u64 = if (consecutive_failures > 0) consecutive_failures + 1 else 1;
|
||||
const delay_ms = base_delay_ms * error_multiplier;
|
||||
var delay_ms = base_delay_ms * error_multiplier;
|
||||
|
||||
// Add extra delay for slower hardware (Pi) to prevent buffer overruns
|
||||
if (cpu_perf < 50) {
|
||||
delay_ms += 100; // Extra 10ms delay for Pi-class hardware
|
||||
}
|
||||
|
||||
std.Thread.sleep(delay_ms * std.time.ns_per_ms);
|
||||
}
|
||||
|
@ -1051,7 +1165,7 @@ pub const Session = struct {
|
|||
|
||||
// Reinitialize recognizer (model should still be valid)
|
||||
if (self.vosk_model) |model| {
|
||||
self.vosk_recognizer = c.vosk_recognizer_new(model, @floatFromInt(self.alsa_capture.?.sample_rate));
|
||||
self.vosk_recognizer = c.vosk_recognizer_new(model, 16000.0);
|
||||
if (self.vosk_recognizer == null) {
|
||||
const error_info = ErrorInfo.init(Error.ModelLoadError, "Failed to reinitialize Vosk recognizer");
|
||||
self.options.event_handler.onDetailedError(error_info);
|
||||
|
@ -1259,6 +1373,11 @@ pub const Session = struct {
|
|||
// Free processing buffer
|
||||
self.allocator.free(self.processing_buffer);
|
||||
|
||||
// Free resample buffer if allocated
|
||||
if (self.resample_buffer) |buf| {
|
||||
self.allocator.free(buf);
|
||||
}
|
||||
|
||||
// Clean up ALSA global configuration cache
|
||||
_ = c.snd_config_update_free_global();
|
||||
|
||||
|
@ -1589,3 +1708,24 @@ test "Session status and recovery" {
|
|||
// which can cause segmentation faults during deinit
|
||||
return error.SkipZigTest;
|
||||
}
|
||||
|
||||
test "resample function" {
|
||||
// Test same sample rate (no conversion)
|
||||
const input = [_]i16{ 100, 200, 300, 400 };
|
||||
var output: [4]i16 = undefined;
|
||||
const count = resample(&input, &output, 16000, 16000);
|
||||
try std.testing.expect(count == 4);
|
||||
try std.testing.expectEqualSlices(i16, &input, output[0..count]);
|
||||
|
||||
// Test downsampling (48kHz -> 16kHz, 3:1 ratio)
|
||||
const input_48k = [_]i16{ 100, 150, 200, 250, 300, 350 };
|
||||
var output_16k: [2]i16 = undefined;
|
||||
const down_count = resample(&input_48k, &output_16k, 48000, 16000);
|
||||
try std.testing.expect(down_count == 2);
|
||||
|
||||
// Test upsampling (16kHz -> 48kHz, 1:3 ratio)
|
||||
const input_16k = [_]i16{ 100, 200 };
|
||||
var output_48k: [6]i16 = undefined;
|
||||
const up_count = resample(&input_16k, &output_48k, 16000, 48000);
|
||||
try std.testing.expect(up_count == 6);
|
||||
}
|
||||
|
|
14
stt.service
Normal file
14
stt.service
Normal file
|
@ -0,0 +1,14 @@
|
|||
[Unit]
|
||||
Description=Real-time Speech Recognition Service
|
||||
After=sound.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
WorkingDirectory=/home/stt
|
||||
ExecStart=/home/stt/.local/bin/stt --exec /home/stt/.local/bin/pos
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
User=stt
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
Loading…
Add table
Reference in a new issue