process audio data when in speech
All checks were successful
Generic zig build / build (push) Successful in 50s

This commit is contained in:
Emil Lerch 2025-11-11 16:08:11 -08:00
parent 5543e122ea
commit 823e10641a
Signed by: lobo
GPG key ID: A7B62D657EF764F8

View file

@ -963,6 +963,8 @@ pub const Session = struct {
/// must be either captured at 16kHz or resampled to 16kHz prior to calling
/// the function
fn processingThreadFn(self: *Session) void {
if (self.vosk_recognizer == null) @panic("Session not initialized");
const rec = self.vosk_recognizer.?;
const vosk_chunk_size = 4096;
var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch {
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk processing buffer");
@ -1037,7 +1039,7 @@ pub const Session = struct {
speech_max_amplitude = 0;
speech_min_amplitude = std.math.maxInt(u16);
// Copy pre-roll buffer to speech buffer
// Copy pre-roll buffer to speech buffer and feed to Vosk
if (preroll_filled) {
// Copy from preroll_pos to end
const first_part_len = preroll_samples - preroll_pos;
@ -1045,10 +1047,18 @@ pub const Session = struct {
// Copy from start to preroll_pos
@memcpy(speech_buffer[first_part_len..preroll_samples], preroll_buffer[0..preroll_pos]);
speech_pos = preroll_samples;
// Feed preroll to Vosk
const preroll_bytes = std.mem.sliceAsBytes(speech_buffer[0..preroll_samples]);
_ = c.vosk_recognizer_accept_waveform(rec, preroll_bytes.ptr, @intCast(preroll_bytes.len));
} else if (preroll_pos > 0) {
// Buffer not full yet, just copy what we have
@memcpy(speech_buffer[0..preroll_pos], preroll_buffer[0..preroll_pos]);
speech_pos = preroll_pos;
// Feed preroll to Vosk
const preroll_bytes = std.mem.sliceAsBytes(speech_buffer[0..preroll_pos]);
_ = c.vosk_recognizer_accept_waveform(rec, preroll_bytes.ptr, @intCast(preroll_bytes.len));
}
}
if (max_amplitude > speech_max_amplitude) speech_max_amplitude = max_amplitude;
@ -1058,12 +1068,20 @@ pub const Session = struct {
speech_pos += copy_len;
speech_samples += samples_read;
silence_samples = 0;
// Feed audio to Vosk in real-time
const audio_bytes = std.mem.sliceAsBytes(vosk_buffer[0..samples_read]);
_ = c.vosk_recognizer_accept_waveform(rec, audio_bytes.ptr, @intCast(audio_bytes.len));
} else if (in_speech) {
silence_samples += samples_read;
const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
@memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
speech_pos += copy_len;
// Continue feeding silence to Vosk
const audio_bytes = std.mem.sliceAsBytes(vosk_buffer[0..samples_read]);
_ = c.vosk_recognizer_accept_waveform(rec, audio_bytes.ptr, @intCast(audio_bytes.len));
if (silence_samples >= silence_duration_ms * samples_per_ms) {
if (speech_samples >= min_speech_duration_ms * samples_per_ms) {
const event = SpeechEvent{
@ -1073,7 +1091,7 @@ pub const Session = struct {
.min_amplitude = speech_min_amplitude,
.audio_data = speech_buffer[0..speech_pos],
};
self.processVoskAudio(event) catch |err| {
self.finalizeVoskAudio(event) catch |err| {
std.log.err("Error processing speech: {}", .{err});
};
}
@ -1085,10 +1103,8 @@ pub const Session = struct {
speech_max_amplitude = 0;
speech_min_amplitude = std.math.maxInt(u16);
if (self.vosk_recognizer) |rec| {
c.vosk_recognizer_reset(rec);
}
}
} else {
// Not in speech - update pre-roll buffer
for (vosk_buffer[0..samples_read]) |sample| {
@ -1103,14 +1119,11 @@ pub const Session = struct {
}
}
/// Process complete audio clip with Vosk and handle results
fn processVoskAudio(self: *Session, event: SpeechEvent) !void {
/// Finalize Vosk processing and get result (audio already fed incrementally)
fn finalizeVoskAudio(self: *Session, event: SpeechEvent) !void {
if (self.vosk_recognizer == null) return Error.InvalidState;
const audio_bytes = std.mem.sliceAsBytes(event.audio_data);
_ = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));
// Get final result
// Get final result (audio was already fed incrementally)
const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer);
if (result_cstr != null) {
const result_str = std.mem.span(result_cstr);