process audio data when in speech
All checks were successful
Generic zig build / build (push) Successful in 50s
All checks were successful
Generic zig build / build (push) Successful in 50s
This commit is contained in:
parent
5543e122ea
commit
823e10641a
1 changed files with 24 additions and 11 deletions
35
src/stt.zig
35
src/stt.zig
|
|
@ -963,6 +963,8 @@ pub const Session = struct {
|
|||
/// must be either captured at 16kHz or resampled to 16kHz prior to calling
|
||||
/// the function
|
||||
fn processingThreadFn(self: *Session) void {
|
||||
if (self.vosk_recognizer == null) @panic("Session not initialized");
|
||||
const rec = self.vosk_recognizer.?;
|
||||
const vosk_chunk_size = 4096;
|
||||
var vosk_buffer = self.allocator.alloc(i16, vosk_chunk_size) catch {
|
||||
const error_info = ErrorInfo.init(Error.OutOfMemory, "Failed to allocate Vosk processing buffer");
|
||||
|
|
@ -1037,7 +1039,7 @@ pub const Session = struct {
|
|||
speech_max_amplitude = 0;
|
||||
speech_min_amplitude = std.math.maxInt(u16);
|
||||
|
||||
// Copy pre-roll buffer to speech buffer
|
||||
// Copy pre-roll buffer to speech buffer and feed to Vosk
|
||||
if (preroll_filled) {
|
||||
// Copy from preroll_pos to end
|
||||
const first_part_len = preroll_samples - preroll_pos;
|
||||
|
|
@ -1045,10 +1047,18 @@ pub const Session = struct {
|
|||
// Copy from start to preroll_pos
|
||||
@memcpy(speech_buffer[first_part_len..preroll_samples], preroll_buffer[0..preroll_pos]);
|
||||
speech_pos = preroll_samples;
|
||||
|
||||
// Feed preroll to Vosk
|
||||
const preroll_bytes = std.mem.sliceAsBytes(speech_buffer[0..preroll_samples]);
|
||||
_ = c.vosk_recognizer_accept_waveform(rec, preroll_bytes.ptr, @intCast(preroll_bytes.len));
|
||||
} else if (preroll_pos > 0) {
|
||||
// Buffer not full yet, just copy what we have
|
||||
@memcpy(speech_buffer[0..preroll_pos], preroll_buffer[0..preroll_pos]);
|
||||
speech_pos = preroll_pos;
|
||||
|
||||
// Feed preroll to Vosk
|
||||
const preroll_bytes = std.mem.sliceAsBytes(speech_buffer[0..preroll_pos]);
|
||||
_ = c.vosk_recognizer_accept_waveform(rec, preroll_bytes.ptr, @intCast(preroll_bytes.len));
|
||||
}
|
||||
}
|
||||
if (max_amplitude > speech_max_amplitude) speech_max_amplitude = max_amplitude;
|
||||
|
|
@ -1058,12 +1068,20 @@ pub const Session = struct {
|
|||
speech_pos += copy_len;
|
||||
speech_samples += samples_read;
|
||||
silence_samples = 0;
|
||||
|
||||
// Feed audio to Vosk in real-time
|
||||
const audio_bytes = std.mem.sliceAsBytes(vosk_buffer[0..samples_read]);
|
||||
_ = c.vosk_recognizer_accept_waveform(rec, audio_bytes.ptr, @intCast(audio_bytes.len));
|
||||
} else if (in_speech) {
|
||||
silence_samples += samples_read;
|
||||
const copy_len = @min(samples_read, speech_buffer.len - speech_pos);
|
||||
@memcpy(speech_buffer[speech_pos .. speech_pos + copy_len], vosk_buffer[0..copy_len]);
|
||||
speech_pos += copy_len;
|
||||
|
||||
// Continue feeding silence to Vosk
|
||||
const audio_bytes = std.mem.sliceAsBytes(vosk_buffer[0..samples_read]);
|
||||
_ = c.vosk_recognizer_accept_waveform(rec, audio_bytes.ptr, @intCast(audio_bytes.len));
|
||||
|
||||
if (silence_samples >= silence_duration_ms * samples_per_ms) {
|
||||
if (speech_samples >= min_speech_duration_ms * samples_per_ms) {
|
||||
const event = SpeechEvent{
|
||||
|
|
@ -1073,7 +1091,7 @@ pub const Session = struct {
|
|||
.min_amplitude = speech_min_amplitude,
|
||||
.audio_data = speech_buffer[0..speech_pos],
|
||||
};
|
||||
self.processVoskAudio(event) catch |err| {
|
||||
self.finalizeVoskAudio(event) catch |err| {
|
||||
std.log.err("Error processing speech: {}", .{err});
|
||||
};
|
||||
}
|
||||
|
|
@ -1085,9 +1103,7 @@ pub const Session = struct {
|
|||
speech_max_amplitude = 0;
|
||||
speech_min_amplitude = std.math.maxInt(u16);
|
||||
|
||||
if (self.vosk_recognizer) |rec| {
|
||||
c.vosk_recognizer_reset(rec);
|
||||
}
|
||||
c.vosk_recognizer_reset(rec);
|
||||
}
|
||||
} else {
|
||||
// Not in speech - update pre-roll buffer
|
||||
|
|
@ -1103,14 +1119,11 @@ pub const Session = struct {
|
|||
}
|
||||
}
|
||||
|
||||
/// Process complete audio clip with Vosk and handle results
|
||||
fn processVoskAudio(self: *Session, event: SpeechEvent) !void {
|
||||
/// Finalize Vosk processing and get result (audio already fed incrementally)
|
||||
fn finalizeVoskAudio(self: *Session, event: SpeechEvent) !void {
|
||||
if (self.vosk_recognizer == null) return Error.InvalidState;
|
||||
|
||||
const audio_bytes = std.mem.sliceAsBytes(event.audio_data);
|
||||
_ = c.vosk_recognizer_accept_waveform(self.vosk_recognizer, audio_bytes.ptr, @intCast(audio_bytes.len));
|
||||
|
||||
// Get final result
|
||||
// Get final result (audio was already fed incrementally)
|
||||
const result_cstr = c.vosk_recognizer_final_result(self.vosk_recognizer);
|
||||
if (result_cstr != null) {
|
||||
const result_str = std.mem.span(result_cstr);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue