more resiliency based on samples from actual speech to text

This commit is contained in:
Emil Lerch 2025-09-29 08:52:07 -07:00
parent 036ddb7f49
commit 83a6824320
Signed by: lobo
GPG key ID: A7B62D657EF764F8

View file

@ -374,11 +374,22 @@ pub const Parser = struct {
var altered = sentence;
// Step 1: Replacements
for (replacement_keys, replacement_values) |key, value| {
for (replacement_keys, replacement_values) |k, v| {
var k_buf: [256]u8 = undefined;
var v_buf: [256]u8 = undefined;
// add spaces on either side so we match words
const key = try std.fmt.bufPrint(&k_buf, " {s} ", .{k});
const value = try std.fmt.bufPrint(&v_buf, " {s} ", .{v});
// and then we need our sentence to have a space on either side
// so the replacement works
var sent_buf: [1024]u8 = undefined;
const sent = try std.fmt.bufPrint(&sent_buf, " {s} ", .{altered});
var altered_buf: [1024]u8 = undefined;
const altered_size = std.mem.replacementSize(
u8,
altered,
sent,
key,
value,
);
@ -389,21 +400,28 @@ pub const Parser = struct {
}
const replacement_count = std.mem.replace(
u8,
altered,
sent,
key,
value,
&altered_buf,
);
altered_buf[altered_size] = 0; // add sentinel
@memcpy(final_buf[0 .. altered_size + 1], altered_buf[0 .. altered_size + 1]);
altered = final_buf[0..altered_size :0];
if (std.mem.trimRight(u8, altered_buf[0..altered_size], " ").len == 0) {
std.log.debug("Sentence empty after replacements", .{});
return error.SentenceEmptyAfterReplacements;
}
const start: usize = if (altered_buf[0] == ' ') 1 else 0;
const last_is_space = altered_buf[altered_size - 1] == ' ';
const end: usize = if (last_is_space) altered_size - 1 else altered_size;
altered_buf[end] = 0; // add sentinel
@memcpy(final_buf[start .. end + 1], altered_buf[start .. end + 1]);
altered = final_buf[start..end :0];
if (replacement_count > 0)
// we have altered the deal. Pray we don't alter it further
std.log.info("Replaced '{s}' in sentence with replacement '{s}' {d} times. Sentence now:\n\t{s}", .{
key,
value,
k,
v,
replacement_count,
altered,
});
@ -428,51 +446,121 @@ pub const Parser = struct {
});
// Step 2: replace null words
var nulls_removed = true;
while (nulls_removed) {
nulls_removed = false;
var this_pass_nulls_removed: usize = 1;
var total_nulls_removed: usize = 0;
var replacement_errors: usize = 0;
while (this_pass_nulls_removed - replacement_errors > 0) {
this_pass_nulls_removed = 0;
// var last_word_buf: [256]u8 = undefined;
var last_word: ?[]const u8 = null;
for (tree.words) |word| {
if (std.mem.indexOf(u8, word, "[?]")) |i| {
var altered_buf: [1024]u8 = undefined;
nulls_removed = true;
// We need to alter this further
const trimmed = word[0..i];
const removals = std.mem.replace(
u8,
altered,
trimmed,
"",
&altered_buf,
);
const len = altered.len - (removals * trimmed.len);
altered_buf[len] = 0;
@memcpy(final_buf[0 .. len + 1], altered_buf[0 .. len + 1]);
altered = final_buf[0..len :0];
const had_last_word = last_word != null;
const is_null = if (last_word == null)
std.mem.indexOf(u8, word, "[?]") != null
else
(word[0] == '[' and word[word.len - 1] == ']') or
std.mem.startsWith(u8, word, "'s.");
if (altered.len == 0) {
std.log.debug("Removed null word '{s}' in sentence {d} time(s). Sentence now empty", .{ trimmed, removals });
return error.SentenceEmptyAfterNullRemoval;
if (!is_null) {
if (last_word) |l| {
// We had no replacements, and this next word is unusable
std.log.warn("No replacements for word '{s}' and cannot combine with next word '{s}'. Continuing", .{ l, word });
last_word = null;
this_pass_nulls_removed += 1; // count as removal
replacement_errors += 1;
}
std.log.info("Removed null word '{s}' in sentence {d} time(s). Sentence now:\n\t{s}", .{
trimmed,
removals,
altered,
});
// Retry parsing with the word removed
tree.deinit();
tree = self.parse(altered) catch |err| {
tree_ptr = null;
if (shouldLog())
std.log.err("Failed to parse altered sentence: {}\n\t{s}", .{ err, altered });
// continue;
return err;
};
tree_ptr = &tree;
break; // we will remove these words conservatively...
continue;
}
// we are on a null, but we might have to skip processing if
// there was an earlier failure
if (this_pass_nulls_removed < replacement_errors) {
this_pass_nulls_removed += 1; // skip and move on
continue;
}
// We need to alter this further
const trimmed = if (std.mem.indexOf(u8, word, "[?]")) |i|
word[0..i]
else if (std.mem.startsWith(u8, word, "'s."))
word[0..2]
else
word[1 .. word.len - 1];
var needle_buf: [256]u8 = undefined;
var first_part: []const u8 = "";
if (last_word) |w| {
if (std.mem.indexOf(u8, w, "[?]")) |i|
first_part = w[0..i]
else
@panic("first part of null word does not have [?]. programming error");
}
const needle = try std.fmt.bufPrint(&needle_buf, " {s}{s} ", .{
first_part,
trimmed,
});
if (last_word) |w|
std.log.debug("last word: {s}, needle: {s}", .{ w, needle[1 .. needle.len - 1] });
last_word = null;
// and then we need our sentence to have a space on either side
// so the replacement works
var sent_buf: [1024]u8 = undefined;
const sent = try std.fmt.bufPrint(&sent_buf, " {s} ", .{altered});
const removals = std.mem.replace(
u8,
sent,
needle,
" ",
&sent_buf,
);
const len = sent.len - (removals * needle.len) + removals;
if (std.mem.trimRight(u8, sent_buf[0..len], " ").len == 0) {
std.log.debug("Removed null word '{s}' in sentence {d} time(s). Sentence now empty", .{ needle[1 .. needle.len - 1], removals });
return error.SentenceEmptyAfterNullRemoval;
}
const start: usize = if (sent_buf[0] == ' ') 1 else 0;
const last_is_space = sent_buf[len - 1] == ' ';
const end: usize = if (last_is_space) len - 1 else len;
if (removals == 0) {
if (had_last_word) {
// giving up
std.log.info("Could not find word to remove after combining with next null. Giving up", .{});
break;
}
// contractions are sometimes split across words in the array
std.log.debug("Could not find word to remove. Combining next word with this one ({s})", .{trimmed});
// last_word = try std.fmt.bufPrint(&last_word_buf, "{s}", .{word});
last_word = word; // I think this should work
continue;
}
std.log.info("Removed null word '{s}' in sentence {d} time(s). Sentence before:\n\t{s}\nafter:\n\t{s}", .{
needle[1 .. needle.len - 1],
removals,
altered, // this is our before...we will copy memory around just below
sent_buf[start..end],
});
sent_buf[end] = 0; // add sentinal
@memcpy(final_buf[start .. end + 1], sent_buf[start .. end + 1]);
altered = final_buf[start..end :0];
// Retry parsing with the word removed
tree.deinit();
tree = self.parse(altered) catch |err| {
tree_ptr = null;
if (shouldLog())
std.log.err("Failed to parse altered sentence: {}\n\t{s}", .{ err, altered });
// continue;
return err;
};
tree_ptr = &tree;
this_pass_nulls_removed += 1;
total_nulls_removed += 1;
break; // we will remove these words conservatively...
}
}
std.log.debug("{d} nulls removed ({d} replacement errors)", .{
total_nulls_removed,
replacement_errors,
});
// Bracketed words are "null"
// words with [?] are "unknown"
// If we have unknowns, I think we want to replace (or if no replacement
@ -494,7 +582,7 @@ pub const Parser = struct {
const action_words = tree.sentenceAction() catch |err| {
// This is the first catch, so we don't want to log here as it
// gets super noisy
std.log.debug("Failed to extract action: {}\n", .{err});
std.log.debug("Failed to extract action: {}", .{err});
return err;
// continue;
};
@ -759,7 +847,7 @@ test "adaptiveCommandParse no valid parse" {
// defer std.testing.log_level = ll;
// std.testing.log_level = .debug;
try std.testing.expectError(
error.SentenceCreationFailed,
error.SentenceEmptyAfterNullRemoval,
parser.adaptiveCommandParse(
sentence,
replacements,
@ -778,6 +866,20 @@ test "adaptiveCommandParse with word replacement and null removal" {
const sentence = "alexa turn on the kitchen lake";
// These should all be unit tests for the replacement and null processing
// Right now that code is all in the adaptiveCommandParse but should be factored out
// const sentence = "ah the next level the out wednesday october first i want alaska eighteen from seattle to boston";
// const sentence = "for for big waiver yeah i'm be a masses";
//
// This shows up as tom[.?] but then a proper 's.p, so breaks the typical pattern of an unknown word after a null word
// const sentence = "them sound indiscipline or doesn't have i say or okay so and so he's creating like an excel spreadsheet was tom's and records or whatever it to translate by so that's how he and he could have come conversation about";
//
// This one takes a lot of processing. Also, "i" ends up as null, so forces us to strip the word 'i' and not the letter 'i'
// const sentence = "i'm i seem to be breaking the website and so i'm training to or a multi city a reservation and it's telling me i can't get from seattle to portland or on monday september twenty ninth am and at that point i was like okay he something completely wrong effect";
// const ll = std.testing.log_level;
// defer std.testing.log_level = ll;
// std.testing.log_level = .debug;
var tree = try parser.adaptiveCommandParse(sentence, replacements);
defer tree.deinit();