more resiliency based on samples from actual speech to text
This commit is contained in:
parent
036ddb7f49
commit
83a6824320
1 changed files with 150 additions and 48 deletions
164
src/root.zig
164
src/root.zig
|
@ -374,11 +374,22 @@ pub const Parser = struct {
|
||||||
var altered = sentence;
|
var altered = sentence;
|
||||||
|
|
||||||
// Step 1: Replacements
|
// Step 1: Replacements
|
||||||
for (replacement_keys, replacement_values) |key, value| {
|
for (replacement_keys, replacement_values) |k, v| {
|
||||||
|
var k_buf: [256]u8 = undefined;
|
||||||
|
var v_buf: [256]u8 = undefined;
|
||||||
|
|
||||||
|
// add spaces on either side so we match words
|
||||||
|
const key = try std.fmt.bufPrint(&k_buf, " {s} ", .{k});
|
||||||
|
const value = try std.fmt.bufPrint(&v_buf, " {s} ", .{v});
|
||||||
|
|
||||||
|
// and then we need our sentence to have a space on either side
|
||||||
|
// so the replacement works
|
||||||
|
var sent_buf: [1024]u8 = undefined;
|
||||||
|
const sent = try std.fmt.bufPrint(&sent_buf, " {s} ", .{altered});
|
||||||
var altered_buf: [1024]u8 = undefined;
|
var altered_buf: [1024]u8 = undefined;
|
||||||
const altered_size = std.mem.replacementSize(
|
const altered_size = std.mem.replacementSize(
|
||||||
u8,
|
u8,
|
||||||
altered,
|
sent,
|
||||||
key,
|
key,
|
||||||
value,
|
value,
|
||||||
);
|
);
|
||||||
|
@ -389,21 +400,28 @@ pub const Parser = struct {
|
||||||
}
|
}
|
||||||
const replacement_count = std.mem.replace(
|
const replacement_count = std.mem.replace(
|
||||||
u8,
|
u8,
|
||||||
altered,
|
sent,
|
||||||
key,
|
key,
|
||||||
value,
|
value,
|
||||||
&altered_buf,
|
&altered_buf,
|
||||||
);
|
);
|
||||||
|
|
||||||
altered_buf[altered_size] = 0; // add sentinel
|
if (std.mem.trimRight(u8, altered_buf[0..altered_size], " ").len == 0) {
|
||||||
@memcpy(final_buf[0 .. altered_size + 1], altered_buf[0 .. altered_size + 1]);
|
std.log.debug("Sentence empty after replacements", .{});
|
||||||
altered = final_buf[0..altered_size :0];
|
return error.SentenceEmptyAfterReplacements;
|
||||||
|
}
|
||||||
|
const start: usize = if (altered_buf[0] == ' ') 1 else 0;
|
||||||
|
const last_is_space = altered_buf[altered_size - 1] == ' ';
|
||||||
|
const end: usize = if (last_is_space) altered_size - 1 else altered_size;
|
||||||
|
altered_buf[end] = 0; // add sentinel
|
||||||
|
@memcpy(final_buf[start .. end + 1], altered_buf[start .. end + 1]);
|
||||||
|
altered = final_buf[start..end :0];
|
||||||
|
|
||||||
if (replacement_count > 0)
|
if (replacement_count > 0)
|
||||||
// we have altered the deal. Pray we don't alter it further
|
// we have altered the deal. Pray we don't alter it further
|
||||||
std.log.info("Replaced '{s}' in sentence with replacement '{s}' {d} times. Sentence now:\n\t{s}", .{
|
std.log.info("Replaced '{s}' in sentence with replacement '{s}' {d} times. Sentence now:\n\t{s}", .{
|
||||||
key,
|
k,
|
||||||
value,
|
v,
|
||||||
replacement_count,
|
replacement_count,
|
||||||
altered,
|
altered,
|
||||||
});
|
});
|
||||||
|
@ -428,36 +446,101 @@ pub const Parser = struct {
|
||||||
});
|
});
|
||||||
|
|
||||||
// Step 2: replace null words
|
// Step 2: replace null words
|
||||||
var nulls_removed = true;
|
var this_pass_nulls_removed: usize = 1;
|
||||||
while (nulls_removed) {
|
var total_nulls_removed: usize = 0;
|
||||||
nulls_removed = false;
|
var replacement_errors: usize = 0;
|
||||||
|
while (this_pass_nulls_removed - replacement_errors > 0) {
|
||||||
|
this_pass_nulls_removed = 0;
|
||||||
|
// var last_word_buf: [256]u8 = undefined;
|
||||||
|
var last_word: ?[]const u8 = null;
|
||||||
for (tree.words) |word| {
|
for (tree.words) |word| {
|
||||||
if (std.mem.indexOf(u8, word, "[?]")) |i| {
|
const had_last_word = last_word != null;
|
||||||
var altered_buf: [1024]u8 = undefined;
|
const is_null = if (last_word == null)
|
||||||
nulls_removed = true;
|
std.mem.indexOf(u8, word, "[?]") != null
|
||||||
|
else
|
||||||
|
(word[0] == '[' and word[word.len - 1] == ']') or
|
||||||
|
std.mem.startsWith(u8, word, "'s.");
|
||||||
|
|
||||||
|
if (!is_null) {
|
||||||
|
if (last_word) |l| {
|
||||||
|
// We had no replacements, and this next word is unusable
|
||||||
|
std.log.warn("No replacements for word '{s}' and cannot combine with next word '{s}'. Continuing", .{ l, word });
|
||||||
|
last_word = null;
|
||||||
|
this_pass_nulls_removed += 1; // count as removal
|
||||||
|
replacement_errors += 1;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// we are on a null, but we might have to skip processing if
|
||||||
|
// there was an earlier failure
|
||||||
|
if (this_pass_nulls_removed < replacement_errors) {
|
||||||
|
this_pass_nulls_removed += 1; // skip and move on
|
||||||
|
continue;
|
||||||
|
}
|
||||||
// We need to alter this further
|
// We need to alter this further
|
||||||
const trimmed = word[0..i];
|
const trimmed = if (std.mem.indexOf(u8, word, "[?]")) |i|
|
||||||
|
word[0..i]
|
||||||
|
else if (std.mem.startsWith(u8, word, "'s."))
|
||||||
|
word[0..2]
|
||||||
|
else
|
||||||
|
word[1 .. word.len - 1];
|
||||||
|
var needle_buf: [256]u8 = undefined;
|
||||||
|
var first_part: []const u8 = "";
|
||||||
|
if (last_word) |w| {
|
||||||
|
if (std.mem.indexOf(u8, w, "[?]")) |i|
|
||||||
|
first_part = w[0..i]
|
||||||
|
else
|
||||||
|
@panic("first part of null word does not have [?]. programming error");
|
||||||
|
}
|
||||||
|
const needle = try std.fmt.bufPrint(&needle_buf, " {s}{s} ", .{
|
||||||
|
first_part,
|
||||||
|
trimmed,
|
||||||
|
});
|
||||||
|
if (last_word) |w|
|
||||||
|
std.log.debug("last word: {s}, needle: {s}", .{ w, needle[1 .. needle.len - 1] });
|
||||||
|
last_word = null;
|
||||||
|
// and then we need our sentence to have a space on either side
|
||||||
|
// so the replacement works
|
||||||
|
var sent_buf: [1024]u8 = undefined;
|
||||||
|
const sent = try std.fmt.bufPrint(&sent_buf, " {s} ", .{altered});
|
||||||
const removals = std.mem.replace(
|
const removals = std.mem.replace(
|
||||||
u8,
|
u8,
|
||||||
altered,
|
sent,
|
||||||
trimmed,
|
needle,
|
||||||
" ",
|
" ",
|
||||||
&altered_buf,
|
&sent_buf,
|
||||||
);
|
);
|
||||||
const len = altered.len - (removals * trimmed.len);
|
const len = sent.len - (removals * needle.len) + removals;
|
||||||
altered_buf[len] = 0;
|
if (std.mem.trimRight(u8, sent_buf[0..len], " ").len == 0) {
|
||||||
@memcpy(final_buf[0 .. len + 1], altered_buf[0 .. len + 1]);
|
std.log.debug("Removed null word '{s}' in sentence {d} time(s). Sentence now empty", .{ needle[1 .. needle.len - 1], removals });
|
||||||
altered = final_buf[0..len :0];
|
|
||||||
|
|
||||||
if (altered.len == 0) {
|
|
||||||
std.log.debug("Removed null word '{s}' in sentence {d} time(s). Sentence now empty", .{ trimmed, removals });
|
|
||||||
return error.SentenceEmptyAfterNullRemoval;
|
return error.SentenceEmptyAfterNullRemoval;
|
||||||
}
|
}
|
||||||
std.log.info("Removed null word '{s}' in sentence {d} time(s). Sentence now:\n\t{s}", .{
|
const start: usize = if (sent_buf[0] == ' ') 1 else 0;
|
||||||
trimmed,
|
const last_is_space = sent_buf[len - 1] == ' ';
|
||||||
|
const end: usize = if (last_is_space) len - 1 else len;
|
||||||
|
|
||||||
|
if (removals == 0) {
|
||||||
|
if (had_last_word) {
|
||||||
|
// giving up
|
||||||
|
std.log.info("Could not find word to remove after combining with next null. Giving up", .{});
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// contractions are sometimes split across words in the array
|
||||||
|
std.log.debug("Could not find word to remove. Combining next word with this one ({s})", .{trimmed});
|
||||||
|
// last_word = try std.fmt.bufPrint(&last_word_buf, "{s}", .{word});
|
||||||
|
last_word = word; // I think this should work
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
std.log.info("Removed null word '{s}' in sentence {d} time(s). Sentence before:\n\t{s}\nafter:\n\t{s}", .{
|
||||||
|
needle[1 .. needle.len - 1],
|
||||||
removals,
|
removals,
|
||||||
altered,
|
altered, // this is our before...we will copy memory around just below
|
||||||
|
sent_buf[start..end],
|
||||||
});
|
});
|
||||||
|
sent_buf[end] = 0; // add sentinal
|
||||||
|
@memcpy(final_buf[start .. end + 1], sent_buf[start .. end + 1]);
|
||||||
|
altered = final_buf[start..end :0];
|
||||||
|
|
||||||
// Retry parsing with the word removed
|
// Retry parsing with the word removed
|
||||||
tree.deinit();
|
tree.deinit();
|
||||||
tree = self.parse(altered) catch |err| {
|
tree = self.parse(altered) catch |err| {
|
||||||
|
@ -468,11 +551,16 @@ pub const Parser = struct {
|
||||||
return err;
|
return err;
|
||||||
};
|
};
|
||||||
tree_ptr = &tree;
|
tree_ptr = &tree;
|
||||||
|
this_pass_nulls_removed += 1;
|
||||||
|
total_nulls_removed += 1;
|
||||||
break; // we will remove these words conservatively...
|
break; // we will remove these words conservatively...
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
std.log.debug("{d} nulls removed ({d} replacement errors)", .{
|
||||||
|
total_nulls_removed,
|
||||||
|
replacement_errors,
|
||||||
|
});
|
||||||
// Bracketed words are "null"
|
// Bracketed words are "null"
|
||||||
// words with [?] are "unknown"
|
// words with [?] are "unknown"
|
||||||
// If we have unknowns, I think we want to replace (or if no replacement
|
// If we have unknowns, I think we want to replace (or if no replacement
|
||||||
|
@ -494,7 +582,7 @@ pub const Parser = struct {
|
||||||
const action_words = tree.sentenceAction() catch |err| {
|
const action_words = tree.sentenceAction() catch |err| {
|
||||||
// This is the first catch, so we don't want to log here as it
|
// This is the first catch, so we don't want to log here as it
|
||||||
// gets super noisy
|
// gets super noisy
|
||||||
std.log.debug("Failed to extract action: {}\n", .{err});
|
std.log.debug("Failed to extract action: {}", .{err});
|
||||||
return err;
|
return err;
|
||||||
// continue;
|
// continue;
|
||||||
};
|
};
|
||||||
|
@ -759,7 +847,7 @@ test "adaptiveCommandParse no valid parse" {
|
||||||
// defer std.testing.log_level = ll;
|
// defer std.testing.log_level = ll;
|
||||||
// std.testing.log_level = .debug;
|
// std.testing.log_level = .debug;
|
||||||
try std.testing.expectError(
|
try std.testing.expectError(
|
||||||
error.SentenceCreationFailed,
|
error.SentenceEmptyAfterNullRemoval,
|
||||||
parser.adaptiveCommandParse(
|
parser.adaptiveCommandParse(
|
||||||
sentence,
|
sentence,
|
||||||
replacements,
|
replacements,
|
||||||
|
@ -778,6 +866,20 @@ test "adaptiveCommandParse with word replacement and null removal" {
|
||||||
|
|
||||||
const sentence = "alexa turn on the kitchen lake";
|
const sentence = "alexa turn on the kitchen lake";
|
||||||
|
|
||||||
|
// These should all be unit tests for the replacement and null processing
|
||||||
|
// Right now that code is all in the adaptiveCommandParse but should be factored out
|
||||||
|
// const sentence = "ah the next level the out wednesday october first i want alaska eighteen from seattle to boston";
|
||||||
|
// const sentence = "for for big waiver yeah i'm be a masses";
|
||||||
|
//
|
||||||
|
// This shows up as tom[.?] but then a proper 's.p, so breaks the typical pattern of an unknown word after a null word
|
||||||
|
// const sentence = "them sound indiscipline or doesn't have i say or okay so and so he's creating like an excel spreadsheet was tom's and records or whatever it to translate by so that's how he and he could have come conversation about";
|
||||||
|
//
|
||||||
|
// This one takes a lot of processing. Also, "i" ends up as null, so forces us to strip the word 'i' and not the letter 'i'
|
||||||
|
// const sentence = "i'm i seem to be breaking the website and so i'm training to or a multi city a reservation and it's telling me i can't get from seattle to portland or on monday september twenty ninth am and at that point i was like okay he something completely wrong effect";
|
||||||
|
|
||||||
|
// const ll = std.testing.log_level;
|
||||||
|
// defer std.testing.log_level = ll;
|
||||||
|
// std.testing.log_level = .debug;
|
||||||
var tree = try parser.adaptiveCommandParse(sentence, replacements);
|
var tree = try parser.adaptiveCommandParse(sentence, replacements);
|
||||||
defer tree.deinit();
|
defer tree.deinit();
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue