more resiliency based on samples from actual speech to text

2025-09-29 08:52:07 -07:00 · 2025-09-29 08:52:07 -07:00 · 83a6824320
commit 83a6824320
parent 036ddb7f49
1 changed files with 150 additions and 48 deletions
--- a/src/root.zig
+++ b/src/root.zig
@ -374,11 +374,22 @@ pub const Parser = struct {
        var altered = sentence;

        // Step 1: Replacements
-        for (replacement_keys, replacement_values) |key, value| {
+        for (replacement_keys, replacement_values) |k, v| {
+            var k_buf: [256]u8 = undefined;
+            var v_buf: [256]u8 = undefined;
+
+            // add spaces on either side so we match words
+            const key = try std.fmt.bufPrint(&k_buf, " {s} ", .{k});
+            const value = try std.fmt.bufPrint(&v_buf, " {s} ", .{v});
+
+            // and then we need our sentence to have a space on either side
+            // so the replacement works
+            var sent_buf: [1024]u8 = undefined;
+            const sent = try std.fmt.bufPrint(&sent_buf, " {s} ", .{altered});
            var altered_buf: [1024]u8 = undefined;
            const altered_size = std.mem.replacementSize(
                u8,
-                altered,
+                sent,
                key,
                value,
            );
@ -389,21 +400,28 @@ pub const Parser = struct {
            }
            const replacement_count = std.mem.replace(
                u8,
-                altered,
+                sent,
                key,
                value,
                &altered_buf,
            );

-            altered_buf[altered_size] = 0; // add sentinel
-            @memcpy(final_buf[0 .. altered_size + 1], altered_buf[0 .. altered_size + 1]);
-            altered = final_buf[0..altered_size :0];
+            if (std.mem.trimRight(u8, altered_buf[0..altered_size], " ").len == 0) {
+                std.log.debug("Sentence empty after replacements", .{});
+                return error.SentenceEmptyAfterReplacements;
+            }
+            const start: usize = if (altered_buf[0] == ' ') 1 else 0;
+            const last_is_space = altered_buf[altered_size - 1] == ' ';
+            const end: usize = if (last_is_space) altered_size - 1 else altered_size;
+            altered_buf[end] = 0; // add sentinel
+            @memcpy(final_buf[start .. end + 1], altered_buf[start .. end + 1]);
+            altered = final_buf[start..end :0];

            if (replacement_count > 0)
                // we have altered the deal. Pray we don't alter it further
                std.log.info("Replaced '{s}' in sentence with replacement '{s}' {d} times. Sentence now:\n\t{s}", .{
-                    key,
-                    value,
+                    k,
+                    v,
                    replacement_count,
                    altered,
                });
@ -428,51 +446,121 @@ pub const Parser = struct {
        });

        // Step 2: replace null words
-        var nulls_removed = true;
-        while (nulls_removed) {
-            nulls_removed = false;
+        var this_pass_nulls_removed: usize = 1;
+        var total_nulls_removed: usize = 0;
+        var replacement_errors: usize = 0;
+        while (this_pass_nulls_removed - replacement_errors > 0) {
+            this_pass_nulls_removed = 0;
+            // var last_word_buf: [256]u8 = undefined;
+            var last_word: ?[]const u8 = null;
            for (tree.words) |word| {
-                if (std.mem.indexOf(u8, word, "[?]")) |i| {
-                    var altered_buf: [1024]u8 = undefined;
-                    nulls_removed = true;
-                    // We need to alter this further
-                    const trimmed = word[0..i];
-                    const removals = std.mem.replace(
-                        u8,
-                        altered,
-                        trimmed,
-                        "",
-                        &altered_buf,
-                    );
-                    const len = altered.len - (removals * trimmed.len);
-                    altered_buf[len] = 0;
-                    @memcpy(final_buf[0 .. len + 1], altered_buf[0 .. len + 1]);
-                    altered = final_buf[0..len :0];
+                const had_last_word = last_word != null;
+                const is_null = if (last_word == null)
+                    std.mem.indexOf(u8, word, "[?]") != null
+                else
+                    (word[0] == '[' and word[word.len - 1] == ']') or
+                        std.mem.startsWith(u8, word, "'s.");

-                    if (altered.len == 0) {
-                        std.log.debug("Removed null word '{s}' in sentence {d} time(s). Sentence now empty", .{ trimmed, removals });
-                        return error.SentenceEmptyAfterNullRemoval;
+                if (!is_null) {
+                    if (last_word) |l| {
+                        // We had no replacements, and this next word is unusable
+                        std.log.warn("No replacements for word '{s}' and cannot combine with next word '{s}'. Continuing", .{ l, word });
+                        last_word = null;
+                        this_pass_nulls_removed += 1; // count as removal
+                        replacement_errors += 1;
                    }
-                    std.log.info("Removed null word '{s}' in sentence {d} time(s). Sentence now:\n\t{s}", .{
-                        trimmed,
-                        removals,
-                        altered,
-                    });
-                    // Retry parsing with the word removed
-                    tree.deinit();
-                    tree = self.parse(altered) catch |err| {
-                        tree_ptr = null;
-                        if (shouldLog())
-                            std.log.err("Failed to parse altered sentence: {}\n\t{s}", .{ err, altered });
-                        // continue;
-                        return err;
-                    };
-                    tree_ptr = &tree;
-                    break; // we will remove these words conservatively...
+                    continue;
                }
+                // we are on a null, but we might have to skip processing if
+                // there was an earlier failure
+                if (this_pass_nulls_removed < replacement_errors) {
+                    this_pass_nulls_removed += 1; // skip and move on
+                    continue;
+                }
+                // We need to alter this further
+                const trimmed = if (std.mem.indexOf(u8, word, "[?]")) |i|
+                    word[0..i]
+                else if (std.mem.startsWith(u8, word, "'s."))
+                    word[0..2]
+                else
+                    word[1 .. word.len - 1];
+                var needle_buf: [256]u8 = undefined;
+                var first_part: []const u8 = "";
+                if (last_word) |w| {
+                    if (std.mem.indexOf(u8, w, "[?]")) |i|
+                        first_part = w[0..i]
+                    else
+                        @panic("first part of null word does not have [?]. programming error");
+                }
+                const needle = try std.fmt.bufPrint(&needle_buf, " {s}{s} ", .{
+                    first_part,
+                    trimmed,
+                });
+                if (last_word) |w|
+                    std.log.debug("last word: {s}, needle: {s}", .{ w, needle[1 .. needle.len - 1] });
+                last_word = null;
+                // and then we need our sentence to have a space on either side
+                // so the replacement works
+                var sent_buf: [1024]u8 = undefined;
+                const sent = try std.fmt.bufPrint(&sent_buf, " {s} ", .{altered});
+                const removals = std.mem.replace(
+                    u8,
+                    sent,
+                    needle,
+                    " ",
+                    &sent_buf,
+                );
+                const len = sent.len - (removals * needle.len) + removals;
+                if (std.mem.trimRight(u8, sent_buf[0..len], " ").len == 0) {
+                    std.log.debug("Removed null word '{s}' in sentence {d} time(s). Sentence now empty", .{ needle[1 .. needle.len - 1], removals });
+                    return error.SentenceEmptyAfterNullRemoval;
+                }
+                const start: usize = if (sent_buf[0] == ' ') 1 else 0;
+                const last_is_space = sent_buf[len - 1] == ' ';
+                const end: usize = if (last_is_space) len - 1 else len;
+
+                if (removals == 0) {
+                    if (had_last_word) {
+                        // giving up
+                        std.log.info("Could not find word to remove after combining with next null. Giving up", .{});
+                        break;
+                    }
+                    //  contractions are sometimes split across words in the array
+                    std.log.debug("Could not find word to remove. Combining next word with this one ({s})", .{trimmed});
+                    // last_word = try std.fmt.bufPrint(&last_word_buf, "{s}", .{word});
+                    last_word = word; // I think this should work
+                    continue;
+                }
+                std.log.info("Removed null word '{s}' in sentence {d} time(s). Sentence before:\n\t{s}\nafter:\n\t{s}", .{
+                    needle[1 .. needle.len - 1],
+                    removals,
+                    altered, // this is our before...we will copy memory around just below
+                    sent_buf[start..end],
+                });
+                sent_buf[end] = 0; // add sentinal
+                @memcpy(final_buf[start .. end + 1], sent_buf[start .. end + 1]);
+                altered = final_buf[start..end :0];
+
+                // Retry parsing with the word removed
+                tree.deinit();
+                tree = self.parse(altered) catch |err| {
+                    tree_ptr = null;
+                    if (shouldLog())
+                        std.log.err("Failed to parse altered sentence: {}\n\t{s}", .{ err, altered });
+                    // continue;
+                    return err;
+                };
+                tree_ptr = &tree;
+                this_pass_nulls_removed += 1;
+                total_nulls_removed += 1;
+                break; // we will remove these words conservatively...
            }
        }

+        std.log.debug("{d} nulls removed ({d} replacement errors)", .{
+            total_nulls_removed,
+            replacement_errors,
+        });
        // Bracketed words are "null"
        // words with [?] are "unknown"
        // If we have unknowns, I think we want to replace (or if no replacement
@ -494,7 +582,7 @@ pub const Parser = struct {
        const action_words = tree.sentenceAction() catch |err| {
            // This is the first catch, so we don't want to log here as it
            // gets super noisy
-            std.log.debug("Failed to extract action: {}\n", .{err});
+            std.log.debug("Failed to extract action: {}", .{err});
            return err;
            // continue;
        };
@ -759,7 +847,7 @@ test "adaptiveCommandParse no valid parse" {
    // defer std.testing.log_level = ll;
    // std.testing.log_level = .debug;
    try std.testing.expectError(
-        error.SentenceCreationFailed,
+        error.SentenceEmptyAfterNullRemoval,
        parser.adaptiveCommandParse(
            sentence,
            replacements,
@ -778,6 +866,20 @@ test "adaptiveCommandParse with word replacement and null removal" {

    const sentence = "alexa turn on the kitchen lake";

+    // These should all be unit tests for the replacement and null processing
+    // Right now that code is all in the adaptiveCommandParse but should be factored out
+    // const sentence = "ah the next level the out wednesday october first i want alaska eighteen from seattle to boston";
+    // const sentence = "for for big waiver yeah i'm be a masses";
+    //
+    // This shows up as tom[.?] but then a proper 's.p, so breaks the typical pattern of an unknown word after a null word
+    // const sentence = "them sound indiscipline or doesn't have i say or okay so and so he's creating like an excel spreadsheet was tom's and records or whatever it to translate by so that's how he and he could have come conversation about";
+    //
+    // This one takes a lot of processing. Also, "i" ends up as null, so forces us to strip the word 'i' and not the letter 'i'
+    // const sentence = "i'm i seem to be breaking the website and so i'm training to or a multi city a reservation and it's telling me i can't get from seattle to portland or on monday september twenty ninth am and at that point i was like okay he something completely wrong effect";
+
+    // const ll = std.testing.log_level;
+    // defer std.testing.log_level = ll;
+    // std.testing.log_level = .debug;
    var tree = try parser.adaptiveCommandParse(sentence, replacements);
    defer tree.deinit();