1005 lines
41 KiB
Zig
1005 lines
41 KiB
Zig
const builtin = @import("builtin");
|
|
const build_options = @import("build_options");
|
|
|
|
const std = @import("std");
|
|
const c = @cImport({
|
|
@cInclude("link-includes.h");
|
|
});
|
|
|
|
pub const ConstituentNode = struct {
|
|
label: []const u8,
|
|
/// index of start for the linkage, does not include LEFT_WALL/RIGHT_WALL
|
|
start: usize,
|
|
/// index of end for the linkage, does not include LEFT_WALL/RIGHT_WALL
|
|
end: usize,
|
|
child: ?*ConstituentNode,
|
|
next: ?*ConstituentNode,
|
|
allocator: std.mem.Allocator,
|
|
|
|
pub fn deinit(self: *ConstituentNode) void {
|
|
self.allocator.free(self.label);
|
|
if (self.child) |child|
|
|
child.deinit();
|
|
if (self.next) |next|
|
|
next.deinit();
|
|
self.allocator.destroy(self);
|
|
}
|
|
};
|
|
|
|
pub const Link = struct {
|
|
left_word: []const u8,
|
|
right_word: []const u8,
|
|
label: []const u8,
|
|
left_index: u32,
|
|
right_index: u32,
|
|
};
|
|
|
|
pub const ParseTree = struct {
|
|
words: [][]const u8,
|
|
links: []Link,
|
|
constituent_tree: ?*ConstituentNode,
|
|
allocator: std.mem.Allocator,
|
|
|
|
pub fn deinit(self: *ParseTree) void {
|
|
for (self.words) |word| {
|
|
self.allocator.free(word);
|
|
}
|
|
self.allocator.free(self.words);
|
|
for (self.links) |link| {
|
|
self.allocator.free(link.left_word);
|
|
self.allocator.free(link.right_word);
|
|
self.allocator.free(link.label);
|
|
}
|
|
self.allocator.free(self.links);
|
|
if (self.constituent_tree) |tree| {
|
|
tree.deinit();
|
|
}
|
|
}
|
|
pub fn format(self: ParseTree, writer: *std.io.Writer) std.io.Writer.Error!void {
|
|
// Print ASCII link diagram
|
|
try self.printLinkDiagram(writer);
|
|
|
|
try writer.writeAll("Words: ");
|
|
for (self.words, 0..) |word, i| {
|
|
try writer.print("{d}: '{s}' ", .{ i, word });
|
|
}
|
|
try writer.print("\n\nLinks ({} total):\n", .{self.links.len});
|
|
|
|
for (self.links, 0..) |link, i| {
|
|
try writer.print(" [{d}] {s} --{s}--> {s}\n", .{ i, link.left_word, link.label, link.right_word });
|
|
}
|
|
|
|
try writer.writeAll("\nConstituent Tree:\n");
|
|
if (self.constituent_tree) |tree| {
|
|
try self.printConstituentNode(writer, tree, 0);
|
|
} else {
|
|
try writer.writeAll(" (no constituent tree)\n");
|
|
}
|
|
}
|
|
|
|
fn printLinkDiagram(self: ParseTree, writer: *std.io.Writer) !void {
|
|
// Simple ASCII diagram - just print links above words
|
|
const max_width = 200;
|
|
var line_buffer: [max_width]u8 = undefined;
|
|
|
|
// Calculate word positions
|
|
var word_positions: [20]usize = undefined; // max 20 words
|
|
var total_width: usize = 0;
|
|
|
|
for (self.words, 0..) |word, i| {
|
|
if (i >= word_positions.len) break;
|
|
word_positions[i] = total_width;
|
|
total_width += word.len + 1;
|
|
}
|
|
|
|
// Print a simple link line
|
|
@memset(&line_buffer, ' ');
|
|
|
|
for (self.links) |link| {
|
|
const left_idx = self.findWordIndex(link.left_word) orelse continue;
|
|
const right_idx = self.findWordIndex(link.right_word) orelse continue;
|
|
if (left_idx >= word_positions.len or right_idx >= word_positions.len) continue;
|
|
|
|
const left_pos = word_positions[left_idx] + link.left_word.len / 2;
|
|
const right_pos = word_positions[right_idx] + link.right_word.len / 2;
|
|
const start_pos = @min(left_pos, right_pos);
|
|
const end_pos = @max(left_pos, right_pos);
|
|
|
|
if (end_pos < max_width) {
|
|
if (start_pos < max_width) line_buffer[start_pos] = '+';
|
|
if (end_pos < max_width) line_buffer[end_pos] = '+';
|
|
for (start_pos + 1..end_pos) |i| {
|
|
if (i < max_width) line_buffer[i] = '-';
|
|
}
|
|
|
|
// Add label
|
|
const label_start = (start_pos + end_pos) / 2;
|
|
if (label_start >= link.label.len / 2 and label_start + link.label.len < max_width) {
|
|
const label_pos = label_start - link.label.len / 2;
|
|
for (link.label, 0..) |ch, i| {
|
|
if (label_pos + i < max_width) {
|
|
line_buffer[label_pos + i] = ch;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
try writer.print("{s}\n", .{line_buffer[0..@min(total_width, max_width)]});
|
|
|
|
// Print connector line
|
|
@memset(&line_buffer, ' ');
|
|
for (self.links) |link| {
|
|
const left_idx = self.findWordIndex(link.left_word) orelse continue;
|
|
const right_idx = self.findWordIndex(link.right_word) orelse continue;
|
|
if (left_idx >= word_positions.len or right_idx >= word_positions.len) continue;
|
|
|
|
const left_pos = word_positions[left_idx] + link.left_word.len / 2;
|
|
const right_pos = word_positions[right_idx] + link.right_word.len / 2;
|
|
|
|
if (left_pos < max_width) line_buffer[left_pos] = '|';
|
|
if (right_pos < max_width) line_buffer[right_pos] = '|';
|
|
}
|
|
|
|
try writer.print("{s}\n", .{line_buffer[0..@min(total_width, max_width)]});
|
|
|
|
// Print words
|
|
for (self.words, 0..) |word, i| {
|
|
if (i > 0) try writer.writeAll(" ");
|
|
try writer.print("{s}", .{word});
|
|
}
|
|
try writer.writeAll("\n\n");
|
|
}
|
|
|
|
fn findWordIndex(self: ParseTree, word: []const u8) ?usize {
|
|
for (self.words, 0..) |w, i| {
|
|
if (std.mem.eql(u8, w, word)) return i;
|
|
}
|
|
return null;
|
|
}
|
|
pub fn firstVerb(self: *ParseTree) ?[]const u8 {
|
|
for (self.words) |word| {
|
|
if (std.mem.endsWith(u8, word, ".v")) {
|
|
return word[0 .. word.len - 2];
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
pub fn sentenceAction(self: *ParseTree) ![][]const u8 {
|
|
var al: std.ArrayList([]const u8) = .{};
|
|
defer al.deinit(self.allocator);
|
|
|
|
if (self.constituent_tree == null) return error.NoConstituentTree;
|
|
var node = self.constituent_tree.?;
|
|
// https://www.link.cs.cmu.edu/link/dict/section-S.html
|
|
if (!std.mem.startsWith(u8, node.label, "S"))
|
|
return error.NoSubjectNounFound;
|
|
if (node.child == null) @panic("S node must have a child");
|
|
node = node.child.?;
|
|
|
|
// I'm not entirely sure this will be universally true, but we'll see
|
|
// in real life testing
|
|
// https://www.link.cs.cmu.edu/link/dict/section-V.html
|
|
// https://www.link.cs.cmu.edu/link/dict/section-P.html
|
|
if (!std.mem.eql(u8, node.label, "VP"))
|
|
return error.VerbBeLinkageNotFound;
|
|
|
|
if (node.child == null) @panic("VP node must have a child");
|
|
node = node.child.?;
|
|
|
|
// This should be our action
|
|
if (node.start != node.end)
|
|
return error.MultiWordVerbsNotSupported; // this should be our issue...
|
|
|
|
const verb = self.words[node.start + 1]; // +1 due to LEFT_WALL
|
|
if (!std.mem.endsWith(u8, verb, ".v"))
|
|
return error.VerbNotLabeledWithDotV;
|
|
|
|
// From this verb, we can just look at the linkage to find what to append
|
|
for (self.links) |l| {
|
|
// We're looking for a modifying verb, see:
|
|
// https://www.link.cs.cmu.edu/link/dict/section-MV.html
|
|
if (@as(usize, l.left_index) == (node.start + 1) and
|
|
std.mem.startsWith(u8, l.label, "MV"))
|
|
{
|
|
// this is an modifying verb
|
|
try al.append(self.allocator, verb[0 .. verb.len - 2]);
|
|
try al.append(
|
|
self.allocator,
|
|
l.right_word[0 .. std.mem.lastIndexOfScalar(u8, l.right_word, '.') orelse l.right_word.len],
|
|
);
|
|
return al.toOwnedSlice(self.allocator);
|
|
}
|
|
}
|
|
return al.toOwnedSlice(self.allocator);
|
|
}
|
|
|
|
pub fn sentenceObject(self: *ParseTree) ![][]const u8 {
|
|
var al: std.ArrayList([]const u8) = .{};
|
|
defer al.deinit(self.allocator);
|
|
|
|
if (self.constituent_tree == null) return error.NoConstituentTree;
|
|
var node = self.constituent_tree.?;
|
|
// https://www.link.cs.cmu.edu/link/dict/section-S.html
|
|
if (!std.mem.startsWith(u8, node.label, "S"))
|
|
return error.NoSubjectNounFound;
|
|
if (node.child == null) @panic("S node must have a child");
|
|
node = node.child.?;
|
|
|
|
// I'm not entirely sure this will be universally true, but we'll see
|
|
// in real life testing
|
|
// https://www.link.cs.cmu.edu/link/dict/section-V.html
|
|
// https://www.link.cs.cmu.edu/link/dict/section-P.html
|
|
if (!std.mem.eql(u8, node.label, "VP"))
|
|
return error.VerbBeLinkageNotFound;
|
|
|
|
if (node.child == null) @panic("VP node must have a child");
|
|
node = node.child.?;
|
|
|
|
// We need the next node, which should be our PP node
|
|
if (node.next == null) @panic("VP node must have a child with at least two members");
|
|
while (node.next != null) // fast forward to the very last node at this level
|
|
node = node.next.?;
|
|
|
|
// https://opencog.github.io/link-grammar-website/dict/section-PP.html
|
|
if (!std.mem.startsWith(u8, node.label, "PP"))
|
|
return error.PastParticipleLinkageNotFound;
|
|
|
|
if (node.child == null) @panic("PP node must have a child");
|
|
node = node.child.?;
|
|
|
|
// At this point we should be pointing to something like "on" or "off"
|
|
// We need the next node, which is the money shot
|
|
if (node.next == null) @panic("PP node must have a child with two members");
|
|
node = node.next.?;
|
|
|
|
// N is "No", which doesn't make sense here, but there is no specific
|
|
// NP section, so I'm not sure what we're here
|
|
if (!std.mem.eql(u8, node.label, "NP"))
|
|
return error.NPLinkageNotFound;
|
|
for (node.start..node.end + 1) |i| {
|
|
// we need to add 1 to this index so we can avoid LEFT-WALL
|
|
const inx = i + 1;
|
|
// we only want the nouns out of this...
|
|
const word = self.words[inx];
|
|
if (!std.mem.endsWith(u8, word, ".n")) continue;
|
|
const trimmed = word[0 .. word.len - 2];
|
|
try al.append(self.allocator, trimmed);
|
|
}
|
|
return al.toOwnedSlice(self.allocator);
|
|
}
|
|
|
|
fn printConstituentNode(self: *const ParseTree, writer: *std.Io.Writer, node: *const ConstituentNode, depth: usize) !void {
|
|
for (0..depth) |_| {
|
|
try writer.writeAll(" ");
|
|
}
|
|
try writer.print("{s} [{}-{}]\n", .{ node.label, node.start, node.end });
|
|
if (node.child) |child| {
|
|
try self.printConstituentNode(writer, child, depth + 1);
|
|
}
|
|
if (node.next) |next| {
|
|
try self.printConstituentNode(writer, next, depth);
|
|
}
|
|
}
|
|
};
|
|
|
|
pub const Parser = struct {
|
|
dict: c.Dictionary,
|
|
opts: c.Parse_Options,
|
|
allocator: std.mem.Allocator,
|
|
|
|
pub fn init(allocator: std.mem.Allocator) !Parser {
|
|
const dict = c.dictionary_create(
|
|
@ptrCast(@constCast("../share/link/4.0.dict")),
|
|
@ptrCast(@constCast("../share/link/4.0.knowledge")),
|
|
@ptrCast(@constCast("../share/link/4.0.constituent-knowledge")),
|
|
@ptrCast(@constCast("../share/link/4.0.affix")),
|
|
);
|
|
if (dict == null) return error.DictionaryCreationFailed;
|
|
|
|
const opts = c.parse_options_create();
|
|
if (opts == null) return error.ParseOptionsCreationFailed;
|
|
|
|
setOptions(opts);
|
|
|
|
return Parser{
|
|
.dict = dict,
|
|
.opts = opts,
|
|
.allocator = allocator,
|
|
};
|
|
}
|
|
|
|
fn setOptions(opts: anytype) void {
|
|
c.parse_options_set_verbosity(opts, 0);
|
|
c.parse_options_set_linkage_limit(opts, 100);
|
|
c.parse_options_set_disjunct_cost(opts, 2);
|
|
c.parse_options_set_min_null_count(opts, 0);
|
|
c.parse_options_set_max_null_count(opts, 0);
|
|
}
|
|
|
|
pub fn initWithDataDir(allocator: std.mem.Allocator, data_dir: []const u8) !Parser {
|
|
const dict_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "../share/link/4.0.dict" });
|
|
defer allocator.free(dict_path);
|
|
const knowledge_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "../share/link/4.0.knowledge" });
|
|
defer allocator.free(knowledge_path);
|
|
const constituent_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "../share/link/4.0.constituent-knowledge" });
|
|
defer allocator.free(constituent_path);
|
|
const affix_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "../share/link/4.0.affix" });
|
|
defer allocator.free(affix_path);
|
|
|
|
const dict_cstr = try allocator.dupeZ(u8, dict_path);
|
|
defer allocator.free(dict_cstr);
|
|
const knowledge_cstr = try allocator.dupeZ(u8, knowledge_path);
|
|
defer allocator.free(knowledge_cstr);
|
|
const constituent_cstr = try allocator.dupeZ(u8, constituent_path);
|
|
defer allocator.free(constituent_cstr);
|
|
const affix_cstr = try allocator.dupeZ(u8, affix_path);
|
|
defer allocator.free(affix_cstr);
|
|
|
|
const dict = c.dictionary_create(
|
|
@ptrCast(@constCast(dict_cstr.ptr)),
|
|
@ptrCast(@constCast(knowledge_cstr.ptr)),
|
|
@ptrCast(@constCast(constituent_cstr.ptr)),
|
|
@ptrCast(@constCast(affix_cstr.ptr)),
|
|
);
|
|
if (dict == null) return error.DictionaryCreationFailed;
|
|
|
|
const opts = c.parse_options_create();
|
|
if (opts == null) return error.ParseOptionsCreationFailed;
|
|
|
|
setOptions(opts);
|
|
|
|
return Parser{
|
|
.dict = dict,
|
|
.opts = opts,
|
|
.allocator = allocator,
|
|
};
|
|
}
|
|
|
|
pub fn deinit(self: *Parser) void {
|
|
_ = c.parse_options_delete(self.opts);
|
|
_ = c.dictionary_delete(self.dict);
|
|
}
|
|
|
|
fn applyReplacements(self: *Parser, sentence: []const u8, replacements: std.StaticStringMap([]const u8), final_buf: []u8) ![]const u8 {
|
|
_ = self; // we don't want to remove this completely, as there
|
|
// could be a time when we need to re-parse after replacement
|
|
const replacement_keys = replacements.keys();
|
|
const replacement_values = replacements.values();
|
|
|
|
var altered = sentence;
|
|
|
|
for (replacement_keys, replacement_values) |k, v| {
|
|
var k_buf: [256]u8 = undefined;
|
|
var v_buf: [256]u8 = undefined;
|
|
|
|
// add spaces on either side so we match words
|
|
const key = try std.fmt.bufPrint(&k_buf, " {s} ", .{k});
|
|
const value = try std.fmt.bufPrint(&v_buf, " {s} ", .{v});
|
|
|
|
// and then we need our sentence to have a space on either side
|
|
// so the replacement works
|
|
var sent_buf: [1024]u8 = undefined;
|
|
const sent = try std.fmt.bufPrint(&sent_buf, " {s} ", .{altered});
|
|
var altered_buf: [1024]u8 = undefined;
|
|
const altered_size = std.mem.replacementSize(
|
|
u8,
|
|
sent,
|
|
key,
|
|
value,
|
|
);
|
|
if (altered_size > 1023) {
|
|
if (shouldLog())
|
|
std.log.err("Sentence too long (>1023): {s}", .{altered});
|
|
return error.SentenceTooLong;
|
|
}
|
|
const replacement_count = std.mem.replace(
|
|
u8,
|
|
sent,
|
|
key,
|
|
value,
|
|
&altered_buf,
|
|
);
|
|
|
|
if (std.mem.trimRight(u8, altered_buf[0..altered_size], " ").len == 0) {
|
|
std.log.debug("Sentence empty after replacements", .{});
|
|
return error.SentenceEmptyAfterReplacements;
|
|
}
|
|
const start: usize = if (altered_buf[0] == ' ') 1 else 0;
|
|
const last_is_space = altered_buf[altered_size - 1] == ' ';
|
|
const end: usize = if (last_is_space) altered_size - 1 else altered_size;
|
|
altered_buf[end] = 0; // add sentinel
|
|
@memcpy(final_buf[start .. end + 1], altered_buf[start .. end + 1]);
|
|
altered = final_buf[start..end :0];
|
|
|
|
if (replacement_count > 0)
|
|
// we have altered the deal. Pray we don't alter it further
|
|
std.log.info("Replaced '{s}' in sentence with replacement '{s}' {d} times. Sentence now:\n\t{s}", .{
|
|
k,
|
|
v,
|
|
replacement_count,
|
|
altered,
|
|
});
|
|
}
|
|
return altered;
|
|
}
|
|
|
|
fn removeNullWords(self: *Parser, altered_sentence: []const u8, tree: *ParseTree, final_buf: []u8) !struct { sentence: []const u8, nulls_removed: usize } {
|
|
var altered = altered_sentence;
|
|
var this_pass_nulls_removed: usize = 1;
|
|
var total_nulls_removed: usize = 0;
|
|
var replacement_errors: usize = 0;
|
|
|
|
while (this_pass_nulls_removed - replacement_errors > 0) {
|
|
this_pass_nulls_removed = 0;
|
|
var last_word: ?[]const u8 = null;
|
|
for (tree.words) |word| {
|
|
const had_last_word = last_word != null;
|
|
const is_null = if (last_word == null)
|
|
std.mem.indexOf(u8, word, "[?]") != null
|
|
else
|
|
(word[0] == '[' and word[word.len - 1] == ']') or
|
|
std.mem.startsWith(u8, word, "'s.");
|
|
|
|
if (!is_null) {
|
|
if (last_word) |l| {
|
|
// We had no replacements, and this next word is unusable
|
|
std.log.warn("No replacements for word '{s}' and cannot combine with next word '{s}'. Continuing", .{ l, word });
|
|
last_word = null;
|
|
this_pass_nulls_removed += 1; // count as removal
|
|
replacement_errors += 1;
|
|
}
|
|
continue;
|
|
}
|
|
// we are on a null, but we might have to skip processing if
|
|
// there was an earlier failure
|
|
if (this_pass_nulls_removed < replacement_errors) {
|
|
this_pass_nulls_removed += 1; // skip and move on
|
|
continue;
|
|
}
|
|
// We need to alter this further
|
|
const trimmed = if (std.mem.indexOf(u8, word, "[?]")) |i|
|
|
word[0..i]
|
|
else if (std.mem.startsWith(u8, word, "'s."))
|
|
word[0..2]
|
|
else
|
|
word[1 .. word.len - 1];
|
|
var needle_buf: [256]u8 = undefined;
|
|
var first_part: []const u8 = "";
|
|
if (last_word) |w| {
|
|
if (std.mem.indexOf(u8, w, "[?]")) |i|
|
|
first_part = w[0..i]
|
|
else
|
|
@panic("first part of null word does not have [?]. programming error");
|
|
}
|
|
const needle = try std.fmt.bufPrint(&needle_buf, " {s}{s} ", .{
|
|
first_part,
|
|
trimmed,
|
|
});
|
|
if (last_word) |w|
|
|
std.log.debug("last word: {s}, needle: {s}", .{ w, needle[1 .. needle.len - 1] });
|
|
last_word = null;
|
|
// and then we need our sentence to have a space on either side
|
|
// so the replacement works
|
|
var sent_buf: [1024]u8 = undefined;
|
|
const sent = try std.fmt.bufPrint(&sent_buf, " {s} ", .{altered});
|
|
const removals = std.mem.replace(
|
|
u8,
|
|
sent,
|
|
needle,
|
|
" ",
|
|
&sent_buf,
|
|
);
|
|
const len = sent.len - (removals * needle.len) + removals;
|
|
if (std.mem.trimRight(u8, sent_buf[0..len], " ").len == 0) {
|
|
std.log.debug("Removed null word '{s}' in sentence {d} time(s). Sentence now empty", .{ needle[1 .. needle.len - 1], removals });
|
|
return error.SentenceEmptyAfterNullRemoval;
|
|
}
|
|
const start: usize = if (sent_buf[0] == ' ') 1 else 0;
|
|
const last_is_space = sent_buf[len - 1] == ' ';
|
|
const end: usize = if (last_is_space) len - 1 else len;
|
|
|
|
if (removals == 0) {
|
|
if (had_last_word) {
|
|
// giving up
|
|
std.log.info("Could not find word to remove after combining with next null. Giving up", .{});
|
|
break;
|
|
}
|
|
// contractions are sometimes split across words in the array
|
|
std.log.debug("Could not find word to remove. Combining next word with this one ({s})", .{trimmed});
|
|
// last_word = try std.fmt.bufPrint(&last_word_buf, "{s}", .{word});
|
|
last_word = word; // I think this should work
|
|
continue;
|
|
}
|
|
std.log.info("Removed null word '{s}' in sentence {d} time(s). Sentence before:\n\t{s}\nafter:\n\t{s}", .{
|
|
needle[1 .. needle.len - 1],
|
|
removals,
|
|
altered, // this is our before...we will copy memory around just below
|
|
sent_buf[start..end],
|
|
});
|
|
sent_buf[end] = 0; // add sentinal
|
|
@memcpy(final_buf[start .. end + 1], sent_buf[start .. end + 1]);
|
|
altered = final_buf[start..end :0];
|
|
|
|
// Retry parsing with the word removed
|
|
tree.deinit();
|
|
tree.* = self.parse(altered) catch |err| {
|
|
if (shouldLog())
|
|
std.log.err("Failed to parse altered sentence: {}\n\t{s}", .{ err, altered });
|
|
// continue;
|
|
return err;
|
|
};
|
|
this_pass_nulls_removed += 1;
|
|
total_nulls_removed += 1;
|
|
break; // we will remove these words conservatively...
|
|
}
|
|
}
|
|
return .{ .sentence = altered, .nulls_removed = total_nulls_removed };
|
|
}
|
|
|
|
/// Parses a sentence with an attempt to "fix" the sentence, assuming
|
|
/// the sentence is a command with an action and an object. If a valid
|
|
/// sentence is found, it will be returned, with the guarantee that
|
|
/// sentenceObject and sentenceAction will return non-zero results. If that
|
|
/// condition cannot be satisfied, error.NoValidParse will be returned
|
|
pub fn adaptiveCommandParse(self: *Parser, sentence: []const u8, replacements: std.StaticStringMap([]const u8)) !ParseTree {
|
|
var final_buf: [1024]u8 = undefined;
|
|
|
|
var altered = try self.applyReplacements(sentence, replacements, &final_buf);
|
|
var tree = self.parse(altered) catch |err| {
|
|
if (shouldLog()) {
|
|
if (altered.len > 0)
|
|
std.log.err("Failed to parse sentence: {}\n\t{s}", .{ err, altered })
|
|
else
|
|
std.log.err("Sentence is empty: not parsing", .{});
|
|
}
|
|
return err;
|
|
};
|
|
const tree_ptr: ?*ParseTree = &tree;
|
|
errdefer if (tree_ptr) |p| p.deinit();
|
|
|
|
std.log.debug("adaptiveCommandParse (step 1 - replacements):\n\toriginal:\n\t\t{s}\n\taltered:\n\t\t{s}\n{f}", .{
|
|
sentence, altered, tree,
|
|
});
|
|
|
|
const result = try self.removeNullWords(altered, &tree, &final_buf);
|
|
altered = result.sentence;
|
|
std.log.debug("{d} nulls removed", .{result.nulls_removed});
|
|
// Bracketed words are "null"
|
|
// words with [?] are "unknown"
|
|
// If we have unknowns, I think we want to replace (or if no replacement
|
|
// is available, strip) them. Then re-parse immediately, because we're
|
|
// in a bad enough state that we might screw something else up
|
|
//
|
|
// If there are nulls, then we should walk those nulls and look for
|
|
// replacement values. If any replacements have been performed, then
|
|
// try re-parsing at that point.
|
|
//
|
|
// This might all be best done in the library itself. Pass in the
|
|
// map of replacement words and let it churn.
|
|
//
|
|
// For null words, I think we can use this replacement loop
|
|
// if (tree.hasUnknowns()) // then what?
|
|
// {}
|
|
|
|
// Validate that we can extract action and object before returning
|
|
const action_words = tree.sentenceAction() catch |err| {
|
|
// This is the first catch, so we don't want to log here as it
|
|
// gets super noisy
|
|
std.log.debug("Failed to extract action: {}", .{err});
|
|
return err;
|
|
// continue;
|
|
};
|
|
defer self.allocator.free(action_words);
|
|
|
|
if (action_words.len == 0) {
|
|
std.log.info("Failed to extract action from sentence", .{});
|
|
return error.SentenceActionNotFound;
|
|
// continue;
|
|
}
|
|
|
|
const object_words = tree.sentenceObject() catch |err| {
|
|
if (shouldLog())
|
|
std.log.err("Failed to extract object: {}\n", .{err});
|
|
// continue;
|
|
return err;
|
|
};
|
|
defer self.allocator.free(object_words);
|
|
|
|
if (object_words.len == 0) {
|
|
std.log.info("Failed to extract object from sentence", .{});
|
|
// continue;
|
|
return error.SentenceObjectNotFound;
|
|
}
|
|
|
|
return tree;
|
|
}
|
|
|
|
inline fn shouldLog() bool {
|
|
return !builtin.is_test or std.testing.log_level != .warn; // .warn is default testing log level
|
|
}
|
|
|
|
pub fn parse(self: *Parser, input: []const u8) !ParseTree {
|
|
const c_input = try self.allocator.dupeZ(u8, input);
|
|
defer self.allocator.free(c_input);
|
|
|
|
const sent = c.sentence_create(c_input.ptr, self.dict);
|
|
if (sent == null) return error.SentenceCreationFailed;
|
|
defer c.sentence_delete(sent);
|
|
|
|
var num_linkages = c.sentence_parse(sent, self.opts);
|
|
|
|
// If no linkages found, try with null links allowed
|
|
if (num_linkages == 0) {
|
|
c.parse_options_set_min_null_count(self.opts, 1);
|
|
c.parse_options_set_max_null_count(self.opts, @intCast(c.sentence_length(sent)));
|
|
num_linkages = c.sentence_parse(sent, self.opts);
|
|
}
|
|
|
|
if (num_linkages == 0) return error.NoLinkagesFound;
|
|
|
|
const linkage = c.linkage_create(0, sent, self.opts);
|
|
if (linkage == null) return error.LinkageCreationFailed;
|
|
defer c.linkage_delete(linkage);
|
|
|
|
const num_words = c.linkage_get_num_words(linkage);
|
|
const num_links = c.linkage_get_num_links(linkage);
|
|
|
|
var words = try self.allocator.alloc([]const u8, @intCast(num_words));
|
|
for (0..@intCast(num_words)) |i| {
|
|
const word_ptr = c.linkage_get_word(linkage, @intCast(i));
|
|
words[i] = try self.allocator.dupe(u8, std.mem.span(word_ptr));
|
|
}
|
|
|
|
var links = try self.allocator.alloc(Link, @intCast(num_links));
|
|
for (0..@intCast(num_links)) |i| {
|
|
const left = c.linkage_get_link_lword(linkage, @intCast(i));
|
|
const right = c.linkage_get_link_rword(linkage, @intCast(i));
|
|
const label_ptr = c.linkage_get_link_label(linkage, @intCast(i));
|
|
|
|
const left_word_ptr = c.linkage_get_word(linkage, left);
|
|
const right_word_ptr = c.linkage_get_word(linkage, right);
|
|
|
|
links[i] = Link{
|
|
.left_word = try self.allocator.dupe(u8, std.mem.span(left_word_ptr)),
|
|
.right_word = try self.allocator.dupe(u8, std.mem.span(right_word_ptr)),
|
|
.label = try self.allocator.dupe(u8, std.mem.span(label_ptr)),
|
|
.left_index = @intCast(left),
|
|
.right_index = @intCast(right),
|
|
};
|
|
}
|
|
|
|
// Extract constituent tree structure
|
|
const constituent_tree = try self.constituentTree(linkage);
|
|
|
|
return ParseTree{
|
|
.words = words,
|
|
.links = links,
|
|
.constituent_tree = constituent_tree,
|
|
.allocator = self.allocator,
|
|
};
|
|
}
|
|
|
|
fn constituentTree(self: *Parser, linkage: c.Linkage) !?*ConstituentNode {
|
|
// struct CNode_s {
|
|
// char * label;
|
|
// CNode * child;
|
|
// CNode * next;
|
|
// int start, end;
|
|
// };
|
|
const constituent_tree = c.linkage_constituent_tree(linkage);
|
|
if (constituent_tree == null) return null;
|
|
defer c.linkage_free_constituent_tree(constituent_tree);
|
|
|
|
return try parseCNode(self.allocator, constituent_tree);
|
|
}
|
|
fn parseCNode(allocator: std.mem.Allocator, cnode: *c.struct_CNode_s) !*ConstituentNode {
|
|
const node = try allocator.create(ConstituentNode);
|
|
errdefer allocator.destroy(node);
|
|
const label = try allocator.dupe(u8, std.mem.span(cnode.label));
|
|
errdefer allocator.free(label);
|
|
node.* = ConstituentNode{
|
|
.label = label,
|
|
.start = @intCast(cnode.start),
|
|
.end = @intCast(cnode.end),
|
|
.child = if (cnode.child != null) try parseCNode(allocator, cnode.child) else null,
|
|
.next = if (cnode.next != null) try parseCNode(allocator, cnode.next) else null,
|
|
.allocator = allocator,
|
|
};
|
|
return node;
|
|
}
|
|
};
|
|
|
|
test "basic C API functionality" {
|
|
const opts = c.parse_options_create();
|
|
defer _ = c.parse_options_delete(opts);
|
|
try std.testing.expect(opts != null);
|
|
}
|
|
|
|
test "parser functionality" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
var tree = try parser.parse("The cat sat on the mat");
|
|
defer tree.deinit();
|
|
|
|
// LEFT-WALL
|
|
// the
|
|
// cat.n
|
|
// sat.v
|
|
// on
|
|
// the
|
|
// mat.n
|
|
// RIGHT-WALL
|
|
try std.testing.expect(tree.words.len == 8); // 6 + LEFT_WALL / RIGHT_WALL
|
|
}
|
|
test "from website" {
|
|
const sentence = "When your back is is against the whiteboard, I'll be back to back you up";
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
var tree = try parser.parse(sentence);
|
|
defer tree.deinit();
|
|
}
|
|
test "real usage" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
var tree = try parser.parse("turn on the bedroom light");
|
|
defer tree.deinit();
|
|
|
|
try std.testing.expect(tree.words.len == 7); // 5 + LEFT_WALL / RIGHT_WALL
|
|
try std.testing.expectEqualStrings("turn", tree.firstVerb().?);
|
|
const sentence_action = try tree.sentenceAction();
|
|
defer std.testing.allocator.free(sentence_action);
|
|
try std.testing.expect(sentence_action.len == 2);
|
|
try std.testing.expectEqualStrings("turn", sentence_action[0]);
|
|
try std.testing.expectEqualStrings("on", sentence_action[1]);
|
|
const sentence_object = try tree.sentenceObject();
|
|
defer std.testing.allocator.free(sentence_object);
|
|
try std.testing.expect(sentence_object.len == 2);
|
|
try std.testing.expectEqualStrings("bedroom", sentence_object[0]);
|
|
try std.testing.expectEqualStrings("light", sentence_object[1]);
|
|
}
|
|
test "real usage - jack" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
var tree = try parser.parse("turn on jack bedroom light");
|
|
defer tree.deinit();
|
|
|
|
// std.debug.print("{f}\n", .{tree});
|
|
try std.testing.expect(tree.words.len == 7); // 5 + LEFT_WALL / RIGHT_WALL
|
|
const sentence_action = try tree.sentenceAction();
|
|
defer std.testing.allocator.free(sentence_action);
|
|
try std.testing.expect(sentence_action.len == 2);
|
|
try std.testing.expectEqualStrings("turn", sentence_action[0]);
|
|
try std.testing.expectEqualStrings("on", sentence_action[1]);
|
|
|
|
const sentence_object = try tree.sentenceObject();
|
|
defer std.testing.allocator.free(sentence_object);
|
|
try std.testing.expect(sentence_object.len == 3);
|
|
try std.testing.expectEqualStrings("jack", sentence_object[0]);
|
|
try std.testing.expectEqualStrings("bedroom", sentence_object[1]);
|
|
try std.testing.expectEqualStrings("light", sentence_object[2]);
|
|
}
|
|
test "adaptiveCommandParse successful without replacements" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
const replacements = std.StaticStringMap([]const u8).initComptime(.{
|
|
.{ "lake", "light" },
|
|
.{ "like", "light" },
|
|
});
|
|
|
|
const sentence = "turn on the kitchen light";
|
|
var tree = try parser.adaptiveCommandParse(sentence, replacements);
|
|
defer tree.deinit();
|
|
|
|
const action_words = try tree.sentenceAction();
|
|
defer std.testing.allocator.free(action_words);
|
|
try std.testing.expect(action_words.len == 2);
|
|
try std.testing.expectEqualStrings("turn", action_words[0]);
|
|
try std.testing.expectEqualStrings("on", action_words[1]);
|
|
|
|
const object_words = try tree.sentenceObject();
|
|
defer std.testing.allocator.free(object_words);
|
|
try std.testing.expect(object_words.len == 2);
|
|
try std.testing.expectEqualStrings("kitchen", object_words[0]);
|
|
try std.testing.expectEqualStrings("light", object_words[1]);
|
|
}
|
|
|
|
test "adaptiveCommandParse with word replacement" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
const replacements = std.StaticStringMap([]const u8).initComptime(.{
|
|
.{ "lake", "light" },
|
|
.{ "like", "light" },
|
|
});
|
|
|
|
const sentence = "turn on the kitchen lake";
|
|
|
|
var tree = try parser.adaptiveCommandParse(sentence, replacements);
|
|
defer tree.deinit();
|
|
|
|
const action_words = try tree.sentenceAction();
|
|
defer std.testing.allocator.free(action_words);
|
|
try std.testing.expect(action_words.len == 2);
|
|
try std.testing.expectEqualStrings("turn", action_words[0]);
|
|
try std.testing.expectEqualStrings("on", action_words[1]);
|
|
|
|
const object_words = try tree.sentenceObject();
|
|
defer std.testing.allocator.free(object_words);
|
|
try std.testing.expect(object_words.len == 2);
|
|
try std.testing.expectEqualStrings("kitchen", object_words[0]);
|
|
try std.testing.expectEqualStrings("light", object_words[1]);
|
|
}
|
|
|
|
test "adaptiveCommandParse no valid parse" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
const replacements = std.StaticStringMap([]const u8).initComptime(.{
|
|
.{ "lake", "light" },
|
|
.{ "like", "light" },
|
|
});
|
|
|
|
const sentence = "xyz abc def";
|
|
|
|
// const ll = std.testing.log_level;
|
|
// defer std.testing.log_level = ll;
|
|
// std.testing.log_level = .debug;
|
|
try std.testing.expectError(
|
|
error.SentenceEmptyAfterNullRemoval,
|
|
parser.adaptiveCommandParse(
|
|
sentence,
|
|
replacements,
|
|
),
|
|
);
|
|
}
|
|
|
|
test "adaptiveCommandParse with word replacement and null removal" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
const replacements = std.StaticStringMap([]const u8).initComptime(.{
|
|
.{ "lake", "light" },
|
|
.{ "like", "light" },
|
|
});
|
|
|
|
const sentence = "alexa turn on the kitchen lake";
|
|
|
|
// const ll = std.testing.log_level;
|
|
// defer std.testing.log_level = ll;
|
|
// std.testing.log_level = .debug;
|
|
var tree = try parser.adaptiveCommandParse(sentence, replacements);
|
|
defer tree.deinit();
|
|
|
|
const action_words = try tree.sentenceAction();
|
|
defer std.testing.allocator.free(action_words);
|
|
try std.testing.expect(action_words.len == 2);
|
|
try std.testing.expectEqualStrings("turn", action_words[0]);
|
|
try std.testing.expectEqualStrings("on", action_words[1]);
|
|
|
|
const object_words = try tree.sentenceObject();
|
|
defer std.testing.allocator.free(object_words);
|
|
try std.testing.expect(object_words.len == 2);
|
|
try std.testing.expectEqualStrings("kitchen", object_words[0]);
|
|
try std.testing.expectEqualStrings("light", object_words[1]);
|
|
}
|
|
test "applyReplacements basic replacement" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
const replacements = std.StaticStringMap([]const u8).initComptime(.{
|
|
.{ "lake", "light" },
|
|
});
|
|
|
|
var final_buf: [1024]u8 = undefined;
|
|
const result = try parser.applyReplacements("turn on the kitchen lake", replacements, &final_buf);
|
|
try std.testing.expectEqualStrings("turn on the kitchen light", result);
|
|
}
|
|
|
|
test "applyReplacements multiple replacements" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
const replacements = std.StaticStringMap([]const u8).initComptime(.{
|
|
.{ "lake", "light" },
|
|
.{ "kitchen", "bedroom" },
|
|
});
|
|
|
|
var final_buf: [1024]u8 = undefined;
|
|
const result = try parser.applyReplacements("turn on the kitchen lake", replacements, &final_buf);
|
|
try std.testing.expectEqualStrings("turn on the bedroom light", result);
|
|
}
|
|
|
|
test "applyReplacements empty after replacement" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
const replacements = std.StaticStringMap([]const u8).initComptime(.{
|
|
.{ "test", "" },
|
|
});
|
|
|
|
var final_buf: [1024]u8 = undefined;
|
|
try std.testing.expectError(error.SentenceEmptyAfterReplacements, parser.applyReplacements("test", replacements, &final_buf));
|
|
}
|
|
|
|
test "removeNullWords no nulls" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
const sentence = "turn on the light";
|
|
var tree = try parser.parse(sentence);
|
|
|
|
var final_buf: [1024]u8 = undefined;
|
|
const result = try parser.removeNullWords(sentence, &tree, &final_buf);
|
|
defer tree.deinit();
|
|
|
|
try std.testing.expectEqualStrings(sentence, result.sentence);
|
|
try std.testing.expect(result.nulls_removed == 0);
|
|
}
|
|
test "removeNullWords - 'i' as null word" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
// sentence from unique samples that originally identified "i" as a null word,
|
|
// originally we were removing "i", which screwed up "eighteen", forcing logic
|
|
// to make sure that we were processing whole words
|
|
const sentence = "ah the next level the out wednesday october first i want alaska eighteen from seattle to boston";
|
|
var tree = try parser.parse(sentence);
|
|
var final_buf: [1024]u8 = undefined;
|
|
const result = try parser.removeNullWords(sentence, &tree, &final_buf);
|
|
defer tree.deinit();
|
|
|
|
try std.testing.expectEqualStrings("ah the next level the out first want eighteen from to", result.sentence);
|
|
try std.testing.expect(result.nulls_removed == 6);
|
|
}
|
|
|
|
test "removeNullWords - was originally crashing" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
const sentence = "for for big waiver yeah i'm be a masses";
|
|
var tree = try parser.parse(sentence);
|
|
var final_buf: [1024]u8 = undefined;
|
|
const result = try parser.removeNullWords(sentence, &tree, &final_buf);
|
|
defer tree.deinit();
|
|
|
|
try std.testing.expectEqualStrings("for for big waiver yeah be a masses", result.sentence);
|
|
try std.testing.expect(result.nulls_removed == 1);
|
|
}
|
|
|
|
test "removeNullWords - null word followed by possessive" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
// This shows up as tom[.?] but then a proper 's.p, so breaks the typical pattern of an unknown word after a null word
|
|
const sentence = "them sound indiscipline or doesn't have i say or okay so and so he's creating like an excel spreadsheet was tom's and records or whatever it to translate by so that's how he and he could have come conversation about";
|
|
var tree = try parser.parse(sentence);
|
|
var final_buf: [1024]u8 = undefined;
|
|
const result = try parser.removeNullWords(sentence, &tree, &final_buf);
|
|
defer tree.deinit();
|
|
|
|
try std.testing.expectEqualStrings("them sound or doesn't have say or okay so and so he's creating like an excel was and records or whatever it to translate by so that's how he and he could have come conversation about", result.sentence);
|
|
try std.testing.expect(result.nulls_removed == 4);
|
|
}
|
|
|
|
test "removeNullWords - complex and long processing time" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
if (!build_options.long_tests) return error.SkipZigTest;
|
|
// This one takes a lot of processing. Also, "i" ends up as null, so forces us to strip the word 'i' and not the letter 'i'
|
|
const sentence = "i'm i seem to be breaking the website and so i'm training to or a multi city a reservation and it's telling me i can't get from seattle to portland or on monday september twenty ninth am and at that point i was like okay he something completely wrong effect";
|
|
var tree = try parser.parse(sentence);
|
|
var final_buf: [1024]u8 = undefined;
|
|
const result = try parser.removeNullWords(sentence, &tree, &final_buf);
|
|
defer tree.deinit();
|
|
|
|
try std.testing.expectEqualStrings("seem to be breaking the website and so training to or a city a reservation and it's telling me can't get from to or on twenty ninth am and at that point was like okay he something completely wrong effect", result.sentence);
|
|
try std.testing.expect(result.nulls_removed == 7);
|
|
}
|