441 lines
17 KiB
Zig
441 lines
17 KiB
Zig
const std = @import("std");
|
|
const c = @cImport({
|
|
@cInclude("link-includes.h");
|
|
});
|
|
|
|
pub const ConstituentNode = struct {
|
|
label: []const u8,
|
|
/// index of start for the linkage, does not include LEFT_WALL/RIGHT_WALL
|
|
start: usize,
|
|
/// index of end for the linkage, does not include LEFT_WALL/RIGHT_WALL
|
|
end: usize,
|
|
child: ?*ConstituentNode,
|
|
next: ?*ConstituentNode,
|
|
allocator: std.mem.Allocator,
|
|
|
|
pub fn deinit(self: *ConstituentNode) void {
|
|
self.allocator.free(self.label);
|
|
if (self.child) |child|
|
|
child.deinit();
|
|
if (self.next) |next|
|
|
next.deinit();
|
|
self.allocator.destroy(self);
|
|
}
|
|
};
|
|
|
|
pub const Link = struct {
|
|
left_word: []const u8,
|
|
right_word: []const u8,
|
|
label: []const u8,
|
|
left_index: u32,
|
|
right_index: u32,
|
|
};
|
|
|
|
pub const ParseTree = struct {
|
|
words: [][]const u8,
|
|
links: []Link,
|
|
constituent_tree: ?*ConstituentNode,
|
|
allocator: std.mem.Allocator,
|
|
|
|
pub fn deinit(self: *ParseTree) void {
|
|
for (self.words) |word| {
|
|
self.allocator.free(word);
|
|
}
|
|
self.allocator.free(self.words);
|
|
for (self.links) |link| {
|
|
self.allocator.free(link.left_word);
|
|
self.allocator.free(link.right_word);
|
|
self.allocator.free(link.label);
|
|
}
|
|
self.allocator.free(self.links);
|
|
if (self.constituent_tree) |tree| {
|
|
tree.deinit();
|
|
}
|
|
}
|
|
pub fn format(self: ParseTree, writer: *std.io.Writer) std.io.Writer.Error!void {
|
|
try writer.writeAll("Words: ");
|
|
for (self.words, 0..) |word, i| {
|
|
try writer.print("{d}: '{s}' ", .{ i, word });
|
|
}
|
|
try writer.print("\n\nLinks ({} total):\n", .{self.links.len});
|
|
|
|
for (self.links, 0..) |link, i| {
|
|
try writer.print(" [{d}] '{s}' --{s}--> '{s}'\n", .{ i, link.left_word, link.label, link.right_word });
|
|
}
|
|
|
|
try writer.writeAll("\nConstituent Tree:\n");
|
|
if (self.constituent_tree) |tree| {
|
|
try self.printConstituentNode(writer, tree, 0);
|
|
} else {
|
|
try writer.writeAll(" (no constituent tree)\n");
|
|
}
|
|
}
|
|
pub fn firstVerb(self: *ParseTree) ?[]const u8 {
|
|
for (self.words) |word| {
|
|
if (std.mem.endsWith(u8, word, ".v")) {
|
|
return word[0 .. word.len - 2];
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
pub fn sentenceAction(self: *ParseTree) ![][]const u8 {
|
|
var al: std.ArrayList([]const u8) = .{};
|
|
defer al.deinit(self.allocator);
|
|
|
|
if (self.constituent_tree == null) return error.NoConstituentTree;
|
|
var node = self.constituent_tree.?;
|
|
// https://www.link.cs.cmu.edu/link/dict/section-S.html
|
|
if (!std.mem.startsWith(u8, node.label, "S"))
|
|
return error.NoSubjectNounFound;
|
|
if (node.child == null) @panic("S node must have a child");
|
|
node = node.child.?;
|
|
|
|
// I'm not entirely sure this will be universally true, but we'll see
|
|
// in real life testing
|
|
// https://www.link.cs.cmu.edu/link/dict/section-V.html
|
|
// https://www.link.cs.cmu.edu/link/dict/section-P.html
|
|
if (!std.mem.eql(u8, node.label, "VP"))
|
|
return error.VerbBeLinkageNotFound;
|
|
|
|
if (node.child == null) @panic("VP node must have a child");
|
|
node = node.child.?;
|
|
|
|
// This should be our action
|
|
if (node.start != node.end)
|
|
return error.MultiWordVerbsNotSupported; // this should be our issue...
|
|
|
|
const verb = self.words[node.start + 1]; // +1 due to LEFT_WALL
|
|
if (!std.mem.endsWith(u8, verb, ".v"))
|
|
return error.VerbNotLabeledWithDotV;
|
|
|
|
// From this verb, we can just look at the linkage to find what to append
|
|
for (self.links) |l| {
|
|
// We're looking for a modifying verb, see:
|
|
// https://www.link.cs.cmu.edu/link/dict/section-MV.html
|
|
if (@as(usize, l.left_index) == (node.start + 1) and
|
|
std.mem.startsWith(u8, l.label, "MV"))
|
|
{
|
|
// this is an modifying verb
|
|
try al.append(self.allocator, verb[0 .. verb.len - 2]);
|
|
try al.append(
|
|
self.allocator,
|
|
l.right_word[0 .. std.mem.lastIndexOfScalar(u8, l.right_word, '.') orelse l.right_word.len],
|
|
);
|
|
return al.toOwnedSlice(self.allocator);
|
|
}
|
|
}
|
|
return al.toOwnedSlice(self.allocator);
|
|
}
|
|
|
|
pub fn sentenceObject(self: *ParseTree) ![][]const u8 {
|
|
var al: std.ArrayList([]const u8) = .{};
|
|
defer al.deinit(self.allocator);
|
|
|
|
if (self.constituent_tree == null) return error.NoConstituentTree;
|
|
var node = self.constituent_tree.?;
|
|
// https://www.link.cs.cmu.edu/link/dict/section-S.html
|
|
if (!std.mem.startsWith(u8, node.label, "S"))
|
|
return error.NoSubjectNounFound;
|
|
if (node.child == null) @panic("S node must have a child");
|
|
node = node.child.?;
|
|
|
|
// I'm not entirely sure this will be universally true, but we'll see
|
|
// in real life testing
|
|
// https://www.link.cs.cmu.edu/link/dict/section-V.html
|
|
// https://www.link.cs.cmu.edu/link/dict/section-P.html
|
|
if (!std.mem.eql(u8, node.label, "VP"))
|
|
return error.VerbBeLinkageNotFound;
|
|
|
|
if (node.child == null) @panic("VP node must have a child");
|
|
node = node.child.?;
|
|
|
|
// We need the next node, which should be our PP node
|
|
if (node.next == null) @panic("VP node must have a child with at least two members");
|
|
while (node.next != null) // fast forward to the very last node at this level
|
|
node = node.next.?;
|
|
|
|
// https://opencog.github.io/link-grammar-website/dict/section-PP.html
|
|
if (!std.mem.startsWith(u8, node.label, "PP"))
|
|
return error.PastParticipleLinkageNotFound;
|
|
|
|
if (node.child == null) @panic("PP node must have a child");
|
|
node = node.child.?;
|
|
|
|
// At this point we should be pointing to something like "on" or "off"
|
|
// We need the next node, which is the money shot
|
|
if (node.next == null) @panic("PP node must have a child with two members");
|
|
node = node.next.?;
|
|
|
|
// N is "No", which doesn't make sense here, but there is no specific
|
|
// NP section, so I'm not sure what we're here
|
|
if (!std.mem.eql(u8, node.label, "NP"))
|
|
return error.NPLinkageNotFound;
|
|
for (node.start..node.end + 1) |i| {
|
|
// we need to add 1 to this index so we can avoid LEFT-WALL
|
|
const inx = i + 1;
|
|
// we only want the nouns out of this...
|
|
const word = self.words[inx];
|
|
if (!std.mem.endsWith(u8, word, ".n")) continue;
|
|
const trimmed = word[0 .. word.len - 2];
|
|
try al.append(self.allocator, trimmed);
|
|
}
|
|
return al.toOwnedSlice(self.allocator);
|
|
}
|
|
|
|
fn printConstituentNode(self: *const ParseTree, writer: *std.Io.Writer, node: *const ConstituentNode, depth: usize) !void {
|
|
for (0..depth) |_| {
|
|
try writer.writeAll(" ");
|
|
}
|
|
try writer.print("[{s}] ({}-{})\n", .{ node.label, node.start, node.end });
|
|
if (node.child) |child| {
|
|
try self.printConstituentNode(writer, child, depth + 1);
|
|
}
|
|
if (node.next) |next| {
|
|
try self.printConstituentNode(writer, next, depth);
|
|
}
|
|
}
|
|
};
|
|
|
|
pub const Parser = struct {
|
|
dict: c.Dictionary,
|
|
opts: c.Parse_Options,
|
|
allocator: std.mem.Allocator,
|
|
|
|
pub fn init(allocator: std.mem.Allocator) !Parser {
|
|
const dict = c.dictionary_create(
|
|
@ptrCast(@constCast("data/4.0.dict")),
|
|
@ptrCast(@constCast("data/4.0.knowledge")),
|
|
@ptrCast(@constCast("data/4.0.constituent-knowledge")),
|
|
@ptrCast(@constCast("data/4.0.affix")),
|
|
);
|
|
if (dict == null) return error.DictionaryCreationFailed;
|
|
|
|
const opts = c.parse_options_create();
|
|
if (opts == null) return error.ParseOptionsCreationFailed;
|
|
|
|
setOptions(opts);
|
|
|
|
return Parser{
|
|
.dict = dict,
|
|
.opts = opts,
|
|
.allocator = allocator,
|
|
};
|
|
}
|
|
|
|
fn setOptions(opts: anytype) void {
|
|
c.parse_options_set_verbosity(opts, 0);
|
|
c.parse_options_set_linkage_limit(opts, 100);
|
|
c.parse_options_set_disjunct_cost(opts, 2);
|
|
c.parse_options_set_min_null_count(opts, 0);
|
|
c.parse_options_set_max_null_count(opts, 0);
|
|
}
|
|
|
|
pub fn initWithDataDir(allocator: std.mem.Allocator, data_dir: []const u8) !Parser {
|
|
const dict_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "data/4.0.dict" });
|
|
defer allocator.free(dict_path);
|
|
const knowledge_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "data/4.0.knowledge" });
|
|
defer allocator.free(knowledge_path);
|
|
const constituent_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "data/4.0.constituent-knowledge" });
|
|
defer allocator.free(constituent_path);
|
|
const affix_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "data/4.0.affix" });
|
|
defer allocator.free(affix_path);
|
|
|
|
const dict_cstr = try allocator.dupeZ(u8, dict_path);
|
|
defer allocator.free(dict_cstr);
|
|
const knowledge_cstr = try allocator.dupeZ(u8, knowledge_path);
|
|
defer allocator.free(knowledge_cstr);
|
|
const constituent_cstr = try allocator.dupeZ(u8, constituent_path);
|
|
defer allocator.free(constituent_cstr);
|
|
const affix_cstr = try allocator.dupeZ(u8, affix_path);
|
|
defer allocator.free(affix_cstr);
|
|
|
|
const dict = c.dictionary_create(
|
|
@ptrCast(@constCast(dict_cstr.ptr)),
|
|
@ptrCast(@constCast(knowledge_cstr.ptr)),
|
|
@ptrCast(@constCast(constituent_cstr.ptr)),
|
|
@ptrCast(@constCast(affix_cstr.ptr)),
|
|
);
|
|
if (dict == null) return error.DictionaryCreationFailed;
|
|
|
|
const opts = c.parse_options_create();
|
|
if (opts == null) return error.ParseOptionsCreationFailed;
|
|
|
|
setOptions(opts);
|
|
|
|
return Parser{
|
|
.dict = dict,
|
|
.opts = opts,
|
|
.allocator = allocator,
|
|
};
|
|
}
|
|
|
|
pub fn deinit(self: *Parser) void {
|
|
_ = c.parse_options_delete(self.opts);
|
|
_ = c.dictionary_delete(self.dict);
|
|
}
|
|
|
|
pub fn parse(self: *Parser, input: []const u8) !ParseTree {
|
|
const c_input = try self.allocator.dupeZ(u8, input);
|
|
defer self.allocator.free(c_input);
|
|
|
|
const sent = c.sentence_create(c_input.ptr, self.dict);
|
|
if (sent == null) return error.SentenceCreationFailed;
|
|
defer c.sentence_delete(sent);
|
|
|
|
var num_linkages = c.sentence_parse(sent, self.opts);
|
|
|
|
// If no linkages found, try with null links allowed
|
|
if (num_linkages == 0) {
|
|
c.parse_options_set_min_null_count(self.opts, 1);
|
|
c.parse_options_set_max_null_count(self.opts, @intCast(c.sentence_length(sent)));
|
|
num_linkages = c.sentence_parse(sent, self.opts);
|
|
}
|
|
|
|
if (num_linkages == 0) return error.NoLinkagesFound;
|
|
|
|
const linkage = c.linkage_create(0, sent, self.opts);
|
|
if (linkage == null) return error.LinkageCreationFailed;
|
|
defer c.linkage_delete(linkage);
|
|
|
|
const num_words = c.linkage_get_num_words(linkage);
|
|
const num_links = c.linkage_get_num_links(linkage);
|
|
|
|
var words = try self.allocator.alloc([]const u8, @intCast(num_words));
|
|
for (0..@intCast(num_words)) |i| {
|
|
const word_ptr = c.linkage_get_word(linkage, @intCast(i));
|
|
words[i] = try self.allocator.dupe(u8, std.mem.span(word_ptr));
|
|
}
|
|
|
|
var links = try self.allocator.alloc(Link, @intCast(num_links));
|
|
for (0..@intCast(num_links)) |i| {
|
|
const left = c.linkage_get_link_lword(linkage, @intCast(i));
|
|
const right = c.linkage_get_link_rword(linkage, @intCast(i));
|
|
const label_ptr = c.linkage_get_link_label(linkage, @intCast(i));
|
|
|
|
const left_word_ptr = c.linkage_get_word(linkage, left);
|
|
const right_word_ptr = c.linkage_get_word(linkage, right);
|
|
|
|
links[i] = Link{
|
|
.left_word = try self.allocator.dupe(u8, std.mem.span(left_word_ptr)),
|
|
.right_word = try self.allocator.dupe(u8, std.mem.span(right_word_ptr)),
|
|
.label = try self.allocator.dupe(u8, std.mem.span(label_ptr)),
|
|
.left_index = @intCast(left),
|
|
.right_index = @intCast(right),
|
|
};
|
|
}
|
|
|
|
// Extract constituent tree structure
|
|
const constituent_tree = try self.constituentTree(linkage);
|
|
|
|
return ParseTree{
|
|
.words = words,
|
|
.links = links,
|
|
.constituent_tree = constituent_tree,
|
|
.allocator = self.allocator,
|
|
};
|
|
}
|
|
|
|
fn constituentTree(self: *Parser, linkage: c.Linkage) !?*ConstituentNode {
|
|
// struct CNode_s {
|
|
// char * label;
|
|
// CNode * child;
|
|
// CNode * next;
|
|
// int start, end;
|
|
// };
|
|
const constituent_tree = c.linkage_constituent_tree(linkage);
|
|
if (constituent_tree == null) return null;
|
|
defer c.linkage_free_constituent_tree(constituent_tree);
|
|
|
|
return try parseCNode(self.allocator, constituent_tree);
|
|
}
|
|
fn parseCNode(allocator: std.mem.Allocator, cnode: *c.struct_CNode_s) !*ConstituentNode {
|
|
const node = try allocator.create(ConstituentNode);
|
|
errdefer allocator.destroy(node);
|
|
const label = try allocator.dupe(u8, std.mem.span(cnode.label));
|
|
errdefer allocator.free(label);
|
|
node.* = ConstituentNode{
|
|
.label = label,
|
|
.start = @intCast(cnode.start),
|
|
.end = @intCast(cnode.end),
|
|
.child = if (cnode.child != null) try parseCNode(allocator, cnode.child) else null,
|
|
.next = if (cnode.next != null) try parseCNode(allocator, cnode.next) else null,
|
|
.allocator = allocator,
|
|
};
|
|
return node;
|
|
}
|
|
};
|
|
|
|
test "basic C API functionality" {
|
|
const opts = c.parse_options_create();
|
|
defer _ = c.parse_options_delete(opts);
|
|
try std.testing.expect(opts != null);
|
|
}
|
|
|
|
test "parser functionality" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
var tree = try parser.parse("The cat sat on the mat");
|
|
defer tree.deinit();
|
|
|
|
// LEFT-WALL
|
|
// the
|
|
// cat.n
|
|
// sat.v
|
|
// on
|
|
// the
|
|
// mat.n
|
|
// RIGHT-WALL
|
|
try std.testing.expect(tree.words.len == 8); // 6 + LEFT_WALL / RIGHT_WALL
|
|
}
|
|
test "from website" {
|
|
const sentence = "When your back is is against the whiteboard, I'll be back to back you up";
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
var tree = try parser.parse(sentence);
|
|
defer tree.deinit();
|
|
}
|
|
test "real usage" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
var tree = try parser.parse("turn on the bedroom light");
|
|
defer tree.deinit();
|
|
|
|
try std.testing.expect(tree.words.len == 7); // 5 + LEFT_WALL / RIGHT_WALL
|
|
try std.testing.expectEqualStrings("turn", tree.firstVerb().?);
|
|
const sentence_action = try tree.sentenceAction();
|
|
defer std.testing.allocator.free(sentence_action);
|
|
try std.testing.expect(sentence_action.len == 2);
|
|
try std.testing.expectEqualStrings("turn", sentence_action[0]);
|
|
try std.testing.expectEqualStrings("on", sentence_action[1]);
|
|
const sentence_object = try tree.sentenceObject();
|
|
defer std.testing.allocator.free(sentence_object);
|
|
try std.testing.expect(sentence_object.len == 2);
|
|
try std.testing.expectEqualStrings("bedroom", sentence_object[0]);
|
|
try std.testing.expectEqualStrings("light", sentence_object[1]);
|
|
}
|
|
test "real usage - jack" {
|
|
var parser = try Parser.init(std.testing.allocator);
|
|
defer parser.deinit();
|
|
|
|
var tree = try parser.parse("turn on jack bedroom light");
|
|
defer tree.deinit();
|
|
|
|
// std.debug.print("{f}\n", .{tree});
|
|
try std.testing.expect(tree.words.len == 7); // 5 + LEFT_WALL / RIGHT_WALL
|
|
const sentence_action = try tree.sentenceAction();
|
|
defer std.testing.allocator.free(sentence_action);
|
|
try std.testing.expect(sentence_action.len == 2);
|
|
try std.testing.expectEqualStrings("turn", sentence_action[0]);
|
|
try std.testing.expectEqualStrings("on", sentence_action[1]);
|
|
|
|
const sentence_object = try tree.sentenceObject();
|
|
defer std.testing.allocator.free(sentence_object);
|
|
try std.testing.expect(sentence_object.len == 3);
|
|
try std.testing.expectEqualStrings("jack", sentence_object[0]);
|
|
try std.testing.expectEqualStrings("bedroom", sentence_object[1]);
|
|
try std.testing.expectEqualStrings("light", sentence_object[2]);
|
|
}
|