pos/src/root.zig

441 lines
17 KiB
Zig

const std = @import("std");
const c = @cImport({
@cInclude("link-includes.h");
});
pub const ConstituentNode = struct {
label: []const u8,
/// index of start for the linkage, does not include LEFT_WALL/RIGHT_WALL
start: usize,
/// index of end for the linkage, does not include LEFT_WALL/RIGHT_WALL
end: usize,
child: ?*ConstituentNode,
next: ?*ConstituentNode,
allocator: std.mem.Allocator,
pub fn deinit(self: *ConstituentNode) void {
self.allocator.free(self.label);
if (self.child) |child|
child.deinit();
if (self.next) |next|
next.deinit();
self.allocator.destroy(self);
}
};
pub const Link = struct {
left_word: []const u8,
right_word: []const u8,
label: []const u8,
left_index: u32,
right_index: u32,
};
pub const ParseTree = struct {
words: [][]const u8,
links: []Link,
constituent_tree: ?*ConstituentNode,
allocator: std.mem.Allocator,
pub fn deinit(self: *ParseTree) void {
for (self.words) |word| {
self.allocator.free(word);
}
self.allocator.free(self.words);
for (self.links) |link| {
self.allocator.free(link.left_word);
self.allocator.free(link.right_word);
self.allocator.free(link.label);
}
self.allocator.free(self.links);
if (self.constituent_tree) |tree| {
tree.deinit();
}
}
pub fn format(self: ParseTree, writer: *std.io.Writer) std.io.Writer.Error!void {
try writer.writeAll("Words: ");
for (self.words, 0..) |word, i| {
try writer.print("{d}: '{s}' ", .{ i, word });
}
try writer.print("\n\nLinks ({} total):\n", .{self.links.len});
for (self.links, 0..) |link, i| {
try writer.print(" [{d}] '{s}' --{s}--> '{s}'\n", .{ i, link.left_word, link.label, link.right_word });
}
try writer.writeAll("\nConstituent Tree:\n");
if (self.constituent_tree) |tree| {
try self.printConstituentNode(writer, tree, 0);
} else {
try writer.writeAll(" (no constituent tree)\n");
}
}
pub fn firstVerb(self: *ParseTree) ?[]const u8 {
for (self.words) |word| {
if (std.mem.endsWith(u8, word, ".v")) {
return word[0 .. word.len - 2];
}
}
return null;
}
pub fn sentenceAction(self: *ParseTree) ![][]const u8 {
var al: std.ArrayList([]const u8) = .{};
defer al.deinit(self.allocator);
if (self.constituent_tree == null) return error.NoConstituentTree;
var node = self.constituent_tree.?;
// https://www.link.cs.cmu.edu/link/dict/section-S.html
if (!std.mem.startsWith(u8, node.label, "S"))
return error.NoSubjectNounFound;
if (node.child == null) @panic("S node must have a child");
node = node.child.?;
// I'm not entirely sure this will be universally true, but we'll see
// in real life testing
// https://www.link.cs.cmu.edu/link/dict/section-V.html
// https://www.link.cs.cmu.edu/link/dict/section-P.html
if (!std.mem.eql(u8, node.label, "VP"))
return error.VerbBeLinkageNotFound;
if (node.child == null) @panic("VP node must have a child");
node = node.child.?;
// This should be our action
if (node.start != node.end)
return error.MultiWordVerbsNotSupported; // this should be our issue...
const verb = self.words[node.start + 1]; // +1 due to LEFT_WALL
if (!std.mem.endsWith(u8, verb, ".v"))
return error.VerbNotLabeledWithDotV;
// From this verb, we can just look at the linkage to find what to append
for (self.links) |l| {
// We're looking for a modifying verb, see:
// https://www.link.cs.cmu.edu/link/dict/section-MV.html
if (@as(usize, l.left_index) == (node.start + 1) and
std.mem.startsWith(u8, l.label, "MV"))
{
// this is an modifying verb
try al.append(self.allocator, verb[0 .. verb.len - 2]);
try al.append(
self.allocator,
l.right_word[0 .. std.mem.lastIndexOfScalar(u8, l.right_word, '.') orelse l.right_word.len],
);
return al.toOwnedSlice(self.allocator);
}
}
return al.toOwnedSlice(self.allocator);
}
pub fn sentenceObject(self: *ParseTree) ![][]const u8 {
var al: std.ArrayList([]const u8) = .{};
defer al.deinit(self.allocator);
if (self.constituent_tree == null) return error.NoConstituentTree;
var node = self.constituent_tree.?;
// https://www.link.cs.cmu.edu/link/dict/section-S.html
if (!std.mem.startsWith(u8, node.label, "S"))
return error.NoSubjectNounFound;
if (node.child == null) @panic("S node must have a child");
node = node.child.?;
// I'm not entirely sure this will be universally true, but we'll see
// in real life testing
// https://www.link.cs.cmu.edu/link/dict/section-V.html
// https://www.link.cs.cmu.edu/link/dict/section-P.html
if (!std.mem.eql(u8, node.label, "VP"))
return error.VerbBeLinkageNotFound;
if (node.child == null) @panic("VP node must have a child");
node = node.child.?;
// We need the next node, which should be our PP node
if (node.next == null) @panic("VP node must have a child with at least two members");
while (node.next != null) // fast forward to the very last node at this level
node = node.next.?;
// https://opencog.github.io/link-grammar-website/dict/section-PP.html
if (!std.mem.startsWith(u8, node.label, "PP"))
return error.PastParticipleLinkageNotFound;
if (node.child == null) @panic("PP node must have a child");
node = node.child.?;
// At this point we should be pointing to something like "on" or "off"
// We need the next node, which is the money shot
if (node.next == null) @panic("PP node must have a child with two members");
node = node.next.?;
// N is "No", which doesn't make sense here, but there is no specific
// NP section, so I'm not sure what we're here
if (!std.mem.eql(u8, node.label, "NP"))
return error.NPLinkageNotFound;
for (node.start..node.end + 1) |i| {
// we need to add 1 to this index so we can avoid LEFT-WALL
const inx = i + 1;
// we only want the nouns out of this...
const word = self.words[inx];
if (!std.mem.endsWith(u8, word, ".n")) continue;
const trimmed = word[0 .. word.len - 2];
try al.append(self.allocator, trimmed);
}
return al.toOwnedSlice(self.allocator);
}
fn printConstituentNode(self: *const ParseTree, writer: *std.Io.Writer, node: *const ConstituentNode, depth: usize) !void {
for (0..depth) |_| {
try writer.writeAll(" ");
}
try writer.print("[{s}] ({}-{})\n", .{ node.label, node.start, node.end });
if (node.child) |child| {
try self.printConstituentNode(writer, child, depth + 1);
}
if (node.next) |next| {
try self.printConstituentNode(writer, next, depth);
}
}
};
pub const Parser = struct {
dict: c.Dictionary,
opts: c.Parse_Options,
allocator: std.mem.Allocator,
pub fn init(allocator: std.mem.Allocator) !Parser {
const dict = c.dictionary_create(
@ptrCast(@constCast("data/4.0.dict")),
@ptrCast(@constCast("data/4.0.knowledge")),
@ptrCast(@constCast("data/4.0.constituent-knowledge")),
@ptrCast(@constCast("data/4.0.affix")),
);
if (dict == null) return error.DictionaryCreationFailed;
const opts = c.parse_options_create();
if (opts == null) return error.ParseOptionsCreationFailed;
setOptions(opts);
return Parser{
.dict = dict,
.opts = opts,
.allocator = allocator,
};
}
fn setOptions(opts: anytype) void {
c.parse_options_set_verbosity(opts, 0);
c.parse_options_set_linkage_limit(opts, 100);
c.parse_options_set_disjunct_cost(opts, 2);
c.parse_options_set_min_null_count(opts, 0);
c.parse_options_set_max_null_count(opts, 0);
}
pub fn initWithDataDir(allocator: std.mem.Allocator, data_dir: []const u8) !Parser {
const dict_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "data/4.0.dict" });
defer allocator.free(dict_path);
const knowledge_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "data/4.0.knowledge" });
defer allocator.free(knowledge_path);
const constituent_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "data/4.0.constituent-knowledge" });
defer allocator.free(constituent_path);
const affix_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "data/4.0.affix" });
defer allocator.free(affix_path);
const dict_cstr = try allocator.dupeZ(u8, dict_path);
defer allocator.free(dict_cstr);
const knowledge_cstr = try allocator.dupeZ(u8, knowledge_path);
defer allocator.free(knowledge_cstr);
const constituent_cstr = try allocator.dupeZ(u8, constituent_path);
defer allocator.free(constituent_cstr);
const affix_cstr = try allocator.dupeZ(u8, affix_path);
defer allocator.free(affix_cstr);
const dict = c.dictionary_create(
@ptrCast(@constCast(dict_cstr.ptr)),
@ptrCast(@constCast(knowledge_cstr.ptr)),
@ptrCast(@constCast(constituent_cstr.ptr)),
@ptrCast(@constCast(affix_cstr.ptr)),
);
if (dict == null) return error.DictionaryCreationFailed;
const opts = c.parse_options_create();
if (opts == null) return error.ParseOptionsCreationFailed;
setOptions(opts);
return Parser{
.dict = dict,
.opts = opts,
.allocator = allocator,
};
}
pub fn deinit(self: *Parser) void {
_ = c.parse_options_delete(self.opts);
_ = c.dictionary_delete(self.dict);
}
pub fn parse(self: *Parser, input: []const u8) !ParseTree {
const c_input = try self.allocator.dupeZ(u8, input);
defer self.allocator.free(c_input);
const sent = c.sentence_create(c_input.ptr, self.dict);
if (sent == null) return error.SentenceCreationFailed;
defer c.sentence_delete(sent);
var num_linkages = c.sentence_parse(sent, self.opts);
// If no linkages found, try with null links allowed
if (num_linkages == 0) {
c.parse_options_set_min_null_count(self.opts, 1);
c.parse_options_set_max_null_count(self.opts, @intCast(c.sentence_length(sent)));
num_linkages = c.sentence_parse(sent, self.opts);
}
if (num_linkages == 0) return error.NoLinkagesFound;
const linkage = c.linkage_create(0, sent, self.opts);
if (linkage == null) return error.LinkageCreationFailed;
defer c.linkage_delete(linkage);
const num_words = c.linkage_get_num_words(linkage);
const num_links = c.linkage_get_num_links(linkage);
var words = try self.allocator.alloc([]const u8, @intCast(num_words));
for (0..@intCast(num_words)) |i| {
const word_ptr = c.linkage_get_word(linkage, @intCast(i));
words[i] = try self.allocator.dupe(u8, std.mem.span(word_ptr));
}
var links = try self.allocator.alloc(Link, @intCast(num_links));
for (0..@intCast(num_links)) |i| {
const left = c.linkage_get_link_lword(linkage, @intCast(i));
const right = c.linkage_get_link_rword(linkage, @intCast(i));
const label_ptr = c.linkage_get_link_label(linkage, @intCast(i));
const left_word_ptr = c.linkage_get_word(linkage, left);
const right_word_ptr = c.linkage_get_word(linkage, right);
links[i] = Link{
.left_word = try self.allocator.dupe(u8, std.mem.span(left_word_ptr)),
.right_word = try self.allocator.dupe(u8, std.mem.span(right_word_ptr)),
.label = try self.allocator.dupe(u8, std.mem.span(label_ptr)),
.left_index = @intCast(left),
.right_index = @intCast(right),
};
}
// Extract constituent tree structure
const constituent_tree = try self.constituentTree(linkage);
return ParseTree{
.words = words,
.links = links,
.constituent_tree = constituent_tree,
.allocator = self.allocator,
};
}
fn constituentTree(self: *Parser, linkage: c.Linkage) !?*ConstituentNode {
// struct CNode_s {
// char * label;
// CNode * child;
// CNode * next;
// int start, end;
// };
const constituent_tree = c.linkage_constituent_tree(linkage);
if (constituent_tree == null) return null;
defer c.linkage_free_constituent_tree(constituent_tree);
return try parseCNode(self.allocator, constituent_tree);
}
fn parseCNode(allocator: std.mem.Allocator, cnode: *c.struct_CNode_s) !*ConstituentNode {
const node = try allocator.create(ConstituentNode);
errdefer allocator.destroy(node);
const label = try allocator.dupe(u8, std.mem.span(cnode.label));
errdefer allocator.free(label);
node.* = ConstituentNode{
.label = label,
.start = @intCast(cnode.start),
.end = @intCast(cnode.end),
.child = if (cnode.child != null) try parseCNode(allocator, cnode.child) else null,
.next = if (cnode.next != null) try parseCNode(allocator, cnode.next) else null,
.allocator = allocator,
};
return node;
}
};
test "basic C API functionality" {
const opts = c.parse_options_create();
defer _ = c.parse_options_delete(opts);
try std.testing.expect(opts != null);
}
test "parser functionality" {
var parser = try Parser.init(std.testing.allocator);
defer parser.deinit();
var tree = try parser.parse("The cat sat on the mat");
defer tree.deinit();
// LEFT-WALL
// the
// cat.n
// sat.v
// on
// the
// mat.n
// RIGHT-WALL
try std.testing.expect(tree.words.len == 8); // 6 + LEFT_WALL / RIGHT_WALL
}
test "from website" {
const sentence = "When your back is is against the whiteboard, I'll be back to back you up";
var parser = try Parser.init(std.testing.allocator);
defer parser.deinit();
var tree = try parser.parse(sentence);
defer tree.deinit();
}
test "real usage" {
var parser = try Parser.init(std.testing.allocator);
defer parser.deinit();
var tree = try parser.parse("turn on the bedroom light");
defer tree.deinit();
try std.testing.expect(tree.words.len == 7); // 5 + LEFT_WALL / RIGHT_WALL
try std.testing.expectEqualStrings("turn", tree.firstVerb().?);
const sentence_action = try tree.sentenceAction();
defer std.testing.allocator.free(sentence_action);
try std.testing.expect(sentence_action.len == 2);
try std.testing.expectEqualStrings("turn", sentence_action[0]);
try std.testing.expectEqualStrings("on", sentence_action[1]);
const sentence_object = try tree.sentenceObject();
defer std.testing.allocator.free(sentence_object);
try std.testing.expect(sentence_object.len == 2);
try std.testing.expectEqualStrings("bedroom", sentence_object[0]);
try std.testing.expectEqualStrings("light", sentence_object[1]);
}
test "real usage - jack" {
var parser = try Parser.init(std.testing.allocator);
defer parser.deinit();
var tree = try parser.parse("turn on jack bedroom light");
defer tree.deinit();
// std.debug.print("{f}\n", .{tree});
try std.testing.expect(tree.words.len == 7); // 5 + LEFT_WALL / RIGHT_WALL
const sentence_action = try tree.sentenceAction();
defer std.testing.allocator.free(sentence_action);
try std.testing.expect(sentence_action.len == 2);
try std.testing.expectEqualStrings("turn", sentence_action[0]);
try std.testing.expectEqualStrings("on", sentence_action[1]);
const sentence_object = try tree.sentenceObject();
defer std.testing.allocator.free(sentence_object);
try std.testing.expect(sentence_object.len == 3);
try std.testing.expectEqualStrings("jack", sentence_object[0]);
try std.testing.expectEqualStrings("bedroom", sentence_object[1]);
try std.testing.expectEqualStrings("light", sentence_object[2]);
}