const std = @import("std"); const c = @cImport({ @cInclude("link-includes.h"); }); pub const ConstituentNode = struct { label: []const u8, /// index of start for the linkage, does not include LEFT_WALL/RIGHT_WALL start: usize, /// index of end for the linkage, does not include LEFT_WALL/RIGHT_WALL end: usize, child: ?*ConstituentNode, next: ?*ConstituentNode, allocator: std.mem.Allocator, pub fn deinit(self: *ConstituentNode) void { self.allocator.free(self.label); if (self.child) |child| child.deinit(); if (self.next) |next| next.deinit(); self.allocator.destroy(self); } }; pub const Link = struct { left_word: []const u8, right_word: []const u8, label: []const u8, left_index: u32, right_index: u32, }; pub const ParseTree = struct { words: [][]const u8, links: []Link, constituent_tree: ?*ConstituentNode, allocator: std.mem.Allocator, pub fn deinit(self: *ParseTree) void { for (self.words) |word| { self.allocator.free(word); } self.allocator.free(self.words); for (self.links) |link| { self.allocator.free(link.left_word); self.allocator.free(link.right_word); self.allocator.free(link.label); } self.allocator.free(self.links); if (self.constituent_tree) |tree| { tree.deinit(); } } pub fn format(self: ParseTree, writer: *std.io.Writer) std.io.Writer.Error!void { try writer.writeAll("Words: "); for (self.words, 0..) |word, i| { try writer.print("{d}: '{s}' ", .{ i, word }); } try writer.print("\n\nLinks ({} total):\n", .{self.links.len}); for (self.links, 0..) |link, i| { try writer.print(" [{d}] '{s}' --{s}--> '{s}'\n", .{ i, link.left_word, link.label, link.right_word }); } try writer.writeAll("\nConstituent Tree:\n"); if (self.constituent_tree) |tree| { try self.printConstituentNode(writer, tree, 0); } else { try writer.writeAll(" (no constituent tree)\n"); } } pub fn firstVerb(self: *ParseTree) ?[]const u8 { for (self.words) |word| { if (std.mem.endsWith(u8, word, ".v")) { return word[0 .. word.len - 2]; } } return null; } pub fn sentenceAction(self: *ParseTree) ![][]const u8 { var al: std.ArrayList([]const u8) = .{}; defer al.deinit(self.allocator); if (self.constituent_tree == null) return error.NoConstituentTree; var node = self.constituent_tree.?; // https://www.link.cs.cmu.edu/link/dict/section-S.html if (!std.mem.startsWith(u8, node.label, "S")) return error.NoSubjectNounFound; if (node.child == null) @panic("S node must have a child"); node = node.child.?; // I'm not entirely sure this will be universally true, but we'll see // in real life testing // https://www.link.cs.cmu.edu/link/dict/section-V.html // https://www.link.cs.cmu.edu/link/dict/section-P.html if (!std.mem.eql(u8, node.label, "VP")) return error.VerbBeLinkageNotFound; if (node.child == null) @panic("VP node must have a child"); node = node.child.?; // This should be our action if (node.start != node.end) return error.MultiWordVerbsNotSupported; // this should be our issue... const verb = self.words[node.start + 1]; // +1 due to LEFT_WALL if (!std.mem.endsWith(u8, verb, ".v")) return error.VerbNotLabeledWithDotV; // From this verb, we can just look at the linkage to find what to append for (self.links) |l| { // We're looking for a modifying verb, see: // https://www.link.cs.cmu.edu/link/dict/section-MV.html if (@as(usize, l.left_index) == (node.start + 1) and std.mem.startsWith(u8, l.label, "MV")) { // this is an modifying verb try al.append(self.allocator, verb[0 .. verb.len - 2]); try al.append( self.allocator, l.right_word[0 .. std.mem.lastIndexOfScalar(u8, l.right_word, '.') orelse l.right_word.len], ); return al.toOwnedSlice(self.allocator); } } return al.toOwnedSlice(self.allocator); } pub fn sentenceObject(self: *ParseTree) ![][]const u8 { var al: std.ArrayList([]const u8) = .{}; defer al.deinit(self.allocator); if (self.constituent_tree == null) return error.NoConstituentTree; var node = self.constituent_tree.?; // https://www.link.cs.cmu.edu/link/dict/section-S.html if (!std.mem.startsWith(u8, node.label, "S")) return error.NoSubjectNounFound; if (node.child == null) @panic("S node must have a child"); node = node.child.?; // I'm not entirely sure this will be universally true, but we'll see // in real life testing // https://www.link.cs.cmu.edu/link/dict/section-V.html // https://www.link.cs.cmu.edu/link/dict/section-P.html if (!std.mem.eql(u8, node.label, "VP")) return error.VerbBeLinkageNotFound; if (node.child == null) @panic("VP node must have a child"); node = node.child.?; // We need the next node, which should be our PP node if (node.next == null) @panic("VP node must have a child with at least two members"); while (node.next != null) // fast forward to the very last node at this level node = node.next.?; // https://opencog.github.io/link-grammar-website/dict/section-PP.html if (!std.mem.startsWith(u8, node.label, "PP")) return error.PastParticipleLinkageNotFound; if (node.child == null) @panic("PP node must have a child"); node = node.child.?; // At this point we should be pointing to something like "on" or "off" // We need the next node, which is the money shot if (node.next == null) @panic("PP node must have a child with two members"); node = node.next.?; // N is "No", which doesn't make sense here, but there is no specific // NP section, so I'm not sure what we're here if (!std.mem.eql(u8, node.label, "NP")) return error.NPLinkageNotFound; for (node.start..node.end + 1) |i| { // we need to add 1 to this index so we can avoid LEFT-WALL const inx = i + 1; // we only want the nouns out of this... const word = self.words[inx]; if (!std.mem.endsWith(u8, word, ".n")) continue; const trimmed = word[0 .. word.len - 2]; try al.append(self.allocator, trimmed); } return al.toOwnedSlice(self.allocator); } fn printConstituentNode(self: *const ParseTree, writer: *std.Io.Writer, node: *const ConstituentNode, depth: usize) !void { for (0..depth) |_| { try writer.writeAll(" "); } try writer.print("[{s}] ({}-{})\n", .{ node.label, node.start, node.end }); if (node.child) |child| { try self.printConstituentNode(writer, child, depth + 1); } if (node.next) |next| { try self.printConstituentNode(writer, next, depth); } } }; pub const Parser = struct { dict: c.Dictionary, opts: c.Parse_Options, allocator: std.mem.Allocator, pub fn init(allocator: std.mem.Allocator) !Parser { const dict = c.dictionary_create( @ptrCast(@constCast("data/4.0.dict")), @ptrCast(@constCast("data/4.0.knowledge")), @ptrCast(@constCast("data/4.0.constituent-knowledge")), @ptrCast(@constCast("data/4.0.affix")), ); if (dict == null) return error.DictionaryCreationFailed; const opts = c.parse_options_create(); if (opts == null) return error.ParseOptionsCreationFailed; setOptions(opts); return Parser{ .dict = dict, .opts = opts, .allocator = allocator, }; } fn setOptions(opts: anytype) void { c.parse_options_set_verbosity(opts, 0); c.parse_options_set_linkage_limit(opts, 100); c.parse_options_set_disjunct_cost(opts, 2); c.parse_options_set_min_null_count(opts, 0); c.parse_options_set_max_null_count(opts, 0); } pub fn initWithDataDir(allocator: std.mem.Allocator, data_dir: []const u8) !Parser { const dict_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "data/4.0.dict" }); defer allocator.free(dict_path); const knowledge_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "data/4.0.knowledge" }); defer allocator.free(knowledge_path); const constituent_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "data/4.0.constituent-knowledge" }); defer allocator.free(constituent_path); const affix_path = try std.fs.path.join(allocator, &[_][]const u8{ data_dir, "data/4.0.affix" }); defer allocator.free(affix_path); const dict_cstr = try allocator.dupeZ(u8, dict_path); defer allocator.free(dict_cstr); const knowledge_cstr = try allocator.dupeZ(u8, knowledge_path); defer allocator.free(knowledge_cstr); const constituent_cstr = try allocator.dupeZ(u8, constituent_path); defer allocator.free(constituent_cstr); const affix_cstr = try allocator.dupeZ(u8, affix_path); defer allocator.free(affix_cstr); const dict = c.dictionary_create( @ptrCast(@constCast(dict_cstr.ptr)), @ptrCast(@constCast(knowledge_cstr.ptr)), @ptrCast(@constCast(constituent_cstr.ptr)), @ptrCast(@constCast(affix_cstr.ptr)), ); if (dict == null) return error.DictionaryCreationFailed; const opts = c.parse_options_create(); if (opts == null) return error.ParseOptionsCreationFailed; setOptions(opts); return Parser{ .dict = dict, .opts = opts, .allocator = allocator, }; } pub fn deinit(self: *Parser) void { _ = c.parse_options_delete(self.opts); _ = c.dictionary_delete(self.dict); } pub fn parse(self: *Parser, input: []const u8) !ParseTree { const c_input = try self.allocator.dupeZ(u8, input); defer self.allocator.free(c_input); const sent = c.sentence_create(c_input.ptr, self.dict); if (sent == null) return error.SentenceCreationFailed; defer c.sentence_delete(sent); var num_linkages = c.sentence_parse(sent, self.opts); // If no linkages found, try with null links allowed if (num_linkages == 0) { c.parse_options_set_min_null_count(self.opts, 1); c.parse_options_set_max_null_count(self.opts, @intCast(c.sentence_length(sent))); num_linkages = c.sentence_parse(sent, self.opts); } if (num_linkages == 0) return error.NoLinkagesFound; const linkage = c.linkage_create(0, sent, self.opts); if (linkage == null) return error.LinkageCreationFailed; defer c.linkage_delete(linkage); const num_words = c.linkage_get_num_words(linkage); const num_links = c.linkage_get_num_links(linkage); var words = try self.allocator.alloc([]const u8, @intCast(num_words)); for (0..@intCast(num_words)) |i| { const word_ptr = c.linkage_get_word(linkage, @intCast(i)); words[i] = try self.allocator.dupe(u8, std.mem.span(word_ptr)); } var links = try self.allocator.alloc(Link, @intCast(num_links)); for (0..@intCast(num_links)) |i| { const left = c.linkage_get_link_lword(linkage, @intCast(i)); const right = c.linkage_get_link_rword(linkage, @intCast(i)); const label_ptr = c.linkage_get_link_label(linkage, @intCast(i)); const left_word_ptr = c.linkage_get_word(linkage, left); const right_word_ptr = c.linkage_get_word(linkage, right); links[i] = Link{ .left_word = try self.allocator.dupe(u8, std.mem.span(left_word_ptr)), .right_word = try self.allocator.dupe(u8, std.mem.span(right_word_ptr)), .label = try self.allocator.dupe(u8, std.mem.span(label_ptr)), .left_index = @intCast(left), .right_index = @intCast(right), }; } // Extract constituent tree structure const constituent_tree = try self.constituentTree(linkage); return ParseTree{ .words = words, .links = links, .constituent_tree = constituent_tree, .allocator = self.allocator, }; } fn constituentTree(self: *Parser, linkage: c.Linkage) !?*ConstituentNode { // struct CNode_s { // char * label; // CNode * child; // CNode * next; // int start, end; // }; const constituent_tree = c.linkage_constituent_tree(linkage); if (constituent_tree == null) return null; defer c.linkage_free_constituent_tree(constituent_tree); return try parseCNode(self.allocator, constituent_tree); } fn parseCNode(allocator: std.mem.Allocator, cnode: *c.struct_CNode_s) !*ConstituentNode { const node = try allocator.create(ConstituentNode); errdefer allocator.destroy(node); const label = try allocator.dupe(u8, std.mem.span(cnode.label)); errdefer allocator.free(label); node.* = ConstituentNode{ .label = label, .start = @intCast(cnode.start), .end = @intCast(cnode.end), .child = if (cnode.child != null) try parseCNode(allocator, cnode.child) else null, .next = if (cnode.next != null) try parseCNode(allocator, cnode.next) else null, .allocator = allocator, }; return node; } }; test "basic C API functionality" { const opts = c.parse_options_create(); defer _ = c.parse_options_delete(opts); try std.testing.expect(opts != null); } test "parser functionality" { var parser = try Parser.init(std.testing.allocator); defer parser.deinit(); var tree = try parser.parse("The cat sat on the mat"); defer tree.deinit(); // LEFT-WALL // the // cat.n // sat.v // on // the // mat.n // RIGHT-WALL try std.testing.expect(tree.words.len == 8); // 6 + LEFT_WALL / RIGHT_WALL } test "from website" { const sentence = "When your back is is against the whiteboard, I'll be back to back you up"; var parser = try Parser.init(std.testing.allocator); defer parser.deinit(); var tree = try parser.parse(sentence); defer tree.deinit(); } test "real usage" { var parser = try Parser.init(std.testing.allocator); defer parser.deinit(); var tree = try parser.parse("turn on the bedroom light"); defer tree.deinit(); try std.testing.expect(tree.words.len == 7); // 5 + LEFT_WALL / RIGHT_WALL try std.testing.expectEqualStrings("turn", tree.firstVerb().?); const sentence_action = try tree.sentenceAction(); defer std.testing.allocator.free(sentence_action); try std.testing.expect(sentence_action.len == 2); try std.testing.expectEqualStrings("turn", sentence_action[0]); try std.testing.expectEqualStrings("on", sentence_action[1]); const sentence_object = try tree.sentenceObject(); defer std.testing.allocator.free(sentence_object); try std.testing.expect(sentence_object.len == 2); try std.testing.expectEqualStrings("bedroom", sentence_object[0]); try std.testing.expectEqualStrings("light", sentence_object[1]); } test "real usage - jack" { var parser = try Parser.init(std.testing.allocator); defer parser.deinit(); var tree = try parser.parse("turn on jack bedroom light"); defer tree.deinit(); // std.debug.print("{f}\n", .{tree}); try std.testing.expect(tree.words.len == 7); // 5 + LEFT_WALL / RIGHT_WALL const sentence_action = try tree.sentenceAction(); defer std.testing.allocator.free(sentence_action); try std.testing.expect(sentence_action.len == 2); try std.testing.expectEqualStrings("turn", sentence_action[0]); try std.testing.expectEqualStrings("on", sentence_action[1]); const sentence_object = try tree.sentenceObject(); defer std.testing.allocator.free(sentence_object); try std.testing.expect(sentence_object.len == 3); try std.testing.expectEqualStrings("jack", sentence_object[0]); try std.testing.expectEqualStrings("bedroom", sentence_object[1]); try std.testing.expectEqualStrings("light", sentence_object[2]); }