diff --git a/src/root.zig b/src/root.zig index 11c75f6..5f76d4f 100644 --- a/src/root.zig +++ b/src/root.zig @@ -3,6 +3,26 @@ const c = @cImport({ @cInclude("link-includes.h"); }); +pub const ConstituentNode = struct { + label: []const u8, + /// index of start for the linkage, does not include LEFT_WALL/RIGHT_WALL + start: usize, + /// index of end for the linkage, does not include LEFT_WALL/RIGHT_WALL + end: usize, + child: ?*ConstituentNode, + next: ?*ConstituentNode, + allocator: std.mem.Allocator, + + pub fn deinit(self: *ConstituentNode) void { + self.allocator.free(self.label); + if (self.child) |child| + child.deinit(); + if (self.next) |next| + next.deinit(); + self.allocator.destroy(self); + } +}; + pub const Link = struct { left_word: []const u8, right_word: []const u8, @@ -14,6 +34,7 @@ pub const Link = struct { pub const ParseTree = struct { words: [][]const u8, links: []Link, + constituent_tree: ?*ConstituentNode, allocator: std.mem.Allocator, pub fn deinit(self: *ParseTree) void { @@ -27,8 +48,28 @@ pub const ParseTree = struct { self.allocator.free(link.label); } self.allocator.free(self.links); + if (self.constituent_tree) |tree| { + tree.deinit(); + } } + pub fn format(self: ParseTree, writer: *std.io.Writer) std.io.Writer.Error!void { + try writer.writeAll("Words: "); + for (self.words, 0..) |word, i| { + try writer.print("{d}: '{s}' ", .{ i, word }); + } + try writer.print("\n\nLinks ({} total):\n", .{self.links.len}); + for (self.links, 0..) |link, i| { + try writer.print(" [{d}] '{s}' --{s}--> '{s}'\n", .{ i, link.left_word, link.label, link.right_word }); + } + + try writer.writeAll("\nConstituent Tree:\n"); + if (self.constituent_tree) |tree| { + try self.printConstituentNode(writer, tree, 0); + } else { + try writer.writeAll(" (no constituent tree)\n"); + } + } pub fn firstVerb(self: *ParseTree) ?[]const u8 { for (self.words) |word| { if (std.mem.endsWith(u8, word, ".v")) { @@ -72,46 +113,68 @@ pub const ParseTree = struct { pub fn sentenceObject(self: *ParseTree) ![][]const u8 { var al: std.ArrayList([]const u8) = .{}; defer al.deinit(self.allocator); - // var noun: ?usize = null; - for (self.words, 0..) |word, i| { - if (std.mem.endsWith(u8, word, ".n")) { - // adjective or noun - for (self.links) |l| { - if (@as(usize, l.left_index) == i and - std.mem.startsWith(u8, l.label, "AN")) - { - // this is an adjective. We need to add both it and the noun to the list - // https://www.link.cs.cmu.edu/link/dict/section-AN.html - try al.append(self.allocator, word[0 .. word.len - 2]); - try al.append( - self.allocator, - l.right_word[0 .. std.mem.lastIndexOfScalar(u8, l.right_word, '.') orelse l.right_word.len], - ); - return al.toOwnedSlice(self.allocator); - } - } - // no adjective, just this noun - try al.append(self.allocator, word[0 .. word.len - 2]); - return al.toOwnedSlice(self.allocator); - } + if (self.constituent_tree == null) return error.NoConstituentTree; + var node = self.constituent_tree.?; + // https://www.link.cs.cmu.edu/link/dict/section-S.html + if (!std.mem.startsWith(u8, node.label, "S")) + return error.NoSubjectNounFound; + if (node.child == null) @panic("S node must have a child"); + node = node.child.?; + + // I'm not entirely sure this will be universally true, but we'll see + // in real life testing + // https://www.link.cs.cmu.edu/link/dict/section-V.html + // https://www.link.cs.cmu.edu/link/dict/section-P.html + if (!std.mem.eql(u8, node.label, "VP")) + return error.VerbBeLinkageNotFound; + + if (node.child == null) @panic("VP node must have a child"); + node = node.child.?; + + // We need the next node, which should be our PP node + if (node.next == null) @panic("VP node must have a child with two members"); + node = node.next.?; + + // https://opencog.github.io/link-grammar-website/dict/section-PP.html + if (!std.mem.startsWith(u8, node.label, "PP")) + return error.PastParticipleLinkageNotFound; + + if (node.child == null) @panic("PP node must have a child"); + node = node.child.?; + + // At this point we should be pointing to something like "on" or "off" + // We need the next node, which is the money shot + if (node.next == null) @panic("PP node must have a child with two members"); + node = node.next.?; + + // N is "No", which doesn't make sense here, but there is no specific + // NP section, so I'm not sure what we're here + if (!std.mem.eql(u8, node.label, "NP")) + return error.NPLinkageNotFound; + for (node.start..node.end + 1) |i| { + // we need to add 1 to this index so we can avoid LEFT-WALL + const inx = i + 1; + // we only want the nouns out of this... + const word = self.words[inx]; + if (!std.mem.endsWith(u8, word, ".n")) continue; + const trimmed = word[0 .. word.len - 2]; + try al.append(self.allocator, trimmed); } return al.toOwnedSlice(self.allocator); } - // TODO: This should be a format function - pub fn printTree(self: *const ParseTree, writer: *std.Io.Writer) !void { - try writer.writeAll("Parse Tree:\n"); - try writer.writeAll("Words: "); - for (self.words) |word| { - try writer.print("'{s}' ", .{word}); + fn printConstituentNode(self: *const ParseTree, writer: *std.Io.Writer, node: *const ConstituentNode, depth: usize) !void { + for (0..depth) |_| { + try writer.writeAll(" "); } - try writer.print("\n\nLinks ({} total):\n", .{self.links.len}); - - for (self.links, 0..) |link, i| { - try writer.print(" [{d}] '{s}' --{s}--> '{s}'\n", .{ i, link.left_word, link.label, link.right_word }); + try writer.print("[{s}] ({}-{})\n", .{ node.label, node.start, node.end }); + if (node.child) |child| { + try self.printConstituentNode(writer, child, depth + 1); + } + if (node.next) |next| { + try self.printConstituentNode(writer, next, depth); } - try writer.writeAll("\n"); } }; @@ -243,12 +306,45 @@ pub const Parser = struct { }; } + // Extract constituent tree structure + const constituent_tree = try self.constituentTree(linkage); + return ParseTree{ .words = words, .links = links, + .constituent_tree = constituent_tree, .allocator = self.allocator, }; } + + fn constituentTree(self: *Parser, linkage: c.Linkage) !?*ConstituentNode { + // struct CNode_s { + // char * label; + // CNode * child; + // CNode * next; + // int start, end; + // }; + const constituent_tree = c.linkage_constituent_tree(linkage); + if (constituent_tree == null) return null; + defer c.linkage_free_constituent_tree(constituent_tree); + + return try parseCNode(self.allocator, constituent_tree); + } + fn parseCNode(allocator: std.mem.Allocator, cnode: *c.struct_CNode_s) !*ConstituentNode { + const node = try allocator.create(ConstituentNode); + errdefer allocator.destroy(node); + const label = try allocator.dupe(u8, std.mem.span(cnode.label)); + errdefer allocator.free(label); + node.* = ConstituentNode{ + .label = label, + .start = @intCast(cnode.start), + .end = @intCast(cnode.end), + .child = if (cnode.child != null) try parseCNode(allocator, cnode.child) else null, + .next = if (cnode.next != null) try parseCNode(allocator, cnode.next) else null, + .allocator = allocator, + }; + return node; + } }; test "basic C API functionality" { @@ -264,10 +360,6 @@ test "parser functionality" { var tree = try parser.parse("The cat sat on the mat"); defer tree.deinit(); - // for (tree.links) |w| - // std.debug.print("l: '{s}', r: '{s}', label: '{s}'\n", .{ w.left_word, w.right_word, w.label }); - // - // std.debug.print("{any}\n", .{tree.words.len}); // LEFT-WALL // the // cat.n @@ -293,12 +385,6 @@ test "real usage" { var tree = try parser.parse("turn on the bedroom light"); defer tree.deinit(); - // for (tree.links) |w| - // std.debug.print("l: '{s}', r: '{s}', label: '{s}'\n", .{ w.left_word, w.right_word, w.label }); - // - // for (tree.words) |w| - // std.debug.print("{s}\n", .{w}); - // std.debug.print("{any}\n", .{tree.words.len}); try std.testing.expect(tree.words.len == 7); // 5 + LEFT_WALL / RIGHT_WALL try std.testing.expectEqualStrings("turn", tree.firstVerb().?); const sentence_action = try tree.sentenceAction(); @@ -316,20 +402,17 @@ test "real usage - jack" { var parser = try Parser.init(std.testing.allocator); defer parser.deinit(); - if (true) return error.SkipZigTest; var tree = try parser.parse("turn on jack bedroom light"); defer tree.deinit(); - var stderr_writer = std.fs.File.stderr().writer(&.{}); - const stderr = &stderr_writer.interface; - try tree.printTree(stderr); // well, this is stupid. We shall fix later + // std.debug.print("{f}\n", .{tree}); try std.testing.expect(tree.words.len == 7); // 5 + LEFT_WALL / RIGHT_WALL - try std.testing.expectEqualStrings("turn", tree.firstVerb().?); const sentence_action = try tree.sentenceAction(); defer std.testing.allocator.free(sentence_action); try std.testing.expect(sentence_action.len == 2); try std.testing.expectEqualStrings("turn", sentence_action[0]); try std.testing.expectEqualStrings("on", sentence_action[1]); + const sentence_object = try tree.sentenceObject(); defer std.testing.allocator.free(sentence_object); try std.testing.expect(sentence_object.len == 3);