use constituent tree to find the correct nouns

This commit is contained in:
Emil Lerch 2025-09-22 12:57:14 -07:00
parent 87ee0f9580
commit 4e1bf8b4b9
Signed by: lobo
GPG key ID: A7B62D657EF764F8

View file

@ -3,6 +3,26 @@ const c = @cImport({
@cInclude("link-includes.h");
});
pub const ConstituentNode = struct {
label: []const u8,
/// index of start for the linkage, does not include LEFT_WALL/RIGHT_WALL
start: usize,
/// index of end for the linkage, does not include LEFT_WALL/RIGHT_WALL
end: usize,
child: ?*ConstituentNode,
next: ?*ConstituentNode,
allocator: std.mem.Allocator,
pub fn deinit(self: *ConstituentNode) void {
self.allocator.free(self.label);
if (self.child) |child|
child.deinit();
if (self.next) |next|
next.deinit();
self.allocator.destroy(self);
}
};
pub const Link = struct {
left_word: []const u8,
right_word: []const u8,
@ -14,6 +34,7 @@ pub const Link = struct {
pub const ParseTree = struct {
words: [][]const u8,
links: []Link,
constituent_tree: ?*ConstituentNode,
allocator: std.mem.Allocator,
pub fn deinit(self: *ParseTree) void {
@ -27,8 +48,28 @@ pub const ParseTree = struct {
self.allocator.free(link.label);
}
self.allocator.free(self.links);
if (self.constituent_tree) |tree| {
tree.deinit();
}
}
pub fn format(self: ParseTree, writer: *std.io.Writer) std.io.Writer.Error!void {
try writer.writeAll("Words: ");
for (self.words, 0..) |word, i| {
try writer.print("{d}: '{s}' ", .{ i, word });
}
try writer.print("\n\nLinks ({} total):\n", .{self.links.len});
for (self.links, 0..) |link, i| {
try writer.print(" [{d}] '{s}' --{s}--> '{s}'\n", .{ i, link.left_word, link.label, link.right_word });
}
try writer.writeAll("\nConstituent Tree:\n");
if (self.constituent_tree) |tree| {
try self.printConstituentNode(writer, tree, 0);
} else {
try writer.writeAll(" (no constituent tree)\n");
}
}
pub fn firstVerb(self: *ParseTree) ?[]const u8 {
for (self.words) |word| {
if (std.mem.endsWith(u8, word, ".v")) {
@ -72,46 +113,68 @@ pub const ParseTree = struct {
pub fn sentenceObject(self: *ParseTree) ![][]const u8 {
var al: std.ArrayList([]const u8) = .{};
defer al.deinit(self.allocator);
// var noun: ?usize = null;
for (self.words, 0..) |word, i| {
if (std.mem.endsWith(u8, word, ".n")) {
// adjective or noun
for (self.links) |l| {
if (@as(usize, l.left_index) == i and
std.mem.startsWith(u8, l.label, "AN"))
{
// this is an adjective. We need to add both it and the noun to the list
// https://www.link.cs.cmu.edu/link/dict/section-AN.html
try al.append(self.allocator, word[0 .. word.len - 2]);
try al.append(
self.allocator,
l.right_word[0 .. std.mem.lastIndexOfScalar(u8, l.right_word, '.') orelse l.right_word.len],
);
return al.toOwnedSlice(self.allocator);
}
}
// no adjective, just this noun
try al.append(self.allocator, word[0 .. word.len - 2]);
return al.toOwnedSlice(self.allocator);
}
if (self.constituent_tree == null) return error.NoConstituentTree;
var node = self.constituent_tree.?;
// https://www.link.cs.cmu.edu/link/dict/section-S.html
if (!std.mem.startsWith(u8, node.label, "S"))
return error.NoSubjectNounFound;
if (node.child == null) @panic("S node must have a child");
node = node.child.?;
// I'm not entirely sure this will be universally true, but we'll see
// in real life testing
// https://www.link.cs.cmu.edu/link/dict/section-V.html
// https://www.link.cs.cmu.edu/link/dict/section-P.html
if (!std.mem.eql(u8, node.label, "VP"))
return error.VerbBeLinkageNotFound;
if (node.child == null) @panic("VP node must have a child");
node = node.child.?;
// We need the next node, which should be our PP node
if (node.next == null) @panic("VP node must have a child with two members");
node = node.next.?;
// https://opencog.github.io/link-grammar-website/dict/section-PP.html
if (!std.mem.startsWith(u8, node.label, "PP"))
return error.PastParticipleLinkageNotFound;
if (node.child == null) @panic("PP node must have a child");
node = node.child.?;
// At this point we should be pointing to something like "on" or "off"
// We need the next node, which is the money shot
if (node.next == null) @panic("PP node must have a child with two members");
node = node.next.?;
// N is "No", which doesn't make sense here, but there is no specific
// NP section, so I'm not sure what we're here
if (!std.mem.eql(u8, node.label, "NP"))
return error.NPLinkageNotFound;
for (node.start..node.end + 1) |i| {
// we need to add 1 to this index so we can avoid LEFT-WALL
const inx = i + 1;
// we only want the nouns out of this...
const word = self.words[inx];
if (!std.mem.endsWith(u8, word, ".n")) continue;
const trimmed = word[0 .. word.len - 2];
try al.append(self.allocator, trimmed);
}
return al.toOwnedSlice(self.allocator);
}
// TODO: This should be a format function
pub fn printTree(self: *const ParseTree, writer: *std.Io.Writer) !void {
try writer.writeAll("Parse Tree:\n");
try writer.writeAll("Words: ");
for (self.words) |word| {
try writer.print("'{s}' ", .{word});
fn printConstituentNode(self: *const ParseTree, writer: *std.Io.Writer, node: *const ConstituentNode, depth: usize) !void {
for (0..depth) |_| {
try writer.writeAll(" ");
}
try writer.print("\n\nLinks ({} total):\n", .{self.links.len});
for (self.links, 0..) |link, i| {
try writer.print(" [{d}] '{s}' --{s}--> '{s}'\n", .{ i, link.left_word, link.label, link.right_word });
try writer.print("[{s}] ({}-{})\n", .{ node.label, node.start, node.end });
if (node.child) |child| {
try self.printConstituentNode(writer, child, depth + 1);
}
if (node.next) |next| {
try self.printConstituentNode(writer, next, depth);
}
try writer.writeAll("\n");
}
};
@ -243,12 +306,45 @@ pub const Parser = struct {
};
}
// Extract constituent tree structure
const constituent_tree = try self.constituentTree(linkage);
return ParseTree{
.words = words,
.links = links,
.constituent_tree = constituent_tree,
.allocator = self.allocator,
};
}
fn constituentTree(self: *Parser, linkage: c.Linkage) !?*ConstituentNode {
// struct CNode_s {
// char * label;
// CNode * child;
// CNode * next;
// int start, end;
// };
const constituent_tree = c.linkage_constituent_tree(linkage);
if (constituent_tree == null) return null;
defer c.linkage_free_constituent_tree(constituent_tree);
return try parseCNode(self.allocator, constituent_tree);
}
fn parseCNode(allocator: std.mem.Allocator, cnode: *c.struct_CNode_s) !*ConstituentNode {
const node = try allocator.create(ConstituentNode);
errdefer allocator.destroy(node);
const label = try allocator.dupe(u8, std.mem.span(cnode.label));
errdefer allocator.free(label);
node.* = ConstituentNode{
.label = label,
.start = @intCast(cnode.start),
.end = @intCast(cnode.end),
.child = if (cnode.child != null) try parseCNode(allocator, cnode.child) else null,
.next = if (cnode.next != null) try parseCNode(allocator, cnode.next) else null,
.allocator = allocator,
};
return node;
}
};
test "basic C API functionality" {
@ -264,10 +360,6 @@ test "parser functionality" {
var tree = try parser.parse("The cat sat on the mat");
defer tree.deinit();
// for (tree.links) |w|
// std.debug.print("l: '{s}', r: '{s}', label: '{s}'\n", .{ w.left_word, w.right_word, w.label });
//
// std.debug.print("{any}\n", .{tree.words.len});
// LEFT-WALL
// the
// cat.n
@ -293,12 +385,6 @@ test "real usage" {
var tree = try parser.parse("turn on the bedroom light");
defer tree.deinit();
// for (tree.links) |w|
// std.debug.print("l: '{s}', r: '{s}', label: '{s}'\n", .{ w.left_word, w.right_word, w.label });
//
// for (tree.words) |w|
// std.debug.print("{s}\n", .{w});
// std.debug.print("{any}\n", .{tree.words.len});
try std.testing.expect(tree.words.len == 7); // 5 + LEFT_WALL / RIGHT_WALL
try std.testing.expectEqualStrings("turn", tree.firstVerb().?);
const sentence_action = try tree.sentenceAction();
@ -316,20 +402,17 @@ test "real usage - jack" {
var parser = try Parser.init(std.testing.allocator);
defer parser.deinit();
if (true) return error.SkipZigTest;
var tree = try parser.parse("turn on jack bedroom light");
defer tree.deinit();
var stderr_writer = std.fs.File.stderr().writer(&.{});
const stderr = &stderr_writer.interface;
try tree.printTree(stderr); // well, this is stupid. We shall fix later
// std.debug.print("{f}\n", .{tree});
try std.testing.expect(tree.words.len == 7); // 5 + LEFT_WALL / RIGHT_WALL
try std.testing.expectEqualStrings("turn", tree.firstVerb().?);
const sentence_action = try tree.sentenceAction();
defer std.testing.allocator.free(sentence_action);
try std.testing.expect(sentence_action.len == 2);
try std.testing.expectEqualStrings("turn", sentence_action[0]);
try std.testing.expectEqualStrings("on", sentence_action[1]);
const sentence_object = try tree.sentenceObject();
defer std.testing.allocator.free(sentence_object);
try std.testing.expect(sentence_object.len == 3);