fontfinder/src/unicode.zig

113 lines
4.0 KiB
Zig
Raw Normal View History

2023-07-30 03:10:04 +00:00
const std = @import("std");
// Pulled from: https://www.unicodepedia.com/groups/
const ranges = @embedFile("ranges.txt");
const eval_branch_quota_base = 18500;
const range_count = blk: {
// This should be related to the number of characters in our embedded file above
@setEvalBranchQuota(eval_branch_quota_base);
break :blk std.mem.count(u8, ranges, "\n");
};
const Ranges = struct {
names: [range_count][]const u8 = undefined,
starting_codepoints: [range_count]u21 = undefined,
ending_codepoints: [range_count]u21 = undefined,
current_inx: usize = 0,
longest_name_len: usize = 0,
const Self = @This();
pub fn first(self: *Self) ?UnicodeGroup {
self.reset();
return self.next();
}
pub fn reset(self: *Self) void {
self.current_inx = 0;
}
pub fn next(self: *Self) ?UnicodeGroup {
if (self.current_inx == range_count) return null;
self.current_inx += 1;
return self.item(self.current_inx - 1);
}
pub fn item(self: Self, index: usize) UnicodeGroup {
return .{
.name = self.names[index],
.starting_codepoint = self.starting_codepoints[index],
.ending_codepoint = self.ending_codepoints[index],
};
}
};
const _all_ranges = blk: {
@setEvalBranchQuota(eval_branch_quota_base * 2);
break :blk parseRanges(ranges) catch @compileError("Could not parse ranges.txt");
};
pub fn all_ranges() Ranges {
return .{
.names = _all_ranges.names,
.starting_codepoints = _all_ranges.starting_codepoints,
.ending_codepoints = _all_ranges.ending_codepoints,
.longest_name_len = _all_ranges.longest_name_len,
};
}
pub const UnicodeGroup = struct {
name: []const u8,
starting_codepoint: u21,
ending_codepoint: u21,
};
fn parseRanges(text: []const u8) !Ranges {
var rc = Ranges{};
var iterator = std.mem.splitSequence(u8, text, "\n");
var inx: usize = 0;
while (iterator.next()) |group|
if (group.len > 0) {
const uc = try parseGroup(group);
rc.names[inx] = uc.name;
rc.starting_codepoints[inx] = uc.starting_codepoint;
rc.ending_codepoints[inx] = uc.ending_codepoint;
rc.longest_name_len = @max(rc.longest_name_len, uc.name.len);
inx += 1;
};
return rc;
}
fn parseGroup(group_text: []const u8) !UnicodeGroup {
// Basic Latin U+0 - U+7F
var iterator = std.mem.splitSequence(u8, group_text, "\t");
const name = std.mem.trimRight(u8, iterator.first(), " ");
const range_text = iterator.next() orelse {
std.log.err("failed parsing on group '{s}'", .{group_text});
return error.NoRangeSpecifiedInGroup;
};
var range_iterator = std.mem.splitSequence(u8, range_text, " - ");
const start_text = range_iterator.first();
const end_text = range_iterator.next() orelse return error.NoEndingCodepointInGroup;
return UnicodeGroup{
.name = name,
.starting_codepoint = try std.fmt.parseUnsigned(u21, start_text[2..], 16),
.ending_codepoint = try std.fmt.parseUnsigned(u21, end_text[2..], 16),
};
}
test "check ranges" {
var parsed_ranges = all_ranges();
// Entry 8 should be:
// Cyrillic U+400 - U+4FF
try std.testing.expectEqual(@as(u21, 0x400), parsed_ranges.starting_codepoints[8]);
try std.testing.expectEqual(@as(u21, 0x4ff), parsed_ranges.ending_codepoints[8]);
try std.testing.expectEqualStrings("Cyrillic", parsed_ranges.names[8]);
var range = parsed_ranges.first().?;
try std.testing.expectEqualStrings("Basic Latin", range.name);
try std.testing.expectEqual(@as(u21, 0x0), range.starting_codepoint);
try std.testing.expectEqual(@as(u21, 0x7f), range.ending_codepoint);
range = parsed_ranges.next().?;
try std.testing.expectEqualStrings("Latin-1 Supplement", range.name);
try std.testing.expectEqual(@as(u21, 0x80), range.starting_codepoint);
try std.testing.expectEqual(@as(u21, 0xff), range.ending_codepoint);
}