112 lines
4 KiB
Zig
112 lines
4 KiB
Zig
const std = @import("std");
|
|
|
|
// Pulled from: https://www.unicodepedia.com/groups/
|
|
const ranges = @embedFile("ranges.txt");
|
|
const eval_branch_quota_base = 18500;
|
|
const range_count = blk: {
|
|
// This should be related to the number of characters in our embedded file above
|
|
@setEvalBranchQuota(eval_branch_quota_base);
|
|
break :blk std.mem.count(u8, ranges, "\n");
|
|
};
|
|
const Ranges = struct {
|
|
names: [range_count][]const u8 = undefined,
|
|
starting_codepoints: [range_count]u21 = undefined,
|
|
ending_codepoints: [range_count]u21 = undefined,
|
|
current_inx: usize = 0,
|
|
longest_name_len: usize = 0,
|
|
|
|
const Self = @This();
|
|
|
|
pub fn first(self: *Self) ?UnicodeGroup {
|
|
self.reset();
|
|
return self.next();
|
|
}
|
|
pub fn reset(self: *Self) void {
|
|
self.current_inx = 0;
|
|
}
|
|
pub fn next(self: *Self) ?UnicodeGroup {
|
|
if (self.current_inx == range_count) return null;
|
|
self.current_inx += 1;
|
|
return self.item(self.current_inx - 1);
|
|
}
|
|
pub fn item(self: Self, index: usize) UnicodeGroup {
|
|
return .{
|
|
.name = self.names[index],
|
|
.starting_codepoint = self.starting_codepoints[index],
|
|
.ending_codepoint = self.ending_codepoints[index],
|
|
};
|
|
}
|
|
};
|
|
|
|
const _all_ranges = blk: {
|
|
@setEvalBranchQuota(eval_branch_quota_base * 2);
|
|
break :blk parseRanges(ranges) catch @compileError("Could not parse ranges.txt");
|
|
};
|
|
|
|
pub fn all_ranges() Ranges {
|
|
return .{
|
|
.names = _all_ranges.names,
|
|
.starting_codepoints = _all_ranges.starting_codepoints,
|
|
.ending_codepoints = _all_ranges.ending_codepoints,
|
|
.longest_name_len = _all_ranges.longest_name_len,
|
|
};
|
|
}
|
|
|
|
pub const UnicodeGroup = struct {
|
|
name: []const u8,
|
|
starting_codepoint: u21,
|
|
ending_codepoint: u21,
|
|
};
|
|
|
|
fn parseRanges(text: []const u8) !Ranges {
|
|
var rc = Ranges{};
|
|
var iterator = std.mem.splitSequence(u8, text, "\n");
|
|
var inx: usize = 0;
|
|
while (iterator.next()) |group|
|
|
if (group.len > 0) {
|
|
const uc = try parseGroup(group);
|
|
rc.names[inx] = uc.name;
|
|
rc.starting_codepoints[inx] = uc.starting_codepoint;
|
|
rc.ending_codepoints[inx] = uc.ending_codepoint;
|
|
rc.longest_name_len = @max(rc.longest_name_len, uc.name.len);
|
|
inx += 1;
|
|
};
|
|
return rc;
|
|
}
|
|
|
|
fn parseGroup(group_text: []const u8) !UnicodeGroup {
|
|
// Basic Latin U+0 - U+7F
|
|
var iterator = std.mem.splitSequence(u8, group_text, "\t");
|
|
const name = std.mem.trimRight(u8, iterator.first(), " ");
|
|
const range_text = iterator.next() orelse {
|
|
std.log.err("failed parsing on group '{s}'", .{group_text});
|
|
return error.NoRangeSpecifiedInGroup;
|
|
};
|
|
var range_iterator = std.mem.splitSequence(u8, range_text, " - ");
|
|
const start_text = range_iterator.first();
|
|
const end_text = range_iterator.next() orelse return error.NoEndingCodepointInGroup;
|
|
return UnicodeGroup{
|
|
.name = name,
|
|
.starting_codepoint = try std.fmt.parseUnsigned(u21, start_text[2..], 16),
|
|
.ending_codepoint = try std.fmt.parseUnsigned(u21, end_text[2..], 16),
|
|
};
|
|
}
|
|
|
|
test "check ranges" {
|
|
var parsed_ranges = all_ranges();
|
|
// Entry 8 should be:
|
|
// Cyrillic U+400 - U+4FF
|
|
try std.testing.expectEqual(@as(u21, 0x400), parsed_ranges.starting_codepoints[8]);
|
|
try std.testing.expectEqual(@as(u21, 0x4ff), parsed_ranges.ending_codepoints[8]);
|
|
try std.testing.expectEqualStrings("Cyrillic", parsed_ranges.names[8]);
|
|
|
|
var range = parsed_ranges.first().?;
|
|
try std.testing.expectEqualStrings("Basic Latin", range.name);
|
|
try std.testing.expectEqual(@as(u21, 0x0), range.starting_codepoint);
|
|
try std.testing.expectEqual(@as(u21, 0x7f), range.ending_codepoint);
|
|
|
|
range = parsed_ranges.next().?;
|
|
try std.testing.expectEqualStrings("Latin-1 Supplement", range.name);
|
|
try std.testing.expectEqual(@as(u21, 0x80), range.starting_codepoint);
|
|
try std.testing.expectEqual(@as(u21, 0xff), range.ending_codepoint);
|
|
}
|