first thing that actually works

This commit is contained in:
Emil Lerch 2021-04-27 11:24:01 -07:00
parent de09f48f8d
commit d3efa21a41
Signed by: lobo
GPG Key ID: A7B62D657EF764F8
12 changed files with 2384 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.cache
zig-cache

104
Dockerfile Normal file
View File

@ -0,0 +1,104 @@
FROM alpine:3.13 AS base
# gcc gets us libgcc.a, even though the build should be using clang
RUN apk add --no-cache clang git cmake make lld musl-dev gcc && \
rm /usr/bin/ld && \
ln -s /usr/bin/ld.lld /usr/bin/ld && rm /usr/bin/gcc # just to be sure
FROM base AS common
# d5f9398d48d9c318563db08100e2e87b24ea3656
# RUN git clone --depth 1 -b pthread-np https://github.com/r-burns/aws-c-common && \
RUN git clone --depth 1 -b v0.5.2 https://github.com/awslabs/aws-c-common && \
mkdir aws-c-common-build && cd aws-c-common-build && \
cmake ../aws-c-common && \
make -j12 && make test && make install
RUN tar -czf aws-c-common-clang.tgz /usr/local/*
FROM base AS openssl
RUN apk add --no-cache perl linux-headers && \
git clone --depth 1 -b OpenSSL_1_1_1i https://github.com/openssl/openssl && \
cd openssl && ./Configure linux-x86_64-clang && make && make install
RUN tar -czf openssl-clang.tgz /usr/local/*
FROM base AS s2n
ENV S2N_LIBCRYPTO=openssl-1.1.1
COPY --from=openssl /openssl-clang.tgz /
RUN git clone --depth 1 -b v0.10.26 https://github.com/awslabs/s2n && \
tar -xzf openssl-clang.tgz && \
mkdir s2n-build && cd s2n-build && \
cmake ../s2n && \
make -j12 && make install
RUN tar -czf s2n-clang.tgz /usr/local/*
FROM base AS cal
COPY --from=openssl /openssl-clang.tgz /
COPY --from=common /aws-c-common-clang.tgz /
# environment not used - just busting docker's cache
ENV COMMIT=d1a4d
# RUN git clone --depth 1 -b v0.4.5 https://github.com/awslabs/aws-c-cal && \
RUN git clone --depth 1 https://github.com/elerch/aws-c-cal && \
tar -xzf aws-c-common-clang.tgz && \
tar -xzf openssl-clang.tgz && \
mkdir cal-build && cd cal-build && \
cmake -DCMAKE_MODULE_PATH=/usr/local/lib64/cmake ../aws-c-cal && \
make -j12 && make install
# No make test:
# 40 - ecdsa_p384_test_key_gen_export (Failed)
RUN tar -czf aws-c-cal-clang.tgz /usr/local/*
FROM base AS compression
COPY --from=common /aws-c-common-clang.tgz /
RUN git clone --depth 1 -b v0.2.10 https://github.com/awslabs/aws-c-compression && \
tar -xzf aws-c-common-clang.tgz && \
mkdir compression-build && cd compression-build && \
cmake -DCMAKE_MODULE_PATH=/usr/local/lib64/cmake ../aws-c-compression && \
make -j12 && make test && make install
RUN tar -czf aws-c-compression-clang.tgz /usr/local/*
FROM base AS io
# Cal includes common and openssl
COPY --from=cal /aws-c-cal-clang.tgz /
COPY --from=s2n /s2n-clang.tgz /
RUN git clone --depth 1 -b v0.9.1 https://github.com/awslabs/aws-c-io && \
tar -xzf s2n-clang.tgz && \
tar -xzf aws-c-cal-clang.tgz && \
mkdir io-build && cd io-build && \
cmake -DCMAKE_MODULE_PATH=/usr/local/lib64/cmake ../aws-c-io && \
make -j12 && make install
RUN tar -czf aws-c-io-clang.tgz /usr/local/*
FROM base AS http
# Cal includes common and openssl
# 2 test failures on musl - both "download medium file"
COPY --from=io /aws-c-io-clang.tgz /
COPY --from=compression /aws-c-compression-clang.tgz /
# RUN git clone --depth 1 -b v0.5.19 https://github.com/awslabs/aws-c-http && \
RUN git clone --depth 1 -b v0.6.1 https://github.com/awslabs/aws-c-http && \
tar -xzf aws-c-io-clang.tgz && \
tar -xzf aws-c-compression-clang.tgz && \
mkdir http-build && cd http-build && \
cmake -DCMAKE_MODULE_PATH=/usr/local/lib64/cmake ../aws-c-http && \
make -j12 && make install
RUN tar -czf aws-c-http-clang.tgz /usr/local/*
FROM base AS auth
# http should have all other dependencies
COPY --from=http /aws-c-http-clang.tgz /
RUN git clone --depth 1 -b v0.5.0 https://github.com/awslabs/aws-c-auth && \
tar -xzf aws-c-http-clang.tgz && \
mkdir auth-build && cd auth-build && \
cmake -DCMAKE_MODULE_PATH=/usr/local/lib64/cmake ../aws-c-auth && \
make -j12 && make install # chunked_signing_test fails
RUN tar -czf aws-c-auth-clang.tgz /usr/local/*
FROM alpine:3.13 as final
COPY --from=auth /aws-c-auth-clang.tgz /
ADD https://ziglang.org/download/0.7.1/zig-linux-x86_64-0.7.1.tar.xz /
RUN tar -xzf /aws-c-auth-clang.tgz && mkdir /src && tar -C /usr/local -xf zig-linux* && \
ln -s /usr/local/zig-linux*/zig /usr/local/bin/zig

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 Emil Lerch
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

15
Makefile Normal file
View File

@ -0,0 +1,15 @@
start-hand-test: src/main.zig src/aws.zig src/xml.zig
@zig build-exe -static -I/usr/local/include -Isrc/ -lc --strip \
--name start-hand-test src/main.zig src/bitfield-workaround.c \
/usr/local/lib64/libaws-c-*.a \
/usr/local/lib64/libs2n.a \
/usr/local/lib/libcrypto.a \
/usr/local/lib/libssl.a
elasticurl: curl.c
@zig build-exe -static -I/usr/local/include -Isrc/ -lc --strip \
--name elasticurl curl.c \
/usr/local/lib64/libaws-c-*.a \
/usr/local/lib64/libs2n.a \
/usr/local/lib/libcrypto.a \
/usr/local/lib/libssl.a

103
README.md Normal file
View File

@ -0,0 +1,103 @@
# AWS SDK for Zig
Ok, so it's not actually an SDK (yet). Right now this is SDK supports sts
get-caller-identity action only. Why? Because it's one of the easiest to
support, so I started there. From here, the next major step is to codegen
the types necessary to support the various services. Currently this code is
dynamically generating the sts types so we are somewhat codegen ready, but
current comptime limitations might trip us up. The advantage of comptime is
that only types actually used would be generated vs the whole surface area
of AWS. That said, with most of the heavy lifting now coded, the addition
of the request/response types, even if all of them are added, should not
balloon the size beyond "reasonable". Of course this still needs to be be seen.
This is my first serious zig effort, so please issue a PR if the code isn't
"ziggy" or if there's a better way.
This is designed to be built statically using the `aws_c_*` libraries, so
we inherit a lot of the goodness of the work going on there. Implementing
get-caller-identity with all dependencies statically linked gives us a stripped
executable size of 5.3M for x86_linux (which is all that's tested at the moment).
## Building
I am assuming here that if you're playing with zig, you pretty much know
what you're doing, so I will stay brief.
First, the dependencies are required. Use the Dockerfile to build these.
a `docker build` will do, but be prepared for it to run a while. Openssl in
particular will take a while, but without any particular knowledge
I'm also hoping/expecting AWS to factor out that library sometime in
the future.
Once that's done, you'll have an alpine image with all dependencies ready
to go and zig 0.7.1 installed. The build.zig currently relies on
[this PR to allow stripping -static](https://github.com/ziglang/zig/pull/8248),
so either:
* Modify build.zig, then strip (or not) after the fact
* Install make and use the included Makefile
## Running
This library uses the aws c libraries for it's work, so it operates like most
other 'AWS things'. Note that I tested by setting the appropriate environment
variables, so config files haven't gotten a run through.
main.zig gives you a program to call sts GetCallerIdentity.
For local testing or alternative endpoints, there's no real standard, so
there is code to look for `AWS_ENDPOINT_URL` environment variable that will
supercede all other configuration.
## Dependencies
Full dependency tree:
aws-c-auth
* s2n
* openssl
* aws-c-common
* aws-c-compression
* aws-c-common
* aws-c-http
* s2n
* aws-c-common
* aws-c-io
* aws-c-common
* s2n
* openssl
* aws-c-cal
* aws-c-compression
* aws-c-common
* aws-c-cal
* aws-c-common
Build order based on above:
1. aws-c-common
1. openssl
2. s2n
2. aws-c-cal
2. aws-c-compression
3. aws-c-io
4. aws-c-http
5. aws-c-auth
Dockerfile in this repo will manage this
TODO List:
* Implement jitter/exponential backoff. This appears to be configuration of `aws_c_io` and should therefore be trivial
* Implement timeouts and other TODO's in the code
* Implement error handling for 4xx, 5xx and other unexpected return values
* Implement generic response body -> Response type handling (right now, this is hard-coded)
* Implement codegen for services with xml structures (using Smithy models)
* Implement codegen for others (using Smithy models)
* Issue PR in c libraries for full static musl build support (see Dockerfile)
* Remove compiler 0.7.1 shims when 0.8.0 is released
Compiler wishlist/watchlist:
* Fix the weirdness we see with comptime type generation (see aws.zig around line 251)
* [Allow declarations for comptime type generation](https://github.com/ziglang/zig/issues/6709)
* [Merge PR to allow stripping -static](https://github.com/ziglang/zig/pull/8248)
* [comptime allocations](https://github.com/ziglang/zig/issues/1291) so we can read files, etc (or is there another way)

51
build.zig Normal file
View File

@ -0,0 +1,51 @@
// const std = @import("std");
const Builder = @import("std").build.Builder;
pub fn build(b: *Builder) void {
// Standard target options allows the person running `zig build` to choose
// what target to build for. Here we do not override the defaults, which
// means any target is allowed, and the default is native. Other options
// for restricting supported target set are available.
const target = b.standardTargetOptions(.{});
// Standard release options allow the person running `zig build` to select
// between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall.
const mode = b.standardReleaseOptions();
const exe = b.addExecutable("start-hand-test", "src/main.zig");
exe.addCSourceFile("src/bitfield-workaround.c", &[_][]const u8{"-std=c99"});
exe.addIncludeDir("./src/");
exe.addIncludeDir("/usr/local/include");
exe.addObjectFile("/usr/local/lib64/libs2n.a");
exe.addObjectFile("/usr/local/lib/libcrypto.a");
exe.addObjectFile("/usr/local/lib/libssl.a");
exe.addObjectFile("/usr/local/lib64/libaws-c-auth.a");
exe.addObjectFile("/usr/local/lib64/libaws-c-cal.a");
exe.addObjectFile("/usr/local/lib64/libaws-c-common.a");
exe.addObjectFile("/usr/local/lib64/libaws-c-compression.a");
exe.addObjectFile("/usr/local/lib64/libaws-c-http.a");
exe.addObjectFile("/usr/local/lib64/libaws-c-io.a");
exe.linkSystemLibrary("c");
exe.setTarget(target);
exe.setBuildMode(mode);
exe.override_dest_dir = .{ .Custom = ".." };
// TODO: Figure out -static
// Neither of these two work
// exe.addCompileFlags([][]const u8{
// "-static",
// "--strip",
// });
exe.is_static = true;
exe.strip = true;
exe.install();
const run_cmd = exe.run();
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| {
run_cmd.addArgs(args);
}
const run_step = b.step("run", "Run the app");
run_step.dependOn(&run_cmd.step);
}

1145
src/aws.zig Normal file

File diff suppressed because it is too large Load Diff

34
src/bitfield-workaround.c Normal file
View File

@ -0,0 +1,34 @@
#include <aws/auth/signing_config.h>
#include <aws/common/date_time.h>
#include "bitfield-workaround.h"
extern void *new_aws_signing_config(
struct aws_allocator *allocator,
const struct bitfield_workaround_aws_signing_config_aws *config) {
struct aws_signing_config_aws *new_config = aws_mem_acquire(allocator, sizeof(struct aws_signing_config_aws));
new_config->algorithm = config->algorithm;
new_config->config_type = config->config_type;
new_config->signature_type = config->signature_type;
new_config->region = config->region;
new_config->service = config->service;
new_config->should_sign_header = config->should_sign_header;
new_config->should_sign_header_ud = config->should_sign_header_ud;
new_config->flags.use_double_uri_encode = config->flags.use_double_uri_encode;
new_config->flags.should_normalize_uri_path = config->flags.should_normalize_uri_path;
new_config->flags.omit_session_token = config->flags.omit_session_token;
new_config->signed_body_value = config->signed_body_value;
new_config->signed_body_header = config->signed_body_header;
new_config->credentials = config->credentials;
new_config->credentials_provider = config->credentials_provider;
new_config->expiration_in_seconds = config->expiration_in_seconds;
aws_date_time_init_now(&new_config->date);
return new_config;
}
extern FILE *get_std_err() {
return stderr;
}

142
src/bitfield-workaround.h Normal file
View File

@ -0,0 +1,142 @@
#ifndef ZIG_AWS_BITFIELD_WORKAROUND_H
#define ZIG_AWS_BITFIELD_WORKAROUND_H
#include <aws/auth/auth.h>
#include <aws/auth/signing_config.h>
// Copied verbatim from https://github.com/awslabs/aws-c-auth/blob/main/include/aws/auth/signing_config.h#L127-L241
// However, the flags has changed to uint32_t without bitfield annotations
// as Zig does not support them yet. See https://github.com/ziglang/zig/issues/1499
// We've renamed as well to make clear what's going on
//
// Signing date is also somewhat problematic, so we removed it and it is
// part of the c code
/*
* Put all flags in here at the end. If this grows, stay aware of bit-space overflow and ABI compatibilty.
*/
struct bitfield_workaround_aws_signing_config_aws_flags {
/**
* We assume the uri will be encoded once in preparation for transmission. Certain services
* do not decode before checking signature, requiring us to actually double-encode the uri in the canonical
* request in order to pass a signature check.
*/
uint32_t use_double_uri_encode;
/**
* Controls whether or not the uri paths should be normalized when building the canonical request
*/
uint32_t should_normalize_uri_path;
/**
* Controls whether "X-Amz-Security-Token" is omitted from the canonical request.
* "X-Amz-Security-Token" is added during signing, as a header or
* query param, when credentials have a session token.
* If false (the default), this parameter is included in the canonical request.
* If true, this parameter is still added, but omitted from the canonical request.
*/
uint32_t omit_session_token;
};
/**
* A configuration structure for use in AWS-related signing. Currently covers sigv4 only, but is not required to.
*/
struct bitfield_workaround_aws_signing_config_aws {
/**
* What kind of config structure is this?
*/
enum aws_signing_config_type config_type;
/**
* What signing algorithm to use.
*/
enum aws_signing_algorithm algorithm;
/**
* What sort of signature should be computed?
*/
enum aws_signature_type signature_type;
/**
* The region to sign against
*/
struct aws_byte_cursor region;
/**
* name of service to sign a request for
*/
struct aws_byte_cursor service;
/**
* Raw date to use during the signing process.
*/
// struct aws_date_time date;
/**
* Optional function to control which headers are a part of the canonical request.
* Skipping auth-required headers will result in an unusable signature. Headers injected by the signing process
* are not skippable.
*
* This function does not override the internal check function (x-amzn-trace-id, user-agent), but rather
* supplements it. In particular, a header will get signed if and only if it returns true to both
* the internal check (skips x-amzn-trace-id, user-agent) and this function (if defined).
*/
aws_should_sign_header_fn *should_sign_header;
void *should_sign_header_ud;
/*
* Put all flags in here at the end. If this grows, stay aware of bit-space overflow and ABI compatibilty.
*/
struct bitfield_workaround_aws_signing_config_aws_flags flags;
/**
* Optional string to use as the canonical request's body value.
* If string is empty, a value will be calculated from the payload during signing.
* Typically, this is the SHA-256 of the (request/chunk/event) payload, written as lowercase hex.
* If this has been precalculated, it can be set here. Special values used by certain services can also be set
* (e.g. "UNSIGNED-PAYLOAD" "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" "STREAMING-AWS4-HMAC-SHA256-EVENTS").
*/
struct aws_byte_cursor signed_body_value;
/**
* Controls what body "hash" header, if any, should be added to the canonical request and the signed request:
* AWS_SBHT_NONE - no header should be added
* AWS_SBHT_X_AMZ_CONTENT_SHA256 - the body "hash" should be added in the X-Amz-Content-Sha256 header
*/
enum aws_signed_body_header_type signed_body_header;
/*
* Signing key control:
*
* (1) If "credentials" is valid, use it
* (2) Else if "credentials_provider" is valid, query credentials from the provider and use the result
* (3) Else fail
*
*/
/**
* AWS Credentials to sign with.
*/
const struct aws_credentials *credentials;
/**
* AWS credentials provider to fetch credentials from.
*/
struct aws_credentials_provider *credentials_provider;
/**
* If non-zero and the signing transform is query param, then signing will add X-Amz-Expires to the query
* string, equal to the value specified here. If this value is zero or if header signing is being used then
* this parameter has no effect.
*/
uint64_t expiration_in_seconds;
};
extern void *new_aws_signing_config(struct aws_allocator *allocator, const struct bitfield_workaround_aws_signing_config_aws *config);
extern FILE *get_std_err();
#endif

55
src/bool.zig Normal file
View File

@ -0,0 +1,55 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2015-2021 Zig Contributors
// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
// The MIT license requires this copyright notice to be included in all copies
// and substantial portions of the software.
const std = @import("std");
const builtin = std.builtin;
const testing = std.testing;
/// Thread-safe, lock-free boolean
pub const Bool = extern struct {
unprotected_value: bool,
pub const Self = @This();
pub fn init(init_val: bool) Self {
return Self{ .unprotected_value = init_val };
}
// xchg is only valid rmw operation for a bool
/// Atomically modifies memory and then returns the previous value.
pub fn xchg(self: *Self, operand: bool, comptime ordering: std.builtin.AtomicOrder) bool {
switch (ordering) {
.Monotonic, .Acquire, .Release, .AcqRel, .SeqCst => {},
else => @compileError("Invalid ordering '" ++ @tagName(ordering) ++ "' for a RMW operation"),
}
return @atomicRmw(bool, &self.unprotected_value, .Xchg, operand, ordering);
}
pub fn load(self: *Self, comptime ordering: std.builtin.AtomicOrder) bool {
switch (ordering) {
.Unordered, .Monotonic, .Acquire, .SeqCst => {},
else => @compileError("Invalid ordering '" ++ @tagName(ordering) ++ "' for a load operation"),
}
return @atomicLoad(bool, &self.unprotected_value, ordering);
}
pub fn store(self: *Self, value: bool, comptime ordering: std.builtin.AtomicOrder) void {
switch (ordering) {
.Unordered, .Monotonic, .Release, .SeqCst => {},
else => @compileError("Invalid ordering '" ++ @tagName(ordering) ++ "' for a store operation"),
}
@atomicStore(bool, &self.unprotected_value, value, ordering);
}
};
test "std.atomic.Bool" {
var a = Bool.init(false);
testing.expectEqual(false, a.xchg(false, .SeqCst));
testing.expectEqual(false, a.load(.SeqCst));
a.store(true, .SeqCst);
testing.expectEqual(true, a.xchg(false, .SeqCst));
testing.expectEqual(false, a.load(.SeqCst));
}

63
src/main.zig Normal file
View File

@ -0,0 +1,63 @@
const std = @import("std");
const aws = @import("aws.zig");
pub fn log(
comptime level: std.log.Level,
comptime scope: @TypeOf(.EnumLiteral),
comptime format: []const u8,
args: anytype,
) void {
// Ignore awshttp messages
if (scope == .awshttp and @enumToInt(level) >= @enumToInt(std.log.Level.debug))
return;
const scope_prefix = "(" ++ @tagName(scope) ++ "): ";
const prefix = "[" ++ @tagName(level) ++ "] " ++ scope_prefix;
// Print the message to stderr, silently ignoring any errors
const held = std.debug.getStderrMutex().acquire();
defer held.release();
const stderr = std.io.getStdErr().writer();
nosuspend stderr.print(prefix ++ format ++ "\n", args) catch return;
}
pub fn main() anyerror!void {
// Uncomment if you want to log allocations
// const file = try std.fs.cwd().createFile("/tmp/allocations.log", .{ .truncate = true });
// defer file.close();
// var child_allocator = std.heap.c_allocator;
// const allocator = &std.heap.loggingAllocator(child_allocator, file.writer()).allocator;
const allocator = std.heap.c_allocator;
const options = aws.Options{
.region = "us-west-2",
};
std.log.info("Start", .{});
var client = aws.Aws.init(allocator);
defer client.deinit();
const resp = try client.call(aws.services.sts.get_caller_identity.Request{}, options);
// TODO: This is a bit wonky. Root cause is lack of declarations in
// comptime-generated types
defer aws.Aws.responseDeinit(resp.raw_response, resp.response_metadata);
// Flip to true to run a second time. This will help debug
// allocation/deallocation issues
const test_twice = false;
if (test_twice) {
std.time.sleep(1000 * std.time.ns_per_ms);
std.log.info("second request", .{});
var client2 = aws.Aws.init(allocator);
defer client2.deinit();
const resp2 = try client2.call(aws.services.sts.get_caller_identity.Request{}, options); // catch here and try alloc?
defer aws.Aws.responseDeinit(resp2.raw_response, resp2.response_metadata);
}
std.log.info("arn: {s}", .{resp.arn});
std.log.info("id: {s}", .{resp.user_id});
std.log.info("account: {s}", .{resp.account});
std.log.info("requestId: {s}", .{resp.response_metadata.request_id});
std.log.info("Departing main", .{});
}

649
src/xml.zig Normal file
View File

@ -0,0 +1,649 @@
const std = @import("std");
const mem = std.mem;
const testing = std.testing;
const Allocator = mem.Allocator;
const ArenaAllocator = std.heap.ArenaAllocator;
const ArrayList = std.ArrayList;
pub const Attribute = struct {
name: []const u8, value: []const u8
};
pub const Content = union(enum) {
CharData: []const u8, Comment: []const u8, Element: *Element
};
pub const Element = struct {
pub const AttributeList = ArrayList(*Attribute);
pub const ContentList = ArrayList(Content);
tag: []const u8,
attributes: AttributeList,
children: ContentList,
fn init(tag: []const u8, alloc: *Allocator) Element {
return .{
.tag = tag,
.attributes = AttributeList.init(alloc),
.children = ContentList.init(alloc),
};
}
pub fn getAttribute(self: *Element, attrib_name: []const u8) ?[]const u8 {
for (self.attributes.items) |child| {
if (mem.eql(u8, child.name, attrib_name)) {
return child.value;
}
}
return null;
}
pub fn getCharData(self: *Element, child_tag: []const u8) ?[]const u8 {
const child = self.findChildByTag(child_tag) orelse return null;
if (child.children.items.len != 1) {
return null;
}
return switch (child.children.items[0]) {
.CharData => |char_data| char_data,
else => null,
};
}
pub fn iterator(self: *Element) ChildIterator {
return .{
.items = self.children.items,
.i = 0,
};
}
pub fn elements(self: *Element) ChildElementIterator {
return .{
.inner = self.iterator(),
};
}
pub fn findChildByTag(self: *Element, tag: []const u8) ?*Element {
return self.findChildrenByTag(tag).next();
}
pub fn findChildrenByTag(self: *Element, tag: []const u8) FindChildrenByTagIterator {
return .{
.inner = self.elements(),
.tag = tag,
};
}
pub const ChildIterator = struct {
items: []Content,
i: usize,
pub fn next(self: *ChildIterator) ?*Content {
if (self.i < self.items.len) {
self.i += 1;
return &self.items[self.i - 1];
}
return null;
}
};
pub const ChildElementIterator = struct {
inner: ChildIterator,
pub fn next(self: *ChildElementIterator) ?*Element {
while (self.inner.next()) |child| {
if (child.* != .Element) {
continue;
}
return child.*.Element;
}
return null;
}
};
pub const FindChildrenByTagIterator = struct {
inner: ChildElementIterator,
tag: []const u8,
pub fn next(self: *FindChildrenByTagIterator) ?*Element {
while (self.inner.next()) |child| {
if (!mem.eql(u8, child.tag, self.tag)) {
continue;
}
return child;
}
return null;
}
};
};
pub const XmlDecl = struct {
version: []const u8, encoding: ?[]const u8, standalone: ?bool
};
pub const Document = struct {
arena: ArenaAllocator,
xml_decl: ?*XmlDecl,
root: *Element,
pub fn deinit(self: Document) void {
var arena = self.arena; // Copy to stack so self can be taken by value.
arena.deinit();
}
};
const ParseContext = struct {
source: []const u8,
offset: usize,
line: usize,
column: usize,
fn init(source: []const u8) ParseContext {
return .{
.source = source,
.offset = 0,
.line = 0,
.column = 0,
};
}
fn peek(self: *ParseContext) ?u8 {
return if (self.offset < self.source.len) self.source[self.offset] else null;
}
fn consume(self: *ParseContext) !u8 {
if (self.offset < self.source.len) {
return self.consumeNoEof();
}
return error.UnexpectedEof;
}
fn consumeNoEof(self: *ParseContext) u8 {
std.debug.assert(self.offset < self.source.len);
const c = self.source[self.offset];
self.offset += 1;
if (c == '\n') {
self.line += 1;
self.column = 0;
} else {
self.column += 1;
}
return c;
}
fn eat(self: *ParseContext, char: u8) bool {
self.expect(char) catch return false;
return true;
}
fn expect(self: *ParseContext, expected: u8) !void {
if (self.peek()) |actual| {
if (expected != actual) {
return error.UnexpectedCharacter;
}
_ = self.consumeNoEof();
return;
}
return error.UnexpectedEof;
}
fn eatStr(self: *ParseContext, text: []const u8) bool {
self.expectStr(text) catch return false;
return true;
}
fn expectStr(self: *ParseContext, text: []const u8) !void {
if (self.source.len < self.offset + text.len) {
return error.UnexpectedEof;
} else if (std.mem.startsWith(u8, self.source[self.offset..], text)) {
var i: usize = 0;
while (i < text.len) : (i += 1) {
_ = self.consumeNoEof();
}
return;
}
return error.UnexpectedCharacter;
}
fn eatWs(self: *ParseContext) bool {
var ws = false;
while (self.peek()) |ch| {
switch (ch) {
' ', '\t', '\n', '\r' => {
ws = true;
_ = self.consumeNoEof();
},
else => break,
}
}
return ws;
}
fn expectWs(self: *ParseContext) !void {
if (!self.eatWs()) return error.UnexpectedCharacter;
}
fn currentLine(self: ParseContext) []const u8 {
var begin: usize = 0;
if (mem.lastIndexOfScalar(u8, self.source[0..self.offset], '\n')) |prev_nl| {
begin = prev_nl + 1;
}
var end = mem.indexOfScalarPos(u8, self.source, self.offset, '\n') orelse self.source.len;
return self.source[begin..end];
}
};
test "ParseContext" {
{
var ctx = ParseContext.init("I like pythons");
testing.expectEqual(@as(?u8, 'I'), ctx.peek());
testing.expectEqual(@as(u8, 'I'), ctx.consumeNoEof());
testing.expectEqual(@as(?u8, ' '), ctx.peek());
testing.expectEqual(@as(u8, ' '), try ctx.consume());
testing.expect(ctx.eat('l'));
testing.expectEqual(@as(?u8, 'i'), ctx.peek());
testing.expectEqual(false, ctx.eat('a'));
testing.expectEqual(@as(?u8, 'i'), ctx.peek());
try ctx.expect('i');
testing.expectEqual(@as(?u8, 'k'), ctx.peek());
testing.expectError(error.UnexpectedCharacter, ctx.expect('a'));
testing.expectEqual(@as(?u8, 'k'), ctx.peek());
testing.expect(ctx.eatStr("ke"));
testing.expectEqual(@as(?u8, ' '), ctx.peek());
testing.expect(ctx.eatWs());
testing.expectEqual(@as(?u8, 'p'), ctx.peek());
testing.expectEqual(false, ctx.eatWs());
testing.expectEqual(@as(?u8, 'p'), ctx.peek());
testing.expectEqual(false, ctx.eatStr("aaaaaaaaa"));
testing.expectEqual(@as(?u8, 'p'), ctx.peek());
testing.expectError(error.UnexpectedEof, ctx.expectStr("aaaaaaaaa"));
testing.expectEqual(@as(?u8, 'p'), ctx.peek());
testing.expectError(error.UnexpectedCharacter, ctx.expectStr("pytn"));
testing.expectEqual(@as(?u8, 'p'), ctx.peek());
try ctx.expectStr("python");
testing.expectEqual(@as(?u8, 's'), ctx.peek());
}
{
var ctx = ParseContext.init("");
testing.expectEqual(ctx.peek(), null);
testing.expectError(error.UnexpectedEof, ctx.consume());
testing.expectEqual(ctx.eat('p'), false);
testing.expectError(error.UnexpectedEof, ctx.expect('p'));
}
}
pub const ParseError = error{ IllegalCharacter, UnexpectedEof, UnexpectedCharacter, UnclosedValue, UnclosedComment, InvalidName, InvalidEntity, InvalidStandaloneValue, NonMatchingClosingTag, InvalidDocument, OutOfMemory };
pub fn parse(backing_allocator: *Allocator, source: []const u8) !Document {
var ctx = ParseContext.init(source);
return try parseDocument(&ctx, backing_allocator);
}
fn parseDocument(ctx: *ParseContext, backing_allocator: *Allocator) !Document {
var doc = Document{
.arena = ArenaAllocator.init(backing_allocator),
.xml_decl = null,
.root = undefined,
};
errdefer doc.deinit();
try trySkipComments(ctx, &doc.arena.allocator);
doc.xml_decl = try tryParseProlog(ctx, &doc.arena.allocator);
_ = ctx.eatWs();
try trySkipComments(ctx, &doc.arena.allocator);
doc.root = (try tryParseElement(ctx, &doc.arena.allocator)) orelse return error.InvalidDocument;
_ = ctx.eatWs();
try trySkipComments(ctx, &doc.arena.allocator);
if (ctx.peek() != null) return error.InvalidDocument;
return doc;
}
fn parseAttrValue(ctx: *ParseContext, alloc: *Allocator) ![]const u8 {
const quote = try ctx.consume();
if (quote != '"' and quote != '\'') return error.UnexpectedCharacter;
const begin = ctx.offset;
while (true) {
const c = ctx.consume() catch return error.UnclosedValue;
if (c == quote) break;
}
const end = ctx.offset - 1;
return try dupeAndUnescape(alloc, ctx.source[begin..end]);
}
fn parseEqAttrValue(ctx: *ParseContext, alloc: *Allocator) ![]const u8 {
_ = ctx.eatWs();
try ctx.expect('=');
_ = ctx.eatWs();
return try parseAttrValue(ctx, alloc);
}
fn parseNameNoDupe(ctx: *ParseContext) ![]const u8 {
// XML's spec on names is very long, so to make this easier
// we just take any character that is not special and not whitespace
const begin = ctx.offset;
while (ctx.peek()) |ch| {
switch (ch) {
' ', '\t', '\n', '\r' => break,
'&', '"', '\'', '<', '>', '?', '=', '/' => break,
else => _ = ctx.consumeNoEof(),
}
}
const end = ctx.offset;
if (begin == end) return error.InvalidName;
return ctx.source[begin..end];
}
fn tryParseCharData(ctx: *ParseContext, alloc: *Allocator) !?[]const u8 {
const begin = ctx.offset;
while (ctx.peek()) |ch| {
switch (ch) {
'<', '>' => break,
else => _ = ctx.consumeNoEof(),
}
}
const end = ctx.offset;
if (begin == end) return null;
return try dupeAndUnescape(alloc, ctx.source[begin..end]);
}
fn parseContent(ctx: *ParseContext, alloc: *Allocator) ParseError!Content {
if (try tryParseCharData(ctx, alloc)) |cd| {
return Content{ .CharData = cd };
} else if (try tryParseComment(ctx, alloc)) |comment| {
return Content{ .Comment = comment };
} else if (try tryParseElement(ctx, alloc)) |elem| {
return Content{ .Element = elem };
} else {
return error.UnexpectedCharacter;
}
}
fn tryParseAttr(ctx: *ParseContext, alloc: *Allocator) !?*Attribute {
const name = parseNameNoDupe(ctx) catch return null;
_ = ctx.eatWs();
try ctx.expect('=');
_ = ctx.eatWs();
const value = try parseAttrValue(ctx, alloc);
const attr = try alloc.create(Attribute);
attr.name = try mem.dupe(alloc, u8, name);
attr.value = value;
return attr;
}
fn tryParseElement(ctx: *ParseContext, alloc: *Allocator) !?*Element {
const start = ctx.offset;
if (!ctx.eat('<')) return null;
const tag = parseNameNoDupe(ctx) catch {
ctx.offset = start;
return null;
};
const element = try alloc.create(Element);
element.* = Element.init(try std.mem.dupe(alloc, u8, tag), alloc);
while (ctx.eatWs()) {
const attr = (try tryParseAttr(ctx, alloc)) orelse break;
try element.attributes.append(attr);
}
if (ctx.eatStr("/>")) {
return element;
}
try ctx.expect('>');
while (true) {
if (ctx.peek() == null) {
return error.UnexpectedEof;
} else if (ctx.eatStr("</")) {
break;
}
const content = try parseContent(ctx, alloc);
try element.children.append(content);
}
const closing_tag = try parseNameNoDupe(ctx);
if (!std.mem.eql(u8, tag, closing_tag)) {
return error.NonMatchingClosingTag;
}
_ = ctx.eatWs();
try ctx.expect('>');
return element;
}
test "tryParseElement" {
var arena = std.heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit();
var alloc = &arena.allocator;
{
var ctx = ParseContext.init("<= a='b'/>");
testing.expectEqual(@as(?*Element, null), try tryParseElement(&ctx, alloc));
testing.expectEqual(@as(?u8, '<'), ctx.peek());
}
{
var ctx = ParseContext.init("<python size='15' color = \"green\"/>");
const elem = try tryParseElement(&ctx, alloc);
testing.expectEqualSlices(u8, elem.?.tag, "python");
const size_attr = elem.?.attributes.items[0];
testing.expectEqualSlices(u8, size_attr.name, "size");
testing.expectEqualSlices(u8, size_attr.value, "15");
const color_attr = elem.?.attributes.items[1];
testing.expectEqualSlices(u8, color_attr.name, "color");
testing.expectEqualSlices(u8, color_attr.value, "green");
}
{
var ctx = ParseContext.init("<python>test</python>");
const elem = try tryParseElement(&ctx, alloc);
testing.expectEqualSlices(u8, elem.?.tag, "python");
testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "test");
}
{
var ctx = ParseContext.init("<a>b<c/>d<e/>f<!--g--></a>");
const elem = try tryParseElement(&ctx, alloc);
testing.expectEqualSlices(u8, elem.?.tag, "a");
testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "b");
testing.expectEqualSlices(u8, elem.?.children.items[1].Element.tag, "c");
testing.expectEqualSlices(u8, elem.?.children.items[2].CharData, "d");
testing.expectEqualSlices(u8, elem.?.children.items[3].Element.tag, "e");
testing.expectEqualSlices(u8, elem.?.children.items[4].CharData, "f");
testing.expectEqualSlices(u8, elem.?.children.items[5].Comment, "g");
}
}
fn tryParseProlog(ctx: *ParseContext, alloc: *Allocator) !?*XmlDecl {
const start = ctx.offset;
if (!ctx.eatStr("<?") or !mem.eql(u8, try parseNameNoDupe(ctx), "xml")) {
ctx.offset = start;
return null;
}
const decl = try alloc.create(XmlDecl);
decl.encoding = null;
decl.standalone = null;
// Version info is mandatory
try ctx.expectWs();
try ctx.expectStr("version");
decl.version = try parseEqAttrValue(ctx, alloc);
if (ctx.eatWs()) {
// Optional encoding and standalone info
var require_ws = false;
if (ctx.eatStr("encoding")) {
decl.encoding = try parseEqAttrValue(ctx, alloc);
require_ws = true;
}
if (require_ws == ctx.eatWs() and ctx.eatStr("standalone")) {
const standalone = try parseEqAttrValue(ctx, alloc);
if (std.mem.eql(u8, standalone, "yes")) {
decl.standalone = true;
} else if (std.mem.eql(u8, standalone, "no")) {
decl.standalone = false;
} else {
return error.InvalidStandaloneValue;
}
}
_ = ctx.eatWs();
}
try ctx.expectStr("?>");
return decl;
}
test "tryParseProlog" {
var arena = std.heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit();
var alloc = &arena.allocator;
{
var ctx = ParseContext.init("<?xmla version='aa'?>");
testing.expectEqual(@as(?*XmlDecl, null), try tryParseProlog(&ctx, alloc));
testing.expectEqual(@as(?u8, '<'), ctx.peek());
}
{
var ctx = ParseContext.init("<?xml version='aa'?>");
const decl = try tryParseProlog(&ctx, alloc);
testing.expectEqualSlices(u8, "aa", decl.?.version);
testing.expectEqual(@as(?[]const u8, null), decl.?.encoding);
testing.expectEqual(@as(?bool, null), decl.?.standalone);
}
{
var ctx = ParseContext.init("<?xml version=\"aa\" encoding = 'bbb' standalone \t = 'yes'?>");
const decl = try tryParseProlog(&ctx, alloc);
testing.expectEqualSlices(u8, "aa", decl.?.version);
testing.expectEqualSlices(u8, "bbb", decl.?.encoding.?);
testing.expectEqual(@as(?bool, true), decl.?.standalone.?);
}
}
fn trySkipComments(ctx: *ParseContext, alloc: *Allocator) !void {
while (try tryParseComment(ctx, alloc)) |_| {
_ = ctx.eatWs();
}
}
fn tryParseComment(ctx: *ParseContext, alloc: *Allocator) !?[]const u8 {
if (!ctx.eatStr("<!--")) return null;
const begin = ctx.offset;
while (!ctx.eatStr("-->")) {
_ = ctx.consume() catch return error.UnclosedComment;
}
const end = ctx.offset - "-->".len;
return try mem.dupe(alloc, u8, ctx.source[begin..end]);
}
fn unescapeEntity(text: []const u8) !u8 {
const EntitySubstition = struct {
text: []const u8, replacement: u8
};
const entities = [_]EntitySubstition{
.{ .text = "&lt;", .replacement = '<' },
.{ .text = "&gt;", .replacement = '>' },
.{ .text = "&amp;", .replacement = '&' },
.{ .text = "&apos;", .replacement = '\'' },
.{ .text = "&quot;", .replacement = '"' },
};
for (entities) |entity| {
if (std.mem.eql(u8, text, entity.text)) return entity.replacement;
}
return error.InvalidEntity;
}
fn dupeAndUnescape(alloc: *Allocator, text: []const u8) ![]const u8 {
const str = try alloc.alloc(u8, text.len);
var j: usize = 0;
var i: usize = 0;
while (i < text.len) : (j += 1) {
if (text[i] == '&') {
const entity_end = 1 + (mem.indexOfScalarPos(u8, text, i, ';') orelse return error.InvalidEntity);
str[j] = try unescapeEntity(text[i..entity_end]);
i = entity_end;
} else {
str[j] = text[i];
i += 1;
}
}
return alloc.shrink(str, j);
}
test "dupeAndUnescape" {
var arena = std.heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit();
var alloc = &arena.allocator;
testing.expectEqualSlices(u8, "test", try dupeAndUnescape(alloc, "test"));
testing.expectEqualSlices(u8, "a<b&c>d\"e'f<", try dupeAndUnescape(alloc, "a&lt;b&amp;c&gt;d&quot;e&apos;f&lt;"));
testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&"));
testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&&"));
testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&test;"));
testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&boa"));
}
test "Top level comments" {
var arena = std.heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit();
var alloc = &arena.allocator;
const doc = try parse(alloc, "<?xml version='aa'?><!--comment--><python color='green'/><!--another comment-->");
testing.expectEqualSlices(u8, "python", doc.root.tag);
}