From c7bff8a5e7453f4e91b22b9d2aecb5ddaca5bdf1 Mon Sep 17 00:00:00 2001
From: Emil Lerch <emil@lerch.org>
Date: Fri, 23 Jul 2021 14:04:12 -0700
Subject: [PATCH] correct remaining to_snake issues in service manifest

---
 codegen/src/main.zig  |  22 ++++++-
 codegen/src/snake.zig | 141 ++++++++++++++++++++++++++++++------------
 2 files changed, 122 insertions(+), 41 deletions(-)

diff --git a/codegen/src/main.zig b/codegen/src/main.zig
index d47de74..2d6bc4c 100644
--- a/codegen/src/main.zig
+++ b/codegen/src/main.zig
@@ -106,7 +106,7 @@ fn generateServices(allocator: *std.mem.Allocator, comptime _: []const u8, file:
         // Service struct
         // name of the field will be snake_case of whatever comes in from
         // sdk_id. Not sure this will simple...
-        const constant_name = try snake.fromPascalCase(allocator, sdk_id);
+        const constant_name = try constantName(allocator, sdk_id);
         try constant_names.append(constant_name);
         try writer.print("const Self = @This();\n", .{});
         try writer.print("pub const version: []const u8 = \"{s}\";\n", .{version});
@@ -132,6 +132,26 @@ fn generateServices(allocator: *std.mem.Allocator, comptime _: []const u8, file:
     }
     return constant_names.toOwnedSlice();
 }
+fn constantName(allocator: *std.mem.Allocator, id: []const u8) ![]const u8 {
+    // There are some ids that don't follow consistent rules, so we'll
+    // look for the exceptions and, if not found, revert to the snake case
+    // algorithm
+
+    // This one might be a bug in snake, but it's the only example so HPDL
+    if (std.mem.eql(u8, id, "SESv2")) return try std.fmt.allocPrint(allocator, "ses_v2", .{});
+    // IoT is an acryonym, but snake wouldn't know that. Interestingly not all
+    // iot services are capitalizing that way.
+    if (std.mem.eql(u8, id, "IoTSiteWise")) return try std.fmt.allocPrint(allocator, "iot_site_wise", .{}); //sitewise?
+    if (std.mem.eql(u8, id, "IoTFleetHub")) return try std.fmt.allocPrint(allocator, "iot_fleet_hub", .{});
+    if (std.mem.eql(u8, id, "IoTSecureTunneling")) return try std.fmt.allocPrint(allocator, "iot_secure_tunneling", .{});
+    if (std.mem.eql(u8, id, "IoTThingsGraph")) return try std.fmt.allocPrint(allocator, "iot_things_graph", .{});
+    // snake turns this into dev_ops, which is a little weird
+    if (std.mem.eql(u8, id, "DevOps Guru")) return try std.fmt.allocPrint(allocator, "devops_guru", .{});
+    if (std.mem.eql(u8, id, "FSx")) return try std.fmt.allocPrint(allocator, "fsx", .{});
+
+    // Not a special case - just snake it
+    return try snake.fromPascalCase(allocator, id);
+}
 fn generateOperation(allocator: *std.mem.Allocator, operation: smithy.ShapeInfo, shapes: anytype, writer: anytype, service: []const u8) !void {
     const snake_case_name = try snake.fromPascalCase(allocator, operation.name);
     defer allocator.free(snake_case_name);
diff --git a/codegen/src/snake.zig b/codegen/src/snake.zig
index 600ff9e..2747793 100644
--- a/codegen/src/snake.zig
+++ b/codegen/src/snake.zig
@@ -2,59 +2,97 @@ const std = @import("std");
 const expectEqualStrings = std.testing.expectEqualStrings;
 
 pub fn fromPascalCase(allocator: *std.mem.Allocator, name: []const u8) ![]u8 {
+    const rc = try allocator.alloc(u8, name.len * 2); // This is overkill, but is > the maximum length possibly needed
+    errdefer allocator.free(rc);
     var utf8_name = (std.unicode.Utf8View.init(name) catch unreachable).iterator();
     var target_inx: u64 = 0;
-    var previous_codepoint: ?u21 = null;
-    var cp = utf8_name.nextCodepoint();
-    if (cp == null) {
-        return try allocator.dupeZ(u8, name);
-    } // TODO: fix bug if single letter uppercase
-    var codepoint = cp.?;
-    const rc = try allocator.alloc(u8, name.len * 2); // This is overkill, but is > the maximum length possibly needed
-    while (utf8_name.nextCodepoint()) |next_codepoint| {
-        if (codepoint > 0xff) return error{UnicodeNotSupported}.UnicodeNotSupported;
-        if (next_codepoint > 0xff) return error{UnicodeNotSupported}.UnicodeNotSupported;
-        const ascii_char = @truncate(u8, codepoint);
-        if (next_codepoint == ' ') continue; // ignore all spaces in name
-        if (ascii_char >= 'A' and ascii_char < 'Z') {
-            const lowercase_char = ascii_char + ('a' - 'A');
-            if (previous_codepoint == null) {
-                rc[target_inx] = lowercase_char;
-                target_inx = target_inx + 1;
+    var curr_char = (try isAscii(utf8_name.nextCodepoint())).?;
+    target_inx = setNext(lowercase(curr_char), rc, target_inx);
+    var prev_char = curr_char;
+    if (try isAscii(utf8_name.nextCodepoint())) |ch| {
+        curr_char = ch;
+    } else {
+        // Single character only - we're done here
+        _ = setNext(0, rc, target_inx);
+        return rc[0..target_inx];
+    }
+    while (try isAscii(utf8_name.nextCodepoint())) |next_char| {
+        if (next_char == ' ') {
+            // a space shouldn't be happening. But if it does, it clues us
+            // in pretty well:
+            //
+            // MyStuff Is Awesome
+            //       |^
+            //       |next_char
+            //       ^
+            //       prev_codepoint/ascii_prev_char (and target_inx)
+            target_inx = setNext(lowercase(curr_char), rc, target_inx);
+            target_inx = setNext('_', rc, target_inx);
+            curr_char = (try isAscii(utf8_name.nextCodepoint())).?;
+            target_inx = setNext(lowercase(curr_char), rc, target_inx);
+            prev_char = curr_char;
+            curr_char = (try isAscii(utf8_name.nextCodepoint())).?;
+            continue;
+        }
+        if (between(curr_char, 'A', 'Z')) {
+            if (isAcronym(curr_char, next_char)) {
+                // We could be in an acronym at the start of a word. This
+                // is the only case where we actually need to look back at the
+                // previous character, and if that's the case, throw in an
+                // underscore
+                // "SAMLMySAMLAcronymThing");
+                if (between(prev_char, 'a', 'z'))
+                    target_inx = setNext('_', rc, target_inx);
+
+                //we are in an acronym - don't snake, just lower
+                target_inx = setNext(lowercase(curr_char), rc, target_inx);
             } else {
-                if (next_codepoint >= 'A' and next_codepoint <= 'Z' and previous_codepoint.? >= 'A' and previous_codepoint.? <= 'Z') {
-                    //we are in an acronym - don't snake, just lower
-                    rc[target_inx] = lowercase_char;
-                    target_inx = target_inx + 1;
-                } else {
-                    rc[target_inx] = '_';
-                    rc[target_inx + 1] = lowercase_char;
-                    target_inx = target_inx + 2;
-                }
+                target_inx = setNext('_', rc, target_inx);
+                target_inx = setNext(lowercase(curr_char), rc, target_inx);
             }
         } else {
-            // if (ascii_char == ' ') {
-            //     rc[target_inx] = '_';
-            // } else {
-            rc[target_inx] = ascii_char;
-            // }
-            target_inx = target_inx + 1;
+            target_inx = setNext(curr_char, rc, target_inx);
         }
-        previous_codepoint = codepoint;
-        codepoint = next_codepoint;
+        prev_char = curr_char;
+        curr_char = next_char;
     }
     // work in the last codepoint - force lowercase
-    rc[target_inx] = @truncate(u8, codepoint);
-    if (rc[target_inx] >= 'A' and rc[target_inx] <= 'Z') {
-        const lowercase_char = rc[target_inx] + ('a' - 'A');
-        rc[target_inx] = lowercase_char;
-    }
-    target_inx = target_inx + 1;
+    target_inx = setNext(lowercase(curr_char), rc, target_inx);
 
     rc[target_inx] = 0;
     return rc[0..target_inx];
 }
 
+fn isAcronym(char1: u8, char2: u8) bool {
+    return isAcronymChar(char1) and isAcronymChar(char2);
+}
+fn isAcronymChar(char: u8) bool {
+    return between(char, 'A', 'Z') or between(char, '0', '9');
+}
+fn isAscii(codepoint: ?u21) !?u8 {
+    if (codepoint) |cp| {
+        if (cp > 0xff) return error.UnicodeNotSupported;
+        return @truncate(u8, cp);
+    }
+    return null;
+}
+
+fn setNext(ascii: u8, slice: []u8, inx: u64) u64 {
+    slice[inx] = ascii;
+    return inx + 1;
+}
+
+fn lowercase(ascii: u8) u8 {
+    var lowercase_char = ascii;
+    if (between(ascii, 'A', 'Z'))
+        lowercase_char = ascii + ('a' - 'A');
+    return lowercase_char;
+}
+
+fn between(char: u8, from: u8, to: u8) bool {
+    return char >= from and char <= to;
+}
+
 test "converts from PascalCase to snake_case" {
     const allocator = std.testing.allocator;
     const snake_case = try fromPascalCase(allocator, "MyPascalCaseThing");
@@ -73,3 +111,26 @@ test "spaces in the name" {
     defer allocator.free(snake_case);
     try expectEqualStrings("api_gateway", snake_case);
 }
+
+test "S3" {
+    const allocator = std.testing.allocator;
+    const snake_case = try fromPascalCase(allocator, "S3");
+    defer allocator.free(snake_case);
+    try expectEqualStrings("s3", snake_case);
+}
+
+test "ec2" {
+    const allocator = std.testing.allocator;
+    const snake_case = try fromPascalCase(allocator, "EC2");
+    defer allocator.free(snake_case);
+    try expectEqualStrings("ec2", snake_case);
+}
+
+test "IoT 1Click Devices Service" {
+    const allocator = std.testing.allocator;
+    const snake_case = try fromPascalCase(allocator, "IoT 1Click Devices Service");
+    defer allocator.free(snake_case);
+    // NOTE: There is some debate amoung humans about what this should
+    // turn into. Should it be iot_1click_... or iot_1_click...?
+    try expectEqualStrings("iot_1_click_devices_service", snake_case);
+}