From 9cbb03256bc4dc846232c37b710ead004ec4555f Mon Sep 17 00:00:00 2001
From: David Gonzalez Martin <davidgmbb@gmail.com>
Date: Wed, 19 Feb 2025 19:13:05 -0600
Subject: [PATCH] Parse simple file with null storage and result

---
 build.zig      | 182 +++++++++++++---------
 src/LLVM.zig   |   7 +-
 src/main.zig   |   7 +-
 src/parser.zig | 401 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 517 insertions(+), 80 deletions(-)
 create mode 100644 src/parser.zig

diff --git a/build.zig b/build.zig
index 50a0d2b..8daab98 100644
--- a/build.zig
+++ b/build.zig
@@ -49,80 +49,83 @@ const LLVM = struct {
     module: *std.Build.Module,
 
     fn setup(b: *std.Build, path: []const u8, target: std.Build.ResolvedTarget, optimize: std.builtin.OptimizeMode) !LLVM {
-        var llvm_libs = std.ArrayList([]const u8).init(b.allocator);
-        var flags = std.ArrayList([]const u8).init(b.allocator);
-        const llvm_config_path = if (b.option([]const u8, "llvm_prefix", "LLVM prefix")) |llvm_prefix| blk: {
-            const full_path = try std.mem.concat(b.allocator, u8, &.{ llvm_prefix, "/bin/llvm-config" });
-            const f = std.fs.cwd().openFile(full_path, .{}) catch return error.llvm_not_found;
-            f.close();
-            break :blk full_path;
-        } else executable_find_in_path(b.allocator, "llvm-config", path) orelse return error.llvm_not_found;
-        const llvm_components_result = try run_process_and_capture_stdout(b, &.{ llvm_config_path, "--components" });
-        var it = std.mem.splitScalar(u8, llvm_components_result, ' ');
-        var args = std.ArrayList([]const u8).init(b.allocator);
-        try args.append(llvm_config_path);
-        try args.append("--libs");
-        while (it.next()) |component| {
-            try args.append(std.mem.trimRight(u8, component, "\n"));
+        if (enable_llvm) {
+            var llvm_libs = std.ArrayList([]const u8).init(b.allocator);
+            var flags = std.ArrayList([]const u8).init(b.allocator);
+            const llvm_config_path = if (b.option([]const u8, "llvm_prefix", "LLVM prefix")) |llvm_prefix| blk: {
+                const full_path = try std.mem.concat(b.allocator, u8, &.{ llvm_prefix, "/bin/llvm-config" });
+                const f = std.fs.cwd().openFile(full_path, .{}) catch return error.llvm_not_found;
+                f.close();
+                break :blk full_path;
+            } else executable_find_in_path(b.allocator, "llvm-config", path) orelse return error.llvm_not_found;
+            const llvm_components_result = try run_process_and_capture_stdout(b, &.{ llvm_config_path, "--components" });
+            var it = std.mem.splitScalar(u8, llvm_components_result, ' ');
+            var args = std.ArrayList([]const u8).init(b.allocator);
+            try args.append(llvm_config_path);
+            try args.append("--libs");
+            while (it.next()) |component| {
+                try args.append(std.mem.trimRight(u8, component, "\n"));
+            }
+            const llvm_libs_result = try run_process_and_capture_stdout(b, args.items);
+            it = std.mem.splitScalar(u8, llvm_libs_result, ' ');
+
+            while (it.next()) |lib| {
+                const llvm_lib = std.mem.trimLeft(u8, std.mem.trimRight(u8, lib, "\n"), "-l");
+                try llvm_libs.append(llvm_lib);
+            }
+
+            const llvm_cxx_flags_result = try run_process_and_capture_stdout(b, &.{ llvm_config_path, "--cxxflags" });
+            it = std.mem.splitScalar(u8, llvm_cxx_flags_result, ' ');
+            while (it.next()) |flag| {
+                const llvm_cxx_flag = std.mem.trimRight(u8, flag, "\n");
+                try flags.append(llvm_cxx_flag);
+            }
+
+            const llvm_lib_dir = std.mem.trimRight(u8, try run_process_and_capture_stdout(b, &.{ llvm_config_path, "--libdir" }), "\n");
+
+            if (optimize != .ReleaseSmall) {
+                try flags.append("-g");
+            }
+
+            try flags.append("-fno-rtti");
+
+            const llvm = b.createModule(.{
+                .target = target,
+                .optimize = optimize,
+            });
+
+            llvm.addLibraryPath(.{ .cwd_relative = llvm_lib_dir });
+
+            llvm.addCSourceFiles(.{
+                .files = &.{"src/llvm.cpp"},
+                .flags = flags.items,
+            });
+            llvm.addIncludePath(.{ .cwd_relative = "/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1" });
+            llvm.addIncludePath(.{ .cwd_relative = "/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/x86_64-pc-linux-gnu" });
+            llvm.addObjectFile(.{ .cwd_relative = "/usr/lib/libstdc++.so.6" });
+
+            const needed_libraries: []const []const u8 = &.{ "unwind", "z" };
+
+            const lld_libs: []const []const u8 = &.{ "lldCommon", "lldCOFF", "lldELF", "lldMachO", "lldMinGW", "lldWasm" };
+
+            for (needed_libraries) |lib| {
+                llvm.linkSystemLibrary(lib, .{});
+            }
+
+            for (llvm_libs.items) |lib| {
+                llvm.linkSystemLibrary(lib, .{});
+            }
+
+            for (lld_libs) |lib| {
+                llvm.linkSystemLibrary(lib, .{});
+            }
+
+            return LLVM{
+                .module = llvm,
+            };
+        } else {
+            return undefined;
         }
-        const llvm_libs_result = try run_process_and_capture_stdout(b, args.items);
-        it = std.mem.splitScalar(u8, llvm_libs_result, ' ');
-
-        while (it.next()) |lib| {
-            const llvm_lib = std.mem.trimLeft(u8, std.mem.trimRight(u8, lib, "\n"), "-l");
-            try llvm_libs.append(llvm_lib);
-        }
-
-        const llvm_cxx_flags_result = try run_process_and_capture_stdout(b, &.{ llvm_config_path, "--cxxflags" });
-        it = std.mem.splitScalar(u8, llvm_cxx_flags_result, ' ');
-        while (it.next()) |flag| {
-            const llvm_cxx_flag = std.mem.trimRight(u8, flag, "\n");
-            try flags.append(llvm_cxx_flag);
-        }
-
-        const llvm_lib_dir = std.mem.trimRight(u8, try run_process_and_capture_stdout(b, &.{ llvm_config_path, "--libdir" }), "\n");
-
-        if (optimize != .ReleaseSmall) {
-            try flags.append("-g");
-        }
-
-        try flags.append("-fno-rtti");
-
-        const llvm = b.createModule(.{
-            .target = target,
-            .optimize = optimize,
-        });
-
-        llvm.addLibraryPath(.{ .cwd_relative = llvm_lib_dir });
-
-        llvm.addCSourceFiles(.{
-            .files = &.{"src/llvm.cpp"},
-            .flags = flags.items,
-        });
-        llvm.addIncludePath(.{ .cwd_relative = "/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1" });
-        llvm.addIncludePath(.{ .cwd_relative = "/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/x86_64-pc-linux-gnu" });
-
-        const needed_libraries: []const []const u8 = &.{ "unwind", "z" };
-
-        llvm.addObjectFile(.{ .cwd_relative = "/usr/lib/libstdc++.so.6" });
-
-        const lld_libs: []const []const u8 = &.{ "lldCommon", "lldCOFF", "lldELF", "lldMachO", "lldMinGW", "lldWasm" };
-
-        for (needed_libraries) |lib| {
-            llvm.linkSystemLibrary(lib, .{});
-        }
-
-        for (llvm_libs.items) |lib| {
-            llvm.linkSystemLibrary(lib, .{});
-        }
-
-        for (lld_libs) |lib| {
-            llvm.linkSystemLibrary(lib, .{});
-        }
-
-        return LLVM{
-            .module = llvm,
-        };
     }
 
     fn link(llvm: LLVM, target: *std.Build.Step.Compile) void {
@@ -135,17 +138,34 @@ const LLVM = struct {
     }
 };
 
+fn debug_binary(b: *std.Build, exe: *std.Build.Step.Compile) *std.Build.Step.Run {
+    const run_step = std.Build.Step.Run.create(b, b.fmt("debug {s}", .{exe.name}));
+    run_step.addArg("gdb");
+    run_step.addArg("-ex");
+    run_step.addArg("r");
+    run_step.addArtifactArg(exe);
+
+    return run_step;
+}
+
+var enable_llvm: bool = undefined;
+
 pub fn build(b: *std.Build) !void {
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
+    enable_llvm = b.option(bool, "enable_llvm", "Enable LLVM") orelse false;
     const env = try std.process.getEnvMap(b.allocator);
     const path = env.get("PATH") orelse unreachable;
 
+    const configuration = b.addOptions();
+    configuration.addOption(bool, "enable_llvm", enable_llvm);
+
     const exe_mod = b.createModule(.{
         .root_source_file = b.path("src/main.zig"),
         .target = target,
         .optimize = optimize,
     });
+    exe_mod.addOptions("configuration", configuration);
 
     const llvm = try LLVM.setup(b, path, target, optimize);
 
@@ -153,8 +173,11 @@ pub fn build(b: *std.Build) !void {
         .name = "bloat-buster",
         .root_module = exe_mod,
     });
+    exe.linkLibC();
 
-    llvm.link(exe);
+    if (enable_llvm) {
+        llvm.link(exe);
+    }
 
     b.installArtifact(exe);
 
@@ -166,10 +189,19 @@ pub fn build(b: *std.Build) !void {
     const run_step = b.step("run", "Run the app");
     run_step.dependOn(&run_cmd.step);
 
+    const debug_cmd = debug_binary(b, exe);
+    const debug_step = b.step("debug", "Debug the app");
+    debug_step.dependOn(&debug_cmd.step);
+
     const exe_unit_tests = b.addTest(.{
         .root_module = exe_mod,
     });
-    llvm.link(exe_unit_tests);
+    exe_unit_tests.linkLibC();
+
+    if (enable_llvm) {
+        llvm.link(exe);
+    }
+
     const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
 
     const test_step = b.step("test", "Run unit tests");
diff --git a/src/LLVM.zig b/src/LLVM.zig
index a894a30..24d9c21 100644
--- a/src/LLVM.zig
+++ b/src/LLVM.zig
@@ -2,6 +2,7 @@ const lib = @import("lib.zig");
 const Arena = lib.Arena;
 const assert = lib.assert;
 const api = @import("llvm_api.zig");
+const configuration = @import("configuration");
 
 /// This is a String which ABI-compatible with C++
 pub const String = extern struct {
@@ -757,7 +758,11 @@ const LldArgvBuilder = struct {
     }
 };
 
-pub fn experiment() void {
+test "experiment" {
+    if (!configuration.enable_llvm) {
+        return error.SkipZigTest;
+    }
+
     const thread = &global.threads[0];
     thread.initialize();
     const module = thread.context.create_module("first_module");
diff --git a/src/main.zig b/src/main.zig
index 44d8658..cf1ec89 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -1,5 +1,6 @@
 const lib = @import("lib.zig");
 const llvm = @import("LLVM.zig");
+const parser = @import("parser.zig");
 const Arena = lib.Arena;
 
 pub const panic = struct {
@@ -138,10 +139,7 @@ pub const panic = struct {
 var global_persistent_arena: *Arena = undefined;
 
 pub fn main() callconv(.C) c_int {
-    lib.GlobalState.initialize();
-
-    llvm.initialize_all();
-    llvm.experiment();
+    parser.parser_experiment();
     return 0;
 }
 
@@ -156,4 +154,5 @@ comptime {
 test {
     _ = lib;
     _ = llvm;
+    _ = parser;
 }
diff --git a/src/parser.zig b/src/parser.zig
new file mode 100644
index 0000000..6eb196e
--- /dev/null
+++ b/src/parser.zig
@@ -0,0 +1,401 @@
+const lib = @import("lib.zig");
+const assert = lib.assert;
+
+const LexerResult = struct {
+    token: Token,
+    offset: u32,
+    character_count: u32,
+};
+
+const Token = enum {};
+
+const left_bracket = '[';
+const right_bracket = ']';
+const left_brace = '{';
+const right_brace = '}';
+const left_parenthesis = '(';
+const right_parenthesis = ')';
+
+fn is_identifier_start_ch(ch: u8) bool {
+    return (ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or ch == '_';
+}
+
+fn is_decimal_ch(ch: u8) bool {
+    return ch >= '0' and ch <= '9';
+}
+
+fn is_identifier_ch(ch: u8) bool {
+    return is_identifier_start_ch(ch) or is_decimal_ch(ch);
+}
+
+fn string_to_enum(comptime E: type, string: []const u8) ?E {
+    inline for (@typeInfo(E).@"enum".fields) |e| {
+        if (lib.string.equal(e.name, string)) {
+            return @field(E, e.name);
+        }
+    } else return null;
+}
+
+pub fn parse_identifier(content: []const u8, start: u32) []const u8 {
+    var offset = start;
+
+    if (is_identifier_start_ch(content[start])) {
+        offset += 1;
+
+        while (offset < content.len) {
+            if (is_identifier_ch(content[offset])) {
+                offset += 1;
+            } else {
+                break;
+            }
+        }
+    }
+
+    return content[start..offset];
+}
+
+const GlobalKeyword = enum {
+    @"export",
+    @"extern",
+};
+
+const GlobalKind = enum {
+    @"fn",
+    foo,
+};
+
+const FunctionKeyword = enum {
+    cc,
+    foo,
+};
+
+const CallingConvention = enum {
+    unknown,
+    c,
+};
+
+fn report_error() noreturn {
+    lib.os.abort();
+}
+
+fn is_space(ch: u8) bool {
+    return ((ch == ' ') or (ch == '\n')) or ((ch == '\t' or ch == '\r'));
+}
+
+fn skip_space(content: []const u8, start: u32) u32 {
+    var offset = start;
+    while (offset < content.len and is_space(content[offset])) {
+        offset += 1;
+    }
+    return offset;
+}
+
+const StatementStartKeyword = enum {
+    @"return",
+    foooooooooo,
+};
+
+fn parse_integer(content: []const u8, start: u32) u32 {
+    const integer_start_ch = content[start];
+    assert(!is_space(integer_start_ch));
+    assert(is_decimal_ch(integer_start_ch));
+
+    var offset = start;
+
+    switch (integer_start_ch) {
+        '0' => {
+            offset += 1;
+
+            switch (content[offset]) {
+                'x' => {
+                    // TODO: parse hexadecimal
+                    report_error();
+                },
+                'o' => {
+                    // TODO: parse octal
+                    report_error();
+                },
+                'b' => {
+                    // TODO: parse binary
+                    report_error();
+                },
+                '0'...'9' => {
+                    report_error();
+                },
+                // Zero literal
+                else => {},
+            }
+        },
+        // TODO: decimal number
+        '1'...'9' => report_error(),
+        else => unreachable,
+    }
+
+    return offset;
+}
+
+fn parse_value(content: []const u8, start: u32) u32 {
+    var offset = start;
+    offset = skip_space(content, start);
+
+    const value_start_ch = content[offset];
+    if (is_identifier_start_ch(value_start_ch)) {
+        report_error();
+    } else if (is_decimal_ch(value_start_ch)) {
+        offset = parse_integer(content, offset);
+    } else {
+        report_error();
+    }
+
+    return offset;
+}
+
+fn parse_block(content: []const u8, start: u32) u32 {
+    var offset = start;
+
+    offset = skip_space(content, offset);
+
+    const is_left_brace = content[offset] == left_brace;
+    offset += @intFromBool(is_left_brace);
+
+    if (!is_left_brace) {
+        report_error();
+    }
+
+    while (true) {
+        offset = skip_space(content, offset);
+
+        if (offset == content.len) {
+            break;
+        }
+
+        if (content[offset] == right_brace) {
+            break;
+        }
+
+        const statement_start_ch = content[offset];
+        if (is_identifier_start_ch(statement_start_ch)) {
+            const statement_start_identifier = parse_identifier(content, offset);
+            // Here, since we have a mandatory identifier start ch, we know at least we have a one-character identifier and an if check is not necessary
+            offset += @intCast(statement_start_identifier.len);
+
+            if (string_to_enum(StatementStartKeyword, statement_start_identifier)) |statement_start_keyword| {
+                switch (statement_start_keyword) {
+                    .@"return" => {
+                        offset = parse_value(content, offset);
+                    },
+                    else => unreachable,
+                }
+
+                const require_semicolon = switch (statement_start_keyword) {
+                    .@"return" => true,
+                    else => report_error(),
+                };
+
+                const is_semicolon = content[offset] == ';';
+                offset += @intFromBool(is_semicolon);
+
+                if (require_semicolon and !is_semicolon) {
+                    report_error();
+                }
+            } else {
+                report_error();
+            }
+        } else {
+            report_error();
+        }
+    }
+
+    // TODO: handle it in a better way
+    assert(content[offset] == right_brace);
+    offset += 1;
+
+    return offset;
+}
+
+pub noinline fn parse_file(content: []const u8) void {
+    var offset: u32 = 0;
+
+    while (true) {
+        offset = skip_space(content, offset);
+
+        if (offset == content.len) {
+            break;
+        }
+
+        var is_export = false;
+
+        if (content[offset] == left_bracket) {
+            offset += 1;
+
+            while (offset < content.len) {
+                const global_keyword_string = parse_identifier(content, offset);
+                offset += @intCast(global_keyword_string.len);
+
+                if (global_keyword_string.len == 0) {
+                    break;
+                }
+
+                const global_keyword = string_to_enum(GlobalKeyword, global_keyword_string) orelse report_error();
+                switch (global_keyword) {
+                    .@"export" => is_export = false,
+                    else => report_error(),
+                }
+
+                switch (content[offset]) {
+                    right_bracket => {},
+                    else => report_error(),
+                }
+            }
+
+            const is_right_bracket = content[offset] == right_bracket;
+            offset += @intFromBool(is_right_bracket);
+
+            if (!is_right_bracket) {
+                report_error();
+            }
+
+            offset = skip_space(content, offset);
+        }
+
+        const global_name = parse_identifier(content, offset);
+        offset += @intCast(global_name.len);
+
+        if (global_name.len == 0) {
+            report_error();
+        }
+
+        offset = skip_space(content, offset);
+
+        const is_equal_token = content[offset] == '=';
+        offset += @intFromBool(is_equal_token);
+
+        if (!is_equal_token) {
+            report_error();
+        }
+
+        offset = skip_space(content, offset);
+
+        const global_kind_string = parse_identifier(content, offset);
+        offset += @intCast(global_kind_string.len);
+
+        offset = skip_space(content, offset);
+
+        if (global_kind_string.len == 0) {
+            report_error();
+        }
+
+        const global_kind = string_to_enum(GlobalKind, global_kind_string) orelse report_error();
+
+        switch (global_kind) {
+            .@"fn" => {
+                var calling_convention = CallingConvention.unknown;
+
+                if (content[offset] == left_bracket) {
+                    offset += 1;
+
+                    while (offset < content.len) {
+                        const function_identifier = parse_identifier(content, offset);
+                        offset += @intCast(function_identifier.len);
+
+                        if (function_identifier.len == 0) {
+                            break;
+                        }
+
+                        const function_keyword = string_to_enum(FunctionKeyword, function_identifier) orelse report_error();
+
+                        offset = skip_space(content, offset);
+
+                        switch (function_keyword) {
+                            .cc => {
+                                const is_left_parenthesis = content[offset] == left_parenthesis;
+                                offset += @intFromBool(is_left_parenthesis);
+
+                                if (!is_left_parenthesis) {
+                                    report_error();
+                                }
+
+                                offset = skip_space(content, offset);
+
+                                const calling_convention_string = parse_identifier(content, offset);
+                                offset += @intCast(calling_convention_string.len);
+
+                                if (calling_convention_string.len == 0) {
+                                    report_error();
+                                }
+
+                                calling_convention = string_to_enum(CallingConvention, calling_convention_string) orelse report_error();
+
+                                offset = skip_space(content, offset);
+
+                                const is_right_parenthesis = content[offset] == right_parenthesis;
+                                offset += @intFromBool(is_right_parenthesis);
+
+                                if (!is_right_parenthesis) {
+                                    report_error();
+                                }
+
+                                offset = skip_space(content, offset);
+                            },
+                            else => report_error(),
+                        }
+
+                        switch (content[offset]) {
+                            right_bracket => {},
+                            else => report_error(),
+                        }
+                    }
+
+                    const is_right_bracket = content[offset] == right_bracket;
+                    offset += @intFromBool(is_right_bracket);
+
+                    if (!is_right_bracket) {
+                        report_error();
+                    }
+                }
+
+                offset = skip_space(content, offset);
+
+                const is_left_parenthesis = content[offset] == left_parenthesis;
+                offset += @intFromBool(is_left_parenthesis);
+
+                if (!is_left_parenthesis) {
+                    report_error();
+                }
+
+                while (offset < content.len and content[offset] != right_parenthesis) {
+                    // TODO: arguments
+                    report_error();
+                }
+
+                // TODO: handle it in a better way
+                assert(content[offset] == right_parenthesis);
+                offset += 1;
+
+                offset = skip_space(content, offset);
+
+                const return_type = parse_identifier(content, offset);
+                offset += @intCast(return_type.len);
+
+                if (return_type.len == 0) {
+                    report_error();
+                }
+
+                offset = parse_block(content, offset);
+            },
+            else => report_error(),
+        }
+    }
+}
+
+pub fn parser_experiment() void {
+    const strlit =
+        \\[export] main = fn [cc(c)] () s32
+        \\{
+        \\    return 0;
+        \\}
+    ;
+    parse_file(strlit);
+}
+
+test "parse" {}