nativity/bootstrap/frontend/lexical_analyzer.zig

const std = @import("std");
const Allocator = std.mem.Allocator;
const assert = std.debug.assert;
const log = std.log;

const equal = std.mem.eql;

const data_structures = @import("../data_structures.zig");
const ArrayList = data_structures.ArrayList;
const enumFromString = data_structures.enumFromString;

const Compilation = @import("../Compilation.zig");
const File = Compilation.File;
const logln = Compilation.logln;
const fs = @import("../fs.zig");

pub const Token = packed struct(u64) {
    start: u32,
    len: u24,
    id: Id,

    pub const Id = enum(u8) {
        eof = 0x00,
        keyword_unsigned_integer = 0x01,
        keyword_signed_integer = 0x02,
        identifier = 0x03,
        number_literal = 0x04,
        string_literal = 0x05,
        discard = 0x06,
        bang = '!', // 0x21
        hash = '#', // 0x23
        dollar_sign = '$', // 0x24
        modulus = '%', // 0x25
        ampersand = '&', // 0x26
        left_parenthesis = '(', // 0x28
        right_parenthesis = ')', // 0x29
        asterisk = '*', // 0x2a
        plus = '+', // 0x2b
        comma = ',', // 0x2c
        minus = '-', // 0x2d
        period = '.', // 0x2e
        slash = '/', // 0x2f
        colon = ':', // 0x3a
        semicolon = ';', // 0x3b
        less = '<', // 0x3c
        equal = '=', // 0x3d
        greater = '>', // 0x3e
        question_mark = '?', // 0x3f
        at = '@', // 0x40
        left_bracket = '[', // 0x5b
        backlash = '\\', // 0x5c
        right_bracket = ']', // 0x5d
        caret = '^', // 0x5e
        underscore = '_', // 0x5f
        grave = '`', // 0x60
        left_brace = '{', // 0x7b
        vertical_bar = '|', // 0x7c
        right_brace = '}', // 0x7d
        tilde = '~', // 0x7e
        fixed_keyword_function = 0x7f,
        fixed_keyword_const = 0x80,
        fixed_keyword_var = 0x81,
        fixed_keyword_void = 0x82,
        fixed_keyword_noreturn = 0x83,
        fixed_keyword_comptime = 0x84,
        fixed_keyword_while = 0x85,
        fixed_keyword_bool = 0x86,
        fixed_keyword_true = 0x87,
        fixed_keyword_false = 0x88,
        fixed_keyword_fn = 0x89,
        fixed_keyword_unreachable = 0x8a,
        fixed_keyword_return = 0x8b,
        fixed_keyword_ssize = 0x8c,
        fixed_keyword_usize = 0x8d,
        fixed_keyword_switch = 0x8e,
        fixed_keyword_if = 0x8f,
        fixed_keyword_else = 0x90,
        fixed_keyword_struct = 0x91,
        fixed_keyword_enum = 0x92,
        fixed_keyword_union = 0x93,
        fixed_keyword_extern = 0x94,
        fixed_keyword_null = 0x95,
        fixed_keyword_align = 0x96,
        fixed_keyword_export = 0x97,
        fixed_keyword_cc = 0x98,
        fixed_keyword_for = 0x99,
        fixed_keyword_undefined = 0x9a,
    };

    pub const Index = u32;
};

pub const FixedKeyword = enum {
    @"comptime",
    @"const",
    @"var",
    void,
    noreturn,
    function,
    @"while",
    bool,
    true,
    false,
    @"fn",
    @"unreachable",
    @"return",
    ssize,
    usize,
    @"switch",
    @"if",
    @"else",
    @"struct",
    @"enum",
    @"union",
    @"extern",
    null,
    @"align",
    @"export",
    cc,
    @"for",
    undefined,
};

pub const Result = struct {
    tokens: ArrayList(Token),
    time: u64,
};

pub const Logger = enum {
    main,
    new_token,
    number_literals,

    pub var bitset = std.EnumSet(Logger).initMany(&.{
        // .new_token,
        .number_literals,
    });
};

pub fn analyze(allocator: Allocator, text: []const u8, file_index: File.Index) !Result {
    _ = file_index;
    const time_start = std.time.Instant.now() catch unreachable;
    var tokens = try ArrayList(Token).initCapacity(allocator, text.len / 8);
    var index: usize = 0;

    while (index < text.len) {
        const start_index = index;
        const start_character = text[index];
        const token_id: Token.Id = switch (start_character) {
            'a'...'z', 'A'...'Z', '_' => blk: {
                while (true) {
                    const ch = text[index];
                    if ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or ch == '_' or (ch >= '0' and ch <= '9')) {
                        index += 1;
                        continue;
                    }

                    break;
                }

                // const identifier = text[start_index..][0 .. index - start_index];
                // logln("Identifier: {s}", .{identifier});

                if (start_character == 'u' or start_character == 's') {
                    var index_integer = start_index + 1;
                    while (text[index_integer] >= '0' and text[index_integer] <= '9') {
                        index_integer += 1;
                    }

                    if (index_integer == index) {
                        const id: Token.Id = switch (start_character) {
                            'u' => .keyword_unsigned_integer,
                            's' => .keyword_signed_integer,
                            else => unreachable,
                        };

                        break :blk id;
                    }
                }

                const string = text[start_index..][0 .. index - start_index];
                break :blk if (enumFromString(FixedKeyword, string)) |fixed_keyword| switch (fixed_keyword) {
                    inline else => |comptime_fixed_keyword| @field(Token.Id, "fixed_keyword_" ++ @tagName(comptime_fixed_keyword)),
                } else if (equal(u8, string, "_")) .discard else .identifier;
            },
            '0'...'9' => blk: {
                // Detect other non-decimal literals
                if (text[index] == '0' and index + 1 < text.len) {
                    logln(.lexer, .number_literals, "Number starts with 0. Checking for non-decimal literals...", .{});
                    if (text[index + 1] == 'x') {
                        logln(.lexer, .number_literals, "Hex", .{});
                        index += 2;
                    } else if (text[index + 1] == 'b') {
                        logln(.lexer, .number_literals, "Bin", .{});
                        index += 2;
                    } else if (text[index + 1] == 'o') {
                        logln(.lexer, .number_literals, "Decimal", .{});
                        index += 2;
                    }
                }

                while (text[index] >= '0' and text[index] <= '9' or text[index] >= 'a' and text[index] <= 'f' or text[index] >= 'A' and text[index] <= 'F') {
                    index += 1;
                }

                break :blk .number_literal;
            },
            '\'' => {
                unreachable;
            },
            '"' => blk: {
                index += 1;

                while (true) {
                    if (text[index] == '"' and text[index - 1] != '"') {
                        break;
                    }

                    index += 1;
                }

                index += 1;

                break :blk .string_literal;
            },
            ' ', '\n', '\r', '\t' => {
                index += 1;
                continue;
            },
            '(', ')', '{', '}', '[', ']', '=', ';', '#', '@', ',', '.', ':', '>', '<', '!', '+', '-', '*', '\\', '/', '&', '|', '^', '?', '$' => |operator| blk: {
                index += 1;
                break :blk @enumFromInt(operator);
            },
            else => |ch| {
                std.debug.panic("NI: '{c}'", .{ch});
            },
        };

        const end_index = index;
        const token = Token{
            .start = @intCast(start_index),
            .len = @intCast(end_index - start_index),
            .id = token_id,
        };

        logln(.lexer, .new_token, "New token {s} added: {s}", .{ @tagName(token.id), text[token.start..][0..token.len] });

        try tokens.append(allocator, token);
    }

    for (tokens.items, 0..) |token, i| {
        logln(.lexer, .main, "#{} {s}\n", .{ i, @tagName(token.id) });
    }

    const time_end = std.time.Instant.now() catch unreachable;
    const time = time_end.since(time_start);

    return .{
        .tokens = tokens,
        .time = time,
    };
}