const std = @import("std"); const Allocator = std.mem.Allocator; const assert = std.debug.assert; const log = std.log; const data_structures = @import("../library.zig"); const enumFromString = data_structures.enumFromString; const MyAllocator = data_structures.MyAllocator; const Compilation = @import("../Compilation.zig"); const File = Compilation.File; const logln = Compilation.logln; const Token = Compilation.Token; const fs = @import("../fs.zig"); // Needed information // Token: u8 // line: u32 // column: u16 // offset: u32 // len: u24 pub const Result = struct { offset: Token.Index, count: u32, line_offset: u32, line_count: u32, time: u64 = 0, }; pub const Logger = enum { start, end, new_token, number_literals, pub var bitset = std.EnumSet(Logger).initMany(&.{ .new_token, .start, .end, .number_literals, }); }; pub fn analyze(allocator: *MyAllocator, text: []const u8, token_buffer: *Token.Buffer) !Result { assert(text.len <= std.math.maxInt(u32)); const len: u32 = @intCast(text.len); var lexer = Result{ .offset = token_buffer.getOffset(), .line_offset = token_buffer.getLineOffset(), .count = 0, .line_count = 0, }; const time_start = std.time.Instant.now() catch unreachable; try token_buffer.line_offsets.append(allocator, 0); for (text, 0..) |byte, index| { if (byte == '\n') { try token_buffer.line_offsets.append(allocator, @intCast(index + 1)); } } var index: u32 = 0; var line_index: u32 = lexer.line_offset; try token_buffer.ensure_with_capacity(allocator, len / 4); // logln(.lexer, .end, "START LEXER - TOKEN OFFSET: {} - LINE OFFSET: {}", .{ Token.unwrap(lexer.offset), lexer.line_offset }); while (index < len) { const start_index = index; const start_character = text[index]; const token_id: Token.Id = switch (start_character) { 'a'...'z', 'A'...'Z', '_' => blk: { while (true) { const ch = text[index]; if ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or ch == '_' or (ch >= '0' and ch <= '9')) { index += 1; continue; } break; } // const identifier = text[start_index..][0 .. index - start_index]; // logln("Identifier: {s}", .{identifier}); if (start_character == 'u' or start_character == 's' and text[start_index + 1] >= '0' and text[start_index + 1] <= '9') { var index_integer = start_index + 1; while (text[index_integer] >= '0' and text[index_integer] <= '9') { index_integer += 1; } if (index_integer == index) { const id: Token.Id = switch (start_character) { 'u' => .keyword_unsigned_integer, 's' => .keyword_signed_integer, else => unreachable, }; break :blk id; } } const string = text[start_index..][0 .. index - start_index]; break :blk if (enumFromString(Compilation.FixedKeyword, string)) |fixed_keyword| switch (fixed_keyword) { inline else => |comptime_fixed_keyword| @field(Token.Id, "fixed_keyword_" ++ @tagName(comptime_fixed_keyword)), } else if (data_structures.byte_equal( string, "_")) .discard else .identifier; }, '0'...'9' => blk: { // Detect other non-decimal literals if (text[index] == '0' and index + 1 < text.len) { if (text[index + 1] == 'x') { index += 2; } else if (text[index + 1] == 'b') { index += 2; } else if (text[index + 1] == 'o') { index += 2; } } while (text[index] >= '0' and text[index] <= '9' or text[index] >= 'a' and text[index] <= 'f' or text[index] >= 'A' and text[index] <= 'F') { index += 1; } break :blk .number_literal; }, '\'' => blk: { index += 1; index += @intFromBool(text[index] == '\''); index += 1; const is_end_char_literal = text[index] == '\''; index += @intFromBool(is_end_char_literal); if (!is_end_char_literal) unreachable; break :blk .character_literal; }, '"' => blk: { index += 1; while (true) { if (text[index] == '"' and text[index - 1] != '"') { break; } index += 1; } index += 1; break :blk .string_literal; }, '#' => blk: { index += 1; // const start_intrinsic = index; while (true) { const ch = text[index]; if ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or ch == '_') { index += 1; } else break; } // const end_intrinsic = index; // const intrinsic_identifier = text[start_intrinsic..][0 .. end_intrinsic - start_intrinsic]; // _ = intrinsic_identifier; break :blk .intrinsic; }, '\n' => { index += 1; line_index += 1; continue; }, ' ', '\r', '\t' => { index += 1; continue; }, '(' => blk: { index += 1; break :blk .operator_left_parenthesis; }, ')' => blk: { index += 1; break :blk .operator_right_parenthesis; }, '{' => blk: { index += 1; break :blk .operator_left_brace; }, '}' => blk: { index += 1; break :blk .operator_right_brace; }, '[' => blk: { index += 1; break :blk .operator_left_bracket; }, ']' => blk: { index += 1; break :blk .operator_right_bracket; }, '<' => blk: { index += 1; switch (text[index]) { '<' => { index += 1; break :blk switch (text[index]) { '=' => b: { index += 1; break :b .operator_shift_left_assign; }, else => .operator_shift_left, }; }, '=' => { index += 1; break :blk .operator_compare_less_equal; }, else => break :blk .operator_compare_less, } }, '>' => blk: { index += 1; switch (text[index]) { '>' => { index += 1; break :blk switch (text[index]) { '=' => b: { index += 1; break :b .operator_shift_right_assign; }, else => .operator_shift_right, }; }, '=' => { index += 1; break :blk .operator_compare_greater_equal; }, else => break :blk .operator_compare_greater, } }, ';' => blk: { index += 1; break :blk .operator_semicolon; }, '@' => blk: { index += 1; break :blk .operator_at; }, ',' => blk: { index += 1; break :blk .operator_comma; }, '.' => blk: { index += 1; break :blk .operator_dot; }, ':' => blk: { index += 1; break :blk .operator_colon; }, '!' => blk: { index += 1; switch (text[index]) { '=' => { index += 1; break :blk .operator_compare_not_equal; }, else => break :blk .operator_bang, } }, '=' => blk: { index += 1; const token_id: Token.Id = switch (text[index]) { '=' => b: { index += 1; break :b .operator_compare_equal; }, '>' => b: { index += 1; break :b .operator_switch_case; }, else => .operator_assign, }; break :blk token_id; }, '+' => blk: { index += 1; const token_id: Token.Id = switch (text[index]) { '=' => b: { index += 1; break :b .operator_add_assign; }, else => .operator_add, }; break :blk token_id; }, '-' => blk: { index += 1; const token_id: Token.Id = switch (text[index]) { '=' => b: { index += 1; break :b .operator_sub_assign; }, else => .operator_minus, }; break :blk token_id; }, '*' => blk: { index += 1; const token_id: Token.Id = switch (text[index]) { '=' => b: { index += 1; break :b .operator_mul_assign; }, else => .operator_asterisk, }; break :blk token_id; }, '/' => blk: { index += 1; const token_id: Token.Id = switch (text[index]) { '=' => b: { index += 1; break :b .operator_div_assign; }, else => .operator_div, }; break :blk token_id; }, '%' => blk: { index += 1; const token_id: Token.Id = switch (text[index]) { '=' => b: { index += 1; break :b .operator_mod_assign; }, else => .operator_mod, }; break :blk token_id; }, '|' => blk: { index += 1; const token_id: Token.Id = switch (text[index]) { '=' => b: { index += 1; break :b .operator_or_assign; }, else => .operator_bar, }; break :blk token_id; }, '&' => blk: { index += 1; const token_id: Token.Id = switch (text[index]) { '=' => b: { index += 1; break :b .operator_and_assign; }, else => .operator_ampersand, }; break :blk token_id; }, '^' => blk: { index += 1; const token_id: Token.Id = switch (text[index]) { '=' => b: { index += 1; break :b .operator_xor_assign; }, else => .operator_xor, }; break :blk token_id; }, '?' => blk: { index += 1; break :blk .operator_optional; }, '$' => blk: { index += 1; break :blk .operator_dollar; }, else => |ch| { const ch_arr = [1]u8{ch}; @panic(&ch_arr); }, }; const end_index = index; const token_length = end_index - start_index; token_buffer.append_with_capacity(.{ .id = token_id, .offset = start_index, .length = token_length, .line = line_index, }); // const line_offset = token_buffer.line_offsets.pointer[line_index]; // const column = start_index - line_offset; // logln(.lexer, .new_token, "T at line {}, column {}, byte offset {}, with length {} -line offset: {}- ({s})", .{ line_index, column, start_index, token_length, line_offset, @tagName(token_id) }); } // logln(.lexer, .end, "END LEXER - TOKEN OFFSET: {} - LINE OFFSET: {}", .{ Token.unwrap(lexer.offset), lexer.line_offset }); lexer.count = Token.sub(token_buffer.getOffset(), lexer.offset); lexer.line_count = token_buffer.getLineOffset() - lexer.line_offset; const time_end = std.time.Instant.now() catch unreachable; lexer.time = time_end.since(time_start); return lexer; }