2024-04-22 22:54:42 -06:00

629 lines
22 KiB
Zig

const std = @import("std");
const Allocator = std.mem.Allocator;
const assert = std.debug.assert;
const log = std.log;
const data_structures = @import("../library.zig");
const enumFromString = data_structures.enumFromString;
const MyAllocator = data_structures.MyAllocator;
const UnpinnedArray = data_structures.UnpinnedArray;
const Compilation = @import("../Compilation.zig");
const File = Compilation.File;
const logln = Compilation.logln;
const Token = Compilation.Token;
const fs = @import("../fs.zig");
// Needed information
// Token: u8
// line: u32
// column: u16
// offset: u32
// len: u24
pub const Result = struct {
offset: Token.Index,
count: u32,
line_offset: u32,
line_count: u32,
time: u64 = 0,
};
pub const Logger = enum {
start,
end,
new_token,
number_literals,
pub var bitset = std.EnumSet(Logger).initMany(&.{
.new_token,
.start,
.end,
.number_literals,
});
};
pub fn analyze(allocator: *MyAllocator, text: []const u8, token_buffer: *Token.Buffer) !Result {
assert(text.len <= std.math.maxInt(u32));
const len: u32 = @intCast(text.len);
var lexer = Result{
.offset = token_buffer.getOffset(),
.line_offset = token_buffer.getLineOffset(),
.count = 0,
.line_count = 0,
};
const time_start = std.time.Instant.now() catch unreachable;
try token_buffer.line_offsets.append(allocator, 0);
for (text, 0..) |byte, index| {
if (byte == '\n') {
try token_buffer.line_offsets.append(allocator, @intCast(index + 1));
}
}
var index: u32 = 0;
var line_index: u32 = lexer.line_offset;
try token_buffer.ensure_with_capacity(allocator, len / 3);
// logln(.lexer, .end, "START LEXER - TOKEN OFFSET: {} - LINE OFFSET: {}", .{ Token.unwrap(lexer.offset), lexer.line_offset });
while (index < len) {
const start_index = index;
const start_character = text[index];
const token_id: Token.Id = switch (start_character) {
'a'...'z', 'A'...'Z', '_' => blk: {
while (true) {
const ch = text[index];
if ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or ch == '_' or (ch >= '0' and ch <= '9')) {
index += 1;
continue;
}
break;
}
// const identifier = text[start_index..][0 .. index - start_index];
// logln("Identifier: {s}", .{identifier});
if (start_character == 'u' or start_character == 's' and text[start_index + 1] >= '0' and text[start_index + 1] <= '9') {
var index_integer = start_index + 1;
while (text[index_integer] >= '0' and text[index_integer] <= '9') {
index_integer += 1;
}
if (index_integer == index) {
const id: Token.Id = switch (start_character) {
'u' => .keyword_unsigned_integer,
's' => .keyword_signed_integer,
else => unreachable,
};
break :blk id;
}
}
const string = text[start_index..][0 .. index - start_index];
break :blk if (enumFromString(Compilation.FixedKeyword, string)) |fixed_keyword| switch (fixed_keyword) {
inline else => |comptime_fixed_keyword| @field(Token.Id, "fixed_keyword_" ++ @tagName(comptime_fixed_keyword)),
} else if (data_structures.byte_equal(string, "_")) .discard else .identifier;
},
'0'...'9' => blk: {
// Detect other non-decimal literals
if (text[index] == '0' and index + 1 < text.len) {
if (text[index + 1] == 'x') {
index += 2;
} else if (text[index + 1] == 'b') {
index += 2;
} else if (text[index + 1] == 'o') {
index += 2;
}
}
while (text[index] >= '0' and text[index] <= '9' or text[index] >= 'a' and text[index] <= 'f' or text[index] >= 'A' and text[index] <= 'F') {
index += 1;
}
break :blk .number_literal;
},
'\'' => blk: {
index += 1;
index += @intFromBool(text[index] == '\\');
index += 1;
const is_end_char_literal = text[index] == '\'';
index += @intFromBool(is_end_char_literal);
if (!is_end_char_literal) unreachable;
break :blk .character_literal;
},
'"' => blk: {
index += 1;
while (true) {
if (text[index] == '"' and text[index - 1] != '"') {
break;
}
index += 1;
}
index += 1;
break :blk .string_literal;
},
'#' => blk: {
index += 1;
// const start_intrinsic = index;
while (true) {
const ch = text[index];
if ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or ch == '_') {
index += 1;
} else break;
}
// const end_intrinsic = index;
// const intrinsic_identifier = text[start_intrinsic..][0 .. end_intrinsic - start_intrinsic];
// _ = intrinsic_identifier;
break :blk .intrinsic;
},
'\n' => {
index += 1;
line_index += 1;
continue;
},
' ', '\r', '\t' => {
index += 1;
continue;
},
'(' => blk: {
index += 1;
break :blk .operator_left_parenthesis;
},
')' => blk: {
index += 1;
break :blk .operator_right_parenthesis;
},
'{' => blk: {
index += 1;
break :blk .operator_left_brace;
},
'}' => blk: {
index += 1;
break :blk .operator_right_brace;
},
'[' => blk: {
index += 1;
break :blk .operator_left_bracket;
},
']' => blk: {
index += 1;
break :blk .operator_right_bracket;
},
'<' => blk: {
index += 1;
switch (text[index]) {
'<' => {
index += 1;
break :blk switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_shift_left_assign;
},
else => .operator_shift_left,
};
},
'=' => {
index += 1;
break :blk .operator_compare_less_equal;
},
else => break :blk .operator_compare_less,
}
},
'>' => blk: {
index += 1;
switch (text[index]) {
'>' => {
index += 1;
break :blk switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_shift_right_assign;
},
else => .operator_shift_right,
};
},
'=' => {
index += 1;
break :blk .operator_compare_greater_equal;
},
else => break :blk .operator_compare_greater,
}
},
';' => blk: {
index += 1;
break :blk .operator_semicolon;
},
'@' => blk: {
index += 1;
break :blk .operator_at;
},
',' => blk: {
index += 1;
break :blk .operator_comma;
},
'.' => blk: {
index += 1;
if (text[index] == '.') {
index += 1;
if (text[index] == '.') {
index += 1;
break :blk .operator_triple_dot;
} else {
break :blk .operator_double_dot;
}
} else {
break :blk .operator_dot;
}
},
':' => blk: {
index += 1;
break :blk .operator_colon;
},
'~' => blk: {
index += 1;
break :blk .operator_tilde;
},
'!' => blk: {
index += 1;
switch (text[index]) {
'=' => {
index += 1;
break :blk .operator_compare_not_equal;
},
else => break :blk .operator_bang,
}
},
'=' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_compare_equal;
},
'>' => b: {
index += 1;
break :b .operator_switch_case;
},
else => .operator_assign,
};
break :blk token_id;
},
'+' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_add_assign;
},
'|' => b: {
index += 1;
break :b switch (text[index]) {
'=' => assign: {
index += 1;
break :assign .operator_saturated_add_assign;
},
else => .operator_saturated_add,
};
},
'%' => b: {
index += 1;
break :b switch (text[index]) {
'=' => assign: {
index += 1;
break :assign .operator_wrapping_add_assign;
},
else => .operator_wrapping_add,
};
},
else => .operator_add,
};
break :blk token_id;
},
'-' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_sub_assign;
},
'|' => b: {
index += 1;
break :b switch (text[index]) {
'=' => assign: {
index += 1;
break :assign .operator_saturated_add_assign;
},
else => .operator_saturated_sub,
};
},
'%' => b: {
index += 1;
break :b switch (text[index]) {
'=' => assign: {
index += 1;
break :assign .operator_wrapping_sub_assign;
},
else => .operator_wrapping_sub,
};
},
else => .operator_minus,
};
break :blk token_id;
},
'*' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_mul_assign;
},
'|' => b: {
index += 1;
break :b switch (text[index]) {
'=' => assign: {
index += 1;
break :assign .operator_saturated_mul_assign;
},
else => .operator_saturated_mul,
};
},
'%' => b: {
index += 1;
break :b switch (text[index]) {
'=' => assign: {
index += 1;
break :assign .operator_wrapping_mul_assign;
},
else => .operator_wrapping_mul,
};
},
else => .operator_asterisk,
};
break :blk token_id;
},
'/' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_div_assign;
},
'/' => {
while (text[index] != '\n') {
index += 1;
}
continue;
},
else => .operator_div,
};
break :blk token_id;
},
'%' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_mod_assign;
},
else => .operator_mod,
};
break :blk token_id;
},
'|' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_or_assign;
},
else => .operator_bar,
};
break :blk token_id;
},
'&' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_and_assign;
},
else => .operator_ampersand,
};
break :blk token_id;
},
'^' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_xor_assign;
},
else => .operator_xor,
};
break :blk token_id;
},
'?' => blk: {
index += 1;
break :blk .operator_optional;
},
'$' => blk: {
index += 1;
break :blk .operator_dollar;
},
// Asm statement (special treatment)
'`' => {
token_buffer.append_with_capacity(.{
.id = .operator_backtick,
.line = line_index,
.offset = start_index,
.length = 1,
});
index += 1;
while (text[index] != '`') {
const start_i = index;
const start_ch = text[start_i];
switch (start_ch) {
'\n' => {
index += 1;
line_index += 1;
},
' ' => index += 1,
'A'...'Z', 'a'...'z' => {
while (true) {
switch (text[index]) {
'A'...'Z', 'a'...'z' => index += 1,
else => break,
}
}
token_buffer.append_with_capacity(.{
.id = .identifier,
.offset = start_i,
.length = index - start_i,
.line = line_index,
});
},
',' => {
token_buffer.append_with_capacity(.{
.id = .operator_comma,
.line = line_index,
.offset = start_i,
.length = 1,
});
index += 1;
},
';' => {
token_buffer.append_with_capacity(.{
.id = .operator_semicolon,
.line = line_index,
.offset = start_i,
.length = 1,
});
index += 1;
},
'{' => {
token_buffer.append_with_capacity(.{
.id = .operator_left_brace,
.line = line_index,
.offset = start_i,
.length = 1,
});
index += 1;
},
'}' => {
token_buffer.append_with_capacity(.{
.id = .operator_right_brace,
.line = line_index,
.offset = start_i,
.length = 1,
});
index += 1;
},
'0' => {
index += 1;
const Representation = enum {
hex,
bin,
octal,
};
const representation: Representation = switch (text[index]) {
'x' => .hex,
else => unreachable,
};
index += 1;
switch (representation) {
.hex => {
while (true) {
switch (text[index]) {
'a'...'f', 'A'...'F', '0'...'9' => index += 1,
else => break,
}
}
token_buffer.append_with_capacity(.{
.id = .number_literal,
.line = line_index,
.offset = start_i,
.length = index - start_i,
});
},
else => unreachable,
}
},
else => unreachable,
}
}
token_buffer.append_with_capacity(.{
.id = .operator_backtick,
.line = line_index,
.length = 1,
.offset = index,
});
index += 1;
continue;
},
else => |ch| {
const ch_arr = [1]u8{ch};
@panic(&ch_arr);
},
};
const end_index = index;
const token_length = end_index - start_index;
token_buffer.append_with_capacity(.{
.id = token_id,
.offset = start_index,
.length = token_length,
.line = line_index,
});
// const line_offset = token_buffer.line_offsets.pointer[line_index];
// const column = start_index - line_offset;
// logln(.lexer, .new_token, "T at line {}, column {}, byte offset {}, with length {} -line offset: {}- ({s})", .{ line_index, column, start_index, token_length, line_offset, @tagName(token_id) });
}
// logln(.lexer, .end, "END LEXER - TOKEN OFFSET: {} - LINE OFFSET: {}", .{ Token.unwrap(lexer.offset), lexer.line_offset });
lexer.count = Token.sub(token_buffer.getOffset(), lexer.offset);
lexer.line_count = token_buffer.getLineOffset() - lexer.line_offset;
const time_end = std.time.Instant.now() catch unreachable;
lexer.time = time_end.since(time_start);
return lexer;
}