2024-01-26 23:45:02 +01:00

436 lines
14 KiB
Zig

const std = @import("std");
const Allocator = std.mem.Allocator;
const assert = std.debug.assert;
const log = std.log;
const equal = std.mem.eql;
const data_structures = @import("../data_structures.zig");
const ArrayList = data_structures.ArrayList;
const enumFromString = data_structures.enumFromString;
const Compilation = @import("../Compilation.zig");
const File = Compilation.File;
const logln = Compilation.logln;
const Token = Compilation.Token;
const fs = @import("../fs.zig");
// Needed information
// Token: u8
// line: u32
// column: u16
// offset: u32
// len: u24
pub const Result = struct {
offset: Token.Index,
count: u32,
line_offset: u32,
line_count: u32,
// ids: ArrayList(Token.Id) = .{},
// token_lines: ArrayList(u32) = .{},
// file_line_offsets: ArrayList(u32) = .{},
// token_offsets: ArrayList(u32) = .{},
// token_lengths: ArrayList(u32) = .{},
time: u64 = 0,
};
pub const Logger = enum {
start,
end,
new_token,
number_literals,
pub var bitset = std.EnumSet(Logger).initMany(&.{
.new_token,
.start,
.end,
.number_literals,
});
};
pub fn analyze(allocator: Allocator, text: []const u8, token_buffer: *Token.Buffer) !Result {
assert(text.len <= std.math.maxInt(u32));
const len: u32 = @intCast(text.len);
var lexer = Result{
.offset = token_buffer.getOffset(),
.line_offset = token_buffer.getLineOffset(),
.count = 0,
.line_count = 0,
};
const time_start = std.time.Instant.now() catch unreachable;
try token_buffer.line_offsets.append(allocator, 0);
for (text, 0..) |byte, index| {
if (byte == '\n') {
try token_buffer.line_offsets.append(allocator, @intCast(index + 1));
}
}
var index: u32 = 0;
var line_index: u32 = lexer.line_offset;
try token_buffer.tokens.ensureUnusedCapacity(allocator, text.len / 4);
logln(.lexer, .end, "START LEXER - TOKEN OFFSET: {} - LINE OFFSET: {}", .{Token.unwrap(lexer.offset), lexer.line_offset});
while (index < len) {
const start_index = index;
const start_character = text[index];
const token_id: Token.Id = switch (start_character) {
'a'...'z', 'A'...'Z', '_' => blk: {
while (true) {
const ch = text[index];
if ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or ch == '_' or (ch >= '0' and ch <= '9')) {
index += 1;
continue;
}
break;
}
// const identifier = text[start_index..][0 .. index - start_index];
// logln("Identifier: {s}", .{identifier});
if (start_character == 'u' or start_character == 's' and text[start_index + 1] >= '0' and text[start_index + 1] <= '9') {
var index_integer = start_index + 1;
while (text[index_integer] >= '0' and text[index_integer] <= '9') {
index_integer += 1;
}
if (index_integer == index) {
const id: Token.Id = switch (start_character) {
'u' => .keyword_unsigned_integer,
's' => .keyword_signed_integer,
else => unreachable,
};
break :blk id;
}
}
const string = text[start_index..][0 .. index - start_index];
break :blk if (enumFromString(Compilation.FixedKeyword, string)) |fixed_keyword| switch (fixed_keyword) {
inline else => |comptime_fixed_keyword| @field(Token.Id, "fixed_keyword_" ++ @tagName(comptime_fixed_keyword)),
} else if (equal(u8, string, "_")) .discard else .identifier;
},
'0'...'9' => blk: {
// Detect other non-decimal literals
if (text[index] == '0' and index + 1 < text.len) {
if (text[index + 1] == 'x') {
index += 2;
} else if (text[index + 1] == 'b') {
index += 2;
} else if (text[index + 1] == 'o') {
index += 2;
}
}
while (text[index] >= '0' and text[index] <= '9' or text[index] >= 'a' and text[index] <= 'f' or text[index] >= 'A' and text[index] <= 'F') {
index += 1;
}
break :blk .number_literal;
},
'\'' => blk: {
index += 1;
index += @intFromBool(text[index] == '\'');
index += 1;
const is_end_char_literal = text[index] == '\'';
index += @intFromBool(is_end_char_literal);
if (!is_end_char_literal) unreachable;
break :blk .character_literal;
},
'"' => blk: {
index += 1;
while (true) {
if (text[index] == '"' and text[index - 1] != '"') {
break;
}
index += 1;
}
index += 1;
break :blk .string_literal;
},
'#' => blk: {
index += 1;
// const start_intrinsic = index;
while (true) {
const ch = text[index];
if ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or ch == '_') {
index += 1;
} else break;
}
// const end_intrinsic = index;
// const intrinsic_identifier = text[start_intrinsic..][0 .. end_intrinsic - start_intrinsic];
// _ = intrinsic_identifier;
break :blk .intrinsic;
},
'\n' => {
index += 1;
line_index += 1;
continue;
},
' ', '\r', '\t' => {
index += 1;
continue;
},
'(' => blk: {
index += 1;
break :blk .operator_left_parenthesis;
},
')' => blk: {
index += 1;
break :blk .operator_right_parenthesis;
},
'{' => blk: {
index += 1;
break :blk .operator_left_brace;
},
'}' => blk: {
index += 1;
break :blk .operator_right_brace;
},
'[' => blk: {
index += 1;
break :blk .operator_left_bracket;
},
']' => blk: {
index += 1;
break :blk .operator_right_bracket;
},
'<' => blk: {
index += 1;
switch (text[index]) {
'<' => {
index += 1;
break :blk switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_shift_left_assign;
},
else => .operator_shift_left,
};
},
'=' => {
index += 1;
break :blk .operator_compare_less_equal;
},
else =>break :blk .operator_compare_less,
}
},
'>' => blk: {
index += 1;
switch (text[index]) {
'>' => {
index += 1;
break :blk switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_shift_right_assign;
},
else => .operator_shift_right,
};
},
'=' => {
index += 1;
break :blk .operator_compare_greater_equal;
},
else =>break :blk .operator_compare_greater,
}
},
';' => blk: {
index += 1;
break :blk .operator_semicolon;
},
'@' => blk: {
index += 1;
break :blk .operator_at;
},
',' => blk: {
index += 1;
break :blk .operator_comma;
},
'.' => blk: {
index += 1;
break :blk .operator_dot;
},
':' => blk: {
index += 1;
break :blk .operator_colon;
},
'!' => blk: {
index += 1;
switch (text[index]) {
'=' => {
index += 1;
break :blk .operator_compare_not_equal;
},
else => break :blk .operator_bang,
}
},
'=' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_compare_equal;
},
'>' => b: {
index += 1;
break :b .operator_switch_case;
},
else => .operator_assign,
};
break :blk token_id;
},
'+' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_add_assign;
},
else => .operator_add,
};
break :blk token_id;
},
'-' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_sub_assign;
},
else => .operator_minus,
};
break :blk token_id;
},
'*' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_mul_assign;
},
else => .operator_asterisk,
};
break :blk token_id;
},
'/' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_div_assign;
},
else => .operator_div,
};
break :blk token_id;
},
'%' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_mod_assign;
},
else => .operator_mod,
};
break :blk token_id;
},
'|' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_or_assign;
},
else => .operator_bar,
};
break :blk token_id;
},
'&' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_and_assign;
},
else => .operator_ampersand,
};
break :blk token_id;
},
'^' => blk: {
index += 1;
const token_id: Token.Id = switch (text[index]) {
'=' => b: {
index += 1;
break :b .operator_xor_assign;
},
else => .operator_xor,
};
break :blk token_id;
},
'?' => blk: {
index += 1;
break :blk .operator_optional;
},
'$' => blk: {
index += 1;
break :blk .operator_dollar;
},
else => |ch| {
std.debug.panic("NI: '{c}'", .{ch});
},
};
const end_index = index;
const token_length = end_index - start_index;
token_buffer.tokens.appendAssumeCapacity(.{
.id = token_id,
.offset = start_index,
.length = token_length,
.line = line_index,
});
const line_offset = token_buffer.line_offsets.items[line_index];
const column = start_index - line_offset;
logln(.lexer, .new_token, "T at line {}, column {}, byte offset {}, with length {} -line offset: {}- ({s})", .{line_index, column, start_index, token_length, line_offset, @tagName(token_id)});
}
logln(.lexer, .end, "END LEXER - TOKEN OFFSET: {} - LINE OFFSET: {}", .{Token.unwrap(lexer.offset), lexer.line_offset});
lexer.count = Token.sub(token_buffer.getOffset(), lexer.offset);
lexer.line_count = token_buffer.getLineOffset() - lexer.line_offset;
const time_end = std.time.Instant.now() catch unreachable;
lexer.time = time_end.since(time_start);
return lexer;
}