feat: basic tokenizer
This commit is contained in:
parent
94ccd1999d
commit
662c38b360
2 changed files with 589 additions and 0 deletions
|
|
@ -1,4 +1,5 @@
|
|||
const std = @import("std");
|
||||
const tokenizer = @import("tokenizer.zig");
|
||||
|
||||
pub const Story = struct {
|
||||
pub const LoadOptions = struct {};
|
||||
|
|
@ -14,3 +15,6 @@ pub const Story = struct {
|
|||
pub fn deinit(_: *Story) void {}
|
||||
};
|
||||
|
||||
test {
|
||||
_ = tokenizer;
|
||||
}
|
||||
|
|
|
|||
585
src/tokenizer.zig
Normal file
585
src/tokenizer.zig
Normal file
|
|
@ -0,0 +1,585 @@
|
|||
const std = @import("std");
|
||||
const assert = std.debug.assert;
|
||||
|
||||
fn isIdentifier(ch: u8) bool {
|
||||
return switch (ch) {
|
||||
'a'...'z', 'A'...'Z', '0'...'9', '_' => true,
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
|
||||
pub const Token = struct {
|
||||
tag: Tag,
|
||||
loc: Location,
|
||||
|
||||
pub const Location = struct {
|
||||
start: usize,
|
||||
end: usize,
|
||||
};
|
||||
|
||||
pub const Tag = enum {
|
||||
eof,
|
||||
newline,
|
||||
ampersand,
|
||||
ampersand_ampersand,
|
||||
exclaimation_mark,
|
||||
exclaimation_question,
|
||||
caret,
|
||||
colon,
|
||||
comma,
|
||||
dot,
|
||||
double_quote,
|
||||
equal,
|
||||
equal_equal,
|
||||
many_equal,
|
||||
not_equal,
|
||||
glue,
|
||||
greater_than_equal,
|
||||
greater_than,
|
||||
identifier,
|
||||
left_arrow,
|
||||
left_brace,
|
||||
left_bracket,
|
||||
left_paren,
|
||||
less_than_equal,
|
||||
less_than,
|
||||
minus,
|
||||
minus_equal,
|
||||
minus_minus,
|
||||
number_literal,
|
||||
percentage,
|
||||
pipe,
|
||||
pipe_pipe,
|
||||
plus,
|
||||
plus_equal,
|
||||
plus_plus,
|
||||
pound,
|
||||
question_mark,
|
||||
right_arrow,
|
||||
right_brace,
|
||||
right_bracket,
|
||||
right_paren,
|
||||
slash,
|
||||
star,
|
||||
string,
|
||||
tilde,
|
||||
whitespace,
|
||||
keyword_and,
|
||||
keyword_const,
|
||||
keyword_else,
|
||||
keyword_false,
|
||||
keyword_function,
|
||||
keyword_list,
|
||||
keyword_mod,
|
||||
keyword_not,
|
||||
keyword_or,
|
||||
keyword_ref,
|
||||
keyword_return,
|
||||
keyword_temp,
|
||||
keyword_true,
|
||||
keyword_var,
|
||||
invalid,
|
||||
};
|
||||
|
||||
pub const keywords = std.StaticStringMap(Tag).initComptime(.{
|
||||
.{ "and", .keyword_and },
|
||||
.{ "CONST", .keyword_const },
|
||||
.{ "else", .keyword_else },
|
||||
.{ "false", .keyword_false },
|
||||
.{ "function", .keyword_function },
|
||||
.{ "LIST", .keyword_list },
|
||||
.{ "mod", .keyword_mod },
|
||||
.{ "not", .keyword_not },
|
||||
.{ "or", .keyword_or },
|
||||
.{ "ref", .keyword_ref },
|
||||
.{ "return", .keyword_return },
|
||||
.{ "temp", .keyword_temp },
|
||||
.{ "true", .keyword_true },
|
||||
.{ "VAR", .keyword_var },
|
||||
});
|
||||
|
||||
pub fn getKeyword(bytes: []const u8) ?Tag {
|
||||
return keywords.get(bytes);
|
||||
}
|
||||
};
|
||||
|
||||
pub const Tokenizer = struct {
|
||||
buffer: [:0]const u8,
|
||||
is_line_start: bool = true,
|
||||
index: usize,
|
||||
|
||||
pub const Grammar = enum {
|
||||
content,
|
||||
expression,
|
||||
};
|
||||
|
||||
pub const State = enum {
|
||||
start,
|
||||
minus,
|
||||
slash,
|
||||
equal,
|
||||
bang,
|
||||
less_than,
|
||||
greater_than,
|
||||
word,
|
||||
identifier,
|
||||
number,
|
||||
number_dot,
|
||||
number_decimal,
|
||||
whitespace,
|
||||
newline,
|
||||
line_comment,
|
||||
block_comment,
|
||||
block_comment_end,
|
||||
invalid,
|
||||
};
|
||||
|
||||
pub fn init(buffer: [:0]const u8) Tokenizer {
|
||||
const tokenizer: Tokenizer = .{
|
||||
.buffer = buffer,
|
||||
.index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0,
|
||||
};
|
||||
return tokenizer;
|
||||
}
|
||||
|
||||
/// Process and return the next token.
|
||||
///
|
||||
/// After this returns invalid, it will reset on the next newline,
|
||||
/// returning tokens starting from there. An eof token will always be
|
||||
/// returned at the end.
|
||||
pub fn next(self: *Tokenizer, grammar: Grammar) Token {
|
||||
var result: Token = .{
|
||||
.tag = undefined,
|
||||
.loc = .{
|
||||
.start = self.index,
|
||||
.end = undefined,
|
||||
},
|
||||
};
|
||||
|
||||
state: switch (State.start) {
|
||||
.start => switch (self.buffer[self.index]) {
|
||||
0 => {
|
||||
if (self.index != self.buffer.len) {
|
||||
continue :state .invalid;
|
||||
} else return .{
|
||||
.tag = .eof,
|
||||
.loc = .{
|
||||
.start = self.index,
|
||||
.end = self.index,
|
||||
},
|
||||
};
|
||||
},
|
||||
'\n' => continue :state .newline,
|
||||
' ', '\t' => {
|
||||
result.tag = .whitespace;
|
||||
continue :state .whitespace;
|
||||
},
|
||||
'~' => {
|
||||
self.index += 1;
|
||||
result.tag = .tilde;
|
||||
},
|
||||
':' => {
|
||||
self.index += 1;
|
||||
result.tag = .colon;
|
||||
},
|
||||
'!' => {
|
||||
self.index += 1;
|
||||
continue :state .bang;
|
||||
},
|
||||
'"' => {
|
||||
self.index += 1;
|
||||
result.tag = .double_quote;
|
||||
},
|
||||
'=' => {
|
||||
self.index += 1;
|
||||
continue :state .equal;
|
||||
},
|
||||
'+' => {
|
||||
self.index += 1;
|
||||
result.tag = .plus;
|
||||
},
|
||||
'-' => {
|
||||
self.index += 1;
|
||||
continue :state .minus;
|
||||
},
|
||||
'*' => {
|
||||
self.index += 1;
|
||||
result.tag = .star;
|
||||
},
|
||||
'/' => {
|
||||
self.index += 1;
|
||||
continue :state .slash;
|
||||
},
|
||||
',' => {
|
||||
self.index += 1;
|
||||
result.tag = .comma;
|
||||
},
|
||||
'%' => {
|
||||
self.index += 1;
|
||||
result.tag = .percentage;
|
||||
},
|
||||
'?' => {
|
||||
self.index += 1;
|
||||
result.tag = .question_mark;
|
||||
},
|
||||
'|' => {
|
||||
self.index += 1;
|
||||
result.tag = .pipe;
|
||||
},
|
||||
'(' => {
|
||||
self.index += 1;
|
||||
result.tag = .left_paren;
|
||||
},
|
||||
')' => {
|
||||
self.index += 1;
|
||||
result.tag = .right_paren;
|
||||
},
|
||||
'<' => {
|
||||
self.index += 1;
|
||||
continue :state .less_than;
|
||||
},
|
||||
'>' => {
|
||||
self.index += 1;
|
||||
continue :state .greater_than;
|
||||
},
|
||||
'.' => {
|
||||
self.index += 1;
|
||||
result.tag = .dot;
|
||||
},
|
||||
'[' => {
|
||||
self.index += 1;
|
||||
result.tag = .left_bracket;
|
||||
},
|
||||
']' => {
|
||||
self.index += 1;
|
||||
result.tag = .right_bracket;
|
||||
},
|
||||
'{' => {
|
||||
self.index += 1;
|
||||
result.tag = .left_brace;
|
||||
},
|
||||
'}' => {
|
||||
self.index += 1;
|
||||
result.tag = .right_brace;
|
||||
},
|
||||
else => {
|
||||
result.loc.start = self.index;
|
||||
switch (grammar) {
|
||||
.expression => switch (self.buffer[self.index]) {
|
||||
'a'...'z', 'A'...'Z' => continue :state .identifier,
|
||||
'0'...'9' => continue :state .number,
|
||||
else => continue :state .invalid,
|
||||
},
|
||||
.content => {
|
||||
if (std.ascii.isPrint(self.buffer[self.index])) {
|
||||
continue :state .word;
|
||||
} else {
|
||||
continue :state .invalid;
|
||||
}
|
||||
},
|
||||
}
|
||||
},
|
||||
},
|
||||
.minus => switch (self.buffer[self.index]) {
|
||||
'>' => {
|
||||
self.index += 1;
|
||||
result.tag = .right_arrow;
|
||||
},
|
||||
else => result.tag = .minus,
|
||||
},
|
||||
.slash => switch (self.buffer[self.index]) {
|
||||
'/' => continue :state .line_comment,
|
||||
'*' => continue :state .block_comment,
|
||||
else => result.tag = .slash,
|
||||
},
|
||||
.equal => switch (grammar) {
|
||||
.expression => switch (self.buffer[self.index]) {
|
||||
'=' => {
|
||||
self.index += 1;
|
||||
result.tag = .equal_equal;
|
||||
},
|
||||
else => result.tag = .equal,
|
||||
},
|
||||
else => result.tag = .equal,
|
||||
},
|
||||
.bang => switch (self.buffer[self.index]) {
|
||||
'=' => {
|
||||
self.index += 1;
|
||||
result.tag = .equal;
|
||||
},
|
||||
else => result.tag = .exclaimation_mark,
|
||||
},
|
||||
.less_than => switch (self.buffer[self.index]) {
|
||||
'=' => {
|
||||
self.index += 1;
|
||||
result.tag = .less_than_equal;
|
||||
},
|
||||
'-' => {
|
||||
self.index += 1;
|
||||
result.tag = .left_arrow;
|
||||
},
|
||||
'>' => {
|
||||
self.index += 1;
|
||||
result.tag = .glue;
|
||||
},
|
||||
else => result.tag = .less_than,
|
||||
},
|
||||
.greater_than => switch (self.buffer[self.index]) {
|
||||
'=' => {
|
||||
self.index += 1;
|
||||
result.tag = .greater_than_equal;
|
||||
},
|
||||
else => result.tag = .greater_than,
|
||||
},
|
||||
.word => {
|
||||
self.index += 1;
|
||||
switch (self.buffer[self.index]) {
|
||||
'0'...'9', 'a'...'z', 'A'...'Z' => continue :state .word,
|
||||
else => {
|
||||
const ident = self.buffer[result.loc.start..self.index];
|
||||
if (Token.getKeyword(ident)) |tag| {
|
||||
result.tag = tag;
|
||||
} else {
|
||||
result.tag = .string;
|
||||
}
|
||||
},
|
||||
}
|
||||
},
|
||||
.number => {
|
||||
self.index += 1;
|
||||
switch (self.buffer[self.index]) {
|
||||
'.' => continue :state .number_dot,
|
||||
'a'...'z', 'A'...'Z', '_' => continue :state .identifier,
|
||||
'0'...'9' => continue :state .number,
|
||||
else => result.tag = .number_literal,
|
||||
}
|
||||
},
|
||||
.number_dot => {
|
||||
self.index += 1;
|
||||
switch (self.buffer[self.index]) {
|
||||
'0'...'9' => continue :state .number_decimal,
|
||||
else => {
|
||||
self.index -= 1;
|
||||
continue :state .invalid;
|
||||
},
|
||||
}
|
||||
},
|
||||
.number_decimal => {
|
||||
self.index += 1;
|
||||
switch (self.buffer[self.index]) {
|
||||
'0'...'9' => continue :state .number_decimal,
|
||||
else => result.tag = .number_literal,
|
||||
}
|
||||
},
|
||||
.identifier => {
|
||||
self.index += 1;
|
||||
switch (self.buffer[self.index]) {
|
||||
'a'...'z', 'A'...'Z', '0'...'9', '_' => continue :state .identifier,
|
||||
else => switch (grammar) {
|
||||
.expression => {
|
||||
const ident = self.buffer[result.loc.start..self.index];
|
||||
if (Token.getKeyword(ident)) |tag| {
|
||||
result.tag = tag;
|
||||
} else {
|
||||
result.tag = .identifier;
|
||||
}
|
||||
},
|
||||
.content => result.tag = .identifier,
|
||||
},
|
||||
}
|
||||
},
|
||||
.whitespace => {
|
||||
self.index += 1;
|
||||
switch (self.buffer[self.index]) {
|
||||
' ', '\t' => continue :state .whitespace,
|
||||
else => {
|
||||
if (self.is_line_start or grammar == .expression) {
|
||||
result.loc.start = self.index;
|
||||
continue :state .start;
|
||||
}
|
||||
result.tag = .whitespace;
|
||||
},
|
||||
}
|
||||
},
|
||||
.newline => {
|
||||
self.index += 1;
|
||||
switch (self.buffer[self.index]) {
|
||||
'\n' => continue :state .newline,
|
||||
else => {
|
||||
if (self.is_line_start) {
|
||||
self.is_line_start = false;
|
||||
result.loc.start = self.index;
|
||||
continue :state .start;
|
||||
} else {
|
||||
result.tag = .newline;
|
||||
}
|
||||
},
|
||||
}
|
||||
},
|
||||
.line_comment => {
|
||||
self.index += 1;
|
||||
switch (self.buffer[self.index]) {
|
||||
0 => {
|
||||
if (self.index != self.buffer.len) {
|
||||
continue :state .invalid;
|
||||
} else return .{
|
||||
.tag = .eof,
|
||||
.loc = .{
|
||||
.start = self.index,
|
||||
.end = self.index,
|
||||
},
|
||||
};
|
||||
},
|
||||
'\n' => {
|
||||
self.is_line_start = true;
|
||||
result.loc.start = self.index;
|
||||
continue :state .start;
|
||||
},
|
||||
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => continue :state .invalid,
|
||||
else => continue :state .line_comment,
|
||||
}
|
||||
},
|
||||
.block_comment => {
|
||||
self.index += 1;
|
||||
switch (self.buffer[self.index]) {
|
||||
0 => {
|
||||
if (self.index != self.buffer.len) {
|
||||
continue :state .invalid;
|
||||
} else return .{
|
||||
.tag = .eof,
|
||||
.loc = .{
|
||||
.start = self.index,
|
||||
.end = self.index,
|
||||
},
|
||||
};
|
||||
},
|
||||
'*' => {
|
||||
self.index += 1;
|
||||
continue :state .block_comment_end;
|
||||
},
|
||||
else => {
|
||||
continue :state .block_comment;
|
||||
},
|
||||
}
|
||||
},
|
||||
.block_comment_end => switch (self.buffer[self.index]) {
|
||||
0 => {
|
||||
if (self.index != self.buffer.len) {
|
||||
continue :state .invalid;
|
||||
} else return .{
|
||||
.tag = .eof,
|
||||
.loc = .{
|
||||
.start = self.index,
|
||||
.end = self.index,
|
||||
},
|
||||
};
|
||||
},
|
||||
'/' => {
|
||||
self.index += 1;
|
||||
continue :state .start;
|
||||
},
|
||||
else => continue :state .block_comment,
|
||||
},
|
||||
.invalid => {
|
||||
self.index += 1;
|
||||
switch (self.buffer[self.index]) {
|
||||
0 => if (self.index == self.buffer.len) {
|
||||
result.tag = .invalid;
|
||||
} else {
|
||||
continue :state .invalid;
|
||||
},
|
||||
'\n' => result.tag = .invalid,
|
||||
else => continue :state .invalid,
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
if (self.is_line_start) self.is_line_start = false;
|
||||
result.loc.end = self.index;
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
test "UTF-8 BOM is recognized and skipped" {
|
||||
try testTokenize("\xEF\xBB\xBFHello\n", &.{ .string, .newline });
|
||||
}
|
||||
test "line comments" {
|
||||
try testTokenize("//", &.{});
|
||||
try testTokenize("// a / b", &.{});
|
||||
try testTokenize("// /", &.{});
|
||||
try testTokenize("/// a", &.{});
|
||||
try testTokenize("////", &.{});
|
||||
}
|
||||
|
||||
test "null byte before eof" {
|
||||
try testTokenize("123 \x00 456", &.{ .string, .whitespace, .invalid });
|
||||
try testTokenize("//\x00", &.{.invalid});
|
||||
try testTokenize("\x00", &.{.invalid});
|
||||
try testTokenize("// NUL\x00\n", &.{ .invalid, .newline });
|
||||
}
|
||||
|
||||
test "content" {
|
||||
try testTokenize("Hello, world!", &.{
|
||||
.string,
|
||||
.comma,
|
||||
.whitespace,
|
||||
.string,
|
||||
.exclaimation_mark,
|
||||
});
|
||||
}
|
||||
|
||||
test "number literals" {
|
||||
try testTokenizeWithMode("0", &.{.number_literal}, .expression);
|
||||
try testTokenizeWithMode("1", &.{.number_literal}, .expression);
|
||||
try testTokenizeWithMode("2", &.{.number_literal}, .expression);
|
||||
try testTokenizeWithMode("3", &.{.number_literal}, .expression);
|
||||
try testTokenizeWithMode("4", &.{.number_literal}, .expression);
|
||||
try testTokenizeWithMode("5", &.{.number_literal}, .expression);
|
||||
try testTokenizeWithMode("6", &.{.number_literal}, .expression);
|
||||
try testTokenizeWithMode("7", &.{.number_literal}, .expression);
|
||||
try testTokenizeWithMode("8", &.{.number_literal}, .expression);
|
||||
try testTokenizeWithMode("9", &.{.number_literal}, .expression);
|
||||
|
||||
try testTokenizeWithMode("0.0", &.{.number_literal}, .expression);
|
||||
try testTokenizeWithMode("1.0", &.{.number_literal}, .expression);
|
||||
try testTokenizeWithMode("10.0", &.{.number_literal}, .expression);
|
||||
}
|
||||
|
||||
test "expressions" {
|
||||
try testTokenizeWithMode("1 + 2 * 3 - 4", &.{
|
||||
.number_literal,
|
||||
.plus,
|
||||
.number_literal,
|
||||
.star,
|
||||
.number_literal,
|
||||
.minus,
|
||||
.number_literal,
|
||||
}, .expression);
|
||||
try testTokenizeWithMode("a = 123", &.{
|
||||
.identifier,
|
||||
.equal,
|
||||
.number_literal,
|
||||
}, .expression);
|
||||
}
|
||||
|
||||
fn testTokenizeWithMode(
|
||||
source: [:0]const u8,
|
||||
expected_token_tags: []const Token.Tag,
|
||||
grammar: Tokenizer.Grammar,
|
||||
) !void {
|
||||
var tokenizer = Tokenizer.init(source);
|
||||
|
||||
for (expected_token_tags) |expected_token_tag| {
|
||||
const token = tokenizer.next(grammar);
|
||||
try std.testing.expectEqual(expected_token_tag, token.tag);
|
||||
}
|
||||
|
||||
const last_token = tokenizer.next(grammar);
|
||||
try std.testing.expectEqual(Token.Tag.eof, last_token.tag);
|
||||
try std.testing.expectEqual(source.len, last_token.loc.start);
|
||||
try std.testing.expectEqual(source.len, last_token.loc.end);
|
||||
}
|
||||
|
||||
fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
|
||||
return testTokenizeWithMode(source, expected_token_tags, .content);
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue