feat: basic tokenizer

This commit is contained in:
Brett Broadhurst 2026-02-26 18:10:18 -07:00
parent 94ccd1999d
commit 662c38b360
Failed to generate hash of commit
2 changed files with 589 additions and 0 deletions

View file

@ -1,4 +1,5 @@
const std = @import("std");
const tokenizer = @import("tokenizer.zig");
pub const Story = struct {
pub const LoadOptions = struct {};
@ -14,3 +15,6 @@ pub const Story = struct {
pub fn deinit(_: *Story) void {}
};
test {
_ = tokenizer;
}

585
src/tokenizer.zig Normal file
View file

@ -0,0 +1,585 @@
const std = @import("std");
const assert = std.debug.assert;
fn isIdentifier(ch: u8) bool {
return switch (ch) {
'a'...'z', 'A'...'Z', '0'...'9', '_' => true,
else => false,
};
}
pub const Token = struct {
tag: Tag,
loc: Location,
pub const Location = struct {
start: usize,
end: usize,
};
pub const Tag = enum {
eof,
newline,
ampersand,
ampersand_ampersand,
exclaimation_mark,
exclaimation_question,
caret,
colon,
comma,
dot,
double_quote,
equal,
equal_equal,
many_equal,
not_equal,
glue,
greater_than_equal,
greater_than,
identifier,
left_arrow,
left_brace,
left_bracket,
left_paren,
less_than_equal,
less_than,
minus,
minus_equal,
minus_minus,
number_literal,
percentage,
pipe,
pipe_pipe,
plus,
plus_equal,
plus_plus,
pound,
question_mark,
right_arrow,
right_brace,
right_bracket,
right_paren,
slash,
star,
string,
tilde,
whitespace,
keyword_and,
keyword_const,
keyword_else,
keyword_false,
keyword_function,
keyword_list,
keyword_mod,
keyword_not,
keyword_or,
keyword_ref,
keyword_return,
keyword_temp,
keyword_true,
keyword_var,
invalid,
};
pub const keywords = std.StaticStringMap(Tag).initComptime(.{
.{ "and", .keyword_and },
.{ "CONST", .keyword_const },
.{ "else", .keyword_else },
.{ "false", .keyword_false },
.{ "function", .keyword_function },
.{ "LIST", .keyword_list },
.{ "mod", .keyword_mod },
.{ "not", .keyword_not },
.{ "or", .keyword_or },
.{ "ref", .keyword_ref },
.{ "return", .keyword_return },
.{ "temp", .keyword_temp },
.{ "true", .keyword_true },
.{ "VAR", .keyword_var },
});
pub fn getKeyword(bytes: []const u8) ?Tag {
return keywords.get(bytes);
}
};
pub const Tokenizer = struct {
buffer: [:0]const u8,
is_line_start: bool = true,
index: usize,
pub const Grammar = enum {
content,
expression,
};
pub const State = enum {
start,
minus,
slash,
equal,
bang,
less_than,
greater_than,
word,
identifier,
number,
number_dot,
number_decimal,
whitespace,
newline,
line_comment,
block_comment,
block_comment_end,
invalid,
};
pub fn init(buffer: [:0]const u8) Tokenizer {
const tokenizer: Tokenizer = .{
.buffer = buffer,
.index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0,
};
return tokenizer;
}
/// Process and return the next token.
///
/// After this returns invalid, it will reset on the next newline,
/// returning tokens starting from there. An eof token will always be
/// returned at the end.
pub fn next(self: *Tokenizer, grammar: Grammar) Token {
var result: Token = .{
.tag = undefined,
.loc = .{
.start = self.index,
.end = undefined,
},
};
state: switch (State.start) {
.start => switch (self.buffer[self.index]) {
0 => {
if (self.index != self.buffer.len) {
continue :state .invalid;
} else return .{
.tag = .eof,
.loc = .{
.start = self.index,
.end = self.index,
},
};
},
'\n' => continue :state .newline,
' ', '\t' => {
result.tag = .whitespace;
continue :state .whitespace;
},
'~' => {
self.index += 1;
result.tag = .tilde;
},
':' => {
self.index += 1;
result.tag = .colon;
},
'!' => {
self.index += 1;
continue :state .bang;
},
'"' => {
self.index += 1;
result.tag = .double_quote;
},
'=' => {
self.index += 1;
continue :state .equal;
},
'+' => {
self.index += 1;
result.tag = .plus;
},
'-' => {
self.index += 1;
continue :state .minus;
},
'*' => {
self.index += 1;
result.tag = .star;
},
'/' => {
self.index += 1;
continue :state .slash;
},
',' => {
self.index += 1;
result.tag = .comma;
},
'%' => {
self.index += 1;
result.tag = .percentage;
},
'?' => {
self.index += 1;
result.tag = .question_mark;
},
'|' => {
self.index += 1;
result.tag = .pipe;
},
'(' => {
self.index += 1;
result.tag = .left_paren;
},
')' => {
self.index += 1;
result.tag = .right_paren;
},
'<' => {
self.index += 1;
continue :state .less_than;
},
'>' => {
self.index += 1;
continue :state .greater_than;
},
'.' => {
self.index += 1;
result.tag = .dot;
},
'[' => {
self.index += 1;
result.tag = .left_bracket;
},
']' => {
self.index += 1;
result.tag = .right_bracket;
},
'{' => {
self.index += 1;
result.tag = .left_brace;
},
'}' => {
self.index += 1;
result.tag = .right_brace;
},
else => {
result.loc.start = self.index;
switch (grammar) {
.expression => switch (self.buffer[self.index]) {
'a'...'z', 'A'...'Z' => continue :state .identifier,
'0'...'9' => continue :state .number,
else => continue :state .invalid,
},
.content => {
if (std.ascii.isPrint(self.buffer[self.index])) {
continue :state .word;
} else {
continue :state .invalid;
}
},
}
},
},
.minus => switch (self.buffer[self.index]) {
'>' => {
self.index += 1;
result.tag = .right_arrow;
},
else => result.tag = .minus,
},
.slash => switch (self.buffer[self.index]) {
'/' => continue :state .line_comment,
'*' => continue :state .block_comment,
else => result.tag = .slash,
},
.equal => switch (grammar) {
.expression => switch (self.buffer[self.index]) {
'=' => {
self.index += 1;
result.tag = .equal_equal;
},
else => result.tag = .equal,
},
else => result.tag = .equal,
},
.bang => switch (self.buffer[self.index]) {
'=' => {
self.index += 1;
result.tag = .equal;
},
else => result.tag = .exclaimation_mark,
},
.less_than => switch (self.buffer[self.index]) {
'=' => {
self.index += 1;
result.tag = .less_than_equal;
},
'-' => {
self.index += 1;
result.tag = .left_arrow;
},
'>' => {
self.index += 1;
result.tag = .glue;
},
else => result.tag = .less_than,
},
.greater_than => switch (self.buffer[self.index]) {
'=' => {
self.index += 1;
result.tag = .greater_than_equal;
},
else => result.tag = .greater_than,
},
.word => {
self.index += 1;
switch (self.buffer[self.index]) {
'0'...'9', 'a'...'z', 'A'...'Z' => continue :state .word,
else => {
const ident = self.buffer[result.loc.start..self.index];
if (Token.getKeyword(ident)) |tag| {
result.tag = tag;
} else {
result.tag = .string;
}
},
}
},
.number => {
self.index += 1;
switch (self.buffer[self.index]) {
'.' => continue :state .number_dot,
'a'...'z', 'A'...'Z', '_' => continue :state .identifier,
'0'...'9' => continue :state .number,
else => result.tag = .number_literal,
}
},
.number_dot => {
self.index += 1;
switch (self.buffer[self.index]) {
'0'...'9' => continue :state .number_decimal,
else => {
self.index -= 1;
continue :state .invalid;
},
}
},
.number_decimal => {
self.index += 1;
switch (self.buffer[self.index]) {
'0'...'9' => continue :state .number_decimal,
else => result.tag = .number_literal,
}
},
.identifier => {
self.index += 1;
switch (self.buffer[self.index]) {
'a'...'z', 'A'...'Z', '0'...'9', '_' => continue :state .identifier,
else => switch (grammar) {
.expression => {
const ident = self.buffer[result.loc.start..self.index];
if (Token.getKeyword(ident)) |tag| {
result.tag = tag;
} else {
result.tag = .identifier;
}
},
.content => result.tag = .identifier,
},
}
},
.whitespace => {
self.index += 1;
switch (self.buffer[self.index]) {
' ', '\t' => continue :state .whitespace,
else => {
if (self.is_line_start or grammar == .expression) {
result.loc.start = self.index;
continue :state .start;
}
result.tag = .whitespace;
},
}
},
.newline => {
self.index += 1;
switch (self.buffer[self.index]) {
'\n' => continue :state .newline,
else => {
if (self.is_line_start) {
self.is_line_start = false;
result.loc.start = self.index;
continue :state .start;
} else {
result.tag = .newline;
}
},
}
},
.line_comment => {
self.index += 1;
switch (self.buffer[self.index]) {
0 => {
if (self.index != self.buffer.len) {
continue :state .invalid;
} else return .{
.tag = .eof,
.loc = .{
.start = self.index,
.end = self.index,
},
};
},
'\n' => {
self.is_line_start = true;
result.loc.start = self.index;
continue :state .start;
},
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => continue :state .invalid,
else => continue :state .line_comment,
}
},
.block_comment => {
self.index += 1;
switch (self.buffer[self.index]) {
0 => {
if (self.index != self.buffer.len) {
continue :state .invalid;
} else return .{
.tag = .eof,
.loc = .{
.start = self.index,
.end = self.index,
},
};
},
'*' => {
self.index += 1;
continue :state .block_comment_end;
},
else => {
continue :state .block_comment;
},
}
},
.block_comment_end => switch (self.buffer[self.index]) {
0 => {
if (self.index != self.buffer.len) {
continue :state .invalid;
} else return .{
.tag = .eof,
.loc = .{
.start = self.index,
.end = self.index,
},
};
},
'/' => {
self.index += 1;
continue :state .start;
},
else => continue :state .block_comment,
},
.invalid => {
self.index += 1;
switch (self.buffer[self.index]) {
0 => if (self.index == self.buffer.len) {
result.tag = .invalid;
} else {
continue :state .invalid;
},
'\n' => result.tag = .invalid,
else => continue :state .invalid,
}
},
}
if (self.is_line_start) self.is_line_start = false;
result.loc.end = self.index;
return result;
}
};
test "UTF-8 BOM is recognized and skipped" {
try testTokenize("\xEF\xBB\xBFHello\n", &.{ .string, .newline });
}
test "line comments" {
try testTokenize("//", &.{});
try testTokenize("// a / b", &.{});
try testTokenize("// /", &.{});
try testTokenize("/// a", &.{});
try testTokenize("////", &.{});
}
test "null byte before eof" {
try testTokenize("123 \x00 456", &.{ .string, .whitespace, .invalid });
try testTokenize("//\x00", &.{.invalid});
try testTokenize("\x00", &.{.invalid});
try testTokenize("// NUL\x00\n", &.{ .invalid, .newline });
}
test "content" {
try testTokenize("Hello, world!", &.{
.string,
.comma,
.whitespace,
.string,
.exclaimation_mark,
});
}
test "number literals" {
try testTokenizeWithMode("0", &.{.number_literal}, .expression);
try testTokenizeWithMode("1", &.{.number_literal}, .expression);
try testTokenizeWithMode("2", &.{.number_literal}, .expression);
try testTokenizeWithMode("3", &.{.number_literal}, .expression);
try testTokenizeWithMode("4", &.{.number_literal}, .expression);
try testTokenizeWithMode("5", &.{.number_literal}, .expression);
try testTokenizeWithMode("6", &.{.number_literal}, .expression);
try testTokenizeWithMode("7", &.{.number_literal}, .expression);
try testTokenizeWithMode("8", &.{.number_literal}, .expression);
try testTokenizeWithMode("9", &.{.number_literal}, .expression);
try testTokenizeWithMode("0.0", &.{.number_literal}, .expression);
try testTokenizeWithMode("1.0", &.{.number_literal}, .expression);
try testTokenizeWithMode("10.0", &.{.number_literal}, .expression);
}
test "expressions" {
try testTokenizeWithMode("1 + 2 * 3 - 4", &.{
.number_literal,
.plus,
.number_literal,
.star,
.number_literal,
.minus,
.number_literal,
}, .expression);
try testTokenizeWithMode("a = 123", &.{
.identifier,
.equal,
.number_literal,
}, .expression);
}
fn testTokenizeWithMode(
source: [:0]const u8,
expected_token_tags: []const Token.Tag,
grammar: Tokenizer.Grammar,
) !void {
var tokenizer = Tokenizer.init(source);
for (expected_token_tags) |expected_token_tag| {
const token = tokenizer.next(grammar);
try std.testing.expectEqual(expected_token_tag, token.tag);
}
const last_token = tokenizer.next(grammar);
try std.testing.expectEqual(Token.Tag.eof, last_token.tag);
try std.testing.expectEqual(source.len, last_token.loc.start);
try std.testing.expectEqual(source.len, last_token.loc.end);
}
fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
return testTokenizeWithMode(source, expected_token_tags, .content);
}