From 662c38b360fa38b515e58da648eacb41147d81f6 Mon Sep 17 00:00:00 2001 From: Brett Broadhurst Date: Thu, 26 Feb 2026 18:10:18 -0700 Subject: [PATCH] feat: basic tokenizer --- src/root.zig | 4 + src/tokenizer.zig | 585 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 589 insertions(+) create mode 100644 src/tokenizer.zig diff --git a/src/root.zig b/src/root.zig index 2e51d00..466b44e 100644 --- a/src/root.zig +++ b/src/root.zig @@ -1,4 +1,5 @@ const std = @import("std"); +const tokenizer = @import("tokenizer.zig"); pub const Story = struct { pub const LoadOptions = struct {}; @@ -14,3 +15,6 @@ pub const Story = struct { pub fn deinit(_: *Story) void {} }; +test { + _ = tokenizer; +} diff --git a/src/tokenizer.zig b/src/tokenizer.zig new file mode 100644 index 0000000..c90018b --- /dev/null +++ b/src/tokenizer.zig @@ -0,0 +1,585 @@ +const std = @import("std"); +const assert = std.debug.assert; + +fn isIdentifier(ch: u8) bool { + return switch (ch) { + 'a'...'z', 'A'...'Z', '0'...'9', '_' => true, + else => false, + }; +} + +pub const Token = struct { + tag: Tag, + loc: Location, + + pub const Location = struct { + start: usize, + end: usize, + }; + + pub const Tag = enum { + eof, + newline, + ampersand, + ampersand_ampersand, + exclaimation_mark, + exclaimation_question, + caret, + colon, + comma, + dot, + double_quote, + equal, + equal_equal, + many_equal, + not_equal, + glue, + greater_than_equal, + greater_than, + identifier, + left_arrow, + left_brace, + left_bracket, + left_paren, + less_than_equal, + less_than, + minus, + minus_equal, + minus_minus, + number_literal, + percentage, + pipe, + pipe_pipe, + plus, + plus_equal, + plus_plus, + pound, + question_mark, + right_arrow, + right_brace, + right_bracket, + right_paren, + slash, + star, + string, + tilde, + whitespace, + keyword_and, + keyword_const, + keyword_else, + keyword_false, + keyword_function, + keyword_list, + keyword_mod, + keyword_not, + keyword_or, + keyword_ref, + keyword_return, + keyword_temp, + keyword_true, + keyword_var, + invalid, + }; + + pub const keywords = std.StaticStringMap(Tag).initComptime(.{ + .{ "and", .keyword_and }, + .{ "CONST", .keyword_const }, + .{ "else", .keyword_else }, + .{ "false", .keyword_false }, + .{ "function", .keyword_function }, + .{ "LIST", .keyword_list }, + .{ "mod", .keyword_mod }, + .{ "not", .keyword_not }, + .{ "or", .keyword_or }, + .{ "ref", .keyword_ref }, + .{ "return", .keyword_return }, + .{ "temp", .keyword_temp }, + .{ "true", .keyword_true }, + .{ "VAR", .keyword_var }, + }); + + pub fn getKeyword(bytes: []const u8) ?Tag { + return keywords.get(bytes); + } +}; + +pub const Tokenizer = struct { + buffer: [:0]const u8, + is_line_start: bool = true, + index: usize, + + pub const Grammar = enum { + content, + expression, + }; + + pub const State = enum { + start, + minus, + slash, + equal, + bang, + less_than, + greater_than, + word, + identifier, + number, + number_dot, + number_decimal, + whitespace, + newline, + line_comment, + block_comment, + block_comment_end, + invalid, + }; + + pub fn init(buffer: [:0]const u8) Tokenizer { + const tokenizer: Tokenizer = .{ + .buffer = buffer, + .index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0, + }; + return tokenizer; + } + + /// Process and return the next token. + /// + /// After this returns invalid, it will reset on the next newline, + /// returning tokens starting from there. An eof token will always be + /// returned at the end. + pub fn next(self: *Tokenizer, grammar: Grammar) Token { + var result: Token = .{ + .tag = undefined, + .loc = .{ + .start = self.index, + .end = undefined, + }, + }; + + state: switch (State.start) { + .start => switch (self.buffer[self.index]) { + 0 => { + if (self.index != self.buffer.len) { + continue :state .invalid; + } else return .{ + .tag = .eof, + .loc = .{ + .start = self.index, + .end = self.index, + }, + }; + }, + '\n' => continue :state .newline, + ' ', '\t' => { + result.tag = .whitespace; + continue :state .whitespace; + }, + '~' => { + self.index += 1; + result.tag = .tilde; + }, + ':' => { + self.index += 1; + result.tag = .colon; + }, + '!' => { + self.index += 1; + continue :state .bang; + }, + '"' => { + self.index += 1; + result.tag = .double_quote; + }, + '=' => { + self.index += 1; + continue :state .equal; + }, + '+' => { + self.index += 1; + result.tag = .plus; + }, + '-' => { + self.index += 1; + continue :state .minus; + }, + '*' => { + self.index += 1; + result.tag = .star; + }, + '/' => { + self.index += 1; + continue :state .slash; + }, + ',' => { + self.index += 1; + result.tag = .comma; + }, + '%' => { + self.index += 1; + result.tag = .percentage; + }, + '?' => { + self.index += 1; + result.tag = .question_mark; + }, + '|' => { + self.index += 1; + result.tag = .pipe; + }, + '(' => { + self.index += 1; + result.tag = .left_paren; + }, + ')' => { + self.index += 1; + result.tag = .right_paren; + }, + '<' => { + self.index += 1; + continue :state .less_than; + }, + '>' => { + self.index += 1; + continue :state .greater_than; + }, + '.' => { + self.index += 1; + result.tag = .dot; + }, + '[' => { + self.index += 1; + result.tag = .left_bracket; + }, + ']' => { + self.index += 1; + result.tag = .right_bracket; + }, + '{' => { + self.index += 1; + result.tag = .left_brace; + }, + '}' => { + self.index += 1; + result.tag = .right_brace; + }, + else => { + result.loc.start = self.index; + switch (grammar) { + .expression => switch (self.buffer[self.index]) { + 'a'...'z', 'A'...'Z' => continue :state .identifier, + '0'...'9' => continue :state .number, + else => continue :state .invalid, + }, + .content => { + if (std.ascii.isPrint(self.buffer[self.index])) { + continue :state .word; + } else { + continue :state .invalid; + } + }, + } + }, + }, + .minus => switch (self.buffer[self.index]) { + '>' => { + self.index += 1; + result.tag = .right_arrow; + }, + else => result.tag = .minus, + }, + .slash => switch (self.buffer[self.index]) { + '/' => continue :state .line_comment, + '*' => continue :state .block_comment, + else => result.tag = .slash, + }, + .equal => switch (grammar) { + .expression => switch (self.buffer[self.index]) { + '=' => { + self.index += 1; + result.tag = .equal_equal; + }, + else => result.tag = .equal, + }, + else => result.tag = .equal, + }, + .bang => switch (self.buffer[self.index]) { + '=' => { + self.index += 1; + result.tag = .equal; + }, + else => result.tag = .exclaimation_mark, + }, + .less_than => switch (self.buffer[self.index]) { + '=' => { + self.index += 1; + result.tag = .less_than_equal; + }, + '-' => { + self.index += 1; + result.tag = .left_arrow; + }, + '>' => { + self.index += 1; + result.tag = .glue; + }, + else => result.tag = .less_than, + }, + .greater_than => switch (self.buffer[self.index]) { + '=' => { + self.index += 1; + result.tag = .greater_than_equal; + }, + else => result.tag = .greater_than, + }, + .word => { + self.index += 1; + switch (self.buffer[self.index]) { + '0'...'9', 'a'...'z', 'A'...'Z' => continue :state .word, + else => { + const ident = self.buffer[result.loc.start..self.index]; + if (Token.getKeyword(ident)) |tag| { + result.tag = tag; + } else { + result.tag = .string; + } + }, + } + }, + .number => { + self.index += 1; + switch (self.buffer[self.index]) { + '.' => continue :state .number_dot, + 'a'...'z', 'A'...'Z', '_' => continue :state .identifier, + '0'...'9' => continue :state .number, + else => result.tag = .number_literal, + } + }, + .number_dot => { + self.index += 1; + switch (self.buffer[self.index]) { + '0'...'9' => continue :state .number_decimal, + else => { + self.index -= 1; + continue :state .invalid; + }, + } + }, + .number_decimal => { + self.index += 1; + switch (self.buffer[self.index]) { + '0'...'9' => continue :state .number_decimal, + else => result.tag = .number_literal, + } + }, + .identifier => { + self.index += 1; + switch (self.buffer[self.index]) { + 'a'...'z', 'A'...'Z', '0'...'9', '_' => continue :state .identifier, + else => switch (grammar) { + .expression => { + const ident = self.buffer[result.loc.start..self.index]; + if (Token.getKeyword(ident)) |tag| { + result.tag = tag; + } else { + result.tag = .identifier; + } + }, + .content => result.tag = .identifier, + }, + } + }, + .whitespace => { + self.index += 1; + switch (self.buffer[self.index]) { + ' ', '\t' => continue :state .whitespace, + else => { + if (self.is_line_start or grammar == .expression) { + result.loc.start = self.index; + continue :state .start; + } + result.tag = .whitespace; + }, + } + }, + .newline => { + self.index += 1; + switch (self.buffer[self.index]) { + '\n' => continue :state .newline, + else => { + if (self.is_line_start) { + self.is_line_start = false; + result.loc.start = self.index; + continue :state .start; + } else { + result.tag = .newline; + } + }, + } + }, + .line_comment => { + self.index += 1; + switch (self.buffer[self.index]) { + 0 => { + if (self.index != self.buffer.len) { + continue :state .invalid; + } else return .{ + .tag = .eof, + .loc = .{ + .start = self.index, + .end = self.index, + }, + }; + }, + '\n' => { + self.is_line_start = true; + result.loc.start = self.index; + continue :state .start; + }, + 0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => continue :state .invalid, + else => continue :state .line_comment, + } + }, + .block_comment => { + self.index += 1; + switch (self.buffer[self.index]) { + 0 => { + if (self.index != self.buffer.len) { + continue :state .invalid; + } else return .{ + .tag = .eof, + .loc = .{ + .start = self.index, + .end = self.index, + }, + }; + }, + '*' => { + self.index += 1; + continue :state .block_comment_end; + }, + else => { + continue :state .block_comment; + }, + } + }, + .block_comment_end => switch (self.buffer[self.index]) { + 0 => { + if (self.index != self.buffer.len) { + continue :state .invalid; + } else return .{ + .tag = .eof, + .loc = .{ + .start = self.index, + .end = self.index, + }, + }; + }, + '/' => { + self.index += 1; + continue :state .start; + }, + else => continue :state .block_comment, + }, + .invalid => { + self.index += 1; + switch (self.buffer[self.index]) { + 0 => if (self.index == self.buffer.len) { + result.tag = .invalid; + } else { + continue :state .invalid; + }, + '\n' => result.tag = .invalid, + else => continue :state .invalid, + } + }, + } + + if (self.is_line_start) self.is_line_start = false; + result.loc.end = self.index; + return result; + } +}; + +test "UTF-8 BOM is recognized and skipped" { + try testTokenize("\xEF\xBB\xBFHello\n", &.{ .string, .newline }); +} +test "line comments" { + try testTokenize("//", &.{}); + try testTokenize("// a / b", &.{}); + try testTokenize("// /", &.{}); + try testTokenize("/// a", &.{}); + try testTokenize("////", &.{}); +} + +test "null byte before eof" { + try testTokenize("123 \x00 456", &.{ .string, .whitespace, .invalid }); + try testTokenize("//\x00", &.{.invalid}); + try testTokenize("\x00", &.{.invalid}); + try testTokenize("// NUL\x00\n", &.{ .invalid, .newline }); +} + +test "content" { + try testTokenize("Hello, world!", &.{ + .string, + .comma, + .whitespace, + .string, + .exclaimation_mark, + }); +} + +test "number literals" { + try testTokenizeWithMode("0", &.{.number_literal}, .expression); + try testTokenizeWithMode("1", &.{.number_literal}, .expression); + try testTokenizeWithMode("2", &.{.number_literal}, .expression); + try testTokenizeWithMode("3", &.{.number_literal}, .expression); + try testTokenizeWithMode("4", &.{.number_literal}, .expression); + try testTokenizeWithMode("5", &.{.number_literal}, .expression); + try testTokenizeWithMode("6", &.{.number_literal}, .expression); + try testTokenizeWithMode("7", &.{.number_literal}, .expression); + try testTokenizeWithMode("8", &.{.number_literal}, .expression); + try testTokenizeWithMode("9", &.{.number_literal}, .expression); + + try testTokenizeWithMode("0.0", &.{.number_literal}, .expression); + try testTokenizeWithMode("1.0", &.{.number_literal}, .expression); + try testTokenizeWithMode("10.0", &.{.number_literal}, .expression); +} + +test "expressions" { + try testTokenizeWithMode("1 + 2 * 3 - 4", &.{ + .number_literal, + .plus, + .number_literal, + .star, + .number_literal, + .minus, + .number_literal, + }, .expression); + try testTokenizeWithMode("a = 123", &.{ + .identifier, + .equal, + .number_literal, + }, .expression); +} + +fn testTokenizeWithMode( + source: [:0]const u8, + expected_token_tags: []const Token.Tag, + grammar: Tokenizer.Grammar, +) !void { + var tokenizer = Tokenizer.init(source); + + for (expected_token_tags) |expected_token_tag| { + const token = tokenizer.next(grammar); + try std.testing.expectEqual(expected_token_tag, token.tag); + } + + const last_token = tokenizer.next(grammar); + try std.testing.expectEqual(Token.Tag.eof, last_token.tag); + try std.testing.expectEqual(source.len, last_token.loc.start); + try std.testing.expectEqual(source.len, last_token.loc.end); +} + +fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void { + return testTokenizeWithMode(source, expected_token_tags, .content); +}