From 02864c8c29aee80d59cbd7251046f77a3e8e4093 Mon Sep 17 00:00:00 2001 From: Daniel Weipert Date: Tue, 31 Dec 2024 15:17:23 +0100 Subject: lexer --- src/Lexer/Lexer.php | 432 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 432 insertions(+) create mode 100644 src/Lexer/Lexer.php (limited to 'src/Lexer') diff --git a/src/Lexer/Lexer.php b/src/Lexer/Lexer.php new file mode 100644 index 0000000..747b0c9 --- /dev/null +++ b/src/Lexer/Lexer.php @@ -0,0 +1,432 @@ +input = $input; + $this->position = 0; + $this->line = 1; + $this->column = 1; + } + + public function lex(): array { + $output = []; + + $singleCharTokens = [ + ":", ",", + "+", "-", + "(", ")", + "[", "]", + "{", "}", + "$", + ]; + + $lastPosition = -1; + while ($this->position < strlen($this->input) - 1) { + $lastPosition = $this->position; + $currentChar = $this->getCurrentChar(); + + // identifier + if ($this->isIdentifierChar($currentChar)) { + $output[] = $this->lexIdentifier(); + } + + // number + else if ($this->isNumberChar($currentChar)) { + $output[] = $this->lexNumber(); + } + + // string + else if ($currentChar == "\"") { + $output[] = $this->lexString(); + } + + // equal sign + else if ($currentChar == "=") { + $nextChar = $this->getNextChar(); + $startColumn = $this->column; + + // equals + if ($nextChar == "=") { + $output[] = new Token( + $currentChar . $nextChar, + $currentChar . $nextChar, + $this->line, + $startColumn, + ); + $this->advance(2); + } + + // pipe + else if ($nextChar == ">") { + $output[] = new Token( + $currentChar . $nextChar, + $currentChar . $nextChar, + $this->line, + $startColumn, + ); + $this->advance(2); + } + + // assign + else { + $output[] = new Token( + $currentChar, + $currentChar, + $this->line, + $startColumn, + ); + $this->advance(1); + } + } + + // forward slash + else if ($currentChar == "/") { + $nextChar = $this->getNextChar(); + $startColumn = $this->column; + + // single line comment + if ($nextChar == "/") { + $output[] = $this->lexSingleLineComment(); + } + + // multiline comment + else if ($nextChar == "*") { + $output[] = $this->lexMultiLineComment(); + } + + // divide + else { + $output[] = new Token( + $currentChar, + $currentChar, + $this->line, + $startColumn, + ); + $this->advance(1); + } + } + + // asterisk + else if ($currentChar == "*") { + $nextChar = $this->getNextChar(); + $startColumn = $this->column; + + // exponential + if ($nextChar == "*") { + $output[] = new Token( + $currentChar . $nextChar, + $currentChar . $nextChar, + $this->line, + $startColumn, + ); + $this->advance(2); + } + + // multiply + else { + $output[] = new Token( + $currentChar, + $currentChar, + $this->line, + $startColumn, + ); + $this->advance(1); + } + } + + // lower + else if ($currentChar == "<") { + $nextChar = $this->getNextChar(); + $startColumn = $this->column; + + // lower equals + if ($nextChar == "=") { + $output[] = new Token( + $currentChar . $nextChar, + $currentChar . $nextChar, + $this->line, + $startColumn, + ); + $this->advance(2); + } + + else { + $output[] = new Token( + $currentChar, + $currentChar, + $this->line, + $startColumn, + ); + $this->advance(1); + } + } + + // greater + else if ($currentChar == ">") { + $nextChar = $this->getNextChar(); + $startColumn = $this->column; + + // greater equals + if ($nextChar == "=") { + $output[] = new Token( + $currentChar . $nextChar, + $currentChar . $nextChar, + $this->line, + $startColumn, + ); + $this->advance(2); + } + + else { + $output[] = new Token( + $currentChar, + $currentChar, + $this->line, + $startColumn, + ); + $this->advance(1); + } + } + + // single char tokens + else if (in_array($currentChar, $singleCharTokens)) { + $output[] = new Token( + $currentChar, + $currentChar, + $this->line, + $this->column, + ); + $this->advance(1); + } + + // whitespace + else { + $this->consumeWhitespace(); + } + + // unknown character + if ($this->position == $lastPosition) { + $error = sprintf("Unknown character %s at position %d,%d" . PHP_EOL, $this->getCurrentChar(), $this->line, $this->column); + $this->errors[] = $error; + + $this->advance(1); + + echo $error; + } + } + + return $output; + } + + public function getCurrentChar(): string + { + return $this->input[$this->position]; + } + + public function getNextChar(): ?string + { + return $this->input[$this->position + 1] ?? null; + } + + /* + public function seek(int $position): void + { + $this->position = $position; + } + */ + + public function advance(int $steps): void + { + $this->position += $steps; + $this->column += $steps; + } + + public function consumeWhitespace(): void + { + $whitespace = [ + " ", + "\t", + PHP_EOL, + ]; + + while (in_array($this->getCurrentChar(), $whitespace)) { + if ($this->getCurrentChar() == PHP_EOL) { + $this->line += 1; + $this->column = 0; + } + + $this->advance(1); + } + } + + public function isIdentifierChar(string $char): bool + { + return "a" <= $char && $char <= "z" or "A" <= $char && $char <= "Z" or in_array($char, [ + "_", + ]); + } + + public function lexIdentifier(): Token + { + $startColumn = $this->column; + + $startPosition = $this->position; + $endPosition = $this->position; + + while ($this->isIdentifierChar($this->getCurrentChar())) { + $endPosition = $this->position; + $this->advance(1); + } + + $value = substr($this->input, $startPosition, $endPosition - $startPosition + 1); + + return new Token( + $value, + $value, + $this->line, + $startColumn, + ); + } + + private function isNumberChar(string $char): bool + { + return "0" <= $char && $char <= "9"; + } + + private function lexNumber(): Token + { + $startColumn = $this->column; + + $startPosition = $this->position; + $endPosition = $this->position; + + while ( + $this->isNumberChar($this->getCurrentChar()) + or $this->getCurrentChar() == "_" + or $this->getCurrentChar() == "." + ) { + $endPosition = $this->position; + $this->advance(1); + } + + $value = substr($this->input, $startPosition, $endPosition - $startPosition + 1); + + return new Token( + $value, + $value, + $this->line, + $startColumn, + ); + } + + private function lexString(): Token + { + $startLine = $this->line; + $startColumn = $this->column; + + $startPosition = $this->position; + $endPosition = $this->position; + + // skip first " + $this->advance(1); + + while ($this->getCurrentChar() != "\"") { + if ($this->getCurrentChar() == PHP_EOL) { + $this->line += 1; + $this->column = 0; + } + + $endPosition = $this->position; + $this->advance(1); + } + + // add last " + $endPosition += 1; + $this->advance(1); + + $rawValue = substr($this->input, $startPosition, $endPosition - $startPosition + 1); + + return new Token( + $rawValue, + substr($rawValue, 1, strlen($rawValue) - 2), + $startLine, + $startColumn, + ); + } + + private function lexSingleLineComment(): Token + { + $startColumn = $this->column; + + $startPosition = $this->position; + $endPosition = $this->position; + + while ($this->getCurrentChar() != PHP_EOL) { + $endPosition = $this->position; + $this->advance(1); + } + + $rawValue = substr($this->input, $startPosition, $endPosition - $startPosition + 1); + + return new Token( + $rawValue, + trim(substr($rawValue, 2)), + $this->line, + $startColumn, + ); + } + + private function lexMultiLineComment(): Token + { + $startLine = $this->line; + $startColumn = $this->column; + + $startPosition = $this->position; + $endPosition = $this->position; + + // skip start + $this->advance(2); + + while (($this->getCurrentChar() . $this->getNextChar()) != "*/") { + if ($this->getCurrentChar() == PHP_EOL) { + $this->line += 1; + $this->column = 0; + } + + $endPosition = $this->position; + $this->advance(1); + } + + // skip end + $endPosition += 2; + $this->advance(2); + + $rawValue = substr($this->input, $startPosition, $endPosition - $startPosition + 1); + + return new Token( + $rawValue, + trim(substr($rawValue, 2, strlen($rawValue) - 4)), + $startLine, + $startColumn, + ); + } +} + +class Token { + public function __construct( + public string $literal, + public string $value, + public string $line, + public string $column, + ) {} +} -- cgit v1.2.3