summaryrefslogtreecommitdiff
path: root/src/Lexer
diff options
context:
space:
mode:
authorDaniel Weipert <git@mail.dweipert.de>2024-12-31 15:17:23 +0100
committerDaniel Weipert <git@mail.dweipert.de>2024-12-31 15:17:23 +0100
commit02864c8c29aee80d59cbd7251046f77a3e8e4093 (patch)
tree514f17bfdfa14538f50031e3ed0cb71e39e8f454 /src/Lexer
lexer
Diffstat (limited to 'src/Lexer')
-rw-r--r--src/Lexer/Lexer.php432
1 files changed, 432 insertions, 0 deletions
diff --git a/src/Lexer/Lexer.php b/src/Lexer/Lexer.php
new file mode 100644
index 0000000..747b0c9
--- /dev/null
+++ b/src/Lexer/Lexer.php
@@ -0,0 +1,432 @@
+<?php
+
+namespace Mnml\Lexer;
+
+class Lexer
+{
+ public string $input;
+ public int $position;
+ public int $line;
+ public int $column;
+ public array $errors;
+
+ public function __construct(string $input)
+ {
+ $this->input = $input;
+ $this->position = 0;
+ $this->line = 1;
+ $this->column = 1;
+ }
+
+ public function lex(): array {
+ $output = [];
+
+ $singleCharTokens = [
+ ":", ",",
+ "+", "-",
+ "(", ")",
+ "[", "]",
+ "{", "}",
+ "$",
+ ];
+
+ $lastPosition = -1;
+ while ($this->position < strlen($this->input) - 1) {
+ $lastPosition = $this->position;
+ $currentChar = $this->getCurrentChar();
+
+ // identifier
+ if ($this->isIdentifierChar($currentChar)) {
+ $output[] = $this->lexIdentifier();
+ }
+
+ // number
+ else if ($this->isNumberChar($currentChar)) {
+ $output[] = $this->lexNumber();
+ }
+
+ // string
+ else if ($currentChar == "\"") {
+ $output[] = $this->lexString();
+ }
+
+ // equal sign
+ else if ($currentChar == "=") {
+ $nextChar = $this->getNextChar();
+ $startColumn = $this->column;
+
+ // equals
+ if ($nextChar == "=") {
+ $output[] = new Token(
+ $currentChar . $nextChar,
+ $currentChar . $nextChar,
+ $this->line,
+ $startColumn,
+ );
+ $this->advance(2);
+ }
+
+ // pipe
+ else if ($nextChar == ">") {
+ $output[] = new Token(
+ $currentChar . $nextChar,
+ $currentChar . $nextChar,
+ $this->line,
+ $startColumn,
+ );
+ $this->advance(2);
+ }
+
+ // assign
+ else {
+ $output[] = new Token(
+ $currentChar,
+ $currentChar,
+ $this->line,
+ $startColumn,
+ );
+ $this->advance(1);
+ }
+ }
+
+ // forward slash
+ else if ($currentChar == "/") {
+ $nextChar = $this->getNextChar();
+ $startColumn = $this->column;
+
+ // single line comment
+ if ($nextChar == "/") {
+ $output[] = $this->lexSingleLineComment();
+ }
+
+ // multiline comment
+ else if ($nextChar == "*") {
+ $output[] = $this->lexMultiLineComment();
+ }
+
+ // divide
+ else {
+ $output[] = new Token(
+ $currentChar,
+ $currentChar,
+ $this->line,
+ $startColumn,
+ );
+ $this->advance(1);
+ }
+ }
+
+ // asterisk
+ else if ($currentChar == "*") {
+ $nextChar = $this->getNextChar();
+ $startColumn = $this->column;
+
+ // exponential
+ if ($nextChar == "*") {
+ $output[] = new Token(
+ $currentChar . $nextChar,
+ $currentChar . $nextChar,
+ $this->line,
+ $startColumn,
+ );
+ $this->advance(2);
+ }
+
+ // multiply
+ else {
+ $output[] = new Token(
+ $currentChar,
+ $currentChar,
+ $this->line,
+ $startColumn,
+ );
+ $this->advance(1);
+ }
+ }
+
+ // lower
+ else if ($currentChar == "<") {
+ $nextChar = $this->getNextChar();
+ $startColumn = $this->column;
+
+ // lower equals
+ if ($nextChar == "=") {
+ $output[] = new Token(
+ $currentChar . $nextChar,
+ $currentChar . $nextChar,
+ $this->line,
+ $startColumn,
+ );
+ $this->advance(2);
+ }
+
+ else {
+ $output[] = new Token(
+ $currentChar,
+ $currentChar,
+ $this->line,
+ $startColumn,
+ );
+ $this->advance(1);
+ }
+ }
+
+ // greater
+ else if ($currentChar == ">") {
+ $nextChar = $this->getNextChar();
+ $startColumn = $this->column;
+
+ // greater equals
+ if ($nextChar == "=") {
+ $output[] = new Token(
+ $currentChar . $nextChar,
+ $currentChar . $nextChar,
+ $this->line,
+ $startColumn,
+ );
+ $this->advance(2);
+ }
+
+ else {
+ $output[] = new Token(
+ $currentChar,
+ $currentChar,
+ $this->line,
+ $startColumn,
+ );
+ $this->advance(1);
+ }
+ }
+
+ // single char tokens
+ else if (in_array($currentChar, $singleCharTokens)) {
+ $output[] = new Token(
+ $currentChar,
+ $currentChar,
+ $this->line,
+ $this->column,
+ );
+ $this->advance(1);
+ }
+
+ // whitespace
+ else {
+ $this->consumeWhitespace();
+ }
+
+ // unknown character
+ if ($this->position == $lastPosition) {
+ $error = sprintf("Unknown character %s at position %d,%d" . PHP_EOL, $this->getCurrentChar(), $this->line, $this->column);
+ $this->errors[] = $error;
+
+ $this->advance(1);
+
+ echo $error;
+ }
+ }
+
+ return $output;
+ }
+
+ public function getCurrentChar(): string
+ {
+ return $this->input[$this->position];
+ }
+
+ public function getNextChar(): ?string
+ {
+ return $this->input[$this->position + 1] ?? null;
+ }
+
+ /*
+ public function seek(int $position): void
+ {
+ $this->position = $position;
+ }
+ */
+
+ public function advance(int $steps): void
+ {
+ $this->position += $steps;
+ $this->column += $steps;
+ }
+
+ public function consumeWhitespace(): void
+ {
+ $whitespace = [
+ " ",
+ "\t",
+ PHP_EOL,
+ ];
+
+ while (in_array($this->getCurrentChar(), $whitespace)) {
+ if ($this->getCurrentChar() == PHP_EOL) {
+ $this->line += 1;
+ $this->column = 0;
+ }
+
+ $this->advance(1);
+ }
+ }
+
+ public function isIdentifierChar(string $char): bool
+ {
+ return "a" <= $char && $char <= "z" or "A" <= $char && $char <= "Z" or in_array($char, [
+ "_",
+ ]);
+ }
+
+ public function lexIdentifier(): Token
+ {
+ $startColumn = $this->column;
+
+ $startPosition = $this->position;
+ $endPosition = $this->position;
+
+ while ($this->isIdentifierChar($this->getCurrentChar())) {
+ $endPosition = $this->position;
+ $this->advance(1);
+ }
+
+ $value = substr($this->input, $startPosition, $endPosition - $startPosition + 1);
+
+ return new Token(
+ $value,
+ $value,
+ $this->line,
+ $startColumn,
+ );
+ }
+
+ private function isNumberChar(string $char): bool
+ {
+ return "0" <= $char && $char <= "9";
+ }
+
+ private function lexNumber(): Token
+ {
+ $startColumn = $this->column;
+
+ $startPosition = $this->position;
+ $endPosition = $this->position;
+
+ while (
+ $this->isNumberChar($this->getCurrentChar())
+ or $this->getCurrentChar() == "_"
+ or $this->getCurrentChar() == "."
+ ) {
+ $endPosition = $this->position;
+ $this->advance(1);
+ }
+
+ $value = substr($this->input, $startPosition, $endPosition - $startPosition + 1);
+
+ return new Token(
+ $value,
+ $value,
+ $this->line,
+ $startColumn,
+ );
+ }
+
+ private function lexString(): Token
+ {
+ $startLine = $this->line;
+ $startColumn = $this->column;
+
+ $startPosition = $this->position;
+ $endPosition = $this->position;
+
+ // skip first "
+ $this->advance(1);
+
+ while ($this->getCurrentChar() != "\"") {
+ if ($this->getCurrentChar() == PHP_EOL) {
+ $this->line += 1;
+ $this->column = 0;
+ }
+
+ $endPosition = $this->position;
+ $this->advance(1);
+ }
+
+ // add last "
+ $endPosition += 1;
+ $this->advance(1);
+
+ $rawValue = substr($this->input, $startPosition, $endPosition - $startPosition + 1);
+
+ return new Token(
+ $rawValue,
+ substr($rawValue, 1, strlen($rawValue) - 2),
+ $startLine,
+ $startColumn,
+ );
+ }
+
+ private function lexSingleLineComment(): Token
+ {
+ $startColumn = $this->column;
+
+ $startPosition = $this->position;
+ $endPosition = $this->position;
+
+ while ($this->getCurrentChar() != PHP_EOL) {
+ $endPosition = $this->position;
+ $this->advance(1);
+ }
+
+ $rawValue = substr($this->input, $startPosition, $endPosition - $startPosition + 1);
+
+ return new Token(
+ $rawValue,
+ trim(substr($rawValue, 2)),
+ $this->line,
+ $startColumn,
+ );
+ }
+
+ private function lexMultiLineComment(): Token
+ {
+ $startLine = $this->line;
+ $startColumn = $this->column;
+
+ $startPosition = $this->position;
+ $endPosition = $this->position;
+
+ // skip start
+ $this->advance(2);
+
+ while (($this->getCurrentChar() . $this->getNextChar()) != "*/") {
+ if ($this->getCurrentChar() == PHP_EOL) {
+ $this->line += 1;
+ $this->column = 0;
+ }
+
+ $endPosition = $this->position;
+ $this->advance(1);
+ }
+
+ // skip end
+ $endPosition += 2;
+ $this->advance(2);
+
+ $rawValue = substr($this->input, $startPosition, $endPosition - $startPosition + 1);
+
+ return new Token(
+ $rawValue,
+ trim(substr($rawValue, 2, strlen($rawValue) - 4)),
+ $startLine,
+ $startColumn,
+ );
+ }
+}
+
+class Token {
+ public function __construct(
+ public string $literal,
+ public string $value,
+ public string $line,
+ public string $column,
+ ) {}
+}