diff options
| author | 2024-02-28 11:37:54 +0800 | |
|---|---|---|
| committer | 2024-02-28 11:37:54 +0800 | |
| commit | 672e0d03d3ffdba740d166f2c4a7243f142cec5f (patch) | |
| tree | 46cdb4ea015877e7828710a47c945aa2e834beb0 /nivis-python/lexer.py | |
| parent | fc6fc64ca3ff7df62dd270fc021f674e56b2dedf (diff) | |
| download | TRPGNivis-672e0d03d3ffdba740d166f2c4a7243f142cec5f.tar.gz TRPGNivis-672e0d03d3ffdba740d166f2c4a7243f142cec5f.zip | |
refactor!: built in rust
Diffstat (limited to 'nivis-python/lexer.py')
| -rw-r--r-- | nivis-python/lexer.py | 276 |
1 files changed, 276 insertions, 0 deletions
diff --git a/nivis-python/lexer.py b/nivis-python/lexer.py new file mode 100644 index 0000000..7ba94e3 --- /dev/null +++ b/nivis-python/lexer.py @@ -0,0 +1,276 @@ +""" +Token and Lexer Documentation +============================= + +This module provides the `Token` and `Lexer` classes for tokenizing input strings. + +Token Class +----------- + +The `Token` class represents a token with a type, value, and position in the input string. It is a subclass of the built-in `dict` class. + +Attributes: +- `type` (str): The type of the token. +- `value` (str or int): The value of the token. +- `position` (int): The position of the token in the input string. + +Methods: +- `__getattr__(self, name)`: Retrieves the value of an attribute by name. Raises an `AttributeError` if the attribute does not exist. + +Lexer Class +----------- + +The `Lexer` class tokenizes an input string using a set of rules. + +Attributes: +- `input` (str): The input string to tokenize. +- `position` (int): The current position in the input string. +- `tokens` (list): The list of tokens generated by the lexer. + +Methods: +- `get_next_token(self)`: Retrieves the next token from the input string. +- `__iter__(self)`: Returns an iterator over the tokens. +- `__getitem__(self, index)`: Retrieves a token by index. +- `__len__(self)`: Returns the number of tokens. + +Usage Example +------------- + +```python +lexer = Lexer(''' +@newMessage: { + ? message == 1: reply: hi + ! reply: no +} +''') + +token = lexer.get_next_token() +while token['type'] != 'EOF': + print(f'Type: {token["type"]}, Value: {token["value"]}, Position: {token["position"]}') + token = lexer.get_next_token() + +print("\nAll tokens:") +print([t['type'] for t in lexer]) +""" +from .exception import ValueError + +__all__ = ["Token", "Lexer"] + + +class Token(dict): + """ + A class representing a token in the lexer. + + Args: + type: The type of the token. + value: The value of the token. + position: The position of the token. + + Returns: + None + + Example: + ```python + token = Token("identifier", "x", (1, 5)) + ``` + """ + + def __init__(self, type, value, position): + """ + Initializes a Token object. + + Args: + type: The type of the token. + value: The value of the token. + position: The position of the token. + + Returns: + None + """ + super().__init__(type=type, value=value, position=position) + + def __getattr__(self, name): + """ + Retrieves the value of an attribute from the Token object. + + Args: + name: The name of the attribute. + + Returns: + The value of the attribute. + + Raises: + AttributeError: Raised when the attribute does not exist. + """ + try: + return self[name] + except KeyError as e: + raise AttributeError(f"'Token' object has no attribute '{name}'") from e + + +class Lexer: + """ + A class representing a lexer for Psi code. + + Args: + input: The input code to be lexed. + + Returns: + None + + Example: + ```python + lexer = Lexer("x = 10") + for token in lexer: + print(token) + ``` + """ + + def __init__(self, input): + """ + Initializes a Lexer object. + + Args: + input: The input code to be lexed. + + Returns: + None + """ + self.input = input + self.position = 0 + self.tokens = [] + + def get_next_token(self): + """ + Retrieves the next token from the input code. + + Returns: + The next token. + + Raises: + Exception: Raised when an unknown character is encountered. + """ + while self.position < len(self.input): + current_char = self.input[self.position] + + if current_char.isspace(): + self.position += 1 + continue + + if current_char == "#": + self.position += 1 + while ( + self.position < len(self.input) + and self.input[self.position] != "\n" + ): + self.position += 1 + continue + + if ( + current_char == "/" + and self.position + 1 < len(self.input) + and self.input[self.position + 1] == "*" + ): + self.position += 2 + while self.position < len(self.input) - 1 and ( + self.input[self.position] != "*" + or self.input[self.position + 1] != "/" + ): + self.position += 1 + if self.position < len(self.input) - 1: + self.position += 2 + continue + + if current_char.isalpha(): + start_position = self.position + while ( + self.position < len(self.input) + and self.input[self.position].isalnum() + ): + self.position += 1 + token = Token( + "IDENTIFIER", + self.input[start_position : self.position], + start_position, + ) + self.tokens.append(token) + return token + + if current_char.isdigit(): + start_position = self.position + while ( + self.position < len(self.input) + and self.input[self.position].isdigit() + ): + self.position += 1 + token = Token( + "INTEGER", + int(self.input[start_position : self.position]), + start_position, + ) + self.tokens.append(token) + return token + + if current_char in {"<", ">", "=", "!", "&", "|", "@"}: + if self.position + 1 < len(self.input) and self.input[ + self.position + 1 + ] in {"=", "&", "|"}: + token = Token( + "OPERATOR", + current_char + self.input[self.position + 1], + self.position, + ) + self.position += 2 + else: + token = Token("OPERATOR", current_char, self.position) + self.position += 1 + self.tokens.append(token) + return token + + if current_char in {"{", "}", "(", ")", "[", "]", ";", ",", ".", ":"}: + return self._extracted_from_get_next_token_64("SEPARATOR", current_char) + if current_char in {"?", "!", "|"}: + return self._extracted_from_get_next_token_64("CONTROL", current_char) + self.position += 1 + raise ValueError(f"Unknown character: {current_char}") + + token = Token("EOF", None, self.position) + self.tokens.append(token) + return token + + # TODO Rename this here and in `get_next_token` + def _extracted_from_get_next_token_64(self, arg0, current_char): + token = Token(arg0, current_char, self.position) + self.position += 1 + self.tokens.append(token) + return token + + def __iter__(self): + """ + Returns an iterator over the tokens. + + Returns: + An iterator over the tokens. + """ + return iter(self.tokens) + + def __getitem__(self, index): + """ + Retrieves the token at the specified index. + + Args: + index: The index of the token. + + Returns: + The token at the specified index. + """ + return self.tokens[index] + + def __len__(self): + """ + Returns the number of tokens. + + Returns: + The number of tokens. + """ + return len(self.tokens) |
