aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/psi/lexer.py
diff options
context:
space:
mode:
Diffstat (limited to 'psi/lexer.py')
-rw-r--r--psi/lexer.py249
1 files changed, 249 insertions, 0 deletions
diff --git a/psi/lexer.py b/psi/lexer.py
new file mode 100644
index 0000000..d2c6f68
--- /dev/null
+++ b/psi/lexer.py
@@ -0,0 +1,249 @@
+"""
+Token and Lexer Documentation
+=============================
+
+This module provides the `Token` and `Lexer` classes for tokenizing input strings.
+
+Token Class
+-----------
+
+The `Token` class represents a token with a type, value, and position in the input string. It is a subclass of the built-in `dict` class.
+
+Attributes:
+- `type` (str): The type of the token.
+- `value` (str or int): The value of the token.
+- `position` (int): The position of the token in the input string.
+
+Methods:
+- `__getattr__(self, name)`: Retrieves the value of an attribute by name. Raises an `AttributeError` if the attribute does not exist.
+
+Lexer Class
+-----------
+
+The `Lexer` class tokenizes an input string using a set of rules.
+
+Attributes:
+- `input` (str): The input string to tokenize.
+- `position` (int): The current position in the input string.
+- `tokens` (list): The list of tokens generated by the lexer.
+
+Methods:
+- `get_next_token(self)`: Retrieves the next token from the input string.
+- `__iter__(self)`: Returns an iterator over the tokens.
+- `__getitem__(self, index)`: Retrieves a token by index.
+- `__len__(self)`: Returns the number of tokens.
+
+Usage Example
+-------------
+
+```python
+lexer = Lexer('''
+@newMessage: {
+ ? message == 1: reply: hi
+ ! reply: no
+}
+''')
+
+token = lexer.get_next_token()
+while token['type'] != 'EOF':
+ print(f'Type: {token["type"]}, Value: {token["value"]}, Position: {token["position"]}')
+ token = lexer.get_next_token()
+
+print("\nAll tokens:")
+print([t['type'] for t in lexer])
+"""
+
+__all__ = ['Token', 'Lexer']
+
+class Token(dict):
+ """
+ A class representing a token in the lexer.
+
+ Args:
+ type: The type of the token.
+ value: The value of the token.
+ position: The position of the token.
+
+ Returns:
+ None
+
+ Example:
+ ```python
+ token = Token("identifier", "x", (1, 5))
+ ```
+ """
+
+ def __init__(self, type, value, position):
+ """
+ Initializes a Token object.
+
+ Args:
+ type: The type of the token.
+ value: The value of the token.
+ position: The position of the token.
+
+ Returns:
+ None
+ """
+ super().__init__(type=type, value=value, position=position)
+
+ def __getattr__(self, name):
+ """
+ Retrieves the value of an attribute from the Token object.
+
+ Args:
+ name: The name of the attribute.
+
+ Returns:
+ The value of the attribute.
+
+ Raises:
+ AttributeError: Raised when the attribute does not exist.
+ """
+ try:
+ return self[name]
+ except KeyError:
+ raise AttributeError(f"'Token' object has no attribute '{name}'")
+
+
+class Lexer:
+ """
+ A class representing a lexer for Psi code.
+
+ Args:
+ input: The input code to be lexed.
+
+ Returns:
+ None
+
+ Example:
+ ```python
+ lexer = Lexer("x = 10")
+ for token in lexer:
+ print(token)
+ ```
+ """
+ def __init__(self, input):
+ """
+ Initializes a Lexer object.
+
+ Args:
+ input: The input code to be lexed.
+
+ Returns:
+ None
+ """
+ self.input = input
+ self.position = 0
+ self.tokens = []
+
+ def get_next_token(self):
+ """
+ Retrieves the next token from the input code.
+
+ Returns:
+ The next token.
+
+ Raises:
+ Exception: Raised when an unknown character is encountered.
+ """
+ while self.position < len(self.input):
+ current_char = self.input[self.position]
+
+ if current_char.isspace():
+ self.position += 1
+ continue
+
+ if current_char == '#':
+ self.position += 1
+ while (self.position < len(self.input) and
+ self.input[self.position] != '\n'):
+ self.position += 1
+ continue
+
+ if current_char == '/' and self.position + 1 < len(self.input) and self.input[self.position + 1] == '*':
+ self.position += 2
+ while (self.position < len(self.input) - 1 and
+ (self.input[self.position] != '*' or self.input[self.position + 1] != '/')):
+ self.position += 1
+ if self.position < len(self.input) - 1:
+ self.position += 2
+ continue
+
+ if current_char.isalpha():
+ start_position = self.position
+ while (self.position < len(self.input) and
+ self.input[self.position].isalnum()):
+ self.position += 1
+ token = Token('IDENTIFIER', self.input[start_position:self.position], start_position)
+ self.tokens.append(token)
+ return token
+
+ if current_char.isdigit():
+ start_position = self.position
+ while (self.position < len(self.input) and
+ self.input[self.position].isdigit()):
+ self.position += 1
+ token = Token('INTEGER', int(self.input[start_position:self.position]), start_position)
+ self.tokens.append(token)
+ return token
+
+ if current_char in {'<', '>', '=', '!', '&', '|', '@'}:
+ if (self.position + 1 < len(self.input) and
+ self.input[self.position + 1] in {'=', '&', '|'}):
+ token = Token('OPERATOR', current_char + self.input[self.position + 1], self.position)
+ self.position += 2
+ else:
+ token = Token('OPERATOR', current_char, self.position)
+ self.position += 1
+ self.tokens.append(token)
+ return token
+
+ if current_char in {'{', '}', '(', ')', '[', ']', ';', ',', '.', ':'}:
+ token = Token('SEPARATOR', current_char, self.position)
+ self.position += 1
+ self.tokens.append(token)
+ return token
+
+ if current_char in {'?', '!', '|'}:
+ token = Token('CONTROL', current_char, self.position)
+ self.position += 1
+ self.tokens.append(token)
+ return token
+
+ self.position += 1
+ raise Exception(f'Unknown character: {current_char}')
+
+ token = Token('EOF', None, self.position)
+ self.tokens.append(token)
+ return token
+
+ def __iter__(self):
+ """
+ Returns an iterator over the tokens.
+
+ Returns:
+ An iterator over the tokens.
+ """
+ return iter(self.tokens)
+
+ def __getitem__(self, index):
+ """
+ Retrieves the token at the specified index.
+
+ Args:
+ index: The index of the token.
+
+ Returns:
+ The token at the specified index.
+ """
+ return self.tokens[index]
+
+ def __len__(self):
+ """
+ Returns the number of tokens.
+
+ Returns:
+ The number of tokens.
+ """
+ return len(self.tokens) \ No newline at end of file