diff options
| author | 2023-09-27 17:31:16 +0800 | |
|---|---|---|
| committer | 2023-09-27 17:31:16 +0800 | |
| commit | ba4129933cdb6d91e695b2de900b8753652ec385 (patch) | |
| tree | c520d508bf50cd22ea3123840f4aff77f148256b /src/psi/lexer.py | |
| parent | 3ad303968524f6dc57b7d5900e33963c77342552 (diff) | |
| download | TRPGNivis-ba4129933cdb6d91e695b2de900b8753652ec385.tar.gz TRPGNivis-ba4129933cdb6d91e695b2de900b8753652ec385.zip | |
feat(pyproject): 优化python包管理结构
Diffstat (limited to 'src/psi/lexer.py')
| -rw-r--r-- | src/psi/lexer.py | 153 |
1 files changed, 153 insertions, 0 deletions
diff --git a/src/psi/lexer.py b/src/psi/lexer.py new file mode 100644 index 0000000..2fce0eb --- /dev/null +++ b/src/psi/lexer.py @@ -0,0 +1,153 @@ +""" +Token and Lexer Documentation +============================= + +This module provides the `Token` and `Lexer` classes for tokenizing input strings. + +Token Class +----------- + +The `Token` class represents a token with a type, value, and position in the input string. It is a subclass of the built-in `dict` class. + +Attributes: +- `type` (str): The type of the token. +- `value` (str or int): The value of the token. +- `position` (int): The position of the token in the input string. + +Methods: +- `__getattr__(self, name)`: Retrieves the value of an attribute by name. Raises an `AttributeError` if the attribute does not exist. + +Lexer Class +----------- + +The `Lexer` class tokenizes an input string using a set of rules. + +Attributes: +- `input` (str): The input string to tokenize. +- `position` (int): The current position in the input string. +- `tokens` (list): The list of tokens generated by the lexer. + +Methods: +- `get_next_token(self)`: Retrieves the next token from the input string. +- `__iter__(self)`: Returns an iterator over the tokens. +- `__getitem__(self, index)`: Retrieves a token by index. +- `__len__(self)`: Returns the number of tokens. + +Usage Example +------------- + +```python +lexer = Lexer(''' +@newMessage: { + ? message == 1: reply: hi + ! reply: no +} +''') + +token = lexer.get_next_token() +while token['type'] != 'EOF': + print(f'Type: {token["type"]}, Value: {token["value"]}, Position: {token["position"]}') + token = lexer.get_next_token() + +print("\nAll tokens:") +print([t['type'] for t in lexer]) +""" + +__all__ = ['Token', 'Lexer'] + +class Token(dict): + def __init__(self, type, value, position): + super().__init__(type=type, value=value, position=position) + + def __getattr__(self, name): + try: + return self[name] + except KeyError: + raise AttributeError(f"'Token' object has no attribute '{name}'") + +class Lexer: + def __init__(self, input): + self.input = input + self.position = 0 + self.tokens = [] + + def get_next_token(self): + while self.position < len(self.input): + current_char = self.input[self.position] + + if current_char.isspace(): + self.position += 1 + continue + + if current_char == '#': + self.position += 1 + while (self.position < len(self.input) and + self.input[self.position] != '\n'): + self.position += 1 + continue + + if current_char == '/' and self.position + 1 < len(self.input) and self.input[self.position + 1] == '*': + self.position += 2 + while (self.position < len(self.input) - 1 and + (self.input[self.position] != '*' or self.input[self.position + 1] != '/')): + self.position += 1 + if self.position < len(self.input) - 1: + self.position += 2 + continue + + if current_char.isalpha(): + start_position = self.position + while (self.position < len(self.input) and + self.input[self.position].isalnum()): + self.position += 1 + token = Token('IDENTIFIER', self.input[start_position:self.position], start_position) + self.tokens.append(token) + return token + + if current_char.isdigit(): + start_position = self.position + while (self.position < len(self.input) and + self.input[self.position].isdigit()): + self.position += 1 + token = Token('INTEGER', int(self.input[start_position:self.position]), start_position) + self.tokens.append(token) + return token + + if current_char in {'<', '>', '=', '!', '&', '|', '@'}: + if (self.position + 1 < len(self.input) and + self.input[self.position + 1] in {'=', '&', '|'}): + token = Token('OPERATOR', current_char + self.input[self.position + 1], self.position) + self.position += 2 + else: + token = Token('OPERATOR', current_char, self.position) + self.position += 1 + self.tokens.append(token) + return token + + if current_char in {'{', '}', '(', ')', '[', ']', ';', ',', '.', ':'}: + token = Token('SEPARATOR', current_char, self.position) + self.position += 1 + self.tokens.append(token) + return token + + if current_char in {'?', '!', '|'}: + token = Token('CONTROL', current_char, self.position) + self.position += 1 + self.tokens.append(token) + return token + + self.position += 1 + raise Exception(f'Unknown character: {current_char}') + + token = Token('EOF', None, self.position) + self.tokens.append(token) + return token + + def __iter__(self): + return iter(self.tokens) + + def __getitem__(self, index): + return self.tokens[index] + + def __len__(self): + return len(self.tokens)
\ No newline at end of file |
