psi/lexer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

"""
Token and Lexer Documentation
=============================

This module provides the `Token` and `Lexer` classes for tokenizing input strings.

Token Class
-----------

The `Token` class represents a token with a type, value, and position in the input string. It is a subclass of the built-in `dict` class.

Attributes:
- `type` (str): The type of the token.
- `value` (str or int): The value of the token.
- `position` (int): The position of the token in the input string.

Methods:
- `__getattr__(self, name)`: Retrieves the value of an attribute by name. Raises an `AttributeError` if the attribute does not exist.

Lexer Class
-----------

The `Lexer` class tokenizes an input string using a set of rules.

Attributes:
- `input` (str): The input string to tokenize.
- `position` (int): The current position in the input string.
- `tokens` (list): The list of tokens generated by the lexer.

Methods:
- `get_next_token(self)`: Retrieves the next token from the input string.
- `__iter__(self)`: Returns an iterator over the tokens.
- `__getitem__(self, index)`: Retrieves a token by index.
- `__len__(self)`: Returns the number of tokens.

Usage Example
-------------

```python
lexer = Lexer('''
@newMessage: {
    ? message == 1: reply: hi
    ! reply: no
}
''')

token = lexer.get_next_token()
while token['type'] != 'EOF':
    print(f'Type: {token["type"]}, Value: {token["value"]}, Position: {token["position"]}')
    token = lexer.get_next_token()

print("\nAll tokens:")
print([t['type'] for t in lexer])
"""

__all__ = ['Token', 'Lexer']

class Token(dict):
    def __init__(self, type, value, position):
        super().__init__(type=type, value=value, position=position)

    def __getattr__(self, name):
        try:
            return self[name]
        except KeyError:
            raise AttributeError(f"'Token' object has no attribute '{name}'")

class Lexer:
    def __init__(self, input):
        self.input = input
        self.position = 0
        self.tokens = []

    def get_next_token(self):
        while self.position < len(self.input):
            current_char = self.input[self.position]

            if current_char.isspace():
                self.position += 1
                continue

            if current_char == '#':
                self.position += 1
                while (self.position < len(self.input) and
                       self.input[self.position] != '\n'):
                    self.position += 1
                continue

            if current_char == '/' and self.position + 1 < len(self.input) and self.input[self.position + 1] == '*':
                self.position += 2
                while (self.position < len(self.input) - 1 and
                       (self.input[self.position] != '*' or self.input[self.position + 1] != '/')):
                    self.position += 1
                if self.position < len(self.input) - 1:
                    self.position += 2
                continue

            if current_char.isalpha():
                start_position = self.position
                while (self.position < len(self.input) and
                       self.input[self.position].isalnum()):
                    self.position += 1
                token = Token('IDENTIFIER', self.input[start_position:self.position], start_position)
                self.tokens.append(token)
                return token

            if current_char.isdigit():
                start_position = self.position
                while (self.position < len(self.input) and
                       self.input[self.position].isdigit()):
                    self.position += 1
                token = Token('INTEGER', int(self.input[start_position:self.position]), start_position)
                self.tokens.append(token)
                return token

            if current_char in {'<', '>', '=', '!', '&', '|', '@'}:
                if (self.position + 1 < len(self.input) and
                    self.input[self.position + 1] in {'=', '&', '|'}):
                    token = Token('OPERATOR', current_char + self.input[self.position + 1], self.position)
                    self.position += 2
                else:
                    token = Token('OPERATOR', current_char, self.position)
                    self.position += 1
                self.tokens.append(token)
                return token

            if current_char in {'{', '}', '(', ')', '[', ']', ';', ',', '.', ':'}:
                token = Token('SEPARATOR', current_char, self.position)
                self.position += 1
                self.tokens.append(token)
                return token

            if current_char in {'?', '!', '|'}:
                token = Token('CONTROL', current_char, self.position)
                self.position += 1
                self.tokens.append(token)
                return token

            self.position += 1
            raise Exception(f'Unknown character: {current_char}')

        token = Token('EOF', None, self.position)
        self.tokens.append(token)
        return token

    def __iter__(self):
        return iter(self.tokens)

    def __getitem__(self, index):
        return self.tokens[index]

    def __len__(self):
        return len(self.tokens)