import re
def lexical_analyzer(source):
token_specification = [
('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number
('ID', r'[A-Za-z_][A-Za-z0-9_]*'), # Identifiers
('OP', r'[+\-*/=]'), # Arithmetic operators and assignment
('SKIP', r'[ \t]+'), # Skip spaces and tabs
('MISMATCH', r'.'), # Any other character
]
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
for mo in re.finditer(tok_regex, source):
kind = mo.lastgroup
value = mo.group()
if kind == 'NUMBER':
yield ('NUMBER', value)
elif kind == 'ID':
yield ('ID', value)
elif kind == 'OP':
yield ('OP', value)
elif kind == 'SKIP':
continue
else:
raise RuntimeError(f'Unexpected character: {value}')
source_code = 'var1 = 23 + 42'
for token in lexical_analyzer(source_code):
print(token)