Last active
December 26, 2025 02:24
-
-
Save nitori/d4494de103c31e9d0872b20488d95dea to your computer and use it in GitHub Desktop.
simple dice expression parser using PEG like left-recursive parser (for left associativity)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from dataclasses import dataclass | |
| from ast import BinOp, Add, Sub, Mult, Div, Mod, Constant, Tuple | |
| import ast | |
| import re | |
| class Dice(Constant): | |
| pass | |
| @dataclass | |
| class Tok: | |
| name: str | |
| value: str | |
| col_offset: int | |
| end_col_offset: int | |
| @property | |
| def extra(self) -> dict: | |
| return { | |
| 'col_offset': self.col_offset, | |
| 'end_col_offset': self.end_col_offset, | |
| } | |
| def tokenize(text: str) -> list[Tok]: | |
| lexemes_raw = [ | |
| ('DICE', r''' | |
| (\d+)? | |
| (d\d+) | |
| (k[hl]\d+)? | |
| (\!\d*)? | |
| '''), | |
| ('OP', r'(\+|\-|\*|/|%)'), | |
| ('PAR_OPEN', r'\('), | |
| ('PAR_CLOSE', r'\)'), | |
| ('SEP', r','), | |
| ('NUM', r'\d+'), | |
| ('WS', r'\s+'), | |
| ('ERR', r'.'), # any other character = invalid | |
| ] | |
| lexemes = [ | |
| (name, re.compile(rf'^({pattern})', re.X | re.DOTALL)) for name, pattern in lexemes_raw | |
| ] | |
| orig_text = text | |
| pos = 0 | |
| tokens = [] | |
| while text: | |
| for name, pattern in lexemes: | |
| if (m := pattern.match(text)) is None: | |
| continue | |
| if name == 'ERR': | |
| raise SyntaxError( | |
| f'Invalid token at pos {pos} in\n' | |
| f' {orig_text}\n' | |
| f' {" " * pos}^' | |
| ) | |
| value = m[0] | |
| length = len(value) | |
| text = text[length:] | |
| col_offset = pos | |
| pos += length | |
| end_col_offset = pos | |
| if name == 'WS': | |
| # ignore whitespaces | |
| break | |
| tokens.append(Tok(name, value, col_offset, end_col_offset)) | |
| break | |
| tokens.append(Tok('ENDTOKEN', '', pos, pos)) | |
| return tokens | |
| def memoize(func): | |
| def wrapper(parser: 'Parser', *args): | |
| """ | |
| simple caching | |
| memo = (func, args, start_pos) --> (result, new_pos) | |
| """ | |
| mark = parser.mark() | |
| key = func, args, mark | |
| if key in parser.memo: | |
| res, pos = parser.memo[key] | |
| parser.reset(pos) | |
| return res | |
| res = func(parser, *args) | |
| parser.memo[key] = res, parser.mark() | |
| return res | |
| return wrapper | |
| def memoize_left_rec(func): | |
| def wrapper(parser: 'Parser', *args): | |
| """ | |
| not simple caching. | |
| - prime with failure to allow left-recursive call. | |
| - repeat calling the function until longest match found. | |
| """ | |
| start_pos = parser.mark() | |
| key = func, args, start_pos | |
| if key in parser.memo: | |
| res, pos = parser.memo[key] | |
| parser.reset(pos) | |
| return res | |
| # prime with failure case | |
| best_res, best_pos = None, start_pos | |
| parser.memo[key] = best_res, best_pos | |
| # try until we find the longest match | |
| while True: | |
| parser.reset(start_pos) | |
| new_res = func(parser, *args) | |
| new_pos = parser.mark() | |
| if new_pos <= best_pos: | |
| break | |
| parser.memo[key] = new_res, new_pos | |
| best_res, best_pos = new_res, new_pos | |
| parser.reset(best_pos) | |
| return best_res | |
| return wrapper | |
| class Parser: | |
| def __init__(self, tokens: list[Tok]): | |
| self.tokens = tokens | |
| self.pos = 0 | |
| self.memo = {} | |
| """ | |
| Helpers | |
| """ | |
| def mark(self) -> int: | |
| return self.pos | |
| def reset(self, pos): | |
| self.pos = pos | |
| def peek(self, name_or_value: str) -> Tok | None: | |
| tok = self.tokens[self.pos] | |
| if tok.name == name_or_value or tok.value == name_or_value: | |
| return tok | |
| return None | |
| def expect(self, name_or_value: str) -> Tok | None: | |
| tok = self.peek(name_or_value) | |
| if tok: | |
| self.pos += 1 | |
| return tok | |
| def parse(self): | |
| """start is entry point""" | |
| return self.start() | |
| """ | |
| Parsing nodes. | |
| """ | |
| def start(self): | |
| """ | |
| start: | |
| | dice-rolls ENDTOKEN | |
| """ | |
| start_pos = self.mark() | |
| if ( | |
| (dice_rolls := self.dice_rolls()) | |
| and (_ := self.expect('ENDTOKEN')) | |
| ): | |
| return dice_rolls | |
| self.reset(start_pos) | |
| return None | |
| @memoize | |
| def dice_rolls(self): | |
| """ | |
| dice-rolls: | |
| | dice-roll [',' dice-roll]* | |
| """ | |
| start_pos = self.mark() | |
| if roll := self.dice_roll(): | |
| col_offset = roll.col_offset | |
| end_col_offset = roll.end_col_offset | |
| result = [roll] | |
| while ( | |
| (_ := self.expect(',')) | |
| and (next_roll := self.dice_roll()) | |
| ): | |
| end_col_offset = next_roll.end_col_offset | |
| result.append(next_roll) | |
| return Tuple(result, col_offset=col_offset, end_col_offset=end_col_offset) | |
| self.reset(start_pos) | |
| return None | |
| @memoize | |
| def dice_roll(self): | |
| """ | |
| We start with the lowest precedence. | |
| dice-roll: | |
| | sum | |
| """ | |
| start_pos = self.mark() | |
| if result := self.sum(): | |
| return result | |
| self.reset(start_pos) | |
| return None | |
| @memoize_left_rec | |
| def sum(self): | |
| """ | |
| sum: | |
| | sum '+' product | |
| | sum '-' product | |
| | product | |
| """ | |
| start_pos = self.mark() | |
| if ( | |
| (a := self.sum()) | |
| and (_ := self.expect('+')) | |
| and (b := self.product()) | |
| ): | |
| return BinOp(a, Add(), b, col_offset=a.col_offset, end_col_offset=b.end_col_offset) | |
| self.reset(start_pos) | |
| if ( | |
| (a := self.sum()) | |
| and (_ := self.expect('-')) | |
| and (b := self.product()) | |
| ): | |
| return BinOp(a, Sub(), b, col_offset=a.col_offset, end_col_offset=b.end_col_offset) | |
| self.reset(start_pos) | |
| if a := self.product(): | |
| return a | |
| self.reset(start_pos) | |
| return None | |
| @memoize_left_rec | |
| def product(self): | |
| """ | |
| product: | |
| | product '*' atom | |
| | product '/' atom | |
| | product '%' atom | |
| | atom | |
| """ | |
| start_pos = self.mark() | |
| if ( | |
| (a := self.product()) | |
| and (_ := self.expect('*')) | |
| and (b := self.atom()) | |
| ): | |
| return BinOp(a, Mult(), b, col_offset=a.col_offset, end_col_offset=b.end_col_offset) | |
| self.reset(start_pos) | |
| if ( | |
| (a := self.product()) | |
| and (_ := self.expect('/')) | |
| and (b := self.atom()) | |
| ): | |
| return BinOp(a, Div(), b, col_offset=a.col_offset, end_col_offset=b.end_col_offset) | |
| self.reset(start_pos) | |
| if ( | |
| (a := self.product()) | |
| and (_ := self.expect('%')) | |
| and (b := self.atom()) | |
| ): | |
| return BinOp(a, Mod(), b, col_offset=a.col_offset, end_col_offset=b.end_col_offset) | |
| self.reset(start_pos) | |
| if a := self.atom(): | |
| return a | |
| self.reset(start_pos) | |
| return None | |
| @memoize | |
| def atom(self): | |
| """ | |
| atom: | |
| | '(' a=dice-roll ')' | |
| | dice=DICE | |
| | number=NUM | |
| """ | |
| start_pos = self.mark() | |
| if ( | |
| (_ := self.expect('(')) | |
| and (dice_roll := self.dice_roll()) | |
| and (_ := self.expect(')')) | |
| ): | |
| return dice_roll | |
| self.reset(start_pos) | |
| if dice := self.expect('DICE'): | |
| return Dice(value=dice.value, **dice.extra) | |
| self.reset(start_pos) | |
| if number := self.expect('NUM'): | |
| return Constant(value=int(number.value), **number.extra) | |
| self.reset(start_pos) | |
| return None | |
| def unparse(ast_obj): | |
| """like ast.unparse, just with extended class.""" | |
| from ast import _Unparser # noqa | |
| class _UnparserExt(_Unparser): | |
| def visit_Dice(self, node): # noqa | |
| self.write(node.value) | |
| unparser = _UnparserExt() | |
| return unparser.visit(ast_obj) | |
| def main(): | |
| sample = "3d4kh1 + d6, 5d6kl3 / (d10 + 2), 5d6kl3 / d10 + 2, 5 * (d20! / d10)" | |
| tokens = tokenize(sample) | |
| parser = Parser(tokens) | |
| tree = parser.parse() | |
| print('Unparsed:', unparse(tree)) | |
| print(ast.dump(tree, indent=4)) | |
| # output: | |
| """ | |
| Unparsed: (3d4kh1 + d6, 5d6kl3 / (d10 + 2), 5 * (d20! / d10)) | |
| Tuple( | |
| elts=[ | |
| BinOp( | |
| left=Dice(value='3d4kh1'), | |
| op=Add(), | |
| right=Dice(value='d6')), | |
| BinOp( | |
| left=Dice(value='5d6kl3'), | |
| op=Div(), | |
| right=BinOp( | |
| left=Dice(value='d10'), | |
| op=Add(), | |
| right=Constant(value=2))), | |
| BinOp( | |
| left=Constant(value=5), | |
| op=Mult(), | |
| right=BinOp( | |
| left=Dice(value='d20!'), | |
| op=Div(), | |
| right=Dice(value='d10')))], | |
| ctx=Load()) | |
| """ | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment