Skip to content

Instantly share code, notes, and snippets.

@nitori
Last active December 26, 2025 02:24
Show Gist options
  • Select an option

  • Save nitori/d4494de103c31e9d0872b20488d95dea to your computer and use it in GitHub Desktop.

Select an option

Save nitori/d4494de103c31e9d0872b20488d95dea to your computer and use it in GitHub Desktop.
simple dice expression parser using PEG like left-recursive parser (for left associativity)
from dataclasses import dataclass
from ast import BinOp, Add, Sub, Mult, Div, Mod, Constant, Tuple
import ast
import re
class Dice(Constant):
pass
@dataclass
class Tok:
name: str
value: str
col_offset: int
end_col_offset: int
@property
def extra(self) -> dict:
return {
'col_offset': self.col_offset,
'end_col_offset': self.end_col_offset,
}
def tokenize(text: str) -> list[Tok]:
lexemes_raw = [
('DICE', r'''
(\d+)?
(d\d+)
(k[hl]\d+)?
(\!\d*)?
'''),
('OP', r'(\+|\-|\*|/|%)'),
('PAR_OPEN', r'\('),
('PAR_CLOSE', r'\)'),
('SEP', r','),
('NUM', r'\d+'),
('WS', r'\s+'),
('ERR', r'.'), # any other character = invalid
]
lexemes = [
(name, re.compile(rf'^({pattern})', re.X | re.DOTALL)) for name, pattern in lexemes_raw
]
orig_text = text
pos = 0
tokens = []
while text:
for name, pattern in lexemes:
if (m := pattern.match(text)) is None:
continue
if name == 'ERR':
raise SyntaxError(
f'Invalid token at pos {pos} in\n'
f' {orig_text}\n'
f' {" " * pos}^'
)
value = m[0]
length = len(value)
text = text[length:]
col_offset = pos
pos += length
end_col_offset = pos
if name == 'WS':
# ignore whitespaces
break
tokens.append(Tok(name, value, col_offset, end_col_offset))
break
tokens.append(Tok('ENDTOKEN', '', pos, pos))
return tokens
def memoize(func):
def wrapper(parser: 'Parser', *args):
"""
simple caching
memo = (func, args, start_pos) --> (result, new_pos)
"""
mark = parser.mark()
key = func, args, mark
if key in parser.memo:
res, pos = parser.memo[key]
parser.reset(pos)
return res
res = func(parser, *args)
parser.memo[key] = res, parser.mark()
return res
return wrapper
def memoize_left_rec(func):
def wrapper(parser: 'Parser', *args):
"""
not simple caching.
- prime with failure to allow left-recursive call.
- repeat calling the function until longest match found.
"""
start_pos = parser.mark()
key = func, args, start_pos
if key in parser.memo:
res, pos = parser.memo[key]
parser.reset(pos)
return res
# prime with failure case
best_res, best_pos = None, start_pos
parser.memo[key] = best_res, best_pos
# try until we find the longest match
while True:
parser.reset(start_pos)
new_res = func(parser, *args)
new_pos = parser.mark()
if new_pos <= best_pos:
break
parser.memo[key] = new_res, new_pos
best_res, best_pos = new_res, new_pos
parser.reset(best_pos)
return best_res
return wrapper
class Parser:
def __init__(self, tokens: list[Tok]):
self.tokens = tokens
self.pos = 0
self.memo = {}
"""
Helpers
"""
def mark(self) -> int:
return self.pos
def reset(self, pos):
self.pos = pos
def peek(self, name_or_value: str) -> Tok | None:
tok = self.tokens[self.pos]
if tok.name == name_or_value or tok.value == name_or_value:
return tok
return None
def expect(self, name_or_value: str) -> Tok | None:
tok = self.peek(name_or_value)
if tok:
self.pos += 1
return tok
def parse(self):
"""start is entry point"""
return self.start()
"""
Parsing nodes.
"""
def start(self):
"""
start:
| dice-rolls ENDTOKEN
"""
start_pos = self.mark()
if (
(dice_rolls := self.dice_rolls())
and (_ := self.expect('ENDTOKEN'))
):
return dice_rolls
self.reset(start_pos)
return None
@memoize
def dice_rolls(self):
"""
dice-rolls:
| dice-roll [',' dice-roll]*
"""
start_pos = self.mark()
if roll := self.dice_roll():
col_offset = roll.col_offset
end_col_offset = roll.end_col_offset
result = [roll]
while (
(_ := self.expect(','))
and (next_roll := self.dice_roll())
):
end_col_offset = next_roll.end_col_offset
result.append(next_roll)
return Tuple(result, col_offset=col_offset, end_col_offset=end_col_offset)
self.reset(start_pos)
return None
@memoize
def dice_roll(self):
"""
We start with the lowest precedence.
dice-roll:
| sum
"""
start_pos = self.mark()
if result := self.sum():
return result
self.reset(start_pos)
return None
@memoize_left_rec
def sum(self):
"""
sum:
| sum '+' product
| sum '-' product
| product
"""
start_pos = self.mark()
if (
(a := self.sum())
and (_ := self.expect('+'))
and (b := self.product())
):
return BinOp(a, Add(), b, col_offset=a.col_offset, end_col_offset=b.end_col_offset)
self.reset(start_pos)
if (
(a := self.sum())
and (_ := self.expect('-'))
and (b := self.product())
):
return BinOp(a, Sub(), b, col_offset=a.col_offset, end_col_offset=b.end_col_offset)
self.reset(start_pos)
if a := self.product():
return a
self.reset(start_pos)
return None
@memoize_left_rec
def product(self):
"""
product:
| product '*' atom
| product '/' atom
| product '%' atom
| atom
"""
start_pos = self.mark()
if (
(a := self.product())
and (_ := self.expect('*'))
and (b := self.atom())
):
return BinOp(a, Mult(), b, col_offset=a.col_offset, end_col_offset=b.end_col_offset)
self.reset(start_pos)
if (
(a := self.product())
and (_ := self.expect('/'))
and (b := self.atom())
):
return BinOp(a, Div(), b, col_offset=a.col_offset, end_col_offset=b.end_col_offset)
self.reset(start_pos)
if (
(a := self.product())
and (_ := self.expect('%'))
and (b := self.atom())
):
return BinOp(a, Mod(), b, col_offset=a.col_offset, end_col_offset=b.end_col_offset)
self.reset(start_pos)
if a := self.atom():
return a
self.reset(start_pos)
return None
@memoize
def atom(self):
"""
atom:
| '(' a=dice-roll ')'
| dice=DICE
| number=NUM
"""
start_pos = self.mark()
if (
(_ := self.expect('('))
and (dice_roll := self.dice_roll())
and (_ := self.expect(')'))
):
return dice_roll
self.reset(start_pos)
if dice := self.expect('DICE'):
return Dice(value=dice.value, **dice.extra)
self.reset(start_pos)
if number := self.expect('NUM'):
return Constant(value=int(number.value), **number.extra)
self.reset(start_pos)
return None
def unparse(ast_obj):
"""like ast.unparse, just with extended class."""
from ast import _Unparser # noqa
class _UnparserExt(_Unparser):
def visit_Dice(self, node): # noqa
self.write(node.value)
unparser = _UnparserExt()
return unparser.visit(ast_obj)
def main():
sample = "3d4kh1 + d6, 5d6kl3 / (d10 + 2), 5d6kl3 / d10 + 2, 5 * (d20! / d10)"
tokens = tokenize(sample)
parser = Parser(tokens)
tree = parser.parse()
print('Unparsed:', unparse(tree))
print(ast.dump(tree, indent=4))
# output:
"""
Unparsed: (3d4kh1 + d6, 5d6kl3 / (d10 + 2), 5 * (d20! / d10))
Tuple(
elts=[
BinOp(
left=Dice(value='3d4kh1'),
op=Add(),
right=Dice(value='d6')),
BinOp(
left=Dice(value='5d6kl3'),
op=Div(),
right=BinOp(
left=Dice(value='d10'),
op=Add(),
right=Constant(value=2))),
BinOp(
left=Constant(value=5),
op=Mult(),
right=BinOp(
left=Dice(value='d20!'),
op=Div(),
right=Dice(value='d10')))],
ctx=Load())
"""
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment