Mr0grog · February 9, 2026 06:56
diff --git a/README.md b/README.md
diff --git a/benchmark.py b/benchmark.py
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 # License: Apache 2.0, Copyright: 2026, Rob Brackett, based on an original
 # script licensed under Apache 2.0 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>

 from __future__ import absolute_import, division, print_function, unicode_literals

 import argparse
 import textwrap
 from functools import partial

 import html5_parser
 import html5lib
 from bs4 import BeautifulSoup
 from selectolax.lexbor import LexborHTMLParser, LexborNode
 import markupever

 try:
    from time import monotonic
 except ImportError:
    from time import time as monotonic

 TF = 'test/large.html'
 try:
    raw = open(TF, 'rb').read()
 except Exception:
    try:
        from urllib.request import urlopen
    except ImportError:
        from urllib import urlopen
    print('Downloading large HTML file...')
    raw = urlopen('https://www.w3.org/TR/html5/single-page.html').read()
    open(TF, 'wb').write(raw)

 print('Testing with HTML file of', '{:,}'.format(len(raw)), 'bytes')


 def timeit(func, number=1):
    total = 0
    for i in range(number):
        st = monotonic()
        r = func()
        t = monotonic() - st
        total += t
        del r
    return total / number


 def doit(name, func, num=20):
    print('Parsing', num, 'times with', name)
    t = timeit(func, num)
    rate = len(raw) / t
    print(name, f'took an average of: {t:,.3f} seconds to parse it @ {rate/1000:.1f} KB/s')
    return t


 p = argparse.ArgumentParser(description='Benchmark html5-parser')
 p.add_argument('treebuilder', nargs='?', default='lxml', choices='lxml soup dom etree'.split())
 p.add_argument(
    '--num',
    '-n',
    default=10,
    type=int,
    help='Number of repetitions for html5lib (html5-parser will use 10x as many reps)')
 args = p.parse_args()

 base_time = doit(
    'html5-parser',
    partial(
        html5_parser.parse,
        raw,
        transport_encoding="utf-8",
        namespace_elements=True,
        treebuilder=args.treebuilder),
    num=args.num * 10)

 soup_time = doit(
    'html5-parser-to-soup',
    partial(html5_parser.parse, raw, transport_encoding="utf-8", treebuilder='soup'),
    num=args.num)

 h5time = doit(
    'html5lib',
    partial(html5lib.parse, raw, transport_encoding="utf-8", treebuilder=args.treebuilder),
    num=args.num)
 soup5_time = doit(
    'BeautifulSoup-with-html5lib', partial(BeautifulSoup, raw, 'html5lib'), num=args.num)
 soup4_time = doit('BeautifulSoup-with-lxml', partial(BeautifulSoup, raw, 'lxml'), num=args.num)

 selectolax_time = doit(
    'selectolax',
    partial(LexborHTMLParser, raw),
    num=args.num * 10,
 )

 markupever_time = doit(
    'markupever',
    lambda: markupever.parse(raw, markupever.HtmlOptions()),
    num=args.num * 10,
 )


 # Test actually walking through the tree --------------------------------------


 def walk_h5(node):
    if node.tag == 'p':
        return node.text

    last = None
    for child in node:
        last = walk_h5(child) or last

    return last


 def parse_and_walk_html5parser():
    tree = html5_parser.parse(
        raw,
        transport_encoding="utf-8",
        namespace_elements=False,
        treebuilder='lxml',
    )
    # print(walk_h5(tree))
    walk_h5(tree)


 def walk_lexbor(node: LexborNode):
    if node.tag == 'p':
        return node.text(deep=False)

    last = None
    for child in node.iter():
        last = walk_lexbor(child) or last

    return last


 def parse_and_walk_selectolax():
    dom = LexborHTMLParser(raw).root
    # print(walk_lexbor(dom))
    walk_lexbor(dom)


 def walk_ever(node: markupever.dom.BaseNode):
    if not isinstance(node, markupever.dom.Element):
        return None

    if node.name == 'p':
        # Markupever doesn't seem to have a method for just the immediate text
        # children like the others, so this is closest. (It's also a little
        # faster than `node.text()` in the benchmark.)
        return ''.join(
            child.content
            for child in node.children()
            if isinstance(child, markupever.dom.Text)
        )
        # return node.text()

    last = None
    for child in node.children():
        last = walk_ever(child) or last

    return last


 def parse_and_walk_markupever():
    dom = markupever.parse(raw, markupever.HtmlOptions())
    # print(f'Walkever: {walk_ever(dom.root().select_one('html'))}')
    # print(f'Walkever: {walk_ever(dom.root().last_child)}')
    walk_ever(dom.root().last_child)


 base_walk_time = doit(
    'html5_parser',
    parse_and_walk_html5parser,
    num=args.num,
 )
 selectolax_walk_time = doit(
    'selectolax',
    parse_and_walk_selectolax,
    num=args.num,
 )
 markupever_walk_time = doit(
    'markupever',
    parse_and_walk_markupever,
    num=args.num,
 )


 def row(*args):
    for a in args:
        print('{:18s}'.format(str(a)), end='|')
    print()


 print()
 print(textwrap.fill(
    'Results are below. They show how much faster html5-parser is than'
    ' each specified parser. Note that there are two additional considerations:'
    ' what the final tree is and whether the parsing supports the HTML 5'
    ' parsing algorithm. The most apples-to-apples comparison is when the'
    ' final tree is lxml and HTML 5 parsing is supported by the parser being compared to.'
    ' Note that in this case, we have the largest speedup. In all other cases,'
    ' speedup is less because of the overhead of building the final tree'
    ' in python instead of C or because the compared parser does not use'
    ' the HTML 5 parsing algorithm or both.'))
 print()
 row('Parser', 'Tree', 'Supports HTML 5', 'Speedup (factor)')
 print('=' * 79)
 row('html5lib', 'lxml', 'yes', round(h5time / base_time))
 row('soup+html5lib', 'BeautifulSoup', 'yes', round(soup5_time / soup_time))
 row('soup+lxml.html', 'BeautifulSoup', 'no', round(soup4_time / soup_time))
 row('selectolax', 'lexbor', 'yes', round(selectolax_time / base_time, 2))
 row('markupever', 'html5ever', 'yes', round(markupever_time / base_time, 2))

 row('walk-selectolax', 'lexbor', 'yes', round(selectolax_walk_time / base_walk_time, 2))
 row('walk-markupever', 'html5ever', 'yes', round(markupever_walk_time / base_walk_time, 2))
diff --git a/requirements.txt b/requirements.txt
 beautifulsoup4 ==4.14.3
 html5-parser ==0.4.12 --no-binary lxml
 html5lib ==1.1
 lxml ==6.0.2
 selectolax ==0.4.6
 markupever ==0.3.2
	#!/usr/bin/env python
	# vim:fileencoding=utf-8
	# License: Apache 2.0, Copyright: 2026, Rob Brackett, based on an original
	# script licensed under Apache 2.0 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>

	from __future__ import absolute_import, division, print_function, unicode_literals

	import argparse
	import textwrap
	from functools import partial

	import html5_parser
	import html5lib
	from bs4 import BeautifulSoup
	from selectolax.lexbor import LexborHTMLParser, LexborNode
	import markupever

	try:
	from time import monotonic
	except ImportError:
	from time import time as monotonic

	TF = 'test/large.html'
	try:
	raw = open(TF, 'rb').read()
	except Exception:
	try:
	from urllib.request import urlopen
	except ImportError:
	from urllib import urlopen
	print('Downloading large HTML file...')
	raw = urlopen('https://www.w3.org/TR/html5/single-page.html').read()
	open(TF, 'wb').write(raw)

	print('Testing with HTML file of', '{:,}'.format(len(raw)), 'bytes')


	def timeit(func, number=1):
	total = 0
	for i in range(number):
	st = monotonic()
	r = func()
	t = monotonic() - st
	total += t
	del r
	return total / number


	def doit(name, func, num=20):
	print('Parsing', num, 'times with', name)
	t = timeit(func, num)
	rate = len(raw) / t
	print(name, f'took an average of: {t:,.3f} seconds to parse it @ {rate/1000:.1f} KB/s')
	return t


	p = argparse.ArgumentParser(description='Benchmark html5-parser')
	p.add_argument('treebuilder', nargs='?', default='lxml', choices='lxml soup dom etree'.split())
	p.add_argument(
	'--num',
	'-n',
	default=10,
	type=int,
	help='Number of repetitions for html5lib (html5-parser will use 10x as many reps)')
	args = p.parse_args()

	base_time = doit(
	'html5-parser',
	partial(
	html5_parser.parse,
	raw,
	transport_encoding="utf-8",
	namespace_elements=True,
	treebuilder=args.treebuilder),
	num=args.num * 10)

	soup_time = doit(
	'html5-parser-to-soup',
	partial(html5_parser.parse, raw, transport_encoding="utf-8", treebuilder='soup'),
	num=args.num)

	h5time = doit(
	'html5lib',
	partial(html5lib.parse, raw, transport_encoding="utf-8", treebuilder=args.treebuilder),
	num=args.num)
	soup5_time = doit(
	'BeautifulSoup-with-html5lib', partial(BeautifulSoup, raw, 'html5lib'), num=args.num)
	soup4_time = doit('BeautifulSoup-with-lxml', partial(BeautifulSoup, raw, 'lxml'), num=args.num)

	selectolax_time = doit(
	'selectolax',
	partial(LexborHTMLParser, raw),
	num=args.num * 10,
	)

	markupever_time = doit(
	'markupever',
	lambda: markupever.parse(raw, markupever.HtmlOptions()),
	num=args.num * 10,
	)


	# Test actually walking through the tree --------------------------------------


	def walk_h5(node):
	if node.tag == 'p':
	return node.text

	last = None
	for child in node:
	last = walk_h5(child) or last

	return last


	def parse_and_walk_html5parser():
	tree = html5_parser.parse(
	raw,
	transport_encoding="utf-8",
	namespace_elements=False,
	treebuilder='lxml',
	)
	# print(walk_h5(tree))
	walk_h5(tree)


	def walk_lexbor(node: LexborNode):
	if node.tag == 'p':
	return node.text(deep=False)

	last = None
	for child in node.iter():
	last = walk_lexbor(child) or last

	return last


	def parse_and_walk_selectolax():
	dom = LexborHTMLParser(raw).root
	# print(walk_lexbor(dom))
	walk_lexbor(dom)


	def walk_ever(node: markupever.dom.BaseNode):
	if not isinstance(node, markupever.dom.Element):
	return None

	if node.name == 'p':
	# Markupever doesn't seem to have a method for just the immediate text
	# children like the others, so this is closest. (It's also a little
	# faster than `node.text()` in the benchmark.)
	return ''.join(
	child.content
	for child in node.children()
	if isinstance(child, markupever.dom.Text)
	)
	# return node.text()

	last = None
	for child in node.children():
	last = walk_ever(child) or last

	return last


	def parse_and_walk_markupever():
	dom = markupever.parse(raw, markupever.HtmlOptions())
	# print(f'Walkever: {walk_ever(dom.root().select_one('html'))}')
	# print(f'Walkever: {walk_ever(dom.root().last_child)}')
	walk_ever(dom.root().last_child)


	base_walk_time = doit(
	'html5_parser',
	parse_and_walk_html5parser,
	num=args.num,
	)
	selectolax_walk_time = doit(
	'selectolax',
	parse_and_walk_selectolax,
	num=args.num,
	)
	markupever_walk_time = doit(
	'markupever',
	parse_and_walk_markupever,
	num=args.num,
	)


	def row(*args):
	for a in args:
	print('{:18s}'.format(str(a)), end='\|')
	print()


	print()
	print(textwrap.fill(
	'Results are below. They show how much faster html5-parser is than'
	' each specified parser. Note that there are two additional considerations:'
	' what the final tree is and whether the parsing supports the HTML 5'
	' parsing algorithm. The most apples-to-apples comparison is when the'
	' final tree is lxml and HTML 5 parsing is supported by the parser being compared to.'
	' Note that in this case, we have the largest speedup. In all other cases,'
	' speedup is less because of the overhead of building the final tree'
	' in python instead of C or because the compared parser does not use'
	' the HTML 5 parsing algorithm or both.'))
	print()
	row('Parser', 'Tree', 'Supports HTML 5', 'Speedup (factor)')
	print('=' * 79)
	row('html5lib', 'lxml', 'yes', round(h5time / base_time))
	row('soup+html5lib', 'BeautifulSoup', 'yes', round(soup5_time / soup_time))
	row('soup+lxml.html', 'BeautifulSoup', 'no', round(soup4_time / soup_time))
	row('selectolax', 'lexbor', 'yes', round(selectolax_time / base_time, 2))
	row('markupever', 'html5ever', 'yes', round(markupever_time / base_time, 2))

	row('walk-selectolax', 'lexbor', 'yes', round(selectolax_walk_time / base_walk_time, 2))
	row('walk-markupever', 'html5ever', 'yes', round(markupever_walk_time / base_walk_time, 2))
	beautifulsoup4 ==4.14.3
	html5-parser ==0.4.12 --no-binary lxml
	html5lib ==1.1
	lxml ==6.0.2
	selectolax ==0.4.6
	markupever ==0.3.2