Skip to content

Instantly share code, notes, and snippets.

@pranaysy
Last active November 6, 2025 08:14
Show Gist options
  • Select an option

  • Save pranaysy/268fcc716c45781ca770fc569cbb2bf9 to your computer and use it in GitHub Desktop.

Select an option

Save pranaysy/268fcc716c45781ca770fc569cbb2bf9 to your computer and use it in GitHub Desktop.
Comparison of LZ76 complexity implementations
"""
A demo comparison of three different implementations for estimating Lempel-Ziv complexity (1976)
Implementations compared:
- Pure Python version, by Naereen
- Numba (str), as implemented in EntroPy by Raphael Vallat
- Numba, updated for speed, based on numpy integer arrays
Simple comparisons for equality of estimates and performance to address issue:
https://github.com/raphaelvallat/entropy/issues/14
"""
# Import calls
from numba import njit
from entropy import lziv_complexity as lzc_rv
from collections import Counter
import numpy as np
# %% Function definitions
def lzc_py(sequence):
"""
Naereen's earlier implementation, lines 78 to 103 of:
https://github.com/Naereen/Lempel-Ziv_Complexity/commit/5537546dfcba37dae1079aeb8f8b57d68171ee7e
"""
u, v, w = 0, 1, 1
v_max = 1
length = len(sequence)
complexity = 1
while True:
if sequence[u + v - 1] == sequence[w + v - 1]:
v += 1
if w + v >= length:
complexity += 1
break
else:
if v > v_max:
v_max = v
u += 1
if u == w:
complexity += 1
w += v_max
if w > length:
break
else:
u = 0
v = 1
v_max = 1
else:
v = 1
return complexity
@njit(["uint64(uint64[:])", "uint32(uint32[:])"], nogil=True)
def lzc_numba(string):
"""
Numba adaptation of Naereen's implementation
Slight modifications based on Yacine Mahdid's notebook:
https://github.com/BIAPT/Notebooks/blob/master/features/Lempel-Ziv%20Complexity.ipynb
"""
# Initialize variables
complexity = 1
prefix_len = 1
len_substring = 1
max_len_substring = 1
pointer = 0
# Iterate until the entire string has not been parsed
while prefix_len + len_substring <= len(string):
# Given a prefix length, find the largest substring
if (
string[pointer + len_substring - 1]
== string[prefix_len + len_substring - 1]
):
len_substring += 1 # increase the length of the substring
else:
max_len_substring = max(len_substring, max_len_substring)
pointer += 1
# Since all pointers have been scanned, pick largest as the jump size
if pointer == prefix_len:
# Increment complexity
complexity += 1
# Set prefix length to the max substring size found so far (jump size)
prefix_len += max_len_substring
# Reset pointer and max substring size
pointer = 0
max_len_substring = 1
# Reset length of current substring
len_substring = 1
# Check if final iteration occurred in the middle of a substring
if len_substring != 1:
complexity += 1
return complexity
def generate(size, symbols=[0, 1]):
"""
Generate an array of random symbols
"""
return np.random.choice(symbols, size).astype("uint64")
def convert_to_str(input_array):
"""
Convert input 1D numeric array to a single string
"""
return "".join(str(x) for x in input_array)
def convert_to_array(input_string):
"""
Convert input string to numeric array (ASCII representation)
"""
return np.fromiter(map(ord, input_string), count=len(input_string), dtype='uint64')
# %% Compare the estimates: EntroPy vs pure Python (Naereen) vs new Numba
# Initialize Counter for tracking mismatches
mismatches = Counter()
# Test 3 different input array sizes
for size in [10, 100, 1_000]:
# For each size, run 1000 tests for matching complexities
for _ in range(1_000):
# Generate binary array as well as string
x_array = generate(size)
x_str = convert_to_str(x_array)
# If mismatch, add to Counter
if lzc_rv(x_str) != lzc_numba(x_array) != lzc_py(x_str):
mismatches[size] += 1
# mismatches is an empty Counter instance as all the three implementations match:
print(mismatches)
# Counter()
# %% Compare performance
lzc_results = []
for n in [10, 100, 1_000, 10_000]:
# Generate array and string
array = generate(n)
string = convert_to_str(array)
# Timings for pure Python version
print("\n" + "#" * 80)
print(f"Pure Python: size {n}, Lempel-Ziv complexity runs in:")
res = %timeit -o lzc_py(string)
lzc_results.append({
"size":n,
"measurements":res.timings,
"method":"pure_python"})
# Timings for Numba (string) version
print("\n" + "#" * 80)
print(f"Numba (string): size {n}, Lempel-Ziv complexity runs in:")
res = %timeit -o lzc_rv(string)
lzc_results.append({
"size":n,
"measurements":res.timings,
"method":"numba_string"})
# Timings for Numba (array) version
print("\n" + "#" * 80)
print(f"Numba (array): size {n}, Lempel-Ziv complexity runs in:")
res = %timeit -o lzc_numba(array)
lzc_results.append({
"size":n,
"measurements":res.timings,
"method":"numba_array"})
# Extended length for Numba (array) version
for n in [100_000]:
# Generate array and string
array = generate(n)
string = convert_to_str(array)
# Timings for Numba (array) version
print("\n" + "#" * 80)
print(f"Numba (array): size {n}, Lempel-Ziv complexity runs in:")
res = %timeit -o lzc_numba(array)
lzc_results.append({
"size":n,
"measurements":res.timings,
"method":"numba_array"})
# lzc_results can be stored as a pandas DataFrame
@pranaysy
Copy link
Author

gist_timings
Reference timings for Intel Core i5-1035G4 running Pop OS 20.10 on kernel 5.8.0
Packages: numba 0.51.2, numpy 1.19.2 and entropy 0.1.2 on Python 3.8.5

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment