pranaysy · November 6, 2025 08:14 · pranaysy · Feb 24, 2021
diff --git a/lz76benchmarks.py b/lz76benchmarks.py
 """
 A demo comparison of three different implementations for estimating Lempel-Ziv complexity (1976)

 Implementations compared:
    - Pure Python version, by Naereen
    - Numba (str), as implemented in EntroPy by Raphael Vallat
    - Numba, updated for speed, based on numpy integer arrays

 Simple comparisons for equality of estimates and performance to address issue:
    https://github.com/raphaelvallat/entropy/issues/14
 """

 # Import calls
 from numba import njit
 from entropy import lziv_complexity as lzc_rv
 from collections import Counter
 import numpy as np

 # %% Function definitions
 def lzc_py(sequence):
    """
    Naereen's earlier implementation, lines 78 to 103 of:
        https://github.com/Naereen/Lempel-Ziv_Complexity/commit/5537546dfcba37dae1079aeb8f8b57d68171ee7e
    """
    u, v, w = 0, 1, 1
    v_max = 1
    length = len(sequence)
    complexity = 1
    while True:
        if sequence[u + v - 1] == sequence[w + v - 1]:
            v += 1
            if w + v >= length:
                complexity += 1
                break
        else:
            if v > v_max:
                v_max = v
            u += 1
            if u == w:
                complexity += 1
                w += v_max
                if w > length:
                    break
                else:
                    u = 0
                    v = 1
                    v_max = 1
            else:
                v = 1
    return complexity


 @njit(["uint64(uint64[:])", "uint32(uint32[:])"], nogil=True)
 def lzc_numba(string):
    """
    Numba adaptation of Naereen's implementation
    Slight modifications based on Yacine Mahdid's notebook:
        https://github.com/BIAPT/Notebooks/blob/master/features/Lempel-Ziv%20Complexity.ipynb
    """
    # Initialize variables
    complexity = 1
    prefix_len = 1
    len_substring = 1
    max_len_substring = 1
    pointer = 0

    # Iterate until the entire string has not been parsed
    while prefix_len + len_substring <= len(string):

        # Given a prefix length, find the largest substring
        if (
            string[pointer + len_substring - 1]
            == string[prefix_len + len_substring - 1]
        ):
            len_substring += 1  # increase the length of the substring
        else:

            max_len_substring = max(len_substring, max_len_substring)
            pointer += 1

            # Since all pointers have been scanned, pick largest as the jump size
            if pointer == prefix_len:

                # Increment complexity
                complexity += 1

                # Set prefix length to the max substring size found so far (jump size)
                prefix_len += max_len_substring

                # Reset pointer and max substring size
                pointer = 0
                max_len_substring = 1

            # Reset length of current substring
            len_substring = 1

    # Check if final iteration occurred in the middle of a substring
    if len_substring != 1:
        complexity += 1

    return complexity


 def generate(size, symbols=[0, 1]):
    """
    Generate an array of random symbols
    """
    return np.random.choice(symbols, size).astype("uint64")


 def convert_to_str(input_array):
    """
    Convert input 1D numeric array to a single string
    """
    return "".join(str(x) for x in input_array)

 def convert_to_array(input_string):
    """
    Convert input string to numeric array (ASCII representation)
    """
    return np.fromiter(map(ord, input_string), count=len(input_string), dtype='uint64')

 # %% Compare the estimates: EntroPy vs pure Python (Naereen) vs new Numba

 # Initialize Counter for tracking mismatches
 mismatches = Counter()

 # Test 3 different input array sizes
 for size in [10, 100, 1_000]:

    # For each size, run 1000 tests for matching complexities
    for _ in range(1_000):

        # Generate binary array as well as string
        x_array = generate(size)
        x_str = convert_to_str(x_array)

        # If mismatch, add to Counter
        if lzc_rv(x_str) != lzc_numba(x_array) != lzc_py(x_str):
            mismatches[size] += 1

 # mismatches is an empty Counter instance as all the three implementations match:
 print(mismatches)
 # Counter()

 # %% Compare performance

 lzc_results = []

 for n in [10, 100, 1_000, 10_000]:

    # Generate array and string
    array = generate(n)
    string = convert_to_str(array)

    # Timings for pure Python version
    print("\n" + "#" * 80)
    print(f"Pure Python: size {n}, Lempel-Ziv complexity runs in:")
    res = %timeit -o lzc_py(string)

    lzc_results.append({
        "size":n,
        "measurements":res.timings,
        "method":"pure_python"})

    # Timings for Numba (string) version
    print("\n" + "#" * 80)
    print(f"Numba (string): size {n}, Lempel-Ziv complexity runs in:")
    res = %timeit -o lzc_rv(string)

    lzc_results.append({
        "size":n,
        "measurements":res.timings,
        "method":"numba_string"})

    # Timings for Numba (array) version
    print("\n" + "#" * 80)
    print(f"Numba (array): size {n}, Lempel-Ziv complexity runs in:")
    res = %timeit -o lzc_numba(array)

    lzc_results.append({
        "size":n,
        "measurements":res.timings,
        "method":"numba_array"})

 # Extended length for Numba (array) version
 for n in [100_000]:

    # Generate array and string
    array = generate(n)
    string = convert_to_str(array)

    # Timings for Numba (array) version
    print("\n" + "#" * 80)
    print(f"Numba (array): size {n}, Lempel-Ziv complexity runs in:")
    res = %timeit -o lzc_numba(array)

    lzc_results.append({
        "size":n,
        "measurements":res.timings,
        "method":"numba_array"})

 # lzc_results can be stored as a pandas DataFrame
	"""
	A demo comparison of three different implementations for estimating Lempel-Ziv complexity (1976)

	Implementations compared:
	- Pure Python version, by Naereen
	- Numba (str), as implemented in EntroPy by Raphael Vallat
	- Numba, updated for speed, based on numpy integer arrays

	Simple comparisons for equality of estimates and performance to address issue:
	https://github.com/raphaelvallat/entropy/issues/14
	"""

	# Import calls
	from numba import njit
	from entropy import lziv_complexity as lzc_rv
	from collections import Counter
	import numpy as np

	# %% Function definitions
	def lzc_py(sequence):
	"""
	Naereen's earlier implementation, lines 78 to 103 of:
	https://github.com/Naereen/Lempel-Ziv_Complexity/commit/5537546dfcba37dae1079aeb8f8b57d68171ee7e
	"""
	u, v, w = 0, 1, 1
	v_max = 1
	length = len(sequence)
	complexity = 1
	while True:
	if sequence[u + v - 1] == sequence[w + v - 1]:
	v += 1
	if w + v >= length:
	complexity += 1
	break
	else:
	if v > v_max:
	v_max = v
	u += 1
	if u == w:
	complexity += 1
	w += v_max
	if w > length:
	break
	else:
	u = 0
	v = 1
	v_max = 1
	else:
	v = 1
	return complexity


	@njit(["uint64(uint64[:])", "uint32(uint32[:])"], nogil=True)
	def lzc_numba(string):
	"""
	Numba adaptation of Naereen's implementation
	Slight modifications based on Yacine Mahdid's notebook:
	https://github.com/BIAPT/Notebooks/blob/master/features/Lempel-Ziv%20Complexity.ipynb
	"""
	# Initialize variables
	complexity = 1
	prefix_len = 1
	len_substring = 1
	max_len_substring = 1
	pointer = 0

	# Iterate until the entire string has not been parsed
	while prefix_len + len_substring <= len(string):

	# Given a prefix length, find the largest substring
	if (
	string[pointer + len_substring - 1]
	== string[prefix_len + len_substring - 1]
	):
	len_substring += 1 # increase the length of the substring
	else:

	max_len_substring = max(len_substring, max_len_substring)
	pointer += 1

	# Since all pointers have been scanned, pick largest as the jump size
	if pointer == prefix_len:

	# Increment complexity
	complexity += 1

	# Set prefix length to the max substring size found so far (jump size)
	prefix_len += max_len_substring

	# Reset pointer and max substring size
	pointer = 0
	max_len_substring = 1

	# Reset length of current substring
	len_substring = 1

	# Check if final iteration occurred in the middle of a substring
	if len_substring != 1:
	complexity += 1

	return complexity


	def generate(size, symbols=[0, 1]):
	"""
	Generate an array of random symbols
	"""
	return np.random.choice(symbols, size).astype("uint64")


	def convert_to_str(input_array):
	"""
	Convert input 1D numeric array to a single string
	"""
	return "".join(str(x) for x in input_array)

	def convert_to_array(input_string):
	"""
	Convert input string to numeric array (ASCII representation)
	"""
	return np.fromiter(map(ord, input_string), count=len(input_string), dtype='uint64')

	# %% Compare the estimates: EntroPy vs pure Python (Naereen) vs new Numba

	# Initialize Counter for tracking mismatches
	mismatches = Counter()

	# Test 3 different input array sizes
	for size in [10, 100, 1_000]:

	# For each size, run 1000 tests for matching complexities
	for _ in range(1_000):

	# Generate binary array as well as string
	x_array = generate(size)
	x_str = convert_to_str(x_array)

	# If mismatch, add to Counter
	if lzc_rv(x_str) != lzc_numba(x_array) != lzc_py(x_str):
	mismatches[size] += 1

	# mismatches is an empty Counter instance as all the three implementations match:
	print(mismatches)
	# Counter()

	# %% Compare performance

	lzc_results = []

	for n in [10, 100, 1_000, 10_000]:

	# Generate array and string
	array = generate(n)
	string = convert_to_str(array)

	# Timings for pure Python version
	print("\n" + "#" * 80)
	print(f"Pure Python: size {n}, Lempel-Ziv complexity runs in:")
	res = %timeit -o lzc_py(string)

	lzc_results.append({
	"size":n,
	"measurements":res.timings,
	"method":"pure_python"})

	# Timings for Numba (string) version
	print("\n" + "#" * 80)
	print(f"Numba (string): size {n}, Lempel-Ziv complexity runs in:")
	res = %timeit -o lzc_rv(string)

	lzc_results.append({
	"size":n,
	"measurements":res.timings,
	"method":"numba_string"})

	# Timings for Numba (array) version
	print("\n" + "#" * 80)
	print(f"Numba (array): size {n}, Lempel-Ziv complexity runs in:")
	res = %timeit -o lzc_numba(array)

	lzc_results.append({
	"size":n,
	"measurements":res.timings,
	"method":"numba_array"})

	# Extended length for Numba (array) version
	for n in [100_000]:

	# Generate array and string
	array = generate(n)
	string = convert_to_str(array)

	# Timings for Numba (array) version
	print("\n" + "#" * 80)
	print(f"Numba (array): size {n}, Lempel-Ziv complexity runs in:")
	res = %timeit -o lzc_numba(array)

	lzc_results.append({
	"size":n,
	"measurements":res.timings,
	"method":"numba_array"})

	# lzc_results can be stored as a pandas DataFrame
No results found