Last active
November 6, 2025 08:14
-
-
Save pranaysy/268fcc716c45781ca770fc569cbb2bf9 to your computer and use it in GitHub Desktop.
Comparison of LZ76 complexity implementations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| A demo comparison of three different implementations for estimating Lempel-Ziv complexity (1976) | |
| Implementations compared: | |
| - Pure Python version, by Naereen | |
| - Numba (str), as implemented in EntroPy by Raphael Vallat | |
| - Numba, updated for speed, based on numpy integer arrays | |
| Simple comparisons for equality of estimates and performance to address issue: | |
| https://github.com/raphaelvallat/entropy/issues/14 | |
| """ | |
| # Import calls | |
| from numba import njit | |
| from entropy import lziv_complexity as lzc_rv | |
| from collections import Counter | |
| import numpy as np | |
| # %% Function definitions | |
| def lzc_py(sequence): | |
| """ | |
| Naereen's earlier implementation, lines 78 to 103 of: | |
| https://github.com/Naereen/Lempel-Ziv_Complexity/commit/5537546dfcba37dae1079aeb8f8b57d68171ee7e | |
| """ | |
| u, v, w = 0, 1, 1 | |
| v_max = 1 | |
| length = len(sequence) | |
| complexity = 1 | |
| while True: | |
| if sequence[u + v - 1] == sequence[w + v - 1]: | |
| v += 1 | |
| if w + v >= length: | |
| complexity += 1 | |
| break | |
| else: | |
| if v > v_max: | |
| v_max = v | |
| u += 1 | |
| if u == w: | |
| complexity += 1 | |
| w += v_max | |
| if w > length: | |
| break | |
| else: | |
| u = 0 | |
| v = 1 | |
| v_max = 1 | |
| else: | |
| v = 1 | |
| return complexity | |
| @njit(["uint64(uint64[:])", "uint32(uint32[:])"], nogil=True) | |
| def lzc_numba(string): | |
| """ | |
| Numba adaptation of Naereen's implementation | |
| Slight modifications based on Yacine Mahdid's notebook: | |
| https://github.com/BIAPT/Notebooks/blob/master/features/Lempel-Ziv%20Complexity.ipynb | |
| """ | |
| # Initialize variables | |
| complexity = 1 | |
| prefix_len = 1 | |
| len_substring = 1 | |
| max_len_substring = 1 | |
| pointer = 0 | |
| # Iterate until the entire string has not been parsed | |
| while prefix_len + len_substring <= len(string): | |
| # Given a prefix length, find the largest substring | |
| if ( | |
| string[pointer + len_substring - 1] | |
| == string[prefix_len + len_substring - 1] | |
| ): | |
| len_substring += 1 # increase the length of the substring | |
| else: | |
| max_len_substring = max(len_substring, max_len_substring) | |
| pointer += 1 | |
| # Since all pointers have been scanned, pick largest as the jump size | |
| if pointer == prefix_len: | |
| # Increment complexity | |
| complexity += 1 | |
| # Set prefix length to the max substring size found so far (jump size) | |
| prefix_len += max_len_substring | |
| # Reset pointer and max substring size | |
| pointer = 0 | |
| max_len_substring = 1 | |
| # Reset length of current substring | |
| len_substring = 1 | |
| # Check if final iteration occurred in the middle of a substring | |
| if len_substring != 1: | |
| complexity += 1 | |
| return complexity | |
| def generate(size, symbols=[0, 1]): | |
| """ | |
| Generate an array of random symbols | |
| """ | |
| return np.random.choice(symbols, size).astype("uint64") | |
| def convert_to_str(input_array): | |
| """ | |
| Convert input 1D numeric array to a single string | |
| """ | |
| return "".join(str(x) for x in input_array) | |
| def convert_to_array(input_string): | |
| """ | |
| Convert input string to numeric array (ASCII representation) | |
| """ | |
| return np.fromiter(map(ord, input_string), count=len(input_string), dtype='uint64') | |
| # %% Compare the estimates: EntroPy vs pure Python (Naereen) vs new Numba | |
| # Initialize Counter for tracking mismatches | |
| mismatches = Counter() | |
| # Test 3 different input array sizes | |
| for size in [10, 100, 1_000]: | |
| # For each size, run 1000 tests for matching complexities | |
| for _ in range(1_000): | |
| # Generate binary array as well as string | |
| x_array = generate(size) | |
| x_str = convert_to_str(x_array) | |
| # If mismatch, add to Counter | |
| if lzc_rv(x_str) != lzc_numba(x_array) != lzc_py(x_str): | |
| mismatches[size] += 1 | |
| # mismatches is an empty Counter instance as all the three implementations match: | |
| print(mismatches) | |
| # Counter() | |
| # %% Compare performance | |
| lzc_results = [] | |
| for n in [10, 100, 1_000, 10_000]: | |
| # Generate array and string | |
| array = generate(n) | |
| string = convert_to_str(array) | |
| # Timings for pure Python version | |
| print("\n" + "#" * 80) | |
| print(f"Pure Python: size {n}, Lempel-Ziv complexity runs in:") | |
| res = %timeit -o lzc_py(string) | |
| lzc_results.append({ | |
| "size":n, | |
| "measurements":res.timings, | |
| "method":"pure_python"}) | |
| # Timings for Numba (string) version | |
| print("\n" + "#" * 80) | |
| print(f"Numba (string): size {n}, Lempel-Ziv complexity runs in:") | |
| res = %timeit -o lzc_rv(string) | |
| lzc_results.append({ | |
| "size":n, | |
| "measurements":res.timings, | |
| "method":"numba_string"}) | |
| # Timings for Numba (array) version | |
| print("\n" + "#" * 80) | |
| print(f"Numba (array): size {n}, Lempel-Ziv complexity runs in:") | |
| res = %timeit -o lzc_numba(array) | |
| lzc_results.append({ | |
| "size":n, | |
| "measurements":res.timings, | |
| "method":"numba_array"}) | |
| # Extended length for Numba (array) version | |
| for n in [100_000]: | |
| # Generate array and string | |
| array = generate(n) | |
| string = convert_to_str(array) | |
| # Timings for Numba (array) version | |
| print("\n" + "#" * 80) | |
| print(f"Numba (array): size {n}, Lempel-Ziv complexity runs in:") | |
| res = %timeit -o lzc_numba(array) | |
| lzc_results.append({ | |
| "size":n, | |
| "measurements":res.timings, | |
| "method":"numba_array"}) | |
| # lzc_results can be stored as a pandas DataFrame |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Reference timings for Intel Core i5-1035G4 running Pop OS 20.10 on
kernel 5.8.0Packages:
numba 0.51.2,numpy 1.19.2andentropy 0.1.2onPython 3.8.5