adhishthite · June 2, 2023 11:49
diff --git a/bloom_filter.py b/bloom_filter.py
 """
 (c) 2023, Adhish Thite.
 """

 # The following code installs the required Python packages for the script.
 # Faker is a Python package for creating fake data.
 # tqdm is a Python library used for printing progress bars.
 # bloom-filter2 is a Python implementation of bloom filters, a probabilistic data structure.

 !pip install Faker
 !pip install tqdm
 !pip install bloom-filter2

 # Importing necessary Python modules and libraries for the script.
 # os and json are built-in Python modules.
 # pickle is a Python built-in module for object serialization.
 # multiprocessing is a built-in Python library for parallel execution.
 # hashlib is a Python library for hash functions.

 import os
 import json
 import pickle
 import multiprocessing
 import hashlib
 from typing import Iterable  # Used for type hints

 from tqdm import tqdm  # For progress bars
 from faker import Faker  # For generating fake data
 from bloom_filter2 import BloomFilter  # For creating bloom filters

 # Initialize the Faker object and set the number of cores for multiprocessing.
 fake: Faker = Faker()
 num_cores = 4  # adjust this based on your system
 file_path_suffix = "/utils/data/"

 # Define a function to create a hash digest using hashlib.
 # This function takes a list of byte objects and a hash_name as parameters,
 # and returns the hexdigest of these byte objects.
 def get_digest(byte_objs=Iterable[bytes], hash_name: str = "sha3_256") -> str:
    hasher = hashlib.new(hash_name)
    for obj in byte_objs:
        hasher.update(obj)
    return hasher.hexdigest()

 # Define a function to generate a fake address using the Faker library.
 # This function takes a seed value as a parameter, which is used to seed the Faker library.
 def generate_address(q):
    Faker.seed(q)
    return {
        "city"        : fake.city(),
        "country_code": fake.country_code(),
        "postal_code" : fake.postcode(),
        "street"     : fake.street_address(),
        "subdivision" : fake.street_suffix()
    }

 # Begin main code execution
 # Measure the time taken to execute the entire block with IPython magic command '%%time'

 %%time

 # Prepare ranges of data sizes for fake addresses generation
 # and empty list for storing filenames.
 data_range: list = [100, 1_000, 10_000, 100_000, 1_000_000]
 filenames: list = []

 # Generate data and Bloom Filters for each range size
 for n_range in data_range:
    k_count: int = int(n_range / 1000)  # Convert the range into 'k' count
    
    # Prepare filenames for storing hash and Bloom Filter data
    address_hash_file_name: str = f"address_hash_{k_count}k.pickle"
    bloom_file_name: str = f"bloom_{k_count}k.bin"
    
    print(f"\n\nSIZE:\t\t{k_count}k RECORDS")
    
    # Step 1: Generate Fake Addresses using multiprocessing for efficiency
    print(f"\tGenerating Data...")
    address_data: list = []

    # Use multiprocessing.Pool() to create a pool of worker processes.
    # Then generate the fake address data in parallel using these processes.
    with multiprocessing.Pool(num_cores) as p:
        address_data = list(tqdm(p.imap(generate_address, range(n_range))))

    print(f"\t{len(address_data)} Fake Addresses Generated")
    
    # Create a Bloom Filter for the generated data
    bloom = BloomFilter(
        max_elements=n_range + 1,
        error_rate=0.01,
        filename=(f'{file_path_suffix}/{bloom_file_name}', -1)
    )
    
    # STEP 2: Generate address hashes and store in a set
    print(f"\n\tCreating Bloom Filter and HashSet for the Fake Dataset...")
    address_hash = set()

    # Hash each address and add it to the hash set and Bloom Filter
    for address in tqdm(address_data):
        digest = get_digest(map(lambda x: x.encode(), address.values()))
        address_hash.add(digest)
        bloom.add(digest)
        
    print(f"\tCreated Bloom Filter and HashSet")
    
    # STEP 4: Save the address_hash to a pickle file
    with open(f'{file_path_suffix}/{address_hash_file_name}', 'wb') as handle:
        pickle.dump(address_hash, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    # Add generated file names to the list
    filenames.append((address_hash_file_name, bloom_file_name))
    
 """
 OUTPUT

 SIZE:		0k RECORDS
 	Generating Data...
 	100 Fake Addresses Generated

 	Creating Bloom Filter and HashSet for the Fake Dataset...
 	Created Bloom Filter and HashSet


 SIZE:		1k RECORDS
 	Generating Data...
 	1000 Fake Addresses Generated

 	Creating Bloom Filter and HashSet for the Fake Dataset...
 	Created Bloom Filter and HashSet


 SIZE:		10k RECORDS
 	Generating Data...
 	10000 Fake Addresses Generated

 	Creating Bloom Filter and HashSet for the Fake Dataset...
 	Created Bloom Filter and HashSet


 SIZE:		100k RECORDS
 	Generating Data...
 	100000 Fake Addresses Generated

 	Creating Bloom Filter and HashSet for the Fake Dataset...
 	Created Bloom Filter and HashSet


 SIZE:		1000k RECORDS
 	Generating Data...
 	1000000 Fake Addresses Generated

 	Creating Bloom Filter and HashSet for the Fake Dataset...
 	Created Bloom Filter and HashSet


 CPU times: user 2min 25s, sys: 17.8 s, total: 2min 43s
 Wall time: 3min 31s
 """

 # Display the file sizes in MB for each generated data and Bloom Filter file
 for f_addr, f_bloom in filenames:

    addr_path: str = f'{file_path_suffix}/{f_addr}'
    bloom_path: str = f'{file_path_suffix}/{f_bloom}'

    # Get the size of the file in bytes, convert to MB
    addr_file_size_gb = os.stat(addr_path).st_size / (1024**2)
    bloom_file_size_gb = os.stat(bloom_path).st_size / (1024**2)
    
    # Print the size of the file in MB
    print(f'{f_addr}:\t\t{addr_file_size_gb:.6f} MB')
    print(f'{f_bloom}:\t\t\t{bloom_file_size_gb:.6f} MB')
    
    print()

    
 """
 address_hash_0k.pickle:		0.006405 MB
 bloom_0k.bin:			0.000118 MB

 address_hash_1k.pickle:		0.063922 MB
 bloom_1k.bin:			0.001146 MB

 address_hash_10k.pickle:		0.639082 MB
 bloom_10k.bin:			0.011430 MB

 address_hash_100k.pickle:		6.390699 MB
 bloom_100k.bin:			0.114266 MB

 address_hash_1000k.pickle:		63.906865 MB
 bloom_1000k.bin:			1.142632 MB
 """
	"""
	(c) 2023, Adhish Thite.
	"""

	# The following code installs the required Python packages for the script.
	# Faker is a Python package for creating fake data.
	# tqdm is a Python library used for printing progress bars.
	# bloom-filter2 is a Python implementation of bloom filters, a probabilistic data structure.

	!pip install Faker
	!pip install tqdm
	!pip install bloom-filter2

	# Importing necessary Python modules and libraries for the script.
	# os and json are built-in Python modules.
	# pickle is a Python built-in module for object serialization.
	# multiprocessing is a built-in Python library for parallel execution.
	# hashlib is a Python library for hash functions.

	import os
	import json
	import pickle
	import multiprocessing
	import hashlib
	from typing import Iterable # Used for type hints

	from tqdm import tqdm # For progress bars
	from faker import Faker # For generating fake data
	from bloom_filter2 import BloomFilter # For creating bloom filters

	# Initialize the Faker object and set the number of cores for multiprocessing.
	fake: Faker = Faker()
	num_cores = 4 # adjust this based on your system
	file_path_suffix = "/utils/data/"

	# Define a function to create a hash digest using hashlib.
	# This function takes a list of byte objects and a hash_name as parameters,
	# and returns the hexdigest of these byte objects.
	def get_digest(byte_objs=Iterable[bytes], hash_name: str = "sha3_256") -> str:
	hasher = hashlib.new(hash_name)
	for obj in byte_objs:
	hasher.update(obj)
	return hasher.hexdigest()

	# Define a function to generate a fake address using the Faker library.
	# This function takes a seed value as a parameter, which is used to seed the Faker library.
	def generate_address(q):
	Faker.seed(q)
	return {
	"city" : fake.city(),
	"country_code": fake.country_code(),
	"postal_code" : fake.postcode(),
	"street" : fake.street_address(),
	"subdivision" : fake.street_suffix()
	}

	# Begin main code execution
	# Measure the time taken to execute the entire block with IPython magic command '%%time'

	%%time

	# Prepare ranges of data sizes for fake addresses generation
	# and empty list for storing filenames.
	data_range: list = [100, 1_000, 10_000, 100_000, 1_000_000]
	filenames: list = []

	# Generate data and Bloom Filters for each range size
	for n_range in data_range:
	k_count: int = int(n_range / 1000) # Convert the range into 'k' count

	# Prepare filenames for storing hash and Bloom Filter data
	address_hash_file_name: str = f"address_hash_{k_count}k.pickle"
	bloom_file_name: str = f"bloom_{k_count}k.bin"

	print(f"\n\nSIZE:\t\t{k_count}k RECORDS")

	# Step 1: Generate Fake Addresses using multiprocessing for efficiency
	print(f"\tGenerating Data...")
	address_data: list = []

	# Use multiprocessing.Pool() to create a pool of worker processes.
	# Then generate the fake address data in parallel using these processes.
	with multiprocessing.Pool(num_cores) as p:
	address_data = list(tqdm(p.imap(generate_address, range(n_range))))

	print(f"\t{len(address_data)} Fake Addresses Generated")

	# Create a Bloom Filter for the generated data
	bloom = BloomFilter(
	max_elements=n_range + 1,
	error_rate=0.01,
	filename=(f'{file_path_suffix}/{bloom_file_name}', -1)
	)

	# STEP 2: Generate address hashes and store in a set
	print(f"\n\tCreating Bloom Filter and HashSet for the Fake Dataset...")
	address_hash = set()

	# Hash each address and add it to the hash set and Bloom Filter
	for address in tqdm(address_data):
	digest = get_digest(map(lambda x: x.encode(), address.values()))
	address_hash.add(digest)
	bloom.add(digest)

	print(f"\tCreated Bloom Filter and HashSet")

	# STEP 4: Save the address_hash to a pickle file
	with open(f'{file_path_suffix}/{address_hash_file_name}', 'wb') as handle:
	pickle.dump(address_hash, handle, protocol=pickle.HIGHEST_PROTOCOL)

	# Add generated file names to the list
	filenames.append((address_hash_file_name, bloom_file_name))

	"""
	OUTPUT

	SIZE: 0k RECORDS
	Generating Data...
	100 Fake Addresses Generated

	Creating Bloom Filter and HashSet for the Fake Dataset...
	Created Bloom Filter and HashSet


	SIZE: 1k RECORDS
	Generating Data...
	1000 Fake Addresses Generated

	Creating Bloom Filter and HashSet for the Fake Dataset...
	Created Bloom Filter and HashSet


	SIZE: 10k RECORDS
	Generating Data...
	10000 Fake Addresses Generated

	Creating Bloom Filter and HashSet for the Fake Dataset...
	Created Bloom Filter and HashSet


	SIZE: 100k RECORDS
	Generating Data...
	100000 Fake Addresses Generated

	Creating Bloom Filter and HashSet for the Fake Dataset...
	Created Bloom Filter and HashSet


	SIZE: 1000k RECORDS
	Generating Data...
	1000000 Fake Addresses Generated

	Creating Bloom Filter and HashSet for the Fake Dataset...
	Created Bloom Filter and HashSet


	CPU times: user 2min 25s, sys: 17.8 s, total: 2min 43s
	Wall time: 3min 31s
	"""

	# Display the file sizes in MB for each generated data and Bloom Filter file
	for f_addr, f_bloom in filenames:

	addr_path: str = f'{file_path_suffix}/{f_addr}'
	bloom_path: str = f'{file_path_suffix}/{f_bloom}'

	# Get the size of the file in bytes, convert to MB
	addr_file_size_gb = os.stat(addr_path).st_size / (1024**2)
	bloom_file_size_gb = os.stat(bloom_path).st_size / (1024**2)

	# Print the size of the file in MB
	print(f'{f_addr}:\t\t{addr_file_size_gb:.6f} MB')
	print(f'{f_bloom}:\t\t\t{bloom_file_size_gb:.6f} MB')

	print()


	"""
	address_hash_0k.pickle: 0.006405 MB
	bloom_0k.bin: 0.000118 MB

	address_hash_1k.pickle: 0.063922 MB
	bloom_1k.bin: 0.001146 MB

	address_hash_10k.pickle: 0.639082 MB
	bloom_10k.bin: 0.011430 MB

	address_hash_100k.pickle: 6.390699 MB
	bloom_100k.bin: 0.114266 MB

	address_hash_1000k.pickle: 63.906865 MB
	bloom_1000k.bin: 1.142632 MB
	"""
No results found