Skip to content

Instantly share code, notes, and snippets.

@adhishthite
Last active June 2, 2023 11:49
Show Gist options
  • Select an option

  • Save adhishthite/c0774797429c2c590acdd75e010fbff8 to your computer and use it in GitHub Desktop.

Select an option

Save adhishthite/c0774797429c2c590acdd75e010fbff8 to your computer and use it in GitHub Desktop.
"""
(c) 2023, Adhish Thite.
"""
# The following code installs the required Python packages for the script.
# Faker is a Python package for creating fake data.
# tqdm is a Python library used for printing progress bars.
# bloom-filter2 is a Python implementation of bloom filters, a probabilistic data structure.
!pip install Faker
!pip install tqdm
!pip install bloom-filter2
# Importing necessary Python modules and libraries for the script.
# os and json are built-in Python modules.
# pickle is a Python built-in module for object serialization.
# multiprocessing is a built-in Python library for parallel execution.
# hashlib is a Python library for hash functions.
import os
import json
import pickle
import multiprocessing
import hashlib
from typing import Iterable # Used for type hints
from tqdm import tqdm # For progress bars
from faker import Faker # For generating fake data
from bloom_filter2 import BloomFilter # For creating bloom filters
# Initialize the Faker object and set the number of cores for multiprocessing.
fake: Faker = Faker()
num_cores = 4 # adjust this based on your system
file_path_suffix = "/utils/data/"
# Define a function to create a hash digest using hashlib.
# This function takes a list of byte objects and a hash_name as parameters,
# and returns the hexdigest of these byte objects.
def get_digest(byte_objs=Iterable[bytes], hash_name: str = "sha3_256") -> str:
hasher = hashlib.new(hash_name)
for obj in byte_objs:
hasher.update(obj)
return hasher.hexdigest()
# Define a function to generate a fake address using the Faker library.
# This function takes a seed value as a parameter, which is used to seed the Faker library.
def generate_address(q):
Faker.seed(q)
return {
"city" : fake.city(),
"country_code": fake.country_code(),
"postal_code" : fake.postcode(),
"street" : fake.street_address(),
"subdivision" : fake.street_suffix()
}
# Begin main code execution
# Measure the time taken to execute the entire block with IPython magic command '%%time'
%%time
# Prepare ranges of data sizes for fake addresses generation
# and empty list for storing filenames.
data_range: list = [100, 1_000, 10_000, 100_000, 1_000_000]
filenames: list = []
# Generate data and Bloom Filters for each range size
for n_range in data_range:
k_count: int = int(n_range / 1000) # Convert the range into 'k' count
# Prepare filenames for storing hash and Bloom Filter data
address_hash_file_name: str = f"address_hash_{k_count}k.pickle"
bloom_file_name: str = f"bloom_{k_count}k.bin"
print(f"\n\nSIZE:\t\t{k_count}k RECORDS")
# Step 1: Generate Fake Addresses using multiprocessing for efficiency
print(f"\tGenerating Data...")
address_data: list = []
# Use multiprocessing.Pool() to create a pool of worker processes.
# Then generate the fake address data in parallel using these processes.
with multiprocessing.Pool(num_cores) as p:
address_data = list(tqdm(p.imap(generate_address, range(n_range))))
print(f"\t{len(address_data)} Fake Addresses Generated")
# Create a Bloom Filter for the generated data
bloom = BloomFilter(
max_elements=n_range + 1,
error_rate=0.01,
filename=(f'{file_path_suffix}/{bloom_file_name}', -1)
)
# STEP 2: Generate address hashes and store in a set
print(f"\n\tCreating Bloom Filter and HashSet for the Fake Dataset...")
address_hash = set()
# Hash each address and add it to the hash set and Bloom Filter
for address in tqdm(address_data):
digest = get_digest(map(lambda x: x.encode(), address.values()))
address_hash.add(digest)
bloom.add(digest)
print(f"\tCreated Bloom Filter and HashSet")
# STEP 4: Save the address_hash to a pickle file
with open(f'{file_path_suffix}/{address_hash_file_name}', 'wb') as handle:
pickle.dump(address_hash, handle, protocol=pickle.HIGHEST_PROTOCOL)
# Add generated file names to the list
filenames.append((address_hash_file_name, bloom_file_name))
"""
OUTPUT
SIZE: 0k RECORDS
Generating Data...
100 Fake Addresses Generated
Creating Bloom Filter and HashSet for the Fake Dataset...
Created Bloom Filter and HashSet
SIZE: 1k RECORDS
Generating Data...
1000 Fake Addresses Generated
Creating Bloom Filter and HashSet for the Fake Dataset...
Created Bloom Filter and HashSet
SIZE: 10k RECORDS
Generating Data...
10000 Fake Addresses Generated
Creating Bloom Filter and HashSet for the Fake Dataset...
Created Bloom Filter and HashSet
SIZE: 100k RECORDS
Generating Data...
100000 Fake Addresses Generated
Creating Bloom Filter and HashSet for the Fake Dataset...
Created Bloom Filter and HashSet
SIZE: 1000k RECORDS
Generating Data...
1000000 Fake Addresses Generated
Creating Bloom Filter and HashSet for the Fake Dataset...
Created Bloom Filter and HashSet
CPU times: user 2min 25s, sys: 17.8 s, total: 2min 43s
Wall time: 3min 31s
"""
# Display the file sizes in MB for each generated data and Bloom Filter file
for f_addr, f_bloom in filenames:
addr_path: str = f'{file_path_suffix}/{f_addr}'
bloom_path: str = f'{file_path_suffix}/{f_bloom}'
# Get the size of the file in bytes, convert to MB
addr_file_size_gb = os.stat(addr_path).st_size / (1024**2)
bloom_file_size_gb = os.stat(bloom_path).st_size / (1024**2)
# Print the size of the file in MB
print(f'{f_addr}:\t\t{addr_file_size_gb:.6f} MB')
print(f'{f_bloom}:\t\t\t{bloom_file_size_gb:.6f} MB')
print()
"""
address_hash_0k.pickle: 0.006405 MB
bloom_0k.bin: 0.000118 MB
address_hash_1k.pickle: 0.063922 MB
bloom_1k.bin: 0.001146 MB
address_hash_10k.pickle: 0.639082 MB
bloom_10k.bin: 0.011430 MB
address_hash_100k.pickle: 6.390699 MB
bloom_100k.bin: 0.114266 MB
address_hash_1000k.pickle: 63.906865 MB
bloom_1000k.bin: 1.142632 MB
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment