Last active
June 2, 2023 11:49
-
-
Save adhishthite/c0774797429c2c590acdd75e010fbff8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| (c) 2023, Adhish Thite. | |
| """ | |
| # The following code installs the required Python packages for the script. | |
| # Faker is a Python package for creating fake data. | |
| # tqdm is a Python library used for printing progress bars. | |
| # bloom-filter2 is a Python implementation of bloom filters, a probabilistic data structure. | |
| !pip install Faker | |
| !pip install tqdm | |
| !pip install bloom-filter2 | |
| # Importing necessary Python modules and libraries for the script. | |
| # os and json are built-in Python modules. | |
| # pickle is a Python built-in module for object serialization. | |
| # multiprocessing is a built-in Python library for parallel execution. | |
| # hashlib is a Python library for hash functions. | |
| import os | |
| import json | |
| import pickle | |
| import multiprocessing | |
| import hashlib | |
| from typing import Iterable # Used for type hints | |
| from tqdm import tqdm # For progress bars | |
| from faker import Faker # For generating fake data | |
| from bloom_filter2 import BloomFilter # For creating bloom filters | |
| # Initialize the Faker object and set the number of cores for multiprocessing. | |
| fake: Faker = Faker() | |
| num_cores = 4 # adjust this based on your system | |
| file_path_suffix = "/utils/data/" | |
| # Define a function to create a hash digest using hashlib. | |
| # This function takes a list of byte objects and a hash_name as parameters, | |
| # and returns the hexdigest of these byte objects. | |
| def get_digest(byte_objs=Iterable[bytes], hash_name: str = "sha3_256") -> str: | |
| hasher = hashlib.new(hash_name) | |
| for obj in byte_objs: | |
| hasher.update(obj) | |
| return hasher.hexdigest() | |
| # Define a function to generate a fake address using the Faker library. | |
| # This function takes a seed value as a parameter, which is used to seed the Faker library. | |
| def generate_address(q): | |
| Faker.seed(q) | |
| return { | |
| "city" : fake.city(), | |
| "country_code": fake.country_code(), | |
| "postal_code" : fake.postcode(), | |
| "street" : fake.street_address(), | |
| "subdivision" : fake.street_suffix() | |
| } | |
| # Begin main code execution | |
| # Measure the time taken to execute the entire block with IPython magic command '%%time' | |
| %%time | |
| # Prepare ranges of data sizes for fake addresses generation | |
| # and empty list for storing filenames. | |
| data_range: list = [100, 1_000, 10_000, 100_000, 1_000_000] | |
| filenames: list = [] | |
| # Generate data and Bloom Filters for each range size | |
| for n_range in data_range: | |
| k_count: int = int(n_range / 1000) # Convert the range into 'k' count | |
| # Prepare filenames for storing hash and Bloom Filter data | |
| address_hash_file_name: str = f"address_hash_{k_count}k.pickle" | |
| bloom_file_name: str = f"bloom_{k_count}k.bin" | |
| print(f"\n\nSIZE:\t\t{k_count}k RECORDS") | |
| # Step 1: Generate Fake Addresses using multiprocessing for efficiency | |
| print(f"\tGenerating Data...") | |
| address_data: list = [] | |
| # Use multiprocessing.Pool() to create a pool of worker processes. | |
| # Then generate the fake address data in parallel using these processes. | |
| with multiprocessing.Pool(num_cores) as p: | |
| address_data = list(tqdm(p.imap(generate_address, range(n_range)))) | |
| print(f"\t{len(address_data)} Fake Addresses Generated") | |
| # Create a Bloom Filter for the generated data | |
| bloom = BloomFilter( | |
| max_elements=n_range + 1, | |
| error_rate=0.01, | |
| filename=(f'{file_path_suffix}/{bloom_file_name}', -1) | |
| ) | |
| # STEP 2: Generate address hashes and store in a set | |
| print(f"\n\tCreating Bloom Filter and HashSet for the Fake Dataset...") | |
| address_hash = set() | |
| # Hash each address and add it to the hash set and Bloom Filter | |
| for address in tqdm(address_data): | |
| digest = get_digest(map(lambda x: x.encode(), address.values())) | |
| address_hash.add(digest) | |
| bloom.add(digest) | |
| print(f"\tCreated Bloom Filter and HashSet") | |
| # STEP 4: Save the address_hash to a pickle file | |
| with open(f'{file_path_suffix}/{address_hash_file_name}', 'wb') as handle: | |
| pickle.dump(address_hash, handle, protocol=pickle.HIGHEST_PROTOCOL) | |
| # Add generated file names to the list | |
| filenames.append((address_hash_file_name, bloom_file_name)) | |
| """ | |
| OUTPUT | |
| SIZE: 0k RECORDS | |
| Generating Data... | |
| 100 Fake Addresses Generated | |
| Creating Bloom Filter and HashSet for the Fake Dataset... | |
| Created Bloom Filter and HashSet | |
| SIZE: 1k RECORDS | |
| Generating Data... | |
| 1000 Fake Addresses Generated | |
| Creating Bloom Filter and HashSet for the Fake Dataset... | |
| Created Bloom Filter and HashSet | |
| SIZE: 10k RECORDS | |
| Generating Data... | |
| 10000 Fake Addresses Generated | |
| Creating Bloom Filter and HashSet for the Fake Dataset... | |
| Created Bloom Filter and HashSet | |
| SIZE: 100k RECORDS | |
| Generating Data... | |
| 100000 Fake Addresses Generated | |
| Creating Bloom Filter and HashSet for the Fake Dataset... | |
| Created Bloom Filter and HashSet | |
| SIZE: 1000k RECORDS | |
| Generating Data... | |
| 1000000 Fake Addresses Generated | |
| Creating Bloom Filter and HashSet for the Fake Dataset... | |
| Created Bloom Filter and HashSet | |
| CPU times: user 2min 25s, sys: 17.8 s, total: 2min 43s | |
| Wall time: 3min 31s | |
| """ | |
| # Display the file sizes in MB for each generated data and Bloom Filter file | |
| for f_addr, f_bloom in filenames: | |
| addr_path: str = f'{file_path_suffix}/{f_addr}' | |
| bloom_path: str = f'{file_path_suffix}/{f_bloom}' | |
| # Get the size of the file in bytes, convert to MB | |
| addr_file_size_gb = os.stat(addr_path).st_size / (1024**2) | |
| bloom_file_size_gb = os.stat(bloom_path).st_size / (1024**2) | |
| # Print the size of the file in MB | |
| print(f'{f_addr}:\t\t{addr_file_size_gb:.6f} MB') | |
| print(f'{f_bloom}:\t\t\t{bloom_file_size_gb:.6f} MB') | |
| print() | |
| """ | |
| address_hash_0k.pickle: 0.006405 MB | |
| bloom_0k.bin: 0.000118 MB | |
| address_hash_1k.pickle: 0.063922 MB | |
| bloom_1k.bin: 0.001146 MB | |
| address_hash_10k.pickle: 0.639082 MB | |
| bloom_10k.bin: 0.011430 MB | |
| address_hash_100k.pickle: 6.390699 MB | |
| bloom_100k.bin: 0.114266 MB | |
| address_hash_1000k.pickle: 63.906865 MB | |
| bloom_1000k.bin: 1.142632 MB | |
| """ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment