Created
August 19, 2020 20:07
-
-
Save bersena911/e211f47f2bf909c6c10bb763e20eef16 to your computer and use it in GitHub Desktop.
import script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import csv | |
| import json | |
| import pandas as pd | |
| from crawler.spider_runner import SpiderRunner | |
| from crawler.spiders.main_spider import Spider | |
| from services.encryption import Encryption | |
| types = { | |
| 'Beauty': 'Professional Beauty', | |
| 'homeservices': 'Home Services' | |
| } | |
| match_map = { | |
| 'EXACT': 3, | |
| 'STRONG': 2, | |
| 'WEAK': 1, | |
| 'NO MATCH': 0 | |
| } | |
| class ImportLicenses: | |
| def __init__(self, path): | |
| self.path = path | |
| def read_file(self): | |
| if self.path.endswith('.xlsx'): | |
| df = pd.read_excel(self.path) | |
| df.fillna('', inplace=True) | |
| else: | |
| return | |
| csvfile1 = open('detailed_result.csv', 'w') | |
| writer1 = csv.writer(csvfile1) | |
| csvfile2 = open('result.csv', 'w') | |
| writer2 = csv.writer(csvfile2) | |
| headers = list(df.columns.values) | |
| writer2.writerow(headers + ['# of Results', 'Best Match']) | |
| headered = False | |
| for item in df.iloc: | |
| if item['state'] != 'WA': | |
| continue | |
| search_request = dict() | |
| search_request['business_name'] = item['business_name'] | |
| search_request['license_category'] = types[item['category']] | |
| search_request['address'] = item['address'] | |
| search_request['city'] = item['city'] | |
| search_request['zip_code'] = str(item['zip']).split('.')[0] | |
| search_request['phone_number'] = str(item['phone'])[1:] | |
| results = [] | |
| lic_numbers_set = set() | |
| if item.get('lic_number_primary', '').replace('na', '') or item.get('lic_number_secondary', '').replace('-', ''): | |
| for lic_number_primary in item['lic_number_primary'].split(','): | |
| if not lic_number_primary.replace('na', ''): | |
| continue | |
| lic_number = lic_number_primary.replace('na', '').strip() | |
| lic_numbers_set.add(lic_number) | |
| search_request['license'] = lic_number | |
| print(search_request) | |
| results += SpiderRunner('washington', **search_request).get_results() | |
| for lic_number_secondary in item['lic_number_secondary'].split(','): | |
| lic_number = lic_number_secondary.replace('-', '') | |
| if not lic_number.strip() or lic_number.strip() in lic_numbers_set: | |
| continue | |
| search_request['license'] = lic_number_secondary.replace('-', '').strip() | |
| search_request['secondary'] = True | |
| print(search_request) | |
| sec_result = SpiderRunner('washington', **search_request).get_results() | |
| print(sec_result) | |
| results += sec_result | |
| else: | |
| print(search_request) | |
| results += SpiderRunner('washington', **search_request).get_results() | |
| search = list(item) | |
| best_result = 'NO MATCH' | |
| if results: | |
| result = results[0] | |
| viewstate = result['viewstate'] | |
| decoded_result = Encryption(viewstate).decrypt() | |
| decoded_details = json.loads(decoded_result) | |
| decoded_details = Spider().pipeline(decoded_details) | |
| decoded_details['match_details'] = result['match_details'] | |
| if not headered: | |
| headered = True | |
| writer1.writerow(headers + ['strength'] + list(decoded_details.keys())) | |
| for result in results: | |
| viewstate = result['viewstate'] | |
| strength = result['match_details']['strength'] | |
| decoded_result = Encryption(viewstate).decrypt() | |
| decoded_details = json.loads(decoded_result) | |
| decoded_details = Spider().pipeline(decoded_details) | |
| decoded_details['match_details'] = result['match_details'] | |
| result = [strength] + list(decoded_details.values()) | |
| writer1.writerow(search + result) | |
| if match_map[strength] > match_map[best_result]: | |
| best_result = strength | |
| else: | |
| writer1.writerow(search) | |
| writer2.writerow(search + [len(results), best_result]) | |
| if __name__ == '__main__': | |
| ImportLicenses('businesses-ca.xlsx').read_file() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment