Skip to content

Instantly share code, notes, and snippets.

@bersena911
Created August 19, 2020 20:07
Show Gist options
  • Select an option

  • Save bersena911/e211f47f2bf909c6c10bb763e20eef16 to your computer and use it in GitHub Desktop.

Select an option

Save bersena911/e211f47f2bf909c6c10bb763e20eef16 to your computer and use it in GitHub Desktop.
import script
import csv
import json
import pandas as pd
from crawler.spider_runner import SpiderRunner
from crawler.spiders.main_spider import Spider
from services.encryption import Encryption
types = {
'Beauty': 'Professional Beauty',
'homeservices': 'Home Services'
}
match_map = {
'EXACT': 3,
'STRONG': 2,
'WEAK': 1,
'NO MATCH': 0
}
class ImportLicenses:
def __init__(self, path):
self.path = path
def read_file(self):
if self.path.endswith('.xlsx'):
df = pd.read_excel(self.path)
df.fillna('', inplace=True)
else:
return
csvfile1 = open('detailed_result.csv', 'w')
writer1 = csv.writer(csvfile1)
csvfile2 = open('result.csv', 'w')
writer2 = csv.writer(csvfile2)
headers = list(df.columns.values)
writer2.writerow(headers + ['# of Results', 'Best Match'])
headered = False
for item in df.iloc:
if item['state'] != 'WA':
continue
search_request = dict()
search_request['business_name'] = item['business_name']
search_request['license_category'] = types[item['category']]
search_request['address'] = item['address']
search_request['city'] = item['city']
search_request['zip_code'] = str(item['zip']).split('.')[0]
search_request['phone_number'] = str(item['phone'])[1:]
results = []
lic_numbers_set = set()
if item.get('lic_number_primary', '').replace('na', '') or item.get('lic_number_secondary', '').replace('-', ''):
for lic_number_primary in item['lic_number_primary'].split(','):
if not lic_number_primary.replace('na', ''):
continue
lic_number = lic_number_primary.replace('na', '').strip()
lic_numbers_set.add(lic_number)
search_request['license'] = lic_number
print(search_request)
results += SpiderRunner('washington', **search_request).get_results()
for lic_number_secondary in item['lic_number_secondary'].split(','):
lic_number = lic_number_secondary.replace('-', '')
if not lic_number.strip() or lic_number.strip() in lic_numbers_set:
continue
search_request['license'] = lic_number_secondary.replace('-', '').strip()
search_request['secondary'] = True
print(search_request)
sec_result = SpiderRunner('washington', **search_request).get_results()
print(sec_result)
results += sec_result
else:
print(search_request)
results += SpiderRunner('washington', **search_request).get_results()
search = list(item)
best_result = 'NO MATCH'
if results:
result = results[0]
viewstate = result['viewstate']
decoded_result = Encryption(viewstate).decrypt()
decoded_details = json.loads(decoded_result)
decoded_details = Spider().pipeline(decoded_details)
decoded_details['match_details'] = result['match_details']
if not headered:
headered = True
writer1.writerow(headers + ['strength'] + list(decoded_details.keys()))
for result in results:
viewstate = result['viewstate']
strength = result['match_details']['strength']
decoded_result = Encryption(viewstate).decrypt()
decoded_details = json.loads(decoded_result)
decoded_details = Spider().pipeline(decoded_details)
decoded_details['match_details'] = result['match_details']
result = [strength] + list(decoded_details.values())
writer1.writerow(search + result)
if match_map[strength] > match_map[best_result]:
best_result = strength
else:
writer1.writerow(search)
writer2.writerow(search + [len(results), best_result])
if __name__ == '__main__':
ImportLicenses('businesses-ca.xlsx').read_file()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment