shellward · January 22, 2023 23:12
diff --git a/_KRS-web-scraper-GPT-IndexBuilder-Gradio-App.py b/_KRS-web-scraper-GPT-IndexBuilder-Gradio-App.py
 # All the code you need for a gradio app that can answers questions about kentucky law. 
 central_url = 'https://apps.legislature.ky.gov/law/statutes/'

 import gradio as gr
 import os
 import json
 from gpt_index import Document, SimpleDirectoryReader, GPTSimpleVectorIndex, MockLLMPredictor
 import requests
 from bs4 import BeautifulSoup
 import pandas as pd
 import numpy as np
 import re
 import time
 from PyPDF2 import PdfReader

 # os.environ['OPENAI_API_KEY'] = OPEN_AI_KEY

 def get_chapter_links(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    chapter_links = []
    for link in soup.find_all('a', class_=['chapter', 'subchapter']): 
        if 'href' in link.attrs:
            chapter_links.append((link.text, link['href']))
    return chapter_links

 def get_statute_links(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    statute_links = []
    for link in soup.find_all('a', class_='statute'):
        if 'href' in link.attrs:
            statute_links.append((link.text, link['href']))
    return statute_links

 def download_statute(url, name):
    r = requests.get(url)
    name = re.sub(r'[\\/*?:"<>|]', '', name)
    name = name.strip()
    with open(f'{name}.pdf', 'wb') as f:
        f.write(r.content)

 def main(min_sleep=6, max_sleep=12):
    if not os.path.exists('data'):
        os.makedirs('data')
    os.chdir('data')
    chapter_links = get_chapter_links(central_url)
    for chapter in chapter_links:
        chapter_name = chapter[0]
        chapter_url = chapter[1]
        chapter_name = re.sub(r'[\\/*?:"<>|]', '', chapter_name)
        chapter_name = chapter_name.replace('.', '')
        if not os.path.exists(chapter_name):
            os.makedirs(chapter_name)
        os.chdir(chapter_name)
        statute_links = get_statute_links(f'https://apps.legislature.ky.gov/law/statutes/{chapter_url}')
        for statute in statute_links:
            statute_name = statute[0]
            statute_url = statute[1]
            if len(statute_name) > 128:
                statute_name = statute_name[:128]
            if not os.path.exists(f'{statute_name}.pdf'):
                download_statute(f'https://apps.legislature.ky.gov/law/statutes/{statute_url}', statute_name)
                time.sleep(np.random.randint(min_sleep, max_sleep))
            else:
                try:
                    pdf = PdfReader(f'{statute_name}.pdf')
                except:
                    os.remove(f'{statute_name}.pdf')
                    download_statute(f'https://apps.legislature.ky.gov/law/statutes/{statute_url}', statute_name)
                    time.sleep(np.random.randint(min_sleep, max_sleep))         
        os.chdir('..')
        print(f'Finished downloading {chapter_name}.')
    os.chdir('..')

 #this function was generated by copilot- I lost the notebook that had what I reall used, but the idea is you copy
 #all the pdfs into one directory, avoiding name collisions.
 def flatten_directories():
    os.chdir('data')
    directories = [d for d in os.listdir('.') if os.path.isdir(d)]
    os.chdir('..')
    if not os.path.exists('flattened'):
        os.makedirs('flattened')
    os.chdir('flattened')
    for directory in directories:
        os.chdir('..')
        os.chdir('data')
        os.chdir(directory)
        statutes = [s for s in os.listdir('.') if os.path.isfile(s)]
        for statute in statutes:
            if len(statute) > 64:
                statute = statute[:64]
            if not os.path.exists(f'{statute}'):
                os.rename(f'{statute}', f'{statute}')
            else:
                i = 1
                while os.path.exists(f'{statute}_{i}'):
                    i += 1
                os.rename(f'{statute}', f'{statute}_{i}')
    os.chdir('..')
    
 #these two functions are copilot as well. You just want to make sure all of your pdfs are valid before trying to load them.
 def check_pdf(filename):
    try:
        pdf = PdfReader(filename)
    except:
        return False
    return True
    
 def check_pdfs(remove_bad_files=True):
    bad_files = []
    os.chdir('flattened')
    files = [f for f in os.listdir('.') if os.path.isfile(f)]
    for file in files:
        if not check_pdf(file):
            bad_files.append(file)
            if (remove_bad_files):
                os.remove(file) 

 def estimate_cost():
    os.chdir('flattened')
    documents = SimpleDirectoryReader('flattened', errors='ignore' ).load_data()
    llm_predictor = MockLLMPredictor()
    index = GPTSimpleVectorIndex(documents, llm_predictor=llm_predictor)
    print(f'Estimated cost: ${llm_predictor.last_token_usage /1000 * .0004:.2f}')

 def build_index():
    os.chdir('flattened')
    documents = SimpleDirectoryReader('flattened', errors='ignore' ).load_data()
    index = GPTSimpleVectorIndex(documents)
    index.save_to_disk('krs_gpt_index.json')
    return index

 def load_index_from_disk():
    index = GPTSimpleVectorIndex([]).load_from_disk("gpt_index.json")
    return index

 def q(query):
    response = index.query(f"""
    You are responding to a question about Kentucky State Law. You have access to most of KRS, the Kentucky Revised Statues, to cite statutes specific to the following question. It is most important to use the newest most updated law available and be factual and step through each of your conclusions.
    Please be a thorough and professional as possible in your response. The question is: 

    {query}
    """)
    res = response.response
    return res

 main()
 flatten_directories()
 check_pdfs()
 estimate_cost()
 index = build_index()
 app = gr.Interface(fn=q, inputs="text", outputs="text", title="Kentucky State Law", description="A gradio app that answers questions about Kentucky State Law.", allow_flagging=False)
 app.launch(debug=True, share=True, inbrowser=True, enable_queue=True)
	# All the code you need for a gradio app that can answers questions about kentucky law.
	central_url = 'https://apps.legislature.ky.gov/law/statutes/'

	import gradio as gr
	import os
	import json
	from gpt_index import Document, SimpleDirectoryReader, GPTSimpleVectorIndex, MockLLMPredictor
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import numpy as np
	import re
	import time
	from PyPDF2 import PdfReader

	# os.environ['OPENAI_API_KEY'] = OPEN_AI_KEY

	def get_chapter_links(url):
	r = requests.get(url)
	soup = BeautifulSoup(r.text, 'html.parser')
	chapter_links = []
	for link in soup.find_all('a', class_=['chapter', 'subchapter']):
	if 'href' in link.attrs:
	chapter_links.append((link.text, link['href']))
	return chapter_links

	def get_statute_links(url):
	r = requests.get(url)
	soup = BeautifulSoup(r.text, 'html.parser')
	statute_links = []
	for link in soup.find_all('a', class_='statute'):
	if 'href' in link.attrs:
	statute_links.append((link.text, link['href']))
	return statute_links

	def download_statute(url, name):
	r = requests.get(url)
	name = re.sub(r'[\\/*?:"<>\|]', '', name)
	name = name.strip()
	with open(f'{name}.pdf', 'wb') as f:
	f.write(r.content)

	def main(min_sleep=6, max_sleep=12):
	if not os.path.exists('data'):
	os.makedirs('data')
	os.chdir('data')
	chapter_links = get_chapter_links(central_url)
	for chapter in chapter_links:
	chapter_name = chapter[0]
	chapter_url = chapter[1]
	chapter_name = re.sub(r'[\\/*?:"<>\|]', '', chapter_name)
	chapter_name = chapter_name.replace('.', '')
	if not os.path.exists(chapter_name):
	os.makedirs(chapter_name)
	os.chdir(chapter_name)
	statute_links = get_statute_links(f'https://apps.legislature.ky.gov/law/statutes/{chapter_url}')
	for statute in statute_links:
	statute_name = statute[0]
	statute_url = statute[1]
	if len(statute_name) > 128:
	statute_name = statute_name[:128]
	if not os.path.exists(f'{statute_name}.pdf'):
	download_statute(f'https://apps.legislature.ky.gov/law/statutes/{statute_url}', statute_name)
	time.sleep(np.random.randint(min_sleep, max_sleep))
	else:
	try:
	pdf = PdfReader(f'{statute_name}.pdf')
	except:
	os.remove(f'{statute_name}.pdf')
	download_statute(f'https://apps.legislature.ky.gov/law/statutes/{statute_url}', statute_name)
	time.sleep(np.random.randint(min_sleep, max_sleep))
	os.chdir('..')
	print(f'Finished downloading {chapter_name}.')
	os.chdir('..')

	#this function was generated by copilot- I lost the notebook that had what I reall used, but the idea is you copy
	#all the pdfs into one directory, avoiding name collisions.
	def flatten_directories():
	os.chdir('data')
	directories = [d for d in os.listdir('.') if os.path.isdir(d)]
	os.chdir('..')
	if not os.path.exists('flattened'):
	os.makedirs('flattened')
	os.chdir('flattened')
	for directory in directories:
	os.chdir('..')
	os.chdir('data')
	os.chdir(directory)
	statutes = [s for s in os.listdir('.') if os.path.isfile(s)]
	for statute in statutes:
	if len(statute) > 64:
	statute = statute[:64]
	if not os.path.exists(f'{statute}'):
	os.rename(f'{statute}', f'{statute}')
	else:
	i = 1
	while os.path.exists(f'{statute}_{i}'):
	i += 1
	os.rename(f'{statute}', f'{statute}_{i}')
	os.chdir('..')

	#these two functions are copilot as well. You just want to make sure all of your pdfs are valid before trying to load them.
	def check_pdf(filename):
	try:
	pdf = PdfReader(filename)
	except:
	return False
	return True

	def check_pdfs(remove_bad_files=True):
	bad_files = []
	os.chdir('flattened')
	files = [f for f in os.listdir('.') if os.path.isfile(f)]
	for file in files:
	if not check_pdf(file):
	bad_files.append(file)
	if (remove_bad_files):
	os.remove(file)

	def estimate_cost():
	os.chdir('flattened')
	documents = SimpleDirectoryReader('flattened', errors='ignore' ).load_data()
	llm_predictor = MockLLMPredictor()
	index = GPTSimpleVectorIndex(documents, llm_predictor=llm_predictor)
	print(f'Estimated cost: ${llm_predictor.last_token_usage /1000 * .0004:.2f}')

	def build_index():
	os.chdir('flattened')
	documents = SimpleDirectoryReader('flattened', errors='ignore' ).load_data()
	index = GPTSimpleVectorIndex(documents)
	index.save_to_disk('krs_gpt_index.json')
	return index

	def load_index_from_disk():
	index = GPTSimpleVectorIndex([]).load_from_disk("gpt_index.json")
	return index

	def q(query):
	response = index.query(f"""
	You are responding to a question about Kentucky State Law. You have access to most of KRS, the Kentucky Revised Statues, to cite statutes specific to the following question. It is most important to use the newest most updated law available and be factual and step through each of your conclusions.
	Please be a thorough and professional as possible in your response. The question is:

	{query}
	""")
	res = response.response
	return res

	main()
	flatten_directories()
	check_pdfs()
	estimate_cost()
	index = build_index()
	app = gr.Interface(fn=q, inputs="text", outputs="text", title="Kentucky State Law", description="A gradio app that answers questions about Kentucky State Law.", allow_flagging=False)
	app.launch(debug=True, share=True, inbrowser=True, enable_queue=True)
No results found