Skip to content

Instantly share code, notes, and snippets.

@shellward
Last active January 22, 2023 23:12
Show Gist options
  • Select an option

  • Save shellward/22f677d7b328e9052b1e5896ea3b49ec to your computer and use it in GitHub Desktop.

Select an option

Save shellward/22f677d7b328e9052b1e5896ea3b49ec to your computer and use it in GitHub Desktop.
Everything you'd need to create a gradio app for asking questions about Kentucky Laws
# All the code you need for a gradio app that can answers questions about kentucky law.
central_url = 'https://apps.legislature.ky.gov/law/statutes/'
import gradio as gr
import os
import json
from gpt_index import Document, SimpleDirectoryReader, GPTSimpleVectorIndex, MockLLMPredictor
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import time
from PyPDF2 import PdfReader
# os.environ['OPENAI_API_KEY'] = OPEN_AI_KEY
def get_chapter_links(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
chapter_links = []
for link in soup.find_all('a', class_=['chapter', 'subchapter']):
if 'href' in link.attrs:
chapter_links.append((link.text, link['href']))
return chapter_links
def get_statute_links(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
statute_links = []
for link in soup.find_all('a', class_='statute'):
if 'href' in link.attrs:
statute_links.append((link.text, link['href']))
return statute_links
def download_statute(url, name):
r = requests.get(url)
name = re.sub(r'[\\/*?:"<>|]', '', name)
name = name.strip()
with open(f'{name}.pdf', 'wb') as f:
f.write(r.content)
def main(min_sleep=6, max_sleep=12):
if not os.path.exists('data'):
os.makedirs('data')
os.chdir('data')
chapter_links = get_chapter_links(central_url)
for chapter in chapter_links:
chapter_name = chapter[0]
chapter_url = chapter[1]
chapter_name = re.sub(r'[\\/*?:"<>|]', '', chapter_name)
chapter_name = chapter_name.replace('.', '')
if not os.path.exists(chapter_name):
os.makedirs(chapter_name)
os.chdir(chapter_name)
statute_links = get_statute_links(f'https://apps.legislature.ky.gov/law/statutes/{chapter_url}')
for statute in statute_links:
statute_name = statute[0]
statute_url = statute[1]
if len(statute_name) > 128:
statute_name = statute_name[:128]
if not os.path.exists(f'{statute_name}.pdf'):
download_statute(f'https://apps.legislature.ky.gov/law/statutes/{statute_url}', statute_name)
time.sleep(np.random.randint(min_sleep, max_sleep))
else:
try:
pdf = PdfReader(f'{statute_name}.pdf')
except:
os.remove(f'{statute_name}.pdf')
download_statute(f'https://apps.legislature.ky.gov/law/statutes/{statute_url}', statute_name)
time.sleep(np.random.randint(min_sleep, max_sleep))
os.chdir('..')
print(f'Finished downloading {chapter_name}.')
os.chdir('..')
#this function was generated by copilot- I lost the notebook that had what I reall used, but the idea is you copy
#all the pdfs into one directory, avoiding name collisions.
def flatten_directories():
os.chdir('data')
directories = [d for d in os.listdir('.') if os.path.isdir(d)]
os.chdir('..')
if not os.path.exists('flattened'):
os.makedirs('flattened')
os.chdir('flattened')
for directory in directories:
os.chdir('..')
os.chdir('data')
os.chdir(directory)
statutes = [s for s in os.listdir('.') if os.path.isfile(s)]
for statute in statutes:
if len(statute) > 64:
statute = statute[:64]
if not os.path.exists(f'{statute}'):
os.rename(f'{statute}', f'{statute}')
else:
i = 1
while os.path.exists(f'{statute}_{i}'):
i += 1
os.rename(f'{statute}', f'{statute}_{i}')
os.chdir('..')
#these two functions are copilot as well. You just want to make sure all of your pdfs are valid before trying to load them.
def check_pdf(filename):
try:
pdf = PdfReader(filename)
except:
return False
return True
def check_pdfs(remove_bad_files=True):
bad_files = []
os.chdir('flattened')
files = [f for f in os.listdir('.') if os.path.isfile(f)]
for file in files:
if not check_pdf(file):
bad_files.append(file)
if (remove_bad_files):
os.remove(file)
def estimate_cost():
os.chdir('flattened')
documents = SimpleDirectoryReader('flattened', errors='ignore' ).load_data()
llm_predictor = MockLLMPredictor()
index = GPTSimpleVectorIndex(documents, llm_predictor=llm_predictor)
print(f'Estimated cost: ${llm_predictor.last_token_usage /1000 * .0004:.2f}')
def build_index():
os.chdir('flattened')
documents = SimpleDirectoryReader('flattened', errors='ignore' ).load_data()
index = GPTSimpleVectorIndex(documents)
index.save_to_disk('krs_gpt_index.json')
return index
def load_index_from_disk():
index = GPTSimpleVectorIndex([]).load_from_disk("gpt_index.json")
return index
def q(query):
response = index.query(f"""
You are responding to a question about Kentucky State Law. You have access to most of KRS, the Kentucky Revised Statues, to cite statutes specific to the following question. It is most important to use the newest most updated law available and be factual and step through each of your conclusions.
Please be a thorough and professional as possible in your response. The question is:
{query}
""")
res = response.response
return res
main()
flatten_directories()
check_pdfs()
estimate_cost()
index = build_index()
app = gr.Interface(fn=q, inputs="text", outputs="text", title="Kentucky State Law", description="A gradio app that answers questions about Kentucky State Law.", allow_flagging=False)
app.launch(debug=True, share=True, inbrowser=True, enable_queue=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment