Last active
January 22, 2023 23:12
-
-
Save shellward/22f677d7b328e9052b1e5896ea3b49ec to your computer and use it in GitHub Desktop.
Everything you'd need to create a gradio app for asking questions about Kentucky Laws
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # All the code you need for a gradio app that can answers questions about kentucky law. | |
| central_url = 'https://apps.legislature.ky.gov/law/statutes/' | |
| import gradio as gr | |
| import os | |
| import json | |
| from gpt_index import Document, SimpleDirectoryReader, GPTSimpleVectorIndex, MockLLMPredictor | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import time | |
| from PyPDF2 import PdfReader | |
| # os.environ['OPENAI_API_KEY'] = OPEN_AI_KEY | |
| def get_chapter_links(url): | |
| r = requests.get(url) | |
| soup = BeautifulSoup(r.text, 'html.parser') | |
| chapter_links = [] | |
| for link in soup.find_all('a', class_=['chapter', 'subchapter']): | |
| if 'href' in link.attrs: | |
| chapter_links.append((link.text, link['href'])) | |
| return chapter_links | |
| def get_statute_links(url): | |
| r = requests.get(url) | |
| soup = BeautifulSoup(r.text, 'html.parser') | |
| statute_links = [] | |
| for link in soup.find_all('a', class_='statute'): | |
| if 'href' in link.attrs: | |
| statute_links.append((link.text, link['href'])) | |
| return statute_links | |
| def download_statute(url, name): | |
| r = requests.get(url) | |
| name = re.sub(r'[\\/*?:"<>|]', '', name) | |
| name = name.strip() | |
| with open(f'{name}.pdf', 'wb') as f: | |
| f.write(r.content) | |
| def main(min_sleep=6, max_sleep=12): | |
| if not os.path.exists('data'): | |
| os.makedirs('data') | |
| os.chdir('data') | |
| chapter_links = get_chapter_links(central_url) | |
| for chapter in chapter_links: | |
| chapter_name = chapter[0] | |
| chapter_url = chapter[1] | |
| chapter_name = re.sub(r'[\\/*?:"<>|]', '', chapter_name) | |
| chapter_name = chapter_name.replace('.', '') | |
| if not os.path.exists(chapter_name): | |
| os.makedirs(chapter_name) | |
| os.chdir(chapter_name) | |
| statute_links = get_statute_links(f'https://apps.legislature.ky.gov/law/statutes/{chapter_url}') | |
| for statute in statute_links: | |
| statute_name = statute[0] | |
| statute_url = statute[1] | |
| if len(statute_name) > 128: | |
| statute_name = statute_name[:128] | |
| if not os.path.exists(f'{statute_name}.pdf'): | |
| download_statute(f'https://apps.legislature.ky.gov/law/statutes/{statute_url}', statute_name) | |
| time.sleep(np.random.randint(min_sleep, max_sleep)) | |
| else: | |
| try: | |
| pdf = PdfReader(f'{statute_name}.pdf') | |
| except: | |
| os.remove(f'{statute_name}.pdf') | |
| download_statute(f'https://apps.legislature.ky.gov/law/statutes/{statute_url}', statute_name) | |
| time.sleep(np.random.randint(min_sleep, max_sleep)) | |
| os.chdir('..') | |
| print(f'Finished downloading {chapter_name}.') | |
| os.chdir('..') | |
| #this function was generated by copilot- I lost the notebook that had what I reall used, but the idea is you copy | |
| #all the pdfs into one directory, avoiding name collisions. | |
| def flatten_directories(): | |
| os.chdir('data') | |
| directories = [d for d in os.listdir('.') if os.path.isdir(d)] | |
| os.chdir('..') | |
| if not os.path.exists('flattened'): | |
| os.makedirs('flattened') | |
| os.chdir('flattened') | |
| for directory in directories: | |
| os.chdir('..') | |
| os.chdir('data') | |
| os.chdir(directory) | |
| statutes = [s for s in os.listdir('.') if os.path.isfile(s)] | |
| for statute in statutes: | |
| if len(statute) > 64: | |
| statute = statute[:64] | |
| if not os.path.exists(f'{statute}'): | |
| os.rename(f'{statute}', f'{statute}') | |
| else: | |
| i = 1 | |
| while os.path.exists(f'{statute}_{i}'): | |
| i += 1 | |
| os.rename(f'{statute}', f'{statute}_{i}') | |
| os.chdir('..') | |
| #these two functions are copilot as well. You just want to make sure all of your pdfs are valid before trying to load them. | |
| def check_pdf(filename): | |
| try: | |
| pdf = PdfReader(filename) | |
| except: | |
| return False | |
| return True | |
| def check_pdfs(remove_bad_files=True): | |
| bad_files = [] | |
| os.chdir('flattened') | |
| files = [f for f in os.listdir('.') if os.path.isfile(f)] | |
| for file in files: | |
| if not check_pdf(file): | |
| bad_files.append(file) | |
| if (remove_bad_files): | |
| os.remove(file) | |
| def estimate_cost(): | |
| os.chdir('flattened') | |
| documents = SimpleDirectoryReader('flattened', errors='ignore' ).load_data() | |
| llm_predictor = MockLLMPredictor() | |
| index = GPTSimpleVectorIndex(documents, llm_predictor=llm_predictor) | |
| print(f'Estimated cost: ${llm_predictor.last_token_usage /1000 * .0004:.2f}') | |
| def build_index(): | |
| os.chdir('flattened') | |
| documents = SimpleDirectoryReader('flattened', errors='ignore' ).load_data() | |
| index = GPTSimpleVectorIndex(documents) | |
| index.save_to_disk('krs_gpt_index.json') | |
| return index | |
| def load_index_from_disk(): | |
| index = GPTSimpleVectorIndex([]).load_from_disk("gpt_index.json") | |
| return index | |
| def q(query): | |
| response = index.query(f""" | |
| You are responding to a question about Kentucky State Law. You have access to most of KRS, the Kentucky Revised Statues, to cite statutes specific to the following question. It is most important to use the newest most updated law available and be factual and step through each of your conclusions. | |
| Please be a thorough and professional as possible in your response. The question is: | |
| {query} | |
| """) | |
| res = response.response | |
| return res | |
| main() | |
| flatten_directories() | |
| check_pdfs() | |
| estimate_cost() | |
| index = build_index() | |
| app = gr.Interface(fn=q, inputs="text", outputs="text", title="Kentucky State Law", description="A gradio app that answers questions about Kentucky State Law.", allow_flagging=False) | |
| app.launch(debug=True, share=True, inbrowser=True, enable_queue=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment