Skip to content

Instantly share code, notes, and snippets.

@truevis
Created March 30, 2024 21:48
Show Gist options
  • Select an option

  • Save truevis/a43004ee7a02c67a8b1ab36443b150d8 to your computer and use it in GitHub Desktop.

Select an option

Save truevis/a43004ee7a02c67a8b1ab36443b150d8 to your computer and use it in GitHub Desktop.
import os
from llama_parse import LlamaParse # pip install llama-parse
from llama_index.core import SimpleDirectoryReader # pip install llama-index
source_directory = r'\data'
target_directory = r'\tables'
parser = LlamaParse(
api_key="...", # can also be set in your env as LLAMA_CLOUD_API_KEY
result_type="markdown", # "markdown" and "text" are available
verbose=True,
num_workers=9,
language="en",
parsing_instruction="Electrical engineering codes"
)
file_extractor = {".pdf": parser}
reader = SimpleDirectoryReader(source_directory, file_extractor=file_extractor)
documents = reader.load_data()
for doc in documents:
file_name = doc.metadata['file_name']
txt_file_name = os.path.splitext(file_name)[0] + "-lp.txt"
txt_file_path = os.path.join(target_directory, txt_file_name)
print(doc.text)
with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
txt_file.write(doc.text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment