Created
March 30, 2024 21:48
-
-
Save truevis/a43004ee7a02c67a8b1ab36443b150d8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| from llama_parse import LlamaParse # pip install llama-parse | |
| from llama_index.core import SimpleDirectoryReader # pip install llama-index | |
| source_directory = r'\data' | |
| target_directory = r'\tables' | |
| parser = LlamaParse( | |
| api_key="...", # can also be set in your env as LLAMA_CLOUD_API_KEY | |
| result_type="markdown", # "markdown" and "text" are available | |
| verbose=True, | |
| num_workers=9, | |
| language="en", | |
| parsing_instruction="Electrical engineering codes" | |
| ) | |
| file_extractor = {".pdf": parser} | |
| reader = SimpleDirectoryReader(source_directory, file_extractor=file_extractor) | |
| documents = reader.load_data() | |
| for doc in documents: | |
| file_name = doc.metadata['file_name'] | |
| txt_file_name = os.path.splitext(file_name)[0] + "-lp.txt" | |
| txt_file_path = os.path.join(target_directory, txt_file_name) | |
| print(doc.text) | |
| with open(txt_file_path, 'w', encoding='utf-8') as txt_file: | |
| txt_file.write(doc.text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment