Created
February 14, 2026 19:40
-
-
Save leodr/1e72fdda17cdb5acd1b62bae7449ec1f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import shutil | |
| from pathlib import Path | |
| import fitz # PyMuPDF | |
| import lmstudio as lms | |
| from pydantic import BaseModel | |
| IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"} | |
| class FileInfoSchema(BaseModel): | |
| document_type: str | |
| title: str | |
| year: int | |
| month: int | |
| day: int | |
| class ImageCaptionSchema(BaseModel): | |
| caption: str | |
| def pdf_to_image_files( | |
| pdf_path: Path, temp_dir: Path, max_size: int = 1500 | |
| ) -> list[Path]: | |
| """Convert each page of a PDF to image files in temp_dir, resized so max dimension is max_size.""" | |
| image_paths = [] | |
| pdf_document = fitz.open(pdf_path) | |
| for page_num in range(len(pdf_document)): | |
| page = pdf_document.load_page(page_num) | |
| # Calculate zoom to make larger side equal to max_size | |
| rect = page.rect | |
| scale = max_size / max(rect.width, rect.height) | |
| matrix = fitz.Matrix(scale, scale) | |
| pix = page.get_pixmap(matrix=matrix) | |
| # Save directly from pixmap (faster, skip PIL) | |
| image_path = temp_dir / f"page_{page_num:04d}.jpg" | |
| pix.save(image_path) | |
| image_paths.append(image_path) | |
| pdf_document.close() | |
| return image_paths | |
| def extract_image_caption(file_path: Path) -> ImageCaptionSchema: | |
| """Extract a short caption from an image using the LLM.""" | |
| model = lms.llm("mistralai/ministral-3-3b") | |
| chat = lms.Chat() | |
| prompt = ( | |
| "Generate a short caption (max 5-7 words) that describes this image. " | |
| "The caption should be concise and descriptive, suitable for use as a filename." | |
| ) | |
| image_handle = lms.prepare_image(str(file_path)) | |
| chat.add_user_message(prompt, images=[image_handle]) | |
| prediction = model.respond(chat, response_format=ImageCaptionSchema) | |
| parsed = prediction.parsed | |
| if isinstance(parsed, dict): | |
| return ImageCaptionSchema(**parsed) | |
| return parsed | |
| def extract_file_info(file_path: Path) -> FileInfoSchema: | |
| """Extract structured info from a PDF using the LLM.""" | |
| model = lms.llm("mistralai/ministral-3-3b") | |
| chat = lms.Chat() | |
| prompt = ( | |
| "Extract the following information from this document:\n" | |
| "- document_type: The type of document (e.g. Rechnung, Vertrag, Brief, Bescheid, Kontoauszug, Rezept, Arztbrief, Gutschrift, Mahnung, Angebot, Lieferschein, Quittung, Zertifikat, etc.)\n" | |
| "- title: A very short title (max 3-5 words) identifying the document, e.g. 'Hausarztpraxis München' or 'Amazon Bestellung'\n" | |
| "- year: The year mentioned or relevant to the document\n" | |
| "- month: The month (1-12)\n" | |
| "- day: The day of the month (1-31)\n\n" | |
| "If any date information is not available, make a reasonable guess based on context." | |
| ) | |
| temp_dir = None | |
| try: | |
| # Create temp directory in the same folder as the PDF | |
| temp_dir = file_path.parent / f".tmp_{file_path.stem}" | |
| temp_dir.mkdir(exist_ok=True) | |
| image_paths = pdf_to_image_files(file_path, temp_dir) | |
| image_handles = [lms.prepare_image(str(p)) for p in image_paths] | |
| chat.add_user_message(prompt, images=image_handles) | |
| prediction = model.respond(chat, response_format=FileInfoSchema) | |
| parsed = prediction.parsed | |
| if isinstance(parsed, dict): | |
| return FileInfoSchema(**parsed) | |
| return parsed | |
| finally: | |
| # Clean up temp directory | |
| if temp_dir and temp_dir.exists(): | |
| shutil.rmtree(temp_dir) | |
| def sanitize_filename(s: str) -> str: | |
| """Remove forbidden characters from a filename component.""" | |
| # Forbidden characters in filenames (Unix: /, Windows: \ / : * ? " < > |) | |
| forbidden = set('/\\:*?"<>|.') | |
| return "".join(c for c in s if c not in forbidden).strip() | |
| def construct_filename(info: FileInfoSchema, original_path: Path) -> str: | |
| """Construct a new filename from the extracted info (for PDFs).""" | |
| safe_type = sanitize_filename(info.document_type) | |
| safe_title = sanitize_filename(info.title) | |
| date_str = f"{info.year:04d}-{info.month:02d}-{info.day:02d}" | |
| new_name = f"{date_str} {safe_type} {safe_title}{original_path.suffix}" | |
| return new_name | |
| def construct_image_filename(caption: ImageCaptionSchema, original_path: Path) -> str: | |
| """Construct a new filename from the caption (for images).""" | |
| safe_caption = sanitize_filename(caption.caption) | |
| return f"{safe_caption}{original_path.suffix}" | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Rename an image or PDF file based on extracted content." | |
| ) | |
| parser.add_argument("file", type=str, help="Path to the image or PDF file") | |
| args = parser.parse_args() | |
| file_path = Path(args.file).resolve() | |
| if not file_path.exists(): | |
| print(f"Error: File '{file_path}' does not exist.") | |
| return 1 | |
| suffix = file_path.suffix.lower() | |
| if suffix not in IMAGE_EXTENSIONS and suffix != ".pdf": | |
| print(f"Error: File '{file_path}' is not a supported image or PDF file.") | |
| return 1 | |
| print(f"Processing: {file_path}") | |
| if suffix in IMAGE_EXTENSIONS: | |
| caption = extract_image_caption(file_path) | |
| print(f"Generated caption: {caption.caption}") | |
| new_filename = construct_image_filename(caption, file_path) | |
| else: | |
| info = extract_file_info(file_path) | |
| print(f"Extracted info: {info}") | |
| new_filename = construct_filename(info, file_path) | |
| new_path = file_path.parent / new_filename | |
| if new_path.exists(): | |
| print(f"Error: Target file '{new_path}' already exists.") | |
| return 1 | |
| file_path.rename(new_path) | |
| print(f"Renamed to: {new_path}") | |
| return 0 | |
| if __name__ == "__main__": | |
| exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment