shhommychon · April 6, 2025 13:36
diff --git a/.my_pdf_to_image.ipynb b/.my_pdf_to_image.ipynb
 {
    "nbformat": 4,
    "nbformat_minor": 0,
    "metadata": {
        "colab": {
            "provenance": [],
            "collapsed_sections": [
                "DCFI_fgT0O_y",
                "J4dArR1q0RQB",
                "8A8d2Lc31KK8",
                "tOHgmwjQ4C_h"
            ]
        },
        "kernelspec": {
            "name": "python3",
            "display_name": "Python 3"
        },
        "language_info": {
            "name": "python"
        }
    },
    "cells": [
        {
            "cell_type": "markdown",
            "source": [
                "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/shhommychon/823e7f590b3cd0e85ae40df6b6c55e2d)"
            ],
            "metadata": {
                "id": "R1zMGDzzzm8v"
            }
        },
        {
            "cell_type": "markdown",
            "source": [
                "# PDF → 이미지\n",
                "- [pdf2png](https://pdf2png.com/) 같은 외부 유틸을 쓰는데 파일 크기가 너무 커서 다운받는 데 1시간 이상 걸릴 때 쓰려고 챗지피티로 간단히 만든 페이지"
            ],
            "metadata": {
                "id": "cTCLqj-Wzv1w"
            }
        },
        {
            "cell_type": "markdown",
            "source": [
                "##### setup\n",
                "- 다시 시작하라는 경고 메세지가 떠도 놀라지 말고 다시 실행하세요"
            ],
            "metadata": {
                "id": "DCFI_fgT0O_y"
            }
        },
        {
            "cell_type": "markdown",
            "source": [
                "###### dependencies\n",
                "- `backports` 라이브러리에 의한 재시작이 요구될 수 있음."
            ],
            "metadata": {
                "id": "J4dArR1q0RQB"
            }
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {
                "id": "RKB3KQ1fzl8X"
            },
            "outputs": [],
            "source": [
                "!pip install fitz --quiet\n",
                "!pip install pymupdf --quiet"
            ]
        },
        {
            "cell_type": "markdown",
            "source": [
                "###### utility functions"
            ],
            "metadata": {
                "id": "8A8d2Lc31KK8"
            }
        },
        {
            "cell_type": "code",
            "source": [
                "!git clone https://gist.github.com/823e7f590b3cd0e85ae40df6b6c55e2d.git gist\n",
                "!cp -r gist/. .\n",
                "!rm -rf gist/\n",
                "!rm -r ./*.ipynb"
            ],
            "metadata": {
                "id": "Q_c5Yf5918Gu"
            },
            "execution_count": null,
            "outputs": []
        },
        {
            "cell_type": "code",
            "source": [
                "from my_utility import extract_images_from_pdf, render_pdf_to_images, compress_images_to_zip"
            ],
            "metadata": {
                "id": "mGXksrMh1OzW"
            },
            "execution_count": null,
            "outputs": []
        },
        {
            "cell_type": "markdown",
            "source": [
                "###### widgets"
            ],
            "metadata": {
                "id": "tOHgmwjQ4C_h"
            }
        },
        {
            "cell_type": "code",
            "source": [
                "from IPython.display import display, clear_output\n",
                "import ipywidgets as widgets"
            ],
            "metadata": {
                "id": "8b-oO4NB4J4w"
            },
            "execution_count": null,
            "outputs": []
        },
        {
            "cell_type": "code",
            "source": [
                "# Default mode\n",
                "MODE = \"extract\" # must be either \"extract\" or \"render\"\n",
                "\n",
                "# Buttons to toggle the mode\n",
                "extract_button = widgets.Button(description=\"추출\", button_style=\"success\")\n",
                "render_button = widgets.Button(description=\"인쇄\", button_style=\"info\")\n",
                "output_area = widgets.Output()\n",
                "\n",
                "# Function to handle button clicks\n",
                "def set_mode_to_extract(b):\n",
                "    global MODE\n",
                "    MODE = \"extract\"\n",
                "    with output_area:\n",
                "        clear_output()\n",
                "        print(\"추출 모드: PDF 파일 내 모든 이미지 객체들을 개별 이미지 파일들로 추출합니다.\")\n",
                "\n",
                "def set_mode_to_render(b):\n",
                "    global MODE\n",
                "    MODE = \"render\"\n",
                "    with output_area:\n",
                "        clear_output()\n",
                "        print(\"인쇄 모드: PDF 파일 각 페이지들을 이미지로 렌더하여 저장합니다.\")\n",
                "\n",
                "# Attach button callbacks\n",
                "extract_button.on_click(set_mode_to_extract)\n",
                "render_button.on_click(set_mode_to_render)"
            ],
            "metadata": {
                "id": "eBJYsHe14Et5"
            },
            "execution_count": null,
            "outputs": []
        },
        {
            "cell_type": "markdown",
            "source": [
                "###### main function"
            ],
            "metadata": {
                "id": "vJ9sxt4Q42rU"
            }
        },
        {
            "cell_type": "code",
            "source": [
                "from glob import glob\n",
                "import os\n",
                "\n",
                "def run(mode=\"extract\", dpi=72):\n",
                "    print(f\"Running in '{mode}' mode...\")\n",
                "    print()\n",
                "\n",
                "    pdf_files = glob(\"/content/*.pdf\") + glob(\"/content/*.PDF\")\n",
                "    for pdf_file in pdf_files:\n",
                "        fname = os.path.basename(pdf_file)\n",
                "        output_folder = f\"/content/{fname.split('.')[0]}\"\n",
                "\n",
                "        # Create the output folder if it doesn't exist\n",
                "        if not os.path.exists(output_folder):\n",
                "            os.makedirs(output_folder)\n",
                "\n",
                "        # Image file generation\n",
                "        if mode == \"extract\":\n",
                "            print(f\"Processing PDF: {pdf_file}\")\n",
                "            extract_images_from_pdf(pdf_file, output_folder)\n",
                "            print(\"\\nImage extraction complete. Compressing...\")\n",
                "        elif mode == \"render\":\n",
                "            print(f\"Processing PDF: {pdf_file}\")\n",
                "            render_pdf_to_images(pdf_file, output_folder, dpi)\n",
                "            print(\"\\nPDF rendering complete. Compressing...\")\n",
                "        else:\n",
                "            print('Invalid mode. Choose \"extract\" or \"render\".')\n",
                "\n",
                "        compress_images_to_zip(output_folder, output_folder+\".zip\")\n",
                "        print(f\"\\nCompression complete. ZIP file saved at: {os.path.basename(output_folder)}.zip\")\n",
                "        print()"
            ],
            "metadata": {
                "id": "81V0B16x45bG"
            },
            "execution_count": null,
            "outputs": []
        },
        {
            "cell_type": "markdown",
            "source": [
                "##### run"
            ],
            "metadata": {
                "id": "cjY8tnAP8LS6"
            }
        },
        {
            "cell_type": "code",
            "source": [
                "print(\"`run()`을 실행하기 전에 아래 버튼을 통해 모드를 선택하세요:\")\n",
                "display(extract_button, render_button, output_area)"
            ],
            "metadata": {
                "id": "NeHl-Rs_8Qdi"
            },
            "execution_count": null,
            "outputs": []
        },
        {
            "cell_type": "code",
            "source": [
                "run(mode=MODE, dpi=300)"
            ],
            "metadata": {
                "id": "L01UukGm8R6x"
            },
            "execution_count": null,
            "outputs": []
        }
    ]
 }
diff --git a/my_utility.py b/my_utility.py
 import fitz
 import os
 import zipfile


 # PDF에서 이미지를 추출하여 저장하는 함수
 def extract_images_from_pdf(pdf_path, output_folder, *args, **kwargs):
    """PDF 파일에서 이미지 객체들을 추출하여 저장하는 함수
    
    Args:
        pdf_path (str): PDF 파일 경로
        output_folder (str): 이미지를 저장할 폴더 경로
        *args: 추가 위치 인자
        **kwargs: 추가 키워드 인자
    """
    # PDF 파일 열기
    pdf_document = fitz.open(pdf_path)

    # PDF의 각 페이지를 순회
    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]
        images = page.get_images(full=True)  # 페이지 내 모든 이미지 가져오기

        # 현재 페이지의 모든 이미지를 순회
        for img_index, img in enumerate(images):
            xref = img[0]  # XREF는 이미지 객체 참조를 가리킴

            # 이미지 바이트 추출
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]  # 이미지 형식 (예: "png", "jpeg")

            # 이미지를 출력 폴더에 저장
            image_filename = f"page{page_number+1:0>3}_img{img_index+1:0>3}.{image_ext}"
            image_path = os.path.join(output_folder, image_filename)

            with open(image_path, "wb") as image_file:
                image_file.write(image_bytes)

            print(f"\tSaved image: {image_path}")

 # PDF의 각 페이지를 이미지로 렌더링하는 함수
 def render_pdf_to_images(pdf_path, output_folder, dpi=72, *args, **kwargs):
    """PDF 파일의 각 페이지를 이미지로 렌더링하여 저장하는 함수
    
    Args:
        pdf_path (str): PDF 파일 경로
        output_folder (str): 이미지를 저장할 폴더 경로
        dpi (int): 출력 이미지의 해상도 (기본값: 72)
        *args: 추가 위치 인자
        **kwargs: 추가 키워드 인자
    """
    # PDF 파일 열기
    pdf_document = fitz.open(pdf_path)

    # DPI에 기반한 확대/축소 계수 계산 (PyMuPDF의 기본값은 72 DPI)
    zoom = dpi / 72  # 스케일 계수
    matrix = fitz.Matrix(zoom, zoom)

    # PDF의 각 페이지를 순회
    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]

        # 페이지를 픽스맵(이미지)으로 렌더링
        pixmap = page.get_pixmap(matrix=matrix, alpha=False)

        # 렌더링된 이미지 저장
        image_filename = f"page{page_number+1:0>3}.png"
        image_path = os.path.join(output_folder, image_filename)
        pixmap.save(image_path)

        print(f"\tRendered page {page_number+1} at {dpi} DPI as image: {image_path}")

 # 이미지 파일들을 ZIP으로 압축하는 함수
 def compress_images_to_zip(input_folder, output_zip):
    """이미지 파일들을 ZIP 파일로 압축하는 함수
    
    Args:
        input_folder (str): 압축할 이미지 파일들이 있는 폴더 경로
        output_zip (str): 생성할 ZIP 파일 경로
    """
    # 입력 폴더의 이미지 파일들을 정렬된 리스트로 가져오기
    image_files = sorted(
        [f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f))],
        key=str.lower  # 알파벳 순으로 정렬 (대소문자 구분 없음)
    )

    # 압축 레벨 0(저장만)으로 ZIP 파일 생성
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_STORED) as zipf:
        for image_file in image_files:
            image_path = os.path.join(input_folder, image_file)
            # 폴더 구조 없이 파일만 ZIP에 추가
            zipf.write(image_path, arcname=image_file)
            print(f"\tAdded to ZIP: {image_file}")
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"collapsed_sections": [
	"DCFI_fgT0O_y",
	"J4dArR1q0RQB",
	"8A8d2Lc31KK8",
	"tOHgmwjQ4C_h"
	]
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"source": [
	"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/shhommychon/823e7f590b3cd0e85ae40df6b6c55e2d)"
	],
	"metadata": {
	"id": "R1zMGDzzzm8v"
	}
	},
	{
	"cell_type": "markdown",
	"source": [
	"# PDF → 이미지\n",
	"- [pdf2png](https://pdf2png.com/) 같은 외부 유틸을 쓰는데 파일 크기가 너무 커서 다운받는 데 1시간 이상 걸릴 때 쓰려고 챗지피티로 간단히 만든 페이지"
	],
	"metadata": {
	"id": "cTCLqj-Wzv1w"
	}
	},
	{
	"cell_type": "markdown",
	"source": [
	"##### setup\n",
	"- 다시 시작하라는 경고 메세지가 떠도 놀라지 말고 다시 실행하세요"
	],
	"metadata": {
	"id": "DCFI_fgT0O_y"
	}
	},
	{
	"cell_type": "markdown",
	"source": [
	"###### dependencies\n",
	"- `backports` 라이브러리에 의한 재시작이 요구될 수 있음."
	],
	"metadata": {
	"id": "J4dArR1q0RQB"
	}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "RKB3KQ1fzl8X"
	},
	"outputs": [],
	"source": [
	"!pip install fitz --quiet\n",
	"!pip install pymupdf --quiet"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"###### utility functions"
	],
	"metadata": {
	"id": "8A8d2Lc31KK8"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!git clone https://gist.github.com/823e7f590b3cd0e85ae40df6b6c55e2d.git gist\n",
	"!cp -r gist/. .\n",
	"!rm -rf gist/\n",
	"!rm -r ./*.ipynb"
	],
	"metadata": {
	"id": "Q_c5Yf5918Gu"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"from my_utility import extract_images_from_pdf, render_pdf_to_images, compress_images_to_zip"
	],
	"metadata": {
	"id": "mGXksrMh1OzW"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"###### widgets"
	],
	"metadata": {
	"id": "tOHgmwjQ4C_h"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"from IPython.display import display, clear_output\n",
	"import ipywidgets as widgets"
	],
	"metadata": {
	"id": "8b-oO4NB4J4w"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Default mode\n",
	"MODE = \"extract\" # must be either \"extract\" or \"render\"\n",
	"\n",
	"# Buttons to toggle the mode\n",
	"extract_button = widgets.Button(description=\"추출\", button_style=\"success\")\n",
	"render_button = widgets.Button(description=\"인쇄\", button_style=\"info\")\n",
	"output_area = widgets.Output()\n",
	"\n",
	"# Function to handle button clicks\n",
	"def set_mode_to_extract(b):\n",
	" global MODE\n",
	" MODE = \"extract\"\n",
	" with output_area:\n",
	" clear_output()\n",
	" print(\"추출 모드: PDF 파일 내 모든 이미지 객체들을 개별 이미지 파일들로 추출합니다.\")\n",
	"\n",
	"def set_mode_to_render(b):\n",
	" global MODE\n",
	" MODE = \"render\"\n",
	" with output_area:\n",
	" clear_output()\n",
	" print(\"인쇄 모드: PDF 파일 각 페이지들을 이미지로 렌더하여 저장합니다.\")\n",
	"\n",
	"# Attach button callbacks\n",
	"extract_button.on_click(set_mode_to_extract)\n",
	"render_button.on_click(set_mode_to_render)"
	],
	"metadata": {
	"id": "eBJYsHe14Et5"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"###### main function"
	],
	"metadata": {
	"id": "vJ9sxt4Q42rU"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"from glob import glob\n",
	"import os\n",
	"\n",
	"def run(mode=\"extract\", dpi=72):\n",
	" print(f\"Running in '{mode}' mode...\")\n",
	" print()\n",
	"\n",
	" pdf_files = glob(\"/content/.pdf\") + glob(\"/content/.PDF\")\n",
	" for pdf_file in pdf_files:\n",
	" fname = os.path.basename(pdf_file)\n",
	" output_folder = f\"/content/{fname.split('.')[0]}\"\n",
	"\n",
	" # Create the output folder if it doesn't exist\n",
	" if not os.path.exists(output_folder):\n",
	" os.makedirs(output_folder)\n",
	"\n",
	" # Image file generation\n",
	" if mode == \"extract\":\n",
	" print(f\"Processing PDF: {pdf_file}\")\n",
	" extract_images_from_pdf(pdf_file, output_folder)\n",
	" print(\"\\nImage extraction complete. Compressing...\")\n",
	" elif mode == \"render\":\n",
	" print(f\"Processing PDF: {pdf_file}\")\n",
	" render_pdf_to_images(pdf_file, output_folder, dpi)\n",
	" print(\"\\nPDF rendering complete. Compressing...\")\n",
	" else:\n",
	" print('Invalid mode. Choose \"extract\" or \"render\".')\n",
	"\n",
	" compress_images_to_zip(output_folder, output_folder+\".zip\")\n",
	" print(f\"\\nCompression complete. ZIP file saved at: {os.path.basename(output_folder)}.zip\")\n",
	" print()"
	],
	"metadata": {
	"id": "81V0B16x45bG"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"##### run"
	],
	"metadata": {
	"id": "cjY8tnAP8LS6"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"print(\"`run()`을 실행하기 전에 아래 버튼을 통해 모드를 선택하세요:\")\n",
	"display(extract_button, render_button, output_area)"
	],
	"metadata": {
	"id": "NeHl-Rs_8Qdi"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"run(mode=MODE, dpi=300)"
	],
	"metadata": {
	"id": "L01UukGm8R6x"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}
	import fitz
	import os
	import zipfile


	# PDF에서 이미지를 추출하여 저장하는 함수
	def extract_images_from_pdf(pdf_path, output_folder, args, *kwargs):
	"""PDF 파일에서 이미지 객체들을 추출하여 저장하는 함수

	Args:
	pdf_path (str): PDF 파일 경로
	output_folder (str): 이미지를 저장할 폴더 경로
	*args: 추가 위치 인자
	**kwargs: 추가 키워드 인자
	"""
	# PDF 파일 열기
	pdf_document = fitz.open(pdf_path)

	# PDF의 각 페이지를 순회
	for page_number in range(len(pdf_document)):
	page = pdf_document[page_number]
	images = page.get_images(full=True) # 페이지 내 모든 이미지 가져오기

	# 현재 페이지의 모든 이미지를 순회
	for img_index, img in enumerate(images):
	xref = img[0] # XREF는 이미지 객체 참조를 가리킴

	# 이미지 바이트 추출
	base_image = pdf_document.extract_image(xref)
	image_bytes = base_image["image"]
	image_ext = base_image["ext"] # 이미지 형식 (예: "png", "jpeg")

	# 이미지를 출력 폴더에 저장
	image_filename = f"page{page_number+1:0>3}_img{img_index+1:0>3}.{image_ext}"
	image_path = os.path.join(output_folder, image_filename)

	with open(image_path, "wb") as image_file:
	image_file.write(image_bytes)

	print(f"\tSaved image: {image_path}")

	# PDF의 각 페이지를 이미지로 렌더링하는 함수
	def render_pdf_to_images(pdf_path, output_folder, dpi=72, args, *kwargs):
	"""PDF 파일의 각 페이지를 이미지로 렌더링하여 저장하는 함수

	Args:
	pdf_path (str): PDF 파일 경로
	output_folder (str): 이미지를 저장할 폴더 경로
	dpi (int): 출력 이미지의 해상도 (기본값: 72)
	*args: 추가 위치 인자
	**kwargs: 추가 키워드 인자
	"""
	# PDF 파일 열기
	pdf_document = fitz.open(pdf_path)

	# DPI에 기반한 확대/축소 계수 계산 (PyMuPDF의 기본값은 72 DPI)
	zoom = dpi / 72 # 스케일 계수
	matrix = fitz.Matrix(zoom, zoom)

	# PDF의 각 페이지를 순회
	for page_number in range(len(pdf_document)):
	page = pdf_document[page_number]

	# 페이지를 픽스맵(이미지)으로 렌더링
	pixmap = page.get_pixmap(matrix=matrix, alpha=False)

	# 렌더링된 이미지 저장
	image_filename = f"page{page_number+1:0>3}.png"
	image_path = os.path.join(output_folder, image_filename)
	pixmap.save(image_path)

	print(f"\tRendered page {page_number+1} at {dpi} DPI as image: {image_path}")

	# 이미지 파일들을 ZIP으로 압축하는 함수
	def compress_images_to_zip(input_folder, output_zip):
	"""이미지 파일들을 ZIP 파일로 압축하는 함수

	Args:
	input_folder (str): 압축할 이미지 파일들이 있는 폴더 경로
	output_zip (str): 생성할 ZIP 파일 경로
	"""
	# 입력 폴더의 이미지 파일들을 정렬된 리스트로 가져오기
	image_files = sorted(
	[f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f))],
	key=str.lower # 알파벳 순으로 정렬 (대소문자 구분 없음)
	)

	# 압축 레벨 0(저장만)으로 ZIP 파일 생성
	with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_STORED) as zipf:
	for image_file in image_files:
	image_path = os.path.join(input_folder, image_file)
	# 폴더 구조 없이 파일만 ZIP에 추가
	zipf.write(image_path, arcname=image_file)
	print(f"\tAdded to ZIP: {image_file}")