Created
November 12, 2025 23:26
-
-
Save atomicink/04bd1fb8305859ecd64188669ef6962b to your computer and use it in GitHub Desktop.
convert kmoe epub to cbz
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import argparse | |
| import os | |
| import zipfile | |
| import xml.etree.ElementTree as ET | |
| from pathlib import Path, PurePosixPath | |
| import sys | |
| from urllib.parse import unquote | |
| import re | |
| # 定义 EPUB 文件中常见的 XML 命名空间 | |
| NAMESPACES = { | |
| 'container': 'urn:oasis:names:tc:opendocument:xmlns:container', | |
| 'opf': 'http://www.idpf.org/2007/opf', | |
| } | |
| # 预编译正则表达式,避免在循环中重复编译 | |
| IMAGE_PATTERN = re.compile( | |
| r'<img[^>]+src\s*=\s*["\']([^"\']+)["\']|' | |
| r'<image[^>]+xlink:href\s*=\s*["\']([^"\']+)["\']', | |
| re.IGNORECASE | |
| ) | |
| def find_opf_path(epub_zip: zipfile.ZipFile) -> str: | |
| """在 EPUB 压缩包中查找 .opf 文件的路径。""" | |
| try: | |
| container_data = epub_zip.read('META-INF/container.xml') | |
| root = ET.fromstring(container_data) | |
| rootfile_element = root.find('.//container:rootfile', NAMESPACES) | |
| if rootfile_element is not None: | |
| return unquote(rootfile_element.get('full-path')) | |
| except (KeyError, ET.ParseError) as e: | |
| print(f" - 错误: 无法解析 META-INF/container.xml。 {e}") | |
| return None | |
| def get_image_paths_from_html_fast(html_content: str) -> list[str]: | |
| """使用预编译的正则表达式快速从 HTML/XHTML 内容中提取所有图片路径。""" | |
| images = [] | |
| for match in IMAGE_PATTERN.finditer(html_content): | |
| path = match.group(1) or match.group(2) | |
| if path: | |
| images.append(path) | |
| return images | |
| def normalize_zip_path(path_str: str) -> str: | |
| """规范化ZIP包内部路径,处理'.'和'..'并使用'/'作为分隔符。""" | |
| return os.path.normpath(path_str).replace(os.sep, '/') | |
| def convert_epub_to_cbz(epub_path: Path): | |
| """ | |
| 将单个 EPUB 文件转换为 CBZ 文件。 | |
| :param epub_path: EPUB 文件的路径对象。 | |
| """ | |
| cbz_path = epub_path.with_suffix('.cbz') | |
| if cbz_path.exists(): | |
| print(f" - 跳过: CBZ 文件已存在 '{cbz_path.name}'") | |
| return | |
| print(f" - 正在处理: '{epub_path.name}'") | |
| try: | |
| with zipfile.ZipFile(epub_path, 'r') as epub_zip: | |
| opf_path_str = find_opf_path(epub_zip) | |
| if not opf_path_str: | |
| print(f" - 错误: 在 EPUB 中未找到 .opf 文件。") | |
| return | |
| opf_dir = os.path.dirname(opf_path_str) | |
| opf_content = epub_zip.read(opf_path_str) | |
| opf_root = ET.fromstring(opf_content) | |
| manifest = {} | |
| for item in opf_root.findall('.//opf:manifest/opf:item', NAMESPACES): | |
| item_id = item.get('id') | |
| item_href = item.get('href') | |
| if item_id and item_href: | |
| relative_path = os.path.join(opf_dir, unquote(item_href)) | |
| manifest[item_id] = normalize_zip_path(relative_path) | |
| spine_item_ids = [item.get('idref') for item in opf_root.findall('.//opf:spine/opf:itemref', NAMESPACES)] | |
| ordered_html_paths = [manifest[idref] for idref in spine_item_ids if idref in manifest] | |
| ordered_image_paths = [] | |
| seen_images = set() | |
| for html_path in ordered_html_paths: | |
| html_dir = os.path.dirname(html_path) | |
| try: | |
| html_content = epub_zip.read(html_path).decode('utf-8', errors='ignore') | |
| image_refs = get_image_paths_from_html_fast(html_content) | |
| for ref in image_refs: | |
| img_path_relative = os.path.join(html_dir, unquote(ref)) | |
| img_path = normalize_zip_path(img_path_relative) | |
| if img_path not in seen_images: | |
| ordered_image_paths.append(img_path) | |
| seen_images.add(img_path) | |
| except KeyError: | |
| print(f" - 警告: Spine中引用的HTML文件在压缩包中不存在: '{html_path}'") | |
| if not ordered_image_paths: | |
| print(" - 错误: 未能从 EPUB 的阅读顺序中提取任何图片。") | |
| return | |
| image_count = len(ordered_image_paths) | |
| padding = len(str(image_count)) if image_count > 0 else 1 | |
| with zipfile.ZipFile(cbz_path, 'w', zipfile.ZIP_STORED) as cbz_zip: | |
| for i, img_path in enumerate(ordered_image_paths): | |
| try: | |
| image_data = epub_zip.read(img_path) | |
| file_extension = PurePosixPath(img_path).suffix | |
| new_filename = f"{i:0{padding}d}{file_extension}" | |
| cbz_zip.writestr(new_filename, image_data) | |
| except KeyError: | |
| print(f" - 警告: HTML中引用的图片文件不存在: '{img_path}'") | |
| print(f" - 成功: 已创建 '{cbz_path.name}',包含 {image_count} 张图片。") | |
| except zipfile.BadZipFile: | |
| print(f" - 错误: 文件不是一个有效的 EPUB (ZIP) 文件。") | |
| except Exception as e: | |
| print(f" - 发生未知错误: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| if cbz_path.exists(): | |
| cbz_path.unlink() | |
| def main(): | |
| """主函数,处理命令行参数和文件遍历。""" | |
| parser = argparse.ArgumentParser( | |
| description="将指定文件夹内的 EPUB 漫画文件转换为图片顺序正确的 CBZ 文件。", | |
| formatter_class=argparse.RawTextHelpFormatter | |
| ) | |
| parser.add_argument( | |
| "input_dir", | |
| type=str, | |
| help="包含 EPUB 文件的文件夹路径。" | |
| ) | |
| args = parser.parse_args() | |
| input_path = Path(args.input_dir) | |
| if not input_path.is_dir(): | |
| print(f"错误: 提供的路径 '{input_path}' 不是一个有效的文件夹。") | |
| sys.exit(1) | |
| print(f"开始扫描文件夹: '{input_path}'") | |
| epub_files = sorted(list(input_path.glob('*.epub'))) | |
| if not epub_files: | |
| print("未在该文件夹下找到任何 .epub 文件。") | |
| return | |
| total_files = len(epub_files) | |
| print(f"找到 {total_files} 个 EPUB 文件,开始转换...") | |
| for i, epub_file in enumerate(epub_files): | |
| print(f"\n[{i+1}/{total_files}] 正在处理文件: {epub_file.name}") | |
| convert_epub_to_cbz(epub_file) | |
| print("\n所有任务处理完毕。") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment