Skip to content

Instantly share code, notes, and snippets.

@atomicink
Created November 12, 2025 23:26
Show Gist options
  • Select an option

  • Save atomicink/04bd1fb8305859ecd64188669ef6962b to your computer and use it in GitHub Desktop.

Select an option

Save atomicink/04bd1fb8305859ecd64188669ef6962b to your computer and use it in GitHub Desktop.
convert kmoe epub to cbz
#!/usr/bin/env python3
import argparse
import os
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path, PurePosixPath
import sys
from urllib.parse import unquote
import re
# 定义 EPUB 文件中常见的 XML 命名空间
NAMESPACES = {
'container': 'urn:oasis:names:tc:opendocument:xmlns:container',
'opf': 'http://www.idpf.org/2007/opf',
}
# 预编译正则表达式,避免在循环中重复编译
IMAGE_PATTERN = re.compile(
r'<img[^>]+src\s*=\s*["\']([^"\']+)["\']|'
r'<image[^>]+xlink:href\s*=\s*["\']([^"\']+)["\']',
re.IGNORECASE
)
def find_opf_path(epub_zip: zipfile.ZipFile) -> str:
"""在 EPUB 压缩包中查找 .opf 文件的路径。"""
try:
container_data = epub_zip.read('META-INF/container.xml')
root = ET.fromstring(container_data)
rootfile_element = root.find('.//container:rootfile', NAMESPACES)
if rootfile_element is not None:
return unquote(rootfile_element.get('full-path'))
except (KeyError, ET.ParseError) as e:
print(f" - 错误: 无法解析 META-INF/container.xml。 {e}")
return None
def get_image_paths_from_html_fast(html_content: str) -> list[str]:
"""使用预编译的正则表达式快速从 HTML/XHTML 内容中提取所有图片路径。"""
images = []
for match in IMAGE_PATTERN.finditer(html_content):
path = match.group(1) or match.group(2)
if path:
images.append(path)
return images
def normalize_zip_path(path_str: str) -> str:
"""规范化ZIP包内部路径,处理'.'和'..'并使用'/'作为分隔符。"""
return os.path.normpath(path_str).replace(os.sep, '/')
def convert_epub_to_cbz(epub_path: Path):
"""
将单个 EPUB 文件转换为 CBZ 文件。
:param epub_path: EPUB 文件的路径对象。
"""
cbz_path = epub_path.with_suffix('.cbz')
if cbz_path.exists():
print(f" - 跳过: CBZ 文件已存在 '{cbz_path.name}'")
return
print(f" - 正在处理: '{epub_path.name}'")
try:
with zipfile.ZipFile(epub_path, 'r') as epub_zip:
opf_path_str = find_opf_path(epub_zip)
if not opf_path_str:
print(f" - 错误: 在 EPUB 中未找到 .opf 文件。")
return
opf_dir = os.path.dirname(opf_path_str)
opf_content = epub_zip.read(opf_path_str)
opf_root = ET.fromstring(opf_content)
manifest = {}
for item in opf_root.findall('.//opf:manifest/opf:item', NAMESPACES):
item_id = item.get('id')
item_href = item.get('href')
if item_id and item_href:
relative_path = os.path.join(opf_dir, unquote(item_href))
manifest[item_id] = normalize_zip_path(relative_path)
spine_item_ids = [item.get('idref') for item in opf_root.findall('.//opf:spine/opf:itemref', NAMESPACES)]
ordered_html_paths = [manifest[idref] for idref in spine_item_ids if idref in manifest]
ordered_image_paths = []
seen_images = set()
for html_path in ordered_html_paths:
html_dir = os.path.dirname(html_path)
try:
html_content = epub_zip.read(html_path).decode('utf-8', errors='ignore')
image_refs = get_image_paths_from_html_fast(html_content)
for ref in image_refs:
img_path_relative = os.path.join(html_dir, unquote(ref))
img_path = normalize_zip_path(img_path_relative)
if img_path not in seen_images:
ordered_image_paths.append(img_path)
seen_images.add(img_path)
except KeyError:
print(f" - 警告: Spine中引用的HTML文件在压缩包中不存在: '{html_path}'")
if not ordered_image_paths:
print(" - 错误: 未能从 EPUB 的阅读顺序中提取任何图片。")
return
image_count = len(ordered_image_paths)
padding = len(str(image_count)) if image_count > 0 else 1
with zipfile.ZipFile(cbz_path, 'w', zipfile.ZIP_STORED) as cbz_zip:
for i, img_path in enumerate(ordered_image_paths):
try:
image_data = epub_zip.read(img_path)
file_extension = PurePosixPath(img_path).suffix
new_filename = f"{i:0{padding}d}{file_extension}"
cbz_zip.writestr(new_filename, image_data)
except KeyError:
print(f" - 警告: HTML中引用的图片文件不存在: '{img_path}'")
print(f" - 成功: 已创建 '{cbz_path.name}',包含 {image_count} 张图片。")
except zipfile.BadZipFile:
print(f" - 错误: 文件不是一个有效的 EPUB (ZIP) 文件。")
except Exception as e:
print(f" - 发生未知错误: {e}")
import traceback
traceback.print_exc()
if cbz_path.exists():
cbz_path.unlink()
def main():
"""主函数,处理命令行参数和文件遍历。"""
parser = argparse.ArgumentParser(
description="将指定文件夹内的 EPUB 漫画文件转换为图片顺序正确的 CBZ 文件。",
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument(
"input_dir",
type=str,
help="包含 EPUB 文件的文件夹路径。"
)
args = parser.parse_args()
input_path = Path(args.input_dir)
if not input_path.is_dir():
print(f"错误: 提供的路径 '{input_path}' 不是一个有效的文件夹。")
sys.exit(1)
print(f"开始扫描文件夹: '{input_path}'")
epub_files = sorted(list(input_path.glob('*.epub')))
if not epub_files:
print("未在该文件夹下找到任何 .epub 文件。")
return
total_files = len(epub_files)
print(f"找到 {total_files} 个 EPUB 文件,开始转换...")
for i, epub_file in enumerate(epub_files):
print(f"\n[{i+1}/{total_files}] 正在处理文件: {epub_file.name}")
convert_epub_to_cbz(epub_file)
print("\n所有任务处理完毕。")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment