atomicink · November 12, 2025 23:26
diff --git a/epub_to_cbz_cli.py b/epub_to_cbz_cli.py
 #!/usr/bin/env python3
 import argparse
 import os
 import zipfile
 import xml.etree.ElementTree as ET
 from pathlib import Path, PurePosixPath
 import sys
 from urllib.parse import unquote
 import re

 # 定义 EPUB 文件中常见的 XML 命名空间
 NAMESPACES = {
    'container': 'urn:oasis:names:tc:opendocument:xmlns:container',
    'opf': 'http://www.idpf.org/2007/opf',
 }

 # 预编译正则表达式，避免在循环中重复编译
 IMAGE_PATTERN = re.compile(
    r'<img[^>]+src\s*=\s*["\']([^"\']+)["\']|'
    r'<image[^>]+xlink:href\s*=\s*["\']([^"\']+)["\']',
    re.IGNORECASE
 )

 def find_opf_path(epub_zip: zipfile.ZipFile) -> str:
    """在 EPUB 压缩包中查找 .opf 文件的路径。"""
    try:
        container_data = epub_zip.read('META-INF/container.xml')
        root = ET.fromstring(container_data)
        rootfile_element = root.find('.//container:rootfile', NAMESPACES)
        if rootfile_element is not None:
            return unquote(rootfile_element.get('full-path'))
    except (KeyError, ET.ParseError) as e:
        print(f"    - 错误: 无法解析 META-INF/container.xml。 {e}")
    return None

 def get_image_paths_from_html_fast(html_content: str) -> list[str]:
    """使用预编译的正则表达式快速从 HTML/XHTML 内容中提取所有图片路径。"""
    images = []
    for match in IMAGE_PATTERN.finditer(html_content):
        path = match.group(1) or match.group(2)
        if path:
            images.append(path)
    return images

 def normalize_zip_path(path_str: str) -> str:
    """规范化ZIP包内部路径，处理'.'和'..'并使用'/'作为分隔符。"""
    return os.path.normpath(path_str).replace(os.sep, '/')

 def convert_epub_to_cbz(epub_path: Path):
    """
    将单个 EPUB 文件转换为 CBZ 文件。
    :param epub_path: EPUB 文件的路径对象。
    """
    cbz_path = epub_path.with_suffix('.cbz')
    if cbz_path.exists():
        print(f"  - 跳过: CBZ 文件已存在 '{cbz_path.name}'")
        return

    print(f"  - 正在处理: '{epub_path.name}'")

    try:
        with zipfile.ZipFile(epub_path, 'r') as epub_zip:
            opf_path_str = find_opf_path(epub_zip)
            if not opf_path_str:
                print(f"    - 错误: 在 EPUB 中未找到 .opf 文件。")
                return

            opf_dir = os.path.dirname(opf_path_str)

            opf_content = epub_zip.read(opf_path_str)
            opf_root = ET.fromstring(opf_content)

            manifest = {}
            for item in opf_root.findall('.//opf:manifest/opf:item', NAMESPACES):
                item_id = item.get('id')
                item_href = item.get('href')
                if item_id and item_href:
                    relative_path = os.path.join(opf_dir, unquote(item_href))
                    manifest[item_id] = normalize_zip_path(relative_path)

            spine_item_ids = [item.get('idref') for item in opf_root.findall('.//opf:spine/opf:itemref', NAMESPACES)]
            ordered_html_paths = [manifest[idref] for idref in spine_item_ids if idref in manifest]

            ordered_image_paths = []
            seen_images = set()
            for html_path in ordered_html_paths:
                html_dir = os.path.dirname(html_path)
                try:
                    html_content = epub_zip.read(html_path).decode('utf-8', errors='ignore')
                    image_refs = get_image_paths_from_html_fast(html_content)
                    
                    for ref in image_refs:
                        img_path_relative = os.path.join(html_dir, unquote(ref))
                        img_path = normalize_zip_path(img_path_relative)
                        
                        if img_path not in seen_images:
                            ordered_image_paths.append(img_path)
                            seen_images.add(img_path)
                except KeyError:
                     print(f"    - 警告: Spine中引用的HTML文件在压缩包中不存在: '{html_path}'")

            if not ordered_image_paths:
                print("    - 错误: 未能从 EPUB 的阅读顺序中提取任何图片。")
                return

            image_count = len(ordered_image_paths)
            padding = len(str(image_count)) if image_count > 0 else 1
            
            with zipfile.ZipFile(cbz_path, 'w', zipfile.ZIP_STORED) as cbz_zip:
                for i, img_path in enumerate(ordered_image_paths):
                    try:
                        image_data = epub_zip.read(img_path)
                        file_extension = PurePosixPath(img_path).suffix
                        new_filename = f"{i:0{padding}d}{file_extension}"
                        cbz_zip.writestr(new_filename, image_data)
                    except KeyError:
                        print(f"    - 警告: HTML中引用的图片文件不存在: '{img_path}'")
            
            print(f"  - 成功: 已创建 '{cbz_path.name}'，包含 {image_count} 张图片。")

    except zipfile.BadZipFile:
        print(f"    - 错误: 文件不是一个有效的 EPUB (ZIP) 文件。")
    except Exception as e:
        print(f"    - 发生未知错误: {e}")
        import traceback
        traceback.print_exc()
        if cbz_path.exists():
            cbz_path.unlink()

 def main():
    """主函数，处理命令行参数和文件遍历。"""
    parser = argparse.ArgumentParser(
        description="将指定文件夹内的 EPUB 漫画文件转换为图片顺序正确的 CBZ 文件。",
        formatter_class=argparse.RawTextHelpFormatter
    )
    parser.add_argument(
        "input_dir",
        type=str,
        help="包含 EPUB 文件的文件夹路径。"
    )
    args = parser.parse_args()

    input_path = Path(args.input_dir)

    if not input_path.is_dir():
        print(f"错误: 提供的路径 '{input_path}' 不是一个有效的文件夹。")
        sys.exit(1)

    print(f"开始扫描文件夹: '{input_path}'")
    epub_files = sorted(list(input_path.glob('*.epub')))

    if not epub_files:
        print("未在该文件夹下找到任何 .epub 文件。")
        return

    total_files = len(epub_files)
    print(f"找到 {total_files} 个 EPUB 文件，开始转换...")

    for i, epub_file in enumerate(epub_files):
        print(f"\n[{i+1}/{total_files}] 正在处理文件: {epub_file.name}")
        convert_epub_to_cbz(epub_file)

    print("\n所有任务处理完毕。")

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	import argparse
	import os
	import zipfile
	import xml.etree.ElementTree as ET
	from pathlib import Path, PurePosixPath
	import sys
	from urllib.parse import unquote
	import re

	# 定义 EPUB 文件中常见的 XML 命名空间
	NAMESPACES = {
	'container': 'urn:oasis:names:tc:opendocument:xmlns:container',
	'opf': 'http://www.idpf.org/2007/opf',
	}

	# 预编译正则表达式，避免在循环中重复编译
	IMAGE_PATTERN = re.compile(
	r'<img[^>]+src\s=\s["\']([^"\']+)["\']\|'
	r'<image[^>]+xlink:href\s=\s["\']([^"\']+)["\']',
	re.IGNORECASE
	)

	def find_opf_path(epub_zip: zipfile.ZipFile) -> str:
	"""在 EPUB 压缩包中查找 .opf 文件的路径。"""
	try:
	container_data = epub_zip.read('META-INF/container.xml')
	root = ET.fromstring(container_data)
	rootfile_element = root.find('.//container:rootfile', NAMESPACES)
	if rootfile_element is not None:
	return unquote(rootfile_element.get('full-path'))
	except (KeyError, ET.ParseError) as e:
	print(f" - 错误: 无法解析 META-INF/container.xml。 {e}")
	return None

	def get_image_paths_from_html_fast(html_content: str) -> list[str]:
	"""使用预编译的正则表达式快速从 HTML/XHTML 内容中提取所有图片路径。"""
	images = []
	for match in IMAGE_PATTERN.finditer(html_content):
	path = match.group(1) or match.group(2)
	if path:
	images.append(path)
	return images

	def normalize_zip_path(path_str: str) -> str:
	"""规范化ZIP包内部路径，处理'.'和'..'并使用'/'作为分隔符。"""
	return os.path.normpath(path_str).replace(os.sep, '/')

	def convert_epub_to_cbz(epub_path: Path):
	"""
	将单个 EPUB 文件转换为 CBZ 文件。
	:param epub_path: EPUB 文件的路径对象。
	"""
	cbz_path = epub_path.with_suffix('.cbz')
	if cbz_path.exists():
	print(f" - 跳过: CBZ 文件已存在 '{cbz_path.name}'")
	return

	print(f" - 正在处理: '{epub_path.name}'")

	try:
	with zipfile.ZipFile(epub_path, 'r') as epub_zip:
	opf_path_str = find_opf_path(epub_zip)
	if not opf_path_str:
	print(f" - 错误: 在 EPUB 中未找到 .opf 文件。")
	return

	opf_dir = os.path.dirname(opf_path_str)

	opf_content = epub_zip.read(opf_path_str)
	opf_root = ET.fromstring(opf_content)

	manifest = {}
	for item in opf_root.findall('.//opf:manifest/opf:item', NAMESPACES):
	item_id = item.get('id')
	item_href = item.get('href')
	if item_id and item_href:
	relative_path = os.path.join(opf_dir, unquote(item_href))
	manifest[item_id] = normalize_zip_path(relative_path)

	spine_item_ids = [item.get('idref') for item in opf_root.findall('.//opf:spine/opf:itemref', NAMESPACES)]
	ordered_html_paths = [manifest[idref] for idref in spine_item_ids if idref in manifest]

	ordered_image_paths = []
	seen_images = set()
	for html_path in ordered_html_paths:
	html_dir = os.path.dirname(html_path)
	try:
	html_content = epub_zip.read(html_path).decode('utf-8', errors='ignore')
	image_refs = get_image_paths_from_html_fast(html_content)

	for ref in image_refs:
	img_path_relative = os.path.join(html_dir, unquote(ref))
	img_path = normalize_zip_path(img_path_relative)

	if img_path not in seen_images:
	ordered_image_paths.append(img_path)
	seen_images.add(img_path)
	except KeyError:
	print(f" - 警告: Spine中引用的HTML文件在压缩包中不存在: '{html_path}'")

	if not ordered_image_paths:
	print(" - 错误: 未能从 EPUB 的阅读顺序中提取任何图片。")
	return

	image_count = len(ordered_image_paths)
	padding = len(str(image_count)) if image_count > 0 else 1

	with zipfile.ZipFile(cbz_path, 'w', zipfile.ZIP_STORED) as cbz_zip:
	for i, img_path in enumerate(ordered_image_paths):
	try:
	image_data = epub_zip.read(img_path)
	file_extension = PurePosixPath(img_path).suffix
	new_filename = f"{i:0{padding}d}{file_extension}"
	cbz_zip.writestr(new_filename, image_data)
	except KeyError:
	print(f" - 警告: HTML中引用的图片文件不存在: '{img_path}'")

	print(f" - 成功: 已创建 '{cbz_path.name}'，包含 {image_count} 张图片。")

	except zipfile.BadZipFile:
	print(f" - 错误: 文件不是一个有效的 EPUB (ZIP) 文件。")
	except Exception as e:
	print(f" - 发生未知错误: {e}")
	import traceback
	traceback.print_exc()
	if cbz_path.exists():
	cbz_path.unlink()

	def main():
	"""主函数，处理命令行参数和文件遍历。"""
	parser = argparse.ArgumentParser(
	description="将指定文件夹内的 EPUB 漫画文件转换为图片顺序正确的 CBZ 文件。",
	formatter_class=argparse.RawTextHelpFormatter
	)
	parser.add_argument(
	"input_dir",
	type=str,
	help="包含 EPUB 文件的文件夹路径。"
	)
	args = parser.parse_args()

	input_path = Path(args.input_dir)

	if not input_path.is_dir():
	print(f"错误: 提供的路径 '{input_path}' 不是一个有效的文件夹。")
	sys.exit(1)

	print(f"开始扫描文件夹: '{input_path}'")
	epub_files = sorted(list(input_path.glob('*.epub')))

	if not epub_files:
	print("未在该文件夹下找到任何 .epub 文件。")
	return

	total_files = len(epub_files)
	print(f"找到 {total_files} 个 EPUB 文件，开始转换...")

	for i, epub_file in enumerate(epub_files):
	print(f"\n[{i+1}/{total_files}] 正在处理文件: {epub_file.name}")
	convert_epub_to_cbz(epub_file)

	print("\n所有任务处理完毕。")

	if __name__ == "__main__":
	main()
No results found