-
-
Save apetenchea/4df556a49f9a2543be877c31355b4164 to your computer and use it in GitHub Desktop.
| # This script gathers all the pages of a manual and merges them into a PDF. | |
| # You'll need to play a bit with inspect-element in order to figure out the format the correct url, | |
| # but it should be easy to adapt it to any manual. | |
| # This script is specifically for https://www.manua.ls/audi/q3-2018/manual. | |
| # Their url format is https://www.manua.ls/viewer/{manual-id}/{page-number}/bg{page-number-hex}.png | |
| # Example: https://www.manua.ls/viewer/668006/100/bg64.png | |
| # Enjoy! | |
| import requests | |
| from tqdm import tqdm | |
| from PIL import Image | |
| from io import BytesIO | |
| from reportlab.pdfgen import canvas | |
| from reportlab.lib.pagesizes import letter | |
| from reportlab.lib.utils import ImageReader | |
| def download_image(url): | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| return Image.open(BytesIO(response.content)) | |
| else: | |
| print(f"Failed to download {url}") | |
| return None | |
| def save_images_as_pdf(images, pdf_filename): | |
| c = canvas.Canvas(pdf_filename, pagesize=letter) | |
| width, height = letter | |
| for image in images: | |
| image_width, image_height = image.size | |
| aspect_ratio = image_width / image_height | |
| new_width = width | |
| new_height = width / aspect_ratio | |
| if new_height > height: | |
| new_height = height | |
| new_width = height * aspect_ratio | |
| # Convert PIL image to byte stream | |
| img_byte_arr = BytesIO() | |
| image.save(img_byte_arr, format='PNG') | |
| img_byte_arr.seek(0) | |
| # Draw image from byte stream | |
| c.drawImage(ImageReader(img_byte_arr), 0, height - new_height, width=new_width, height=new_height) | |
| c.showPage() | |
| c.save() | |
| def main(): | |
| base_url = "https://www.manua.ls/viewer/668006/" | |
| images = [] | |
| for i in tqdm(range(1, 231)): # Adjust the range as needed | |
| url = f"{base_url}{i}/bg{hex(i)[2:]}.png" | |
| image = download_image(url) | |
| if image: | |
| images.append(image) | |
| if images: | |
| save_images_as_pdf(images, "output.pdf") | |
| print("PDF created successfully") | |
| else: | |
| print("No images downloaded") | |
| if __name__ == "__main__": | |
| main() |
| # Use this script for webp manuals | |
| # example: https://www.manua.ls/growatt/min-3000-11400tl-xh-us/manual?p=1 | |
| # pip install selenium webdriver-manager pillow tqdm | |
| # By default Firefox is used, but it's easy to adapt to chrome, see below | |
| """ | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.chrome.service import Service | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| options = Options() | |
| options.headless = True | |
| options.add_argument("--window-size=1200,1600") | |
| driver = webdriver.Chrome(options=options) | |
| """ | |
| from selenium import webdriver | |
| from selenium.webdriver.firefox.options import Options | |
| from selenium.webdriver.firefox.service import Service | |
| from webdriver_manager.firefox import GeckoDriverManager | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from PIL import Image | |
| from tqdm import tqdm | |
| import io | |
| import time | |
| def get_screenshot(driver, url, consent): | |
| driver.get(url) | |
| # Wait for consent and give time for JS to load elements | |
| if consent: | |
| try: | |
| consent_button = WebDriverWait(driver, 3).until( | |
| EC.element_to_be_clickable((By.XPATH, '//button[@aria-label="Consent"]')) | |
| ) | |
| consent_button.click() | |
| except: | |
| pass | |
| else: | |
| time.sleep(1) | |
| viewer_div = driver.find_element(By.ID, "viewer") | |
| # Save screenshot of just one element | |
| png = viewer_div.screenshot_as_png | |
| # Optional, intermediary save step | |
| # viewer_div.screenshot(f"{url[-1]}.png") | |
| return Image.open(io.BytesIO(png)) | |
| def main(): | |
| options = Options() | |
| options.headless = True | |
| options.set_preference("layout.css.devPixelsPerPx", "1.5") | |
| driver = webdriver.Firefox(options=options) | |
| base_url = "https://www.manua.ls" | |
| images = [] | |
| try: | |
| for i in tqdm(range(1, 82)): # number of pages 81 | |
| url = f"{base_url}/growatt/min-3000-11400tl-xh-us/manual?p={i}" # manual name may differ | |
| img = get_screenshot(driver, url, consent=(i == 1)) | |
| if img: | |
| images.append(img) | |
| finally: | |
| driver.quit() | |
| if images: | |
| images[0].save("output.pdf", save_all=True, append_images=images[1:]) | |
| print("PDF created successfully") | |
| else: | |
| print("No screenshots taken") | |
| driver.quit() | |
| if __name__ == "__main__": | |
| main() |
this is great. i made two changes:
viewer_div = driver.find_element(By.CLASS_NAME, "viewer-page")
doing it this way removes the viewer UI (the arrows still show. i know selenium can hide elements, but this is good enough for what i need)
and
options.add_argument("--headless")
the other headless method works for chrome but not FF.
how to use? I can't run it.
❯ python manuals.py
Traceback (most recent call last):
File "D:\Applications\Manua.ls downloader\manuals.py", line 9, in <module>
from tqdm import tqdm
ModuleNotFoundError: No module named 'tqdm'
UPDATE:
alright I thought I should install package written in the from ... import ... section. I tried to download as much as I can but still didn't work.
❯ python manuals.py
0%| | 0/230 [00:01<?, ?it/s]
Traceback (most recent call last):
File "D:\Applications\Manua.ls downloader\manuals.py", line 62, in <module>
main()
~~~~^^
File "D:\Applications\Manua.ls downloader\manuals.py", line 51, in main
image = download_image(url)
File "D:\Applications\Manua.ls downloader\manuals.py", line 19, in download_image
return Image.open(BytesIO(response.content))
~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\komi\scoop\apps\python\current\Lib\site-packages\PIL\Image.py", line 3498, in open
raise UnidentifiedImageError(msg)
PIL.UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x000002C3E22E3D80>
❯ pip install BytesIO
ERROR: Could not find a version that satisfies the requirement BytesIO (from versions: none)
[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for BytesIO
how to use? I can't run it.
❯ python manuals.py Traceback (most recent call last): File "D:\Applications\Manua.ls downloader\manuals.py", line 9, in <module> from tqdm import tqdm ModuleNotFoundError: No module named 'tqdm'UPDATE: alright I thought I should install package written in the
from ... import ...section. I tried to download as much as I can but still didn't work.❯ python manuals.py 0%| | 0/230 [00:01<?, ?it/s] Traceback (most recent call last): File "D:\Applications\Manua.ls downloader\manuals.py", line 62, in <module> main() ~~~~^^ File "D:\Applications\Manua.ls downloader\manuals.py", line 51, in main image = download_image(url) File "D:\Applications\Manua.ls downloader\manuals.py", line 19, in download_image return Image.open(BytesIO(response.content)) ~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\komi\scoop\apps\python\current\Lib\site-packages\PIL\Image.py", line 3498, in open raise UnidentifiedImageError(msg) PIL.UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x000002C3E22E3D80>❯ pip install BytesIO ERROR: Could not find a version that satisfies the requirement BytesIO (from versions: none) [notice] A new release of pip is available: 25.2 -> 25.3 [notice] To update, run: python.exe -m pip install --upgrade pip ERROR: No matching distribution found for BytesIO
Put these in a file called requirements.txt:
requests
tqdm
Pillow
reportlab
selenium
webdriver-manager
Run pip install -r requirements.txt, or python -m pip install requirements.txt (in case you have multiple python versions).
Oh wait it failed because I changed the base url to this:
base_url = "https://www.manua.ls/asus/rog-strix-b860-i-gaming-wifi"I guess I'll have to figure out how to get the base url.
Oh this is only grabbing images, the manuals that I wanted to download have text in it.
Webp script worked for me thanks. I was targetting https://www.manua.ls/honda/life-2010/manual and just had to change the main code to this
base_url = "https://www.manua.ls"
images = []
try:
for i in tqdm(range(1, 248)):
url = f"{base_url}/honda/life-2010/manual?p={i}"
thanks, ill mess with that