Skip to content

Instantly share code, notes, and snippets.

@praveenc
Created January 30, 2026 15:24
Show Gist options
  • Select an option

  • Save praveenc/c46242c893b8be6f12f59fdb7e1da8f9 to your computer and use it in GitHub Desktop.

Select an option

Save praveenc/c46242c893b8be6f12f59fdb7e1da8f9 to your computer and use it in GitHub Desktop.
Scrapes linked AWS documentation to local dir (Supports HTML, MD, TXT) formats.
# /// script
# requires-python = ">=3.12.9"
# dependencies = [
# "loguru==0.7.3",
# "beautifulsoup4==4.12.3",
# "requests",
# "trafilatura[all]==2.0.0",
# "rich==13.9.4"
# ]
# ///
import time
from argparse import ArgumentParser
from copy import deepcopy
from pathlib import Path
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from loguru import logger
from rich.progress import (
BarColumn,
Progress,
SpinnerColumn,
TextColumn,
TimeElapsedColumn,
)
from trafilatura import extract, fetch_url
from trafilatura.settings import DEFAULT_CONFIG
# logger.info(f"inside {Path(__file__).name}")
# Configure Trafilatura settings
def get_crawler_config():
config = deepcopy(DEFAULT_CONFIG)
config['DEFAULT']['DOWNLOAD_TIMEOUT'] = '30'
config['DEFAULT']['SLEEP_TIME'] = '5'
config['DEFAULT']['MIN_FILE_SIZE'] = '10'
config['DEFAULT']['EXTRACTION_TIMEOUT'] = '30'
config['DEFAULT']['EXTENSIVE_DATE_SEARCH'] = 'off'
return config
# Find the URL of the next topic in the HTML content
def find_next_url(html_content, base_url):
soup = BeautifulSoup(html_content, "html.parser")
next_topic = soup.find("div", {"class": "next-link"})
if next_topic and next_topic.get("href"):
return f"{base_url}{next_topic.get('href')}".replace("./", "")
return None
# Get appropriate filename based on URL and format
def get_filename(url, output_format):
parsed_url = urlparse(url)
file_name = parsed_url.path.split("/")[-1] or "index"
if output_format == "html":
if not file_name.endswith('.html'):
file_name = f"{file_name}.html"
elif output_format == "markdown":
file_name = f"{file_name.split('.')[0]}.md"
else: # text
file_name = f"{file_name.split('.')[0]}.txt"
return file_name
def main():
# Parse command line arguments
parser = ArgumentParser(description="Web scraper that converts HTML to various formats")
parser.add_argument("--url", type=str, required=True, help="Starting URL to scrape")
parser.add_argument("--target_dir", type=str, default="./data/markdown", help="Target directory for output files")
parser.add_argument("--format", type=str, choices=["html", "markdown", "text"],
default="markdown", help="Output format: html, markdown, or text")
args = parser.parse_args()
# Setup directories and variables
output_format = args.format
target_dir = Path(args.target_dir)
if not target_dir.exists():
target_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Files will be saved to: {str(target_dir)} ")
# Validate URL
if not args.url:
logger.error("URL not provided")
return {"statusCode": 400, "body": "URL not provided."}
# Extract base URL for constructing full URLs
base_url = "/".join(args.url.split("/")[:-1]) + "/"
# Initialize variables
current_url = args.url
failed_downloads = []
crawler_config = get_crawler_config()
with Progress(
SpinnerColumn(),
TextColumn("[bold blue]{task.description}"),
BarColumn(),
TextColumn("[bold green]{task.fields[url]}"),
TimeElapsedColumn()
) as progress:
task = progress.add_task("[cyan]Scraping...", total=None, url=current_url)
# Main scraping loop
while current_url:
file_name = get_filename(current_url, output_format)
# check if file exists on disk first
if (target_dir / file_name).exists():
logger.warning(f"File {file_name} already exists. Skipping.")
progress.update(task, description="File already exists", url=current_url)
continue
progress.update(task, description=f"Processing", url=current_url)
# logger.info(f"Processing: {current_url} -> {file_name}")
# Fetch content
html_content = fetch_url(current_url)
if not html_content:
logger.error(f"Failed to download {current_url}")
failed_downloads.append(current_url)
time.sleep(0.5)
continue
# Extract and save content
content_to_save = extract(
html_content,
include_comments=False,
include_tables=True,
output_format=output_format,
with_metadata=False,
config=crawler_config
)
if content_to_save:
file_path = target_dir / file_name
file_path.write_text(content_to_save, encoding="utf-8")
progress.update(task, advance=1)
# logger.debug(f"Saved {output_format} content to {file_path}")
else:
logger.error(f"Failed to extract content from {current_url}")
failed_downloads.append(current_url)
# Find next URL
next_url = find_next_url(html_content, base_url)
if next_url:
current_url = next_url
else:
progress.update(task, description="[bold green]Complete!")
logger.info("No more 'Next Topic' found. Exiting.")
break
time.sleep(0.65)
# Report results
if failed_downloads:
logger.warning(f"Failed to process {len(failed_downloads)} URLs")
logger.debug(f"Failed URLs: {failed_downloads}")
logger.info(f"Scraping completed. Content saved in {output_format} format.")
return {"statusCode": 200, "body": f"Scraping completed. Content saved in {output_format} format."}
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment