Created
January 30, 2026 15:24
-
-
Save praveenc/c46242c893b8be6f12f59fdb7e1da8f9 to your computer and use it in GitHub Desktop.
Scrapes linked AWS documentation to local dir (Supports HTML, MD, TXT) formats.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # requires-python = ">=3.12.9" | |
| # dependencies = [ | |
| # "loguru==0.7.3", | |
| # "beautifulsoup4==4.12.3", | |
| # "requests", | |
| # "trafilatura[all]==2.0.0", | |
| # "rich==13.9.4" | |
| # ] | |
| # /// | |
| import time | |
| from argparse import ArgumentParser | |
| from copy import deepcopy | |
| from pathlib import Path | |
| from urllib.parse import urlparse | |
| from bs4 import BeautifulSoup | |
| from loguru import logger | |
| from rich.progress import ( | |
| BarColumn, | |
| Progress, | |
| SpinnerColumn, | |
| TextColumn, | |
| TimeElapsedColumn, | |
| ) | |
| from trafilatura import extract, fetch_url | |
| from trafilatura.settings import DEFAULT_CONFIG | |
| # logger.info(f"inside {Path(__file__).name}") | |
| # Configure Trafilatura settings | |
| def get_crawler_config(): | |
| config = deepcopy(DEFAULT_CONFIG) | |
| config['DEFAULT']['DOWNLOAD_TIMEOUT'] = '30' | |
| config['DEFAULT']['SLEEP_TIME'] = '5' | |
| config['DEFAULT']['MIN_FILE_SIZE'] = '10' | |
| config['DEFAULT']['EXTRACTION_TIMEOUT'] = '30' | |
| config['DEFAULT']['EXTENSIVE_DATE_SEARCH'] = 'off' | |
| return config | |
| # Find the URL of the next topic in the HTML content | |
| def find_next_url(html_content, base_url): | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| next_topic = soup.find("div", {"class": "next-link"}) | |
| if next_topic and next_topic.get("href"): | |
| return f"{base_url}{next_topic.get('href')}".replace("./", "") | |
| return None | |
| # Get appropriate filename based on URL and format | |
| def get_filename(url, output_format): | |
| parsed_url = urlparse(url) | |
| file_name = parsed_url.path.split("/")[-1] or "index" | |
| if output_format == "html": | |
| if not file_name.endswith('.html'): | |
| file_name = f"{file_name}.html" | |
| elif output_format == "markdown": | |
| file_name = f"{file_name.split('.')[0]}.md" | |
| else: # text | |
| file_name = f"{file_name.split('.')[0]}.txt" | |
| return file_name | |
| def main(): | |
| # Parse command line arguments | |
| parser = ArgumentParser(description="Web scraper that converts HTML to various formats") | |
| parser.add_argument("--url", type=str, required=True, help="Starting URL to scrape") | |
| parser.add_argument("--target_dir", type=str, default="./data/markdown", help="Target directory for output files") | |
| parser.add_argument("--format", type=str, choices=["html", "markdown", "text"], | |
| default="markdown", help="Output format: html, markdown, or text") | |
| args = parser.parse_args() | |
| # Setup directories and variables | |
| output_format = args.format | |
| target_dir = Path(args.target_dir) | |
| if not target_dir.exists(): | |
| target_dir.mkdir(parents=True, exist_ok=True) | |
| logger.info(f"Files will be saved to: {str(target_dir)} ") | |
| # Validate URL | |
| if not args.url: | |
| logger.error("URL not provided") | |
| return {"statusCode": 400, "body": "URL not provided."} | |
| # Extract base URL for constructing full URLs | |
| base_url = "/".join(args.url.split("/")[:-1]) + "/" | |
| # Initialize variables | |
| current_url = args.url | |
| failed_downloads = [] | |
| crawler_config = get_crawler_config() | |
| with Progress( | |
| SpinnerColumn(), | |
| TextColumn("[bold blue]{task.description}"), | |
| BarColumn(), | |
| TextColumn("[bold green]{task.fields[url]}"), | |
| TimeElapsedColumn() | |
| ) as progress: | |
| task = progress.add_task("[cyan]Scraping...", total=None, url=current_url) | |
| # Main scraping loop | |
| while current_url: | |
| file_name = get_filename(current_url, output_format) | |
| # check if file exists on disk first | |
| if (target_dir / file_name).exists(): | |
| logger.warning(f"File {file_name} already exists. Skipping.") | |
| progress.update(task, description="File already exists", url=current_url) | |
| continue | |
| progress.update(task, description=f"Processing", url=current_url) | |
| # logger.info(f"Processing: {current_url} -> {file_name}") | |
| # Fetch content | |
| html_content = fetch_url(current_url) | |
| if not html_content: | |
| logger.error(f"Failed to download {current_url}") | |
| failed_downloads.append(current_url) | |
| time.sleep(0.5) | |
| continue | |
| # Extract and save content | |
| content_to_save = extract( | |
| html_content, | |
| include_comments=False, | |
| include_tables=True, | |
| output_format=output_format, | |
| with_metadata=False, | |
| config=crawler_config | |
| ) | |
| if content_to_save: | |
| file_path = target_dir / file_name | |
| file_path.write_text(content_to_save, encoding="utf-8") | |
| progress.update(task, advance=1) | |
| # logger.debug(f"Saved {output_format} content to {file_path}") | |
| else: | |
| logger.error(f"Failed to extract content from {current_url}") | |
| failed_downloads.append(current_url) | |
| # Find next URL | |
| next_url = find_next_url(html_content, base_url) | |
| if next_url: | |
| current_url = next_url | |
| else: | |
| progress.update(task, description="[bold green]Complete!") | |
| logger.info("No more 'Next Topic' found. Exiting.") | |
| break | |
| time.sleep(0.65) | |
| # Report results | |
| if failed_downloads: | |
| logger.warning(f"Failed to process {len(failed_downloads)} URLs") | |
| logger.debug(f"Failed URLs: {failed_downloads}") | |
| logger.info(f"Scraping completed. Content saved in {output_format} format.") | |
| return {"statusCode": 200, "body": f"Scraping completed. Content saved in {output_format} format."} | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment