praveenc · January 30, 2026 15:24
diff --git a/scrape_aws_linkedpages_trafilatura.py b/scrape_aws_linkedpages_trafilatura.py
 # /// script
 # requires-python = ">=3.12.9"
 # dependencies = [
 #   "loguru==0.7.3",
 #   "beautifulsoup4==4.12.3",
 #   "requests",
 #   "trafilatura[all]==2.0.0",
 #   "rich==13.9.4"
 # ]
 # ///
 import time
 from argparse import ArgumentParser
 from copy import deepcopy
 from pathlib import Path
 from urllib.parse import urlparse

 from bs4 import BeautifulSoup
 from loguru import logger
 from rich.progress import (
    BarColumn,
    Progress,
    SpinnerColumn,
    TextColumn,
    TimeElapsedColumn,
 )
 from trafilatura import extract, fetch_url
 from trafilatura.settings import DEFAULT_CONFIG

 # logger.info(f"inside {Path(__file__).name}")

 # Configure Trafilatura settings
 def get_crawler_config():
    config = deepcopy(DEFAULT_CONFIG)
    config['DEFAULT']['DOWNLOAD_TIMEOUT'] = '30'
    config['DEFAULT']['SLEEP_TIME'] = '5'
    config['DEFAULT']['MIN_FILE_SIZE'] = '10'
    config['DEFAULT']['EXTRACTION_TIMEOUT'] = '30'
    config['DEFAULT']['EXTENSIVE_DATE_SEARCH'] = 'off'
    return config

 # Find the URL of the next topic in the HTML content
 def find_next_url(html_content, base_url):
    soup = BeautifulSoup(html_content, "html.parser")
    next_topic = soup.find("div", {"class": "next-link"})
    if next_topic and next_topic.get("href"):
        return f"{base_url}{next_topic.get('href')}".replace("./", "")
    return None

 # Get appropriate filename based on URL and format
 def get_filename(url, output_format):
    parsed_url = urlparse(url)
    file_name = parsed_url.path.split("/")[-1] or "index"

    if output_format == "html":
        if not file_name.endswith('.html'):
            file_name = f"{file_name}.html"
    elif output_format == "markdown":
        file_name = f"{file_name.split('.')[0]}.md"
    else:  # text
        file_name = f"{file_name.split('.')[0]}.txt"

    return file_name

 def main():
    # Parse command line arguments
    parser = ArgumentParser(description="Web scraper that converts HTML to various formats")
    parser.add_argument("--url", type=str, required=True, help="Starting URL to scrape")
    parser.add_argument("--target_dir", type=str, default="./data/markdown", help="Target directory for output files")
    parser.add_argument("--format", type=str, choices=["html", "markdown", "text"],
                        default="markdown", help="Output format: html, markdown, or text")
    args = parser.parse_args()

    # Setup directories and variables
    output_format = args.format
    target_dir = Path(args.target_dir)
    if not target_dir.exists():
        target_dir.mkdir(parents=True, exist_ok=True)
        logger.info(f"Files will be saved to: {str(target_dir)} ")

    # Validate URL
    if not args.url:
        logger.error("URL not provided")
        return {"statusCode": 400, "body": "URL not provided."}

    # Extract base URL for constructing full URLs
    base_url = "/".join(args.url.split("/")[:-1]) + "/"

    # Initialize variables
    current_url = args.url
    failed_downloads = []
    crawler_config = get_crawler_config()

    with Progress(
            SpinnerColumn(),
            TextColumn("[bold blue]{task.description}"),
            BarColumn(),
            TextColumn("[bold green]{task.fields[url]}"),
            TimeElapsedColumn()
        ) as progress:
            task = progress.add_task("[cyan]Scraping...", total=None, url=current_url)

            # Main scraping loop
            while current_url:
                file_name = get_filename(current_url, output_format)
                # check if file exists on disk first
                if (target_dir / file_name).exists():
                    logger.warning(f"File {file_name} already exists. Skipping.")
                    progress.update(task, description="File already exists", url=current_url)
                    continue
                progress.update(task, description=f"Processing", url=current_url)
                # logger.info(f"Processing: {current_url} -> {file_name}")

                # Fetch content
                html_content = fetch_url(current_url)
                if not html_content:
                    logger.error(f"Failed to download {current_url}")
                    failed_downloads.append(current_url)
                    time.sleep(0.5)
                    continue

                # Extract and save content
                content_to_save = extract(
                    html_content,
                    include_comments=False,
                    include_tables=True,
                    output_format=output_format,
                    with_metadata=False,
                    config=crawler_config
                )

                if content_to_save:
                    file_path = target_dir / file_name
                    file_path.write_text(content_to_save, encoding="utf-8")
                    progress.update(task, advance=1)
                    # logger.debug(f"Saved {output_format} content to {file_path}")
                else:
                    logger.error(f"Failed to extract content from {current_url}")
                    failed_downloads.append(current_url)

                # Find next URL
                next_url = find_next_url(html_content, base_url)
                if next_url:
                    current_url = next_url
                else:
                    progress.update(task, description="[bold green]Complete!")
                    logger.info("No more 'Next Topic' found. Exiting.")
                    break

                time.sleep(0.65)

    # Report results
    if failed_downloads:
        logger.warning(f"Failed to process {len(failed_downloads)} URLs")
        logger.debug(f"Failed URLs: {failed_downloads}")

    logger.info(f"Scraping completed. Content saved in {output_format} format.")
    return {"statusCode": 200, "body": f"Scraping completed. Content saved in {output_format} format."}

 if __name__ == "__main__":
    main()
	# /// script
	# requires-python = ">=3.12.9"
	# dependencies = [
	# "loguru==0.7.3",
	# "beautifulsoup4==4.12.3",
	# "requests",
	# "trafilatura[all]==2.0.0",
	# "rich==13.9.4"
	# ]
	# ///
	import time
	from argparse import ArgumentParser
	from copy import deepcopy
	from pathlib import Path
	from urllib.parse import urlparse

	from bs4 import BeautifulSoup
	from loguru import logger
	from rich.progress import (
	BarColumn,
	Progress,
	SpinnerColumn,
	TextColumn,
	TimeElapsedColumn,
	)
	from trafilatura import extract, fetch_url
	from trafilatura.settings import DEFAULT_CONFIG

	# logger.info(f"inside {Path(__file__).name}")

	# Configure Trafilatura settings
	def get_crawler_config():
	config = deepcopy(DEFAULT_CONFIG)
	config['DEFAULT']['DOWNLOAD_TIMEOUT'] = '30'
	config['DEFAULT']['SLEEP_TIME'] = '5'
	config['DEFAULT']['MIN_FILE_SIZE'] = '10'
	config['DEFAULT']['EXTRACTION_TIMEOUT'] = '30'
	config['DEFAULT']['EXTENSIVE_DATE_SEARCH'] = 'off'
	return config

	# Find the URL of the next topic in the HTML content
	def find_next_url(html_content, base_url):
	soup = BeautifulSoup(html_content, "html.parser")
	next_topic = soup.find("div", {"class": "next-link"})
	if next_topic and next_topic.get("href"):
	return f"{base_url}{next_topic.get('href')}".replace("./", "")
	return None

	# Get appropriate filename based on URL and format
	def get_filename(url, output_format):
	parsed_url = urlparse(url)
	file_name = parsed_url.path.split("/")[-1] or "index"

	if output_format == "html":
	if not file_name.endswith('.html'):
	file_name = f"{file_name}.html"
	elif output_format == "markdown":
	file_name = f"{file_name.split('.')[0]}.md"
	else: # text
	file_name = f"{file_name.split('.')[0]}.txt"

	return file_name

	def main():
	# Parse command line arguments
	parser = ArgumentParser(description="Web scraper that converts HTML to various formats")
	parser.add_argument("--url", type=str, required=True, help="Starting URL to scrape")
	parser.add_argument("--target_dir", type=str, default="./data/markdown", help="Target directory for output files")
	parser.add_argument("--format", type=str, choices=["html", "markdown", "text"],
	default="markdown", help="Output format: html, markdown, or text")
	args = parser.parse_args()

	# Setup directories and variables
	output_format = args.format
	target_dir = Path(args.target_dir)
	if not target_dir.exists():
	target_dir.mkdir(parents=True, exist_ok=True)
	logger.info(f"Files will be saved to: {str(target_dir)} ")

	# Validate URL
	if not args.url:
	logger.error("URL not provided")
	return {"statusCode": 400, "body": "URL not provided."}

	# Extract base URL for constructing full URLs
	base_url = "/".join(args.url.split("/")[:-1]) + "/"

	# Initialize variables
	current_url = args.url
	failed_downloads = []
	crawler_config = get_crawler_config()

	with Progress(
	SpinnerColumn(),
	TextColumn("[bold blue]{task.description}"),
	BarColumn(),
	TextColumn("[bold green]{task.fields[url]}"),
	TimeElapsedColumn()
	) as progress:
	task = progress.add_task("[cyan]Scraping...", total=None, url=current_url)

	# Main scraping loop
	while current_url:
	file_name = get_filename(current_url, output_format)
	# check if file exists on disk first
	if (target_dir / file_name).exists():
	logger.warning(f"File {file_name} already exists. Skipping.")
	progress.update(task, description="File already exists", url=current_url)
	continue
	progress.update(task, description=f"Processing", url=current_url)
	# logger.info(f"Processing: {current_url} -> {file_name}")

	# Fetch content
	html_content = fetch_url(current_url)
	if not html_content:
	logger.error(f"Failed to download {current_url}")
	failed_downloads.append(current_url)
	time.sleep(0.5)
	continue

	# Extract and save content
	content_to_save = extract(
	html_content,
	include_comments=False,
	include_tables=True,
	output_format=output_format,
	with_metadata=False,
	config=crawler_config
	)

	if content_to_save:
	file_path = target_dir / file_name
	file_path.write_text(content_to_save, encoding="utf-8")
	progress.update(task, advance=1)
	# logger.debug(f"Saved {output_format} content to {file_path}")
	else:
	logger.error(f"Failed to extract content from {current_url}")
	failed_downloads.append(current_url)

	# Find next URL
	next_url = find_next_url(html_content, base_url)
	if next_url:
	current_url = next_url
	else:
	progress.update(task, description="[bold green]Complete!")
	logger.info("No more 'Next Topic' found. Exiting.")
	break

	time.sleep(0.65)

	# Report results
	if failed_downloads:
	logger.warning(f"Failed to process {len(failed_downloads)} URLs")
	logger.debug(f"Failed URLs: {failed_downloads}")

	logger.info(f"Scraping completed. Content saved in {output_format} format.")
	return {"statusCode": 200, "body": f"Scraping completed. Content saved in {output_format} format."}

	if __name__ == "__main__":
	main()
No results found