Created
December 30, 2025 00:48
-
-
Save ethanabrooks/7e0d3265d2d5adfb4d875d5528767184 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "0a3a1d08", | |
| "metadata": {}, | |
| "source": [ | |
| "# Web Content Extraction Tool Comparison\n", | |
| "\n", | |
| "Comparing tools for extracting readable text/markdown from web pages.\n", | |
| "\n", | |
| "## Tools Evaluated\n", | |
| "\n", | |
| "| Tool | Type | Notes |\n", | |
| "|------|------|-------|\n", | |
| "| trafilatura | Python | Purpose-built for web text extraction |\n", | |
| "| newspaper3k | Python | News article extraction |\n", | |
| "| readability-lxml | Python | Python port of Mozilla Readability |\n", | |
| "| Mozilla Readability | JavaScript | Original Firefox Reader View library |\n", | |
| "| Playwright | Python | Browser automation for JS-rendered pages |\n", | |
| "| html2text | Python | HTML to Markdown converter |\n", | |
| "| BeautifulSoup | Python | Manual extraction baseline |\n", | |
| "| Parallel.ai | API | Commercial service (requires API key) |" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "025255de", | |
| "metadata": {}, | |
| "source": [ | |
| "## Configuration" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "d880d862", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T00:46:32.595548Z", | |
| "iopub.status.busy": "2025-12-30T00:46:32.595263Z", | |
| "iopub.status.idle": "2025-12-30T00:46:32.603824Z", | |
| "shell.execute_reply": "2025-12-30T00:46:32.602935Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "MAX_CHARS = 3000 # Maximum characters to display per output" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "b24ee9dd", | |
| "metadata": {}, | |
| "source": [ | |
| "## Setup" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "97de6407", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T00:46:32.606893Z", | |
| "iopub.status.busy": "2025-12-30T00:46:32.606666Z", | |
| "iopub.status.idle": "2025-12-30T00:46:32.659835Z", | |
| "shell.execute_reply": "2025-12-30T00:46:32.659436Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import json\n", | |
| "import os\n", | |
| "import subprocess\n", | |
| "import requests\n", | |
| "from pathlib import Path" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "ee68e648", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T00:46:32.661065Z", | |
| "iopub.status.busy": "2025-12-30T00:46:32.660990Z", | |
| "iopub.status.idle": "2025-12-30T00:46:32.871743Z", | |
| "shell.execute_reply": "2025-12-30T00:46:32.871398Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Fetched 2,003 bytes\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "TEST_URL = \"https://www.amazon.com/\"\n", | |
| "\n", | |
| "headers = {\n", | |
| " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\",\n", | |
| " \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\",\n", | |
| " \"Accept-Language\": \"en-US,en;q=0.9\",\n", | |
| "}\n", | |
| "response = requests.get(TEST_URL, headers=headers, timeout=30)\n", | |
| "html_content = None\n", | |
| "fetch_error = None\n", | |
| "\n", | |
| "if response.ok:\n", | |
| " html_content = response.text\n", | |
| " print(f\"Fetched {len(html_content):,} bytes\")\n", | |
| "else:\n", | |
| " fetch_error = f\"HTTP {response.status_code}: {response.reason}\"\n", | |
| " print(f\"Fetch failed: {fetch_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "20a00121", | |
| "metadata": {}, | |
| "source": [ | |
| "## 1. Trafilatura\n", | |
| "\n", | |
| "[trafilatura](https://trafilatura.readthedocs.io/) - Purpose-built for web text extraction with native markdown output." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "77dc050c", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T00:46:32.873026Z", | |
| "iopub.status.busy": "2025-12-30T00:46:32.872952Z", | |
| "iopub.status.idle": "2025-12-30T00:46:33.017733Z", | |
| "shell.execute_reply": "2025-12-30T00:46:33.017294Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "JavaScript is disabled\n", | |
| "In order to continue, we need to verify that you're not a robot. This requires JavaScript. Enable JavaScript and then reload the page.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import trafilatura\n", | |
| "\n", | |
| "if html_content:\n", | |
| " trafilatura_text = trafilatura.extract(\n", | |
| " html_content,\n", | |
| " output_format=\"markdown\",\n", | |
| " include_tables=True,\n", | |
| " include_links=True,\n", | |
| " include_images=False,\n", | |
| " )\n", | |
| " print(trafilatura_text[:MAX_CHARS] if trafilatura_text else \"No content\")\n", | |
| "else:\n", | |
| " trafilatura_text = None\n", | |
| " print(f\"Skipped: {fetch_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "37293fa6", | |
| "metadata": {}, | |
| "source": [ | |
| "## 2. Newspaper3k\n", | |
| "\n", | |
| "[newspaper3k](https://newspaper.readthedocs.io/) - Designed for news article extraction." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "f406ca17", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T00:46:33.018957Z", | |
| "iopub.status.busy": "2025-12-30T00:46:33.018851Z", | |
| "iopub.status.idle": "2025-12-30T00:46:33.115833Z", | |
| "shell.execute_reply": "2025-12-30T00:46:33.115421Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "No content\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from newspaper import Article\n", | |
| "\n", | |
| "article = Article(TEST_URL)\n", | |
| "newspaper_error = None\n", | |
| "\n", | |
| "if html_content:\n", | |
| " article.set_html(html_content)\n", | |
| " try:\n", | |
| " article.parse()\n", | |
| " except ValueError as e:\n", | |
| " newspaper_error = f\"ValueError: {e}\"\n", | |
| "\n", | |
| " if newspaper_error:\n", | |
| " print(f\"Newspaper3k error: {newspaper_error}\")\n", | |
| " else:\n", | |
| " print(article.text[:MAX_CHARS] if article.text else \"No content\")\n", | |
| "else:\n", | |
| " print(f\"Skipped: {fetch_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "4580af76", | |
| "metadata": {}, | |
| "source": [ | |
| "## 3. Readability-lxml\n", | |
| "\n", | |
| "[readability-lxml](https://github.com/buriy/python-readability) - Python port of Mozilla Readability.\n", | |
| "Outputs HTML, so we pipe through html2text for markdown." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "a0111b20", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T00:46:33.116932Z", | |
| "iopub.status.busy": "2025-12-30T00:46:33.116852Z", | |
| "iopub.status.idle": "2025-12-30T00:46:33.124238Z", | |
| "shell.execute_reply": "2025-12-30T00:46:33.123919Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "# JavaScript is disabled\n", | |
| "\n", | |
| "In order to continue, we need to verify that you're not a robot. This requires JavaScript. Enable JavaScript and then reload the page. \n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from readability import Document\n", | |
| "import html2text\n", | |
| "\n", | |
| "h2t = html2text.HTML2Text()\n", | |
| "h2t.ignore_links = False\n", | |
| "h2t.ignore_images = True\n", | |
| "h2t.body_width = 0\n", | |
| "\n", | |
| "readability_markdown = \"\"\n", | |
| "readability_error = None\n", | |
| "\n", | |
| "if html_content:\n", | |
| " doc = Document(html_content)\n", | |
| " try:\n", | |
| " readable_html = doc.summary()\n", | |
| " readability_markdown = h2t.handle(readable_html)\n", | |
| " except Exception as e:\n", | |
| " readability_error = f\"{type(e).__name__}: {e}\"\n", | |
| "\n", | |
| " if readability_error:\n", | |
| " print(f\"Readability-lxml error: {readability_error}\")\n", | |
| " else:\n", | |
| " print(readability_markdown[:MAX_CHARS])\n", | |
| "else:\n", | |
| " print(f\"Skipped: {fetch_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "3ef93b51", | |
| "metadata": {}, | |
| "source": [ | |
| "## 4. Mozilla Readability (JavaScript)\n", | |
| "\n", | |
| "[Mozilla Readability](https://github.com/mozilla/readability) - Original Firefox Reader View library, called via Node.js." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "3dba3e69", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T00:46:33.125289Z", | |
| "iopub.status.busy": "2025-12-30T00:46:33.125227Z", | |
| "iopub.status.idle": "2025-12-30T00:46:34.222748Z", | |
| "shell.execute_reply": "2025-12-30T00:46:34.222344Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "script_path = Path(\"readability_extract.js\")\n", | |
| "mozilla_markdown = \"\"\n", | |
| "\n", | |
| "if not html_content:\n", | |
| " print(f\"Skipped: {fetch_error}\")\n", | |
| "elif not script_path.exists():\n", | |
| " print(\"readability_extract.js not found\")\n", | |
| "else:\n", | |
| " result = subprocess.run(\n", | |
| " [\"node\", str(script_path)],\n", | |
| " input=html_content,\n", | |
| " capture_output=True,\n", | |
| " text=True,\n", | |
| " timeout=30,\n", | |
| " )\n", | |
| " if result.returncode == 0:\n", | |
| " mozilla_result = json.loads(result.stdout)\n", | |
| " mozilla_markdown = h2t.handle(mozilla_result.get(\"content\", \"\"))\n", | |
| " print(mozilla_markdown[:MAX_CHARS])\n", | |
| " else:\n", | |
| " print(f\"Error: {result.stderr}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "ceaffb93", | |
| "metadata": {}, | |
| "source": [ | |
| "## 5. Playwright\n", | |
| "\n", | |
| "[Playwright](https://playwright.dev/) - Browser automation that renders JavaScript before extraction." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "5e9bfba5", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T00:46:34.224089Z", | |
| "iopub.status.busy": "2025-12-30T00:46:34.223998Z", | |
| "iopub.status.idle": "2025-12-30T00:47:36.273783Z", | |
| "shell.execute_reply": "2025-12-30T00:47:36.273172Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Playwright error: Timeout after 60000ms\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import asyncio\n", | |
| "from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout\n", | |
| "import nest_asyncio\n", | |
| "\n", | |
| "nest_asyncio.apply()\n", | |
| "\n", | |
| "\n", | |
| "async def fetch_with_playwright(\n", | |
| " url: str, timeout: int = 60000\n", | |
| ") -> tuple[str | None, str | None]:\n", | |
| " \"\"\"Returns (html, error). One will be None.\"\"\"\n", | |
| " browser = None\n", | |
| " try:\n", | |
| " async with async_playwright() as p:\n", | |
| " browser = await p.chromium.launch(headless=True)\n", | |
| " page = await browser.new_page()\n", | |
| " response = await page.goto(url, wait_until=\"networkidle\", timeout=timeout)\n", | |
| " html = await page.content()\n", | |
| " await browser.close()\n", | |
| " status = response.status if response else None\n", | |
| " if status and status >= 400:\n", | |
| " return None, f\"HTTP {status}\"\n", | |
| " return html, None\n", | |
| " except PlaywrightTimeout:\n", | |
| " return None, f\"Timeout after {timeout}ms\"\n", | |
| " except Exception as e:\n", | |
| " return None, f\"{type(e).__name__}: {e}\"\n", | |
| "\n", | |
| "\n", | |
| "playwright_html = None\n", | |
| "playwright_extracted = None\n", | |
| "playwright_error = None\n", | |
| "\n", | |
| "loop = asyncio.get_event_loop()\n", | |
| "result = loop.run_until_complete(fetch_with_playwright(TEST_URL))\n", | |
| "playwright_html, playwright_error = result\n", | |
| "\n", | |
| "if playwright_html:\n", | |
| " playwright_extracted = trafilatura.extract(\n", | |
| " playwright_html,\n", | |
| " output_format=\"markdown\",\n", | |
| " include_tables=True,\n", | |
| " include_links=True,\n", | |
| " include_images=False,\n", | |
| " )\n", | |
| " print(\n", | |
| " playwright_extracted[:MAX_CHARS]\n", | |
| " if playwright_extracted\n", | |
| " else \"No content extracted from HTML\"\n", | |
| " )\n", | |
| "else:\n", | |
| " print(f\"Playwright error: {playwright_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "6e522bf1", | |
| "metadata": {}, | |
| "source": [ | |
| "## 6. Parallel.ai\n", | |
| "\n", | |
| "[Parallel.ai](https://docs.parallel.ai/) - Commercial API for web extraction using the Python SDK." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "a66a4eb3", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T00:47:36.275380Z", | |
| "iopub.status.busy": "2025-12-30T00:47:36.275258Z", | |
| "iopub.status.idle": "2025-12-30T00:47:37.534930Z", | |
| "shell.execute_reply": "2025-12-30T00:47:37.534413Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "## Skip to\n", | |
| "\n", | |
| "* [Main content]()\n", | |
| "\n", | |
| "* * *\n", | |
| "\n", | |
| "## Keyboard shortcuts\n", | |
| "\n", | |
| "* [Search alt \\+ /](javascript:void\\(0\\))\n", | |
| "* [Cart shift \\+ alt \\+ C](javascript:void\\(0\\))\n", | |
| "* [Home shift \\+ alt \\+ H](javascript:void\\(0\\))\n", | |
| "* [Orders shift \\+ alt \\+ O](javascript:void\\(0\\))\n", | |
| "* Show/Hide shortcuts\n", | |
| " \n", | |
| " shift \\+ alt \\+ Z\n", | |
| "\n", | |
| "To move between items, use your keyboard's up or down arrows.\n", | |
| "\n", | |
| "[.us](/ref=nav_logo)\n", | |
| "\n", | |
| "Delivering to Washington 20001 Update location\n", | |
| "\n", | |
| "All\n", | |
| "\n", | |
| "Select the department you want to search in All Departments Alexa Skills Amazon Autos Amazon Devices Amazon Fresh Amazon Global Store Amazon Haul Amazon One Medical Amazon Pharmacy Amazon Resale Appliances Apps & Games Arts, Crafts & Sewing Audible Books & Originals Automotive Parts & Accessories Baby Beauty & Personal Care Books CDs & Vinyl Cell Phones & Accessories Clothing, Shoes & Jewelry Women's Clothing, Shoes & Jewelry Men's Clothing, Shoes & Jewelry Girl's Clothing, Shoes & Jewelry Boy's Clothing, Shoes & Jewelry Baby Clothing, Shoes & Jewelry Collectibles & Fine Art Computers Credit and Payment Cards Digital Music Electronics Garden & Outdoor Gift Cards Grocery & Gourmet Food Handmade Health, Household & Baby Care Home & Business Services Home & Kitchen Industrial & Scientific Just for Prime Kindle Store Luggage & Travel Gear Luxury Stores Magazine Subscriptions Movies & TV Musical Instruments Office Products Pet Supplies Premium Beauty Prime Video Same-Day Store Smart Home Software Sports & Outdoors Subscribe & Save Subscription Boxes Tools & Home Improvement Toys & Games Under $10 Video Games Weis Whole Foods Market\n", | |
| "\n", | |
| "Search Amazon\n", | |
| "\n", | |
| "[EN](/customer-preferences/edit?ie=UTF8&preferencesReturnUrl=%2F&ref_=topnav_lang)\n", | |
| "\n", | |
| "[Hello, sign in Account & Lists](https://www.amazon.com/ap/signin?openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.com%2F%3F_encoding%3DUTF8%26ref_%3Dnav_ya_signin&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=usflex&openid.mode=checkid_setup&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0)\n", | |
| "\n", | |
| "[Returns & Orders](/gp/css/order-history?ref_=nav_orders_first) [0 Cart](/gp/cart/view.html?ref_=nav_cart)\n", | |
| "\n", | |
| "[_Previous slide_](#)\n", | |
| "\n", | |
| "1. [](/prime/?_encoding=UTF8&pd_rd_w=D2xET&content-id=amzn1.sym.12323c3f-5f7f-4fd2-b5fa-9a0f8d235248&pf_rd_p=12323c3f-5f7f-4fd2-b5fa-9a0f8d235248&pf_rd_r=PHMKKZA0836966GKS86P&pd_rd_wg=uFK58&pd_rd_r=9ec656f8-8a84-4f69-89e9-4bd6afe85675&ref_=pd_hp_d_hero_unk)\n", | |
| "2. \n", | |
| "3. \n", | |
| "4. \n", | |
| "5. \n", | |
| "6. \n", | |
| "7.\n", | |
| "\n", | |
| "[_Next slide_](#)\n", | |
| "\n", | |
| "## 15% off pre-loved pieces\n", | |
| "\n", | |
| "[Chanel](/s/?_encoding=UTF8&i=luxury&bbn=207247666011&pd_rd_w=dZq8w&content-id=amzn1.sym.685d4b15-38d8-4316-8e51-9e52d6ea700a&pf_rd_p=685d4b15-38d8-4316-8e51-9e52d6ea700a&pf_rd_r=PHMKKZA0836966GKS86P&pd_rd_wg=B2kXn&pd_rd_r=34bcd810-c58c-4ebf-8389-47772e5842f6&ref_=pd_hp_d_atf_unk)\n", | |
| "\n", | |
| "[Van Cleef](/s/?_encoding=UTF8&i=luxury&srs=207247666011&bbn=207247666011&rh=n%3A207247666011%2Cp_123%3A235110&s=date-desc\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from parallel import Parallel\n", | |
| "\n", | |
| "parallel_result = None\n", | |
| "parallel_error = None\n", | |
| "\n", | |
| "api_key = os.getenv(\"PARALLEL_API_KEY\")\n", | |
| "if not api_key:\n", | |
| " parallel_error = \"PARALLEL_API_KEY not set\"\n", | |
| "else:\n", | |
| " client = Parallel(api_key=api_key)\n", | |
| " extract = client.beta.extract(\n", | |
| " urls=[TEST_URL],\n", | |
| " objective=\"Extract the main content of this page\",\n", | |
| " excerpts=True,\n", | |
| " full_content=True,\n", | |
| " )\n", | |
| " parallel_result = extract.results\n", | |
| "\n", | |
| "if parallel_result:\n", | |
| " for result in parallel_result:\n", | |
| " if result.full_content:\n", | |
| " print(result.full_content[:MAX_CHARS])\n", | |
| "else:\n", | |
| " print(f\"Parallel.ai error: {parallel_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "fb8e4c91", | |
| "metadata": {}, | |
| "source": [ | |
| "## 7. Exa\n", | |
| "\n", | |
| "[Exa](https://exa.ai/) - AI-native search and content extraction API." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "fd2bc1d0", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T00:47:37.536129Z", | |
| "iopub.status.busy": "2025-12-30T00:47:37.536054Z", | |
| "iopub.status.idle": "2025-12-30T00:47:38.125967Z", | |
| "shell.execute_reply": "2025-12-30T00:47:38.125403Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Amazon.com. Spend less. Smile more.\n", | |
| "\n", | |
| "[.us](https://www.amazon.com/ref=nav_logo)\n", | |
| "[\n", | |
| "Delivering to Buffalo 14205Update location\n", | |
| "]()\n", | |
| "All**\n", | |
| "Select the department you want to search inAll DepartmentsAlexa SkillsAmazon AutosAmazon DevicesAmazon Global StoreAmazon HaulAmazon One MedicalAmazon PharmacyAmazon ResaleAppliancesApps & GamesArts, Crafts & SewingAudible Books & OriginalsAutomotive Parts & AccessoriesBabyBeauty & Personal CareBooksCDs & VinylCell Phones & AccessoriesClothing, Shoes & JewelryWomen's Clothing, Shoes & JewelryMen's Clothing, Shoes & JewelryGirl's Clothing, Shoes & JewelryBoy's Clothing, Shoes & JewelryBaby Clothing, Shoes & JewelryCollectibles & Fine ArtComputersCredit and Payment CardsDigital MusicElectronicsGarden & OutdoorGift CardsGrocery & Gourmet FoodHandmadeHealth, Household & Baby CareHome & Business ServicesHome & KitchenIndustrial & ScientificJust for PrimeKindle StoreLuggage & Travel GearLuxury StoresMagazine SubscriptionsMovies & TVMusical InstrumentsOffice ProductsPet SuppliesPremium BeautyPrime VideoSame-Day StoreSmart HomeSoftwareSports & OutdoorsSubscribe & SaveSubscription BoxesTools & Home ImprovementToys & GamesUnder $10Video GamesWhole Foods Market\n", | |
| "Search Amazon\n", | |
| "[\n", | |
| "EN\n", | |
| "](https://www.amazon.com/customer-preferences/edit?ie=UTF8&preferencesReturnUrl=/&ref_=topnav_lang)\n", | |
| "[\n", | |
| "Hello, sign in\n", | |
| "Account & Lists](https://www.amazon.com/ap/signin?openid.pape.max_auth_age=0&openid.return_to=https://www.amazon.com/?ref_=nav_ya_signin&openid.identity=http://specs.openid.net/auth/2.0/identifier_select&openid.assoc_handle=usflex&openid.mode=checkid_setup&openid.claimed_id=http://specs.openid.net/auth/2.0/identifier_select&openid.ns=http://specs.openid.net/auth/2.0)\n", | |
| "[Returns& Orders](https://www.amazon.com/gp/css/order-history?ref_=nav_orders_first)[\n", | |
| "0\n", | |
| "Cart\n", | |
| "](https://www.amazon.com/gp/cart/view.html?ref_=nav_cart)\n", | |
| "[Sign in](https://www.amazon.com/ap/signin?openid.pape.max_auth_age=0&openid.return_to=https://www.amazon.com/?ref_=nav_signin&openid.identity=http://specs.openid.net/auth/2.0/identifier_select&openid.assoc_handle=usflex&openid.mode=checkid_setup&openid.claimed_id=http://specs.openid.net/auth/2.0/identifier_select&openid.ns=http://specs.openid.net/auth/2.0)\n", | |
| "New customer?[Start here.](https://www.amazon.com/ap/register?openid.pape.max_auth_age=0&openid.return_to=https://www.amazon.com/?_encoding=UTF8&ref_=nav_newcust&openid.identity=http://specs.openid.net/auth/2.0/identifier_select&openid.assoc_handle=usflex&openid.mode=checkid_setup&openid.claimed_id=http://specs.openid.net\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from exa_py import Exa\n", | |
| "\n", | |
| "exa_result = None\n", | |
| "exa_error = None\n", | |
| "\n", | |
| "exa_api_key = os.getenv(\"EXA_API_KEY\")\n", | |
| "if not exa_api_key:\n", | |
| " exa_error = \"EXA_API_KEY not set\"\n", | |
| "else:\n", | |
| " exa = Exa(exa_api_key)\n", | |
| " results = exa.get_contents(urls=[TEST_URL], text=True)\n", | |
| " if results.results:\n", | |
| " exa_result = results.results[0].text\n", | |
| "\n", | |
| "if exa_result:\n", | |
| " print(exa_result[:MAX_CHARS])\n", | |
| "else:\n", | |
| " print(f\"Exa error: {exa_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "b5c89c1a", | |
| "metadata": {}, | |
| "source": [ | |
| "## 8. html2text (direct)\n", | |
| "\n", | |
| "[html2text](https://github.com/Alir3z4/html2text) - Converts HTML to Markdown without readability filtering." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "298e1fb0", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T00:47:38.127230Z", | |
| "iopub.status.busy": "2025-12-30T00:47:38.127131Z", | |
| "iopub.status.idle": "2025-12-30T00:47:38.129467Z", | |
| "shell.execute_reply": "2025-12-30T00:47:38.129036Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "# JavaScript is disabled\n", | |
| "\n", | |
| "In order to continue, we need to verify that you're not a robot. This requires JavaScript. Enable JavaScript and then reload the page. \n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "if html_content:\n", | |
| " html2text_output = h2t.handle(html_content)\n", | |
| " print(html2text_output[:MAX_CHARS])\n", | |
| "else:\n", | |
| " html2text_output = \"\"\n", | |
| " print(f\"Skipped: {fetch_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "f6584f23", | |
| "metadata": {}, | |
| "source": [ | |
| "## 9. BeautifulSoup\n", | |
| "\n", | |
| "[BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) - Manual text extraction baseline." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "1a165cb6", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T00:47:38.130779Z", | |
| "iopub.status.busy": "2025-12-30T00:47:38.130673Z", | |
| "iopub.status.idle": "2025-12-30T00:47:38.134792Z", | |
| "shell.execute_reply": "2025-12-30T00:47:38.134323Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "JavaScript is disabled\n", | |
| "In order to continue, we need to verify that you're not a robot.\n", | |
| " This requires JavaScript. Enable JavaScript and then reload the page.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from bs4 import BeautifulSoup\n", | |
| "\n", | |
| "if html_content:\n", | |
| " soup = BeautifulSoup(html_content, \"lxml\")\n", | |
| " for el in soup([\"script\", \"style\", \"nav\", \"footer\", \"header\"]):\n", | |
| " el.decompose()\n", | |
| "\n", | |
| " content = soup.find(\"div\", {\"id\": \"mw-content-text\"})\n", | |
| " bs_text = (\n", | |
| " content.get_text(separator=\"\\n\", strip=True)\n", | |
| " if content\n", | |
| " else soup.get_text(separator=\"\\n\", strip=True)\n", | |
| " )\n", | |
| " print(bs_text[:MAX_CHARS])\n", | |
| "else:\n", | |
| " bs_text = \"\"\n", | |
| " print(f\"Skipped: {fetch_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "899b203c", | |
| "metadata": {}, | |
| "source": [ | |
| "## Summary" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "id": "0888b242", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T00:47:38.135838Z", | |
| "iopub.status.busy": "2025-12-30T00:47:38.135771Z", | |
| "iopub.status.idle": "2025-12-30T00:47:38.138270Z", | |
| "shell.execute_reply": "2025-12-30T00:47:38.137881Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "exa : 146,107 chars\n", | |
| "parallel.ai : 48,564 chars\n", | |
| "beautifulsoup : 165 chars\n", | |
| "readability-lxml : 162 chars\n", | |
| "html2text : 162 chars\n", | |
| "trafilatura : 157 chars\n", | |
| "mozilla readability : 1 chars\n", | |
| "newspaper3k : 0 chars\n", | |
| "playwright : 0 chars\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "results = {\n", | |
| " \"trafilatura\": len(trafilatura_text or \"\"),\n", | |
| " \"newspaper3k\": len(article.text or \"\")\n", | |
| " if html_content and not newspaper_error\n", | |
| " else 0,\n", | |
| " \"readability-lxml\": len(readability_markdown),\n", | |
| " \"mozilla readability\": len(mozilla_markdown),\n", | |
| " \"playwright\": len(playwright_extracted or \"\"),\n", | |
| " \"parallel.ai\": len(parallel_result[0].full_content or \"\") if parallel_result else 0,\n", | |
| " \"exa\": len(exa_result or \"\"),\n", | |
| " \"html2text\": len(html2text_output),\n", | |
| " \"beautifulsoup\": len(bs_text),\n", | |
| "}\n", | |
| "\n", | |
| "if fetch_error:\n", | |
| " print(\n", | |
| " f\"Note: requests fetch failed ({fetch_error}), some tools used Playwright-fetched HTML\\n\"\n", | |
| " )\n", | |
| "\n", | |
| "for name, length in sorted(results.items(), key=lambda x: -x[1]):\n", | |
| " print(f\"{name:25s}: {length:>8,} chars\")" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.9" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment