Created
December 30, 2025 14:36
-
-
Save ethanabrooks/304b5a40d4f99372e0cafdda9e7155b6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "0a3a1d08", | |
| "metadata": { | |
| "papermill": { | |
| "duration": 0.00224, | |
| "end_time": "2025-12-30T14:30:24.032859", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.030619", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "# Web Content Extraction Tool Comparison\n", | |
| "\n", | |
| "Comparing tools for extracting readable text/markdown from web pages.\n", | |
| "\n", | |
| "## Tools Evaluated\n", | |
| "\n", | |
| "| Tool | Type | Notes |\n", | |
| "| ------------------- | ---------- | ---------------------------------------- |\n", | |
| "| trafilatura | Python | Purpose-built for web text extraction |\n", | |
| "| newspaper3k | Python | News article extraction |\n", | |
| "| readability-lxml | Python | Python port of Mozilla Readability |\n", | |
| "| Mozilla Readability | JavaScript | Original Firefox Reader View library |\n", | |
| "| Playwright | Python | Browser automation for JS-rendered pages |\n", | |
| "| html2text | Python | HTML to Markdown converter |\n", | |
| "| BeautifulSoup | Python | Manual extraction baseline |\n", | |
| "| Parallel.ai | API | Commercial service (requires API key) |\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "025255de", | |
| "metadata": { | |
| "papermill": { | |
| "duration": 0.00236, | |
| "end_time": "2025-12-30T14:30:24.036856", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.034496", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "## Configuration\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "d880d862", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:24.041045Z", | |
| "iopub.status.busy": "2025-12-30T14:30:24.040921Z", | |
| "iopub.status.idle": "2025-12-30T14:30:24.046650Z", | |
| "shell.execute_reply": "2025-12-30T14:30:24.045800Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.00931, | |
| "end_time": "2025-12-30T14:30:24.047725", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.038415", | |
| "status": "completed" | |
| }, | |
| "tags": [ | |
| "parameters" | |
| ] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Parameters - these can be overridden by papermill\n", | |
| "TEST_URL = \"https://en.wikipedia.org/wiki/WBA_interim_middleweight_championship#List_of_interim_champions\"\n", | |
| "MAX_CHARS = 3000 # Maximum characters to display per output" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "39cce182", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:24.053419Z", | |
| "iopub.status.busy": "2025-12-30T14:30:24.053291Z", | |
| "iopub.status.idle": "2025-12-30T14:30:24.054988Z", | |
| "shell.execute_reply": "2025-12-30T14:30:24.054587Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.005357, | |
| "end_time": "2025-12-30T14:30:24.055503", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.050146", | |
| "status": "completed" | |
| }, | |
| "tags": [ | |
| "injected-parameters" | |
| ] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Parameters\n", | |
| "TEST_URL = \"https://news.ycombinator.com/\"\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "b24ee9dd", | |
| "metadata": { | |
| "papermill": { | |
| "duration": 0.001515, | |
| "end_time": "2025-12-30T14:30:24.058600", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.057085", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "## Setup\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "97de6407", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:24.062781Z", | |
| "iopub.status.busy": "2025-12-30T14:30:24.062677Z", | |
| "iopub.status.idle": "2025-12-30T14:30:24.094339Z", | |
| "shell.execute_reply": "2025-12-30T14:30:24.093585Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.034742, | |
| "end_time": "2025-12-30T14:30:24.095287", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.060545", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import json\n", | |
| "import os\n", | |
| "import subprocess\n", | |
| "import requests\n", | |
| "from pathlib import Path" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "ee68e648", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:24.099467Z", | |
| "iopub.status.busy": "2025-12-30T14:30:24.099351Z", | |
| "iopub.status.idle": "2025-12-30T14:30:24.510198Z", | |
| "shell.execute_reply": "2025-12-30T14:30:24.509291Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.414033, | |
| "end_time": "2025-12-30T14:30:24.511095", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.097062", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Fetched 34,463 bytes\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "headers = {\n", | |
| " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\",\n", | |
| " \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\",\n", | |
| " \"Accept-Language\": \"en-US,en;q=0.9\",\n", | |
| "}\n", | |
| "response = requests.get(TEST_URL, headers=headers, timeout=30)\n", | |
| "html_content = None\n", | |
| "fetch_error = None\n", | |
| "\n", | |
| "if response.ok:\n", | |
| " html_content = response.text\n", | |
| " print(f\"Fetched {len(html_content):,} bytes\")\n", | |
| "else:\n", | |
| " fetch_error = f\"HTTP {response.status_code}: {response.reason}\"\n", | |
| " print(f\"Fetch failed: {fetch_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "20a00121", | |
| "metadata": { | |
| "papermill": { | |
| "duration": 0.001574, | |
| "end_time": "2025-12-30T14:30:24.514679", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.513105", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "## 1. Trafilatura\n", | |
| "\n", | |
| "[trafilatura](https://trafilatura.readthedocs.io/) - Purpose-built for web text extraction with native markdown output.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "77dc050c", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:24.518985Z", | |
| "iopub.status.busy": "2025-12-30T14:30:24.518845Z", | |
| "iopub.status.idle": "2025-12-30T14:30:24.687646Z", | |
| "shell.execute_reply": "2025-12-30T14:30:24.687204Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.172026, | |
| "end_time": "2025-12-30T14:30:24.688269", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.516243", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Hacker News\n", | |
| "new\n", | |
| "|\n", | |
| "past\n", | |
| "|\n", | |
| "comments\n", | |
| "|\n", | |
| "ask\n", | |
| "|\n", | |
| "show\n", | |
| "|\n", | |
| "jobs\n", | |
| "|\n", | |
| "submit\n", | |
| "login\n", | |
| "1.\n", | |
| "Netflix: Open Content\n", | |
| "(\n", | |
| "netflix.com\n", | |
| ")\n", | |
| "246 points\n", | |
| "by\n", | |
| "tosh\n", | |
| "4 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "38 comments\n", | |
| "2.\n", | |
| "Non-Zero-Sum Games\n", | |
| "(\n", | |
| "nonzerosum.games\n", | |
| ")\n", | |
| "118 points\n", | |
| "by\n", | |
| "8organicbits\n", | |
| "2 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "17 comments\n", | |
| "3.\n", | |
| "Times New American: A Tale of Two Fonts\n", | |
| "(\n", | |
| "hsu.cy\n", | |
| ")\n", | |
| "46 points\n", | |
| "by\n", | |
| "firexcy\n", | |
| "1 hour ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "18 comments\n", | |
| "4.\n", | |
| "The British Empire's Resilient Subsea Telegraph Network\n", | |
| "(\n", | |
| "subseacables.blogspot.com\n", | |
| ")\n", | |
| "18 points\n", | |
| "by\n", | |
| "giuliomagnifico\n", | |
| "1 hour ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "1 comment\n", | |
| "5.\n", | |
| "Approachable Swift Concurrency\n", | |
| "(\n", | |
| "fuckingapproachableswiftconcurrency.com\n", | |
| ")\n", | |
| "26 points\n", | |
| "by\n", | |
| "wrxd\n", | |
| "1 hour ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "2 comments\n", | |
| "6.\n", | |
| "Google is dead. Where do we go now?\n", | |
| "(\n", | |
| "circusscientist.com\n", | |
| ")\n", | |
| "924 points\n", | |
| "by\n", | |
| "tomjuggler\n", | |
| "17 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "734 comments\n", | |
| "7.\n", | |
| "Win32 is the stable Linux ABI\n", | |
| "(\n", | |
| "loss32.org\n", | |
| ")\n", | |
| "64 points\n", | |
| "by\n", | |
| "krautburglar\n", | |
| "1 hour ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "23 comments\n", | |
| "8.\n", | |
| "Go Away Python\n", | |
| "(\n", | |
| "lorentz.app\n", | |
| ")\n", | |
| "147 points\n", | |
| "by\n", | |
| "baalimago\n", | |
| "5 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "80 comments\n", | |
| "9.\n", | |
| "No strcpy either\n", | |
| "(\n", | |
| "haxx.se\n", | |
| ")\n", | |
| "46 points\n", | |
| "by\n", | |
| "firesteelrain\n", | |
| "1 hour ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "13 comments\n", | |
| "10.\n", | |
| "GOG is getting acquired by its original co-founder\n", | |
| "(\n", | |
| "gog.com\n", | |
| ")\n", | |
| "761 points\n", | |
| "by\n", | |
| "haunter\n", | |
| "21 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "449 comments\n", | |
| "11.\n", | |
| "Crimson (YC X25) is hiring founding engineers in London\n", | |
| "(\n", | |
| "ycombinator.com\n", | |
| ")\n", | |
| "2 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "12.\n", | |
| "Show HN: One clean, developer-focused page for every Unicode symbol\n", | |
| "(\n", | |
| "fontgenerator.design\n", | |
| ")\n", | |
| "80 points\n", | |
| "by\n", | |
| "yarlinghe\n", | |
| "7 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "38 comments\n", | |
| "13.\n", | |
| "Stranger Things creator says turn off \"garbage\" settings\n", | |
| "(\n", | |
| "screenrant.com\n", | |
| ")\n", | |
| "259 points\n", | |
| "by\n", | |
| "1970-01-01\n", | |
| "14 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "457 comments\n", | |
| "14.\n", | |
| "Hacking Washing Machines [video]\n", | |
| "(\n", | |
| "ccc.de\n", | |
| ")\n", | |
| "162 points\n", | |
| "by\n", | |
| "clausecker\n", | |
| "12 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "32 comments\n", | |
| "15.\n", | |
| "Tesla's 4680 battery supply chain collapses as partner writes down deal by 99%\n", | |
| "(\n", | |
| "electrek.co\n", | |
| ")\n", | |
| "531 points\n", | |
| "by\n", | |
| "coloneltcb\n", | |
| "20 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "585 comments\n", | |
| "16.\n", | |
| "ManusAI Joins Meta\n", | |
| "(\n", | |
| "manus.im\n", | |
| ")\n", | |
| "266 points\n", | |
| "by\n", | |
| "gniting\n", | |
| "16 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "160 comments\n", | |
| "17.\n", | |
| "UNIX Fourth Edition\n", | |
| "(\n", | |
| "squoze.net\n", | |
| ")\n", | |
| "77 points\n", | |
| "by\n", | |
| "dcminter\n", | |
| "8 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "6 comments\n", | |
| "18.\n", | |
| "The future of software development is software developers\n", | |
| "(\n", | |
| "codemanship.wordpress.com\n", | |
| ")\n", | |
| "281 points\n", | |
| "by\n", | |
| "cdrnsf\n", | |
| "19 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "287 comments\n", | |
| "19.\n", | |
| "Graph Algorithms in Rayon\n", | |
| "(\n", | |
| "davidlattimore.github.io\n", | |
| ")\n", | |
| "26 points\n", | |
| "by\n", | |
| "PaulHoule\n", | |
| "5 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "discuss\n", | |
| "20.\n", | |
| "Charm Ruby – Glamorous Terminal Libraries for Ruby\n", | |
| "(\n", | |
| "charm-ruby.dev\n", | |
| ")\n", | |
| "43 points\n", | |
| "by\n", | |
| "todsacerdoti\n", | |
| "6 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "5 comments\n", | |
| "21.\n", | |
| "AI is forcing us to write good code\n", | |
| "(\n", | |
| "logic.inc\n", | |
| ")\n", | |
| "228 points\n", | |
| "by\n", | |
| "sgk284\n", | |
| "19 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "163 comments\n", | |
| "22.\n", | |
| "Concurrent Hash Table Designs\n", | |
| "(\n", | |
| "bluuewhale.github.io\n", | |
| ")\n", | |
| "6 points\n", | |
| "by\n", | |
| "signa11\n", | |
| "2 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "discuss\n", | |
| "23.\n", | |
| "Turning an old Amazon Kindle into a eInk development platform (2021)\n", | |
| "(\n", | |
| "lidskialf.net\n", | |
| ")\n", | |
| "36 points\n", | |
| "by\n", | |
| "fanf2\n", | |
| "7 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "7 comments\n", | |
| "24.\n", | |
| "Groq investor sounds alarm on data centers\n", | |
| "(\n", | |
| "axios.com\n", | |
| ")\n", | |
| "18 points\n", | |
| "by\n", | |
| "giuliomagnifico\n", | |
| "1 hour ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "10 comments\n", | |
| "25.\n", | |
| "MongoDB Server Security Update, December 2025\n", | |
| "(\n", | |
| "mongodb.com\n", | |
| ")\n", | |
| "96 points\n", | |
| "by\n", | |
| "plorkyeran\n", | |
| "14 hours ag\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import trafilatura\n", | |
| "\n", | |
| "if html_content:\n", | |
| " trafilatura_text = trafilatura.extract(\n", | |
| " html_content,\n", | |
| " output_format=\"markdown\",\n", | |
| " include_tables=True,\n", | |
| " include_links=True,\n", | |
| " include_images=False,\n", | |
| " )\n", | |
| " print(trafilatura_text[:MAX_CHARS] if trafilatura_text else \"No content\")\n", | |
| "else:\n", | |
| " trafilatura_text = None\n", | |
| " print(f\"Skipped: {fetch_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "37293fa6", | |
| "metadata": { | |
| "papermill": { | |
| "duration": 0.00159, | |
| "end_time": "2025-12-30T14:30:24.691676", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.690086", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "## 2. Newspaper3k\n", | |
| "\n", | |
| "[newspaper3k](https://newspaper.readthedocs.io/) - Designed for news article extraction.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "f406ca17", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:24.695894Z", | |
| "iopub.status.busy": "2025-12-30T14:30:24.695731Z", | |
| "iopub.status.idle": "2025-12-30T14:30:24.864174Z", | |
| "shell.execute_reply": "2025-12-30T14:30:24.863301Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.171896, | |
| "end_time": "2025-12-30T14:30:24.865327", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.693431", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "No content\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from newspaper import Article\n", | |
| "\n", | |
| "article = Article(TEST_URL)\n", | |
| "newspaper_error = None\n", | |
| "\n", | |
| "if html_content:\n", | |
| " article.set_html(html_content)\n", | |
| " try:\n", | |
| " article.parse()\n", | |
| " except ValueError as e:\n", | |
| " newspaper_error = f\"ValueError: {e}\"\n", | |
| "\n", | |
| " if newspaper_error:\n", | |
| " print(f\"Newspaper3k error: {newspaper_error}\")\n", | |
| " else:\n", | |
| " print(article.text[:MAX_CHARS] if article.text else \"No content\")\n", | |
| "else:\n", | |
| " print(f\"Skipped: {fetch_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "4580af76", | |
| "metadata": { | |
| "papermill": { | |
| "duration": 0.002573, | |
| "end_time": "2025-12-30T14:30:24.871134", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.868561", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "## 3. Readability-lxml\n", | |
| "\n", | |
| "[readability-lxml](https://github.com/buriy/python-readability) - Python port of Mozilla Readability.\n", | |
| "Outputs HTML, so we pipe through html2text for markdown.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "a0111b20", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:24.880091Z", | |
| "iopub.status.busy": "2025-12-30T14:30:24.879844Z", | |
| "iopub.status.idle": "2025-12-30T14:30:24.905435Z", | |
| "shell.execute_reply": "2025-12-30T14:30:24.905042Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.032438, | |
| "end_time": "2025-12-30T14:30:24.905850", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.873412", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from readability import Document\n", | |
| "import html2text\n", | |
| "\n", | |
| "h2t = html2text.HTML2Text()\n", | |
| "h2t.ignore_links = False\n", | |
| "h2t.ignore_images = True\n", | |
| "h2t.body_width = 0\n", | |
| "\n", | |
| "readability_markdown = \"\"\n", | |
| "readability_error = None\n", | |
| "\n", | |
| "if html_content:\n", | |
| " doc = Document(html_content)\n", | |
| " try:\n", | |
| " readable_html = doc.summary()\n", | |
| " readability_markdown = h2t.handle(readable_html)\n", | |
| " except Exception as e:\n", | |
| " readability_error = f\"{type(e).__name__}: {e}\"\n", | |
| "\n", | |
| " if readability_error:\n", | |
| " print(f\"Readability-lxml error: {readability_error}\")\n", | |
| " else:\n", | |
| " print(readability_markdown[:MAX_CHARS])\n", | |
| "else:\n", | |
| " print(f\"Skipped: {fetch_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "3ef93b51", | |
| "metadata": { | |
| "papermill": { | |
| "duration": 0.00157, | |
| "end_time": "2025-12-30T14:30:24.909148", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.907578", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "## 4. Mozilla Readability (JavaScript)\n", | |
| "\n", | |
| "[Mozilla Readability](https://github.com/mozilla/readability) - Original Firefox Reader View library, called via Node.js.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "3dba3e69", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:24.912963Z", | |
| "iopub.status.busy": "2025-12-30T14:30:24.912866Z", | |
| "iopub.status.idle": "2025-12-30T14:30:25.695609Z", | |
| "shell.execute_reply": "2025-12-30T14:30:25.695121Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.78519, | |
| "end_time": "2025-12-30T14:30:25.696083", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:24.910893", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "| 1.| [](https://example.com/vote?id=46431560&how=up&goto=news)| [Netflix: Open Content](https://opencontent.netflix.com/) ([netflix.com](https://example.com/from?site=netflix.com)) \n", | |
| "---|---|--- \n", | |
| "| 246 points by [tosh](https://example.com/user?id=tosh) [4 hours ago](https://example.com/item?id=46431560) | [hide](https://example.com/hide?id=46431560&goto=news) | [38 comments](https://example.com/item?id=46431560) \n", | |
| "2.| [](https://example.com/vote?id=46432311&how=up&goto=news)| [Non-Zero-Sum Games](https://nonzerosum.games/) ([nonzerosum.games](https://example.com/from?site=nonzerosum.games)) \n", | |
| "| 118 points by [8organicbits](https://example.com/user?id=8organicbits) [2 hours ago](https://example.com/item?id=46432311) | [hide](https://example.com/hide?id=46432311&goto=news) | [17 comments](https://example.com/item?id=46432311) \n", | |
| "3.| [](https://example.com/vote?id=46432862&how=up&goto=news)| [Times New American: A Tale of Two Fonts](https://hsu.cy/2025/12/times-new-american/) ([hsu.cy](https://example.com/from?site=hsu.cy)) \n", | |
| "| 46 points by [firexcy](https://example.com/user?id=firexcy) [1 hour ago](https://example.com/item?id=46432862) | [hide](https://example.com/hide?id=46432862&goto=news) | [18 comments](https://example.com/item?id=46432862) \n", | |
| "4.| [](https://example.com/vote?id=46432999&how=up&goto=news)| [The British Empire's Resilient Subsea Telegraph Network](https://subseacables.blogspot.com/2025/12/the-british-empires-resilient-subsea.html) ([subseacables.blogspot.com](https://example.com/from?site=subseacables.blogspot.com)) \n", | |
| "| 18 points by [giuliomagnifico](https://example.com/user?id=giuliomagnifico) [1 hour ago](https://example.com/item?id=46432999) | [hide](https://example.com/hide?id=46432999&goto=news) | [1 comment](https://example.com/item?id=46432999) \n", | |
| "5.| [](https://example.com/vote?id=46432916&how=up&goto=news)| [Approachable Swift Concurrency](https://fuckingapproachableswiftconcurrency.com/en/) ([fuckingapproachableswiftconcurrency.com](https://example.com/from?site=fuckingapproachableswiftconcurrency.com)) \n", | |
| "| 26 points by [wrxd](https://example.com/user?id=wrxd) [1 hour ago](https://example.com/item?id=46432916) | [hide](https://example.com/hide?id=46432916&goto=news) | [2 comments](https://example.com/item?id=46432916) \n", | |
| "6.| [](https://example.com/vote?id=46425198&how=up&goto=news)| [Google is dead. Where do we go now?](https://www.circusscientist.com/2025/12/29/google-is-dead-where-do-we-go-now/) ([circusscientist.com](https://example.com/from?site=circusscientist.com)) \n", | |
| "| 924 points by [tomjuggler](https://example.com/user?id=tomjuggler) [17 hours ago](https://example.com/item?id=46425198) | [hide](https://example.com/hide?id=46425198&goto=news) | [734 comments](https://example.com/item?id=46425198) \n", | |
| "7.| [](https://example.com/vote?id=46433035&how=up&goto=news)| [Win32 is the stable Linux ABI](https://loss32.org/) ([loss32.org](https://example.com/from?site=loss32.org)) \n", | |
| "| 64 points by [krautburglar](https://example.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "script_path = Path(\"readability_extract.js\")\n", | |
| "mozilla_markdown = \"\"\n", | |
| "\n", | |
| "if not html_content:\n", | |
| " print(f\"Skipped: {fetch_error}\")\n", | |
| "elif not script_path.exists():\n", | |
| " print(\"readability_extract.js not found\")\n", | |
| "else:\n", | |
| " result = subprocess.run(\n", | |
| " [\"node\", str(script_path)],\n", | |
| " input=html_content,\n", | |
| " capture_output=True,\n", | |
| " text=True,\n", | |
| " timeout=30,\n", | |
| " )\n", | |
| " if result.returncode == 0:\n", | |
| " mozilla_result = json.loads(result.stdout)\n", | |
| " mozilla_markdown = h2t.handle(mozilla_result.get(\"content\", \"\"))\n", | |
| " print(mozilla_markdown[:MAX_CHARS])\n", | |
| " else:\n", | |
| " print(f\"Error: {result.stderr}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "ceaffb93", | |
| "metadata": { | |
| "papermill": { | |
| "duration": 0.001767, | |
| "end_time": "2025-12-30T14:30:25.699720", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:25.697953", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "## 5. Playwright\n", | |
| "\n", | |
| "[Playwright](https://playwright.dev/) - Browser automation that renders JavaScript before extraction.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "5e9bfba5", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:25.704470Z", | |
| "iopub.status.busy": "2025-12-30T14:30:25.704317Z", | |
| "iopub.status.idle": "2025-12-30T14:30:29.537793Z", | |
| "shell.execute_reply": "2025-12-30T14:30:29.537141Z" | |
| }, | |
| "papermill": { | |
| "duration": 3.836854, | |
| "end_time": "2025-12-30T14:30:29.538455", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:25.701601", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "| 1. | | [Netflix: Open Content](https://opencontent.netflix.com/) ([netflix.com](from?site=netflix.com)) |\n", | |
| "| 246 points by [tosh](user?id=tosh) [4 hours ago](item?id=46431560) | [hide](hide?id=46431560&goto=news) | [38 comments](item?id=46431560) |\n", | |
| "|\n", | |
| "| 2. | | [Non-Zero-Sum Games](https://nonzerosum.games/) ([nonzerosum.games](from?site=nonzerosum.games)) |\n", | |
| "| 118 points by [8organicbits](user?id=8organicbits) [2 hours ago](item?id=46432311) | [hide](hide?id=46432311&goto=news) | [17 comments](item?id=46432311) |\n", | |
| "|\n", | |
| "| 3. | | [Times New American: A Tale of Two Fonts](https://hsu.cy/2025/12/times-new-american/) ([hsu.cy](from?site=hsu.cy)) |\n", | |
| "| 46 points by [firexcy](user?id=firexcy) [1 hour ago](item?id=46432862) | [hide](hide?id=46432862&goto=news) | [18 comments](item?id=46432862) |\n", | |
| "|\n", | |
| "| 4. | | [The British Empire's Resilient Subsea Telegraph Network](https://subseacables.blogspot.com/2025/12/the-british-empires-resilient-subsea.html) ([subseacables.blogspot.com](from?site=subseacables.blogspot.com)) |\n", | |
| "| 18 points by [giuliomagnifico](user?id=giuliomagnifico) [1 hour ago](item?id=46432999) | [hide](hide?id=46432999&goto=news) | [1 comment](item?id=46432999) |\n", | |
| "|\n", | |
| "| 5. | | [Approachable Swift Concurrency](https://fuckingapproachableswiftconcurrency.com/en/) ([fuckingapproachableswiftconcurrency.com](from?site=fuckingapproachableswiftconcurrency.com)) |\n", | |
| "| 26 points by [wrxd](user?id=wrxd) [1 hour ago](item?id=46432916) | [hide](hide?id=46432916&goto=news) | [2 comments](item?id=46432916) |\n", | |
| "|\n", | |
| "| 6. | | [Google is dead. Where do we go now?](https://www.circusscientist.com/2025/12/29/google-is-dead-where-do-we-go-now/) ([circusscientist.com](from?site=circusscientist.com)) |\n", | |
| "| 924 points by [tomjuggler](user?id=tomjuggler) [17 hours ago](item?id=46425198) | [hide](hide?id=46425198&goto=news) | [734 comments](item?id=46425198) |\n", | |
| "|\n", | |
| "| 7. | | [Win32 is the stable Linux ABI](https://loss32.org/) ([loss32.org](from?site=loss32.org)) |\n", | |
| "| 64 points by [krautburglar](user?id=krautburglar) [1 hour ago](item?id=46433035) | [hide](hide?id=46433035&goto=news) | [23 comments](item?id=46433035) |\n", | |
| "|\n", | |
| "| 8. | | [Go Away Python](https://lorentz.app/blog-item.html?id=go-shebang) ([lorentz.app](from?site=lorentz.app)) |\n", | |
| "| 147 points by [baalimago](user?id=baalimago) [5 hours ago](item?id=46431028) | [hide](hide?id=46431028&goto=news) | [80 comments](item?id=46431028) |\n", | |
| "|\n", | |
| "| 9. | | [No strcpy either](https://daniel.haxx.se/blog/2025/12/29/no-strcpy-either/) ([haxx.se](from?site=haxx.se)) |\n", | |
| "| 46 points by [firesteelrain](user?id=firesteelrain) [1 hour ago](item?id=46433029) | [hide](hide?id=46433029&goto=news) | [13 comments](item?id=46433029) |\n", | |
| "|\n", | |
| "| 10. | | [GOG is getting acquired by its original co-founder](https://www.gog.com/blog/gog-is-getting-acquired-by-its-original-co-founder-what-it-means-for-you/) ([gog.com](from?site=gog.com)) |\n", | |
| "| 761 points by [haunter](user?id=haunter) [21 hours ago](item?id=46422412) | [hide](hide?id=46422412&goto=news) | [449 comments](item?id=464224\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import asyncio\n", | |
| "from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout\n", | |
| "import nest_asyncio\n", | |
| "\n", | |
| "nest_asyncio.apply()\n", | |
| "\n", | |
| "\n", | |
| "async def fetch_with_playwright(\n", | |
| " url: str, timeout: int = 30000\n", | |
| ") -> tuple[str | None, str | None]:\n", | |
| " \"\"\"Returns (html, error). One will be None.\"\"\"\n", | |
| " try:\n", | |
| " async with async_playwright() as p:\n", | |
| " browser = await p.chromium.launch(headless=True)\n", | |
| " page = await browser.new_page()\n", | |
| " response = await page.goto(url, wait_until=\"domcontentloaded\", timeout=timeout)\n", | |
| " await page.wait_for_timeout(3000) # Let JS render\n", | |
| " html = await page.content()\n", | |
| " await browser.close()\n", | |
| " status = response.status if response else None\n", | |
| " if status and status >= 400:\n", | |
| " return None, f\"HTTP {status}\"\n", | |
| " return html, None\n", | |
| " except PlaywrightTimeout:\n", | |
| " return None, f\"Timeout after {timeout}ms\"\n", | |
| " except Exception as e:\n", | |
| " return None, f\"{type(e).__name__}: {e}\"\n", | |
| "\n", | |
| "\n", | |
| "playwright_html = None\n", | |
| "playwright_extracted = None\n", | |
| "playwright_error = None\n", | |
| "\n", | |
| "loop = asyncio.get_event_loop()\n", | |
| "result = loop.run_until_complete(fetch_with_playwright(TEST_URL))\n", | |
| "playwright_html, playwright_error = result\n", | |
| "\n", | |
| "if playwright_html:\n", | |
| " playwright_extracted = trafilatura.extract(\n", | |
| " playwright_html,\n", | |
| " output_format=\"markdown\",\n", | |
| " include_tables=True,\n", | |
| " include_links=True,\n", | |
| " include_images=False,\n", | |
| " )\n", | |
| " print(\n", | |
| " playwright_extracted[:MAX_CHARS]\n", | |
| " if playwright_extracted\n", | |
| " else \"No content extracted from HTML\"\n", | |
| " )\n", | |
| "else:\n", | |
| " print(f\"Playwright error: {playwright_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "6e522bf1", | |
| "metadata": { | |
| "papermill": { | |
| "duration": 0.001866, | |
| "end_time": "2025-12-30T14:30:29.543006", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:29.541140", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "## 6. Parallel.ai\n", | |
| "\n", | |
| "[Parallel.ai](https://docs.parallel.ai/) - Commercial API for web extraction using the Python SDK.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "a66a4eb3", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:29.548577Z", | |
| "iopub.status.busy": "2025-12-30T14:30:29.548365Z", | |
| "iopub.status.idle": "2025-12-30T14:30:29.891992Z", | |
| "shell.execute_reply": "2025-12-30T14:30:29.891524Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.347494, | |
| "end_time": "2025-12-30T14:30:29.892792", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:29.545298", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "||[](https://news.ycombinator.com) |**[Hacker News](news)** [new](newest) | [past](front) | [comments](newcomments) | <ask> | <show> | <jobs> | <submit> |[login](login?goto=news) |\n", | |
| "| --- | --- | --- | |\n", | |
| "| --- |\n", | |
| "||1\\. |[](vote?id=46431560&how=up&goto=news) |[Netflix: Open Content](https://opencontent.netflix.com/) ( [netflix.com](from?site=netflix.com) ) |\n", | |
| "| --- | --- | --- |\n", | |
| "| |121 points by [tosh](user?id=tosh) [2 hours ago](item?id=46431560) | [hide](hide?id=46431560&goto=news) | [12 comments](item?id=46431560) |\n", | |
| "|2\\. |[](vote?id=46432311&how=up&goto=news) |[Non-Zero-Sum Games](https://nonzerosum.games/) ( [nonzerosum.games](from?site=nonzerosum.games) ) |\n", | |
| "| |26 points by [8organicbits](user?id=8organicbits) [47 minutes ago](item?id=46432311) | [hide](hide?id=46432311&goto=news) | [1 comment](item?id=46432311) |\n", | |
| "|3\\. |[](vote?id=46425198&how=up&goto=news) |[Google is dead. Where do we go now?](https://www.circusscientist.com/2025/12/29/google-is-dead-where-do-we-go-now/) ( [circusscientist.com](from?site=circusscientist.com) ) |\n", | |
| "| |866 points by [tomjuggler](user?id=tomjuggler) [16 hours ago](item?id=46425198) | [hide](hide?id=46425198&goto=news) | [693 comments](item?id=46425198) |\n", | |
| "|4\\. |[](vote?id=46431453&how=up&goto=news) |[HSBC blocks its app due to F-Droid-installed Bitwarden](https://mastodon.neilzone.co.uk/@neil/115807834298031971) ( [neilzone.co.uk](from?site=neilzone.co.uk) ) |\n", | |
| "| |185 points by [\\_\\_\\_\\_\\_k](user?id=_____k) [2 hours ago](item?id=46431453) | [hide](hide?id=46431453&goto=news) | [155 comments](item?id=46431453) |\n", | |
| "|5\\. |[](vote?id=46431028&how=up&goto=news) |[Go Away Python](https://lorentz.app/blog-item.html?id=go-shebang) ( [lorentz.app](from?site=lorentz.app) ) |\n", | |
| "| |89 points by [baalimago](user?id=baalimago) [3 hours ago](item?id=46431028) | [hide](hide?id=46431028&goto=news) | [32 comments](item?id=46431028) |\n", | |
| "|6\\. |[](vote?id=46422412&how=up&goto=news) |[GOG is getting acquired by its original co-founder](https://www.gog.com/blog/gog-is-getting-acquired-by-its-original-co-founder-what-it-means-for-you/) ( [gog.com](from?site=gog.com) ) |\n", | |
| "| |724 points by [haunter](user?id=haunter) [19 hours ago](item?id=46422412) | [hide](hide?id=46422412&goto=news) | [425 comments](item?id=46422412) |\n", | |
| "|7\\. |[](vote?id=46432057&how=up&goto=news) |[Nicolas Guillou, French ICC judge sanctioned by the US and \"debanked\"](https://www.lemonde.fr/en/international/article/2025/11/19/nicolas-guillou-french-icc-judge-sanctioned-by-the-us-you-are-effectively-blacklisted-by-much-of-the-world-s-banking-system_6747628_4.html) ( [lemonde.fr](from?site=lemonde.fr) ) |\n", | |
| "| |74 points by [lifeisstillgood](user?id=lifeisstillgood) [1 hour ago](item?id=46432057) | [hide](hide?id=46432057&goto=news) | [31 comments](item?id=46432057) |\n", | |
| "|8\\. |[](vote?id=46384975&how=up&goto=news) |[Show HN: One clean, developer-focused page for every Unicode symbol](https://fontgenerator.design/symbols) ( [fontgenerator.design](from?site=fontgenerator.design) ) |\n", | |
| "| |55 poi\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from parallel import Parallel\n", | |
| "\n", | |
| "parallel_result = None\n", | |
| "parallel_error = None\n", | |
| "\n", | |
| "api_key = os.getenv(\"PARALLEL_API_KEY\")\n", | |
| "if not api_key:\n", | |
| " parallel_error = \"PARALLEL_API_KEY not set\"\n", | |
| "else:\n", | |
| " client = Parallel(api_key=api_key)\n", | |
| " extract = client.beta.extract(\n", | |
| " urls=[TEST_URL],\n", | |
| " objective=\"Extract the main content of this page\",\n", | |
| " excerpts=True,\n", | |
| " full_content=True,\n", | |
| " )\n", | |
| " parallel_result = extract.results\n", | |
| "\n", | |
| "if parallel_result:\n", | |
| " for result in parallel_result:\n", | |
| " if result.full_content:\n", | |
| " print(result.full_content[:MAX_CHARS])\n", | |
| "else:\n", | |
| " print(f\"Parallel.ai error: {parallel_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "fb8e4c91", | |
| "metadata": { | |
| "papermill": { | |
| "duration": 0.001914, | |
| "end_time": "2025-12-30T14:30:29.897137", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:29.895223", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "## 7. Exa\n", | |
| "\n", | |
| "[Exa](https://exa.ai/) - AI-native search and content extraction API.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "fd2bc1d0", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:29.901659Z", | |
| "iopub.status.busy": "2025-12-30T14:30:29.901510Z", | |
| "iopub.status.idle": "2025-12-30T14:30:30.272089Z", | |
| "shell.execute_reply": "2025-12-30T14:30:30.271726Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.373964, | |
| "end_time": "2025-12-30T14:30:30.272840", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:29.898876", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Hacker News[](https://news.ycombinator.com)|**[Hacker News](news)**[new](newest)|[past](front)|[comments](newcomments)|[ask](ask)|[show](show)|[jobs](jobs)|[submit](submit)|[login](login?goto=news)|\n", | |
| "|\n", | |
| "1.|[\n", | |
| "](vote?id=46431560&how=up&goto=news)|[Netflix: Open Content](https://opencontent.netflix.com/)([netflix.com](from?site=netflix.com))|\n", | |
| "|200 pointsby[tosh](user?id=tosh)[3 hours ago](item?id=46431560)|[hide](hide?id=46431560&goto=news)|[33comments](item?id=46431560)|\n", | |
| "2.|[\n", | |
| "](vote?id=46432311&how=up&goto=news)|[Non-Zero-Sum Games](https://nonzerosum.games/)([nonzerosum.games](from?site=nonzerosum.games))|\n", | |
| "|96 pointsby[8organicbits](user?id=8organicbits)[2 hours ago](item?id=46432311)|[hide](hide?id=46432311&goto=news)|[12comments](item?id=46432311)|\n", | |
| "3.|[\n", | |
| "](vote?id=46432862&how=up&goto=news)|[Times New American: A Tale of Two Fonts](https://hsu.cy/2025/12/times-new-american/)([hsu.cy](from?site=hsu.cy))|\n", | |
| "|30 pointsby[firexcy](user?id=firexcy)[1 hour ago](item?id=46432862)|[hide](hide?id=46432862&goto=news)|[11comments](item?id=46432862)|\n", | |
| "4.|[\n", | |
| "](vote?id=46425198&how=up&goto=news)|[Google is dead. Where do we go now?](https://www.circusscientist.com/2025/12/29/google-is-dead-where-do-we-go-now/)([circusscientist.com](from?site=circusscientist.com))|\n", | |
| "|909 pointsby[tomjuggler](user?id=tomjuggler)[17 hours ago](item?id=46425198)|[hide](hide?id=46425198&goto=news)|[719comments](item?id=46425198)|\n", | |
| "5.|[\n", | |
| "](vote?id=46431028&how=up&goto=news)|[Go Away Python](https://lorentz.app/blog-item.html?id=go-shebang)([lorentz.app](from?site=lorentz.app))|\n", | |
| "|135 pointsby[baalimago](user?id=baalimago)[5 hours ago](item?id=46431028)|[hide](hide?id=46431028&goto=news)|[71comments](item?id=46431028)|\n", | |
| "6.|[\n", | |
| "](vote?id=46433035&how=up&goto=news)|[Win32 is the stable Linux ABI](https://loss32.org/)([loss32.org](from?site=loss32.org))|\n", | |
| "|37 pointsby[krautburglar](user?id=krautburglar)[43 minutes ago](item?id=46433035)|[hide](hide?id=46433035&goto=news)|[7comments](item?id=46433035)|\n", | |
| "7.|[\n", | |
| "](vote?id=46433029&how=up&goto=news)|[No strcpy either](https://daniel.haxx.se/blog/2025/12/29/no-strcpy-either/)([haxx.se](from?site=haxx.se))|\n", | |
| "|27 pointsby[firesteelrain](user?id=firesteelrain)[44 minutes ago](item?id=46433029)|[hide](hide?id=46433029&goto=news)|[3comments](item?id=46433029)|\n", | |
| "8.|[\n", | |
| "](vote?id=46432916&how=up&goto=news)|[Approachable Swift Concurrency](https://fuckingapproachableswiftconcurrency.com/en/)([fuckingapproachableswiftconcurrency.com](from?site=fuckingapproachableswiftconcurrency.com))|\n", | |
| "|10 pointsby[wrxd](user?id=wrxd)[56 minutes ago](item?id=46432916)|[hide](hide?id=46432916&goto=news)|[discuss](item?id=46432916)|\n", | |
| "9.|[\n", | |
| "](vote?id=46422412&how=up&goto=news)|[GOG is getting acquired by its original co-founder](https://www.gog.com/blog/gog-is-getting-acquired-by-its-original-co-founder-what-it-means-for-you/)([gog.com](from?site=gog.com))|\n", | |
| "|753 pointsby[haunter\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from exa_py import Exa\n", | |
| "\n", | |
| "exa_result = None\n", | |
| "exa_error = None\n", | |
| "\n", | |
| "exa_api_key = os.getenv(\"EXA_API_KEY\")\n", | |
| "if not exa_api_key:\n", | |
| " exa_error = \"EXA_API_KEY not set\"\n", | |
| "else:\n", | |
| " exa = Exa(exa_api_key)\n", | |
| " results = exa.get_contents(urls=[TEST_URL], text=True)\n", | |
| " if results.results:\n", | |
| " exa_result = results.results[0].text\n", | |
| "\n", | |
| "if exa_result:\n", | |
| " print(exa_result[:MAX_CHARS])\n", | |
| "else:\n", | |
| " print(f\"Exa error: {exa_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "b5c89c1a", | |
| "metadata": { | |
| "papermill": { | |
| "duration": 0.002529, | |
| "end_time": "2025-12-30T14:30:30.277915", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:30.275386", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "## 8. html2text (direct)\n", | |
| "\n", | |
| "[html2text](https://github.com/Alir3z4/html2text) - Converts HTML to Markdown without readability filtering.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "298e1fb0", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:30.282693Z", | |
| "iopub.status.busy": "2025-12-30T14:30:30.282560Z", | |
| "iopub.status.idle": "2025-12-30T14:30:30.291870Z", | |
| "shell.execute_reply": "2025-12-30T14:30:30.291441Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.012447, | |
| "end_time": "2025-12-30T14:30:30.292415", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:30.279968", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "| [](https://news.ycombinator.com)| **[Hacker News](news)**[new](newest) | [past](front) | [comments](newcomments) | [ask](ask) | [show](show) | [jobs](jobs) | [submit](submit)| [login](login?goto=news) \n", | |
| "---|---|--- \n", | |
| "| 1.| [](vote?id=46431560&how=up&goto=news)| [Netflix: Open Content](https://opencontent.netflix.com/) ([netflix.com](from?site=netflix.com)) \n", | |
| "---|---|--- \n", | |
| "| 246 points by [tosh](user?id=tosh) [4 hours ago](item?id=46431560) | [hide](hide?id=46431560&goto=news) | [38 comments](item?id=46431560) \n", | |
| "2.| [](vote?id=46432311&how=up&goto=news)| [Non-Zero-Sum Games](https://nonzerosum.games/) ([nonzerosum.games](from?site=nonzerosum.games)) \n", | |
| "| 118 points by [8organicbits](user?id=8organicbits) [2 hours ago](item?id=46432311) | [hide](hide?id=46432311&goto=news) | [17 comments](item?id=46432311) \n", | |
| "3.| [](vote?id=46432862&how=up&goto=news)| [Times New American: A Tale of Two Fonts](https://hsu.cy/2025/12/times-new-american/) ([hsu.cy](from?site=hsu.cy)) \n", | |
| "| 46 points by [firexcy](user?id=firexcy) [1 hour ago](item?id=46432862) | [hide](hide?id=46432862&goto=news) | [18 comments](item?id=46432862) \n", | |
| "4.| [](vote?id=46432999&how=up&goto=news)| [The British Empire's Resilient Subsea Telegraph Network](https://subseacables.blogspot.com/2025/12/the-british-empires-resilient-subsea.html) ([subseacables.blogspot.com](from?site=subseacables.blogspot.com)) \n", | |
| "| 18 points by [giuliomagnifico](user?id=giuliomagnifico) [1 hour ago](item?id=46432999) | [hide](hide?id=46432999&goto=news) | [1 comment](item?id=46432999) \n", | |
| "5.| [](vote?id=46432916&how=up&goto=news)| [Approachable Swift Concurrency](https://fuckingapproachableswiftconcurrency.com/en/) ([fuckingapproachableswiftconcurrency.com](from?site=fuckingapproachableswiftconcurrency.com)) \n", | |
| "| 26 points by [wrxd](user?id=wrxd) [1 hour ago](item?id=46432916) | [hide](hide?id=46432916&goto=news) | [2 comments](item?id=46432916) \n", | |
| "6.| [](vote?id=46425198&how=up&goto=news)| [Google is dead. Where do we go now?](https://www.circusscientist.com/2025/12/29/google-is-dead-where-do-we-go-now/) ([circusscientist.com](from?site=circusscientist.com)) \n", | |
| "| 924 points by [tomjuggler](user?id=tomjuggler) [17 hours ago](item?id=46425198) | [hide](hide?id=46425198&goto=news) | [734 comments](item?id=46425198) \n", | |
| "7.| [](vote?id=46433035&how=up&goto=news)| [Win32 is the stable Linux ABI](https://loss32.org/) ([loss32.org](from?site=loss32.org)) \n", | |
| "| 64 points by [krautburglar](user?id=krautburglar) [1 hour ago](item?id=46433035) | [hide](hide?id=46433035&goto=news) | [23 comments](item?id=46433035) \n", | |
| "8.| [](vote?id=46431028&how=up&goto=news)| [Go Away Python](https://lorentz.app/blog-item.html?id=go-shebang) ([lorentz.app](from?site=lorentz.app)) \n", | |
| "| 147 points by [baalimago](user?id=baalimago) [5 hours ago](item?id=46431028) | [hide](hide?id=46431028&goto=news) | [80 comments](item?id=46431028) \n", | |
| "9.| [](vote?id=46433029&how=up&goto=news)| [No strcpy either](https://daniel.haxx.se/blog/2025/12/29/no-strcpy-either/) ([haxx.s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "if html_content:\n", | |
| " html2text_output = h2t.handle(html_content)\n", | |
| " print(html2text_output[:MAX_CHARS])\n", | |
| "else:\n", | |
| " html2text_output = \"\"\n", | |
| " print(f\"Skipped: {fetch_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "f6584f23", | |
| "metadata": { | |
| "papermill": { | |
| "duration": 0.001836, | |
| "end_time": "2025-12-30T14:30:30.297021", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:30.295185", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "## 9. BeautifulSoup\n", | |
| "\n", | |
| "[BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) - Manual text extraction baseline.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "id": "1a165cb6", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:30.301260Z", | |
| "iopub.status.busy": "2025-12-30T14:30:30.301130Z", | |
| "iopub.status.idle": "2025-12-30T14:30:30.310135Z", | |
| "shell.execute_reply": "2025-12-30T14:30:30.309787Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.012073, | |
| "end_time": "2025-12-30T14:30:30.310752", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:30.298679", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Hacker News\n", | |
| "Hacker News\n", | |
| "new\n", | |
| "|\n", | |
| "past\n", | |
| "|\n", | |
| "comments\n", | |
| "|\n", | |
| "ask\n", | |
| "|\n", | |
| "show\n", | |
| "|\n", | |
| "jobs\n", | |
| "|\n", | |
| "submit\n", | |
| "login\n", | |
| "1.\n", | |
| "Netflix: Open Content\n", | |
| "(\n", | |
| "netflix.com\n", | |
| ")\n", | |
| "246 points\n", | |
| "by\n", | |
| "tosh\n", | |
| "4 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "38 comments\n", | |
| "2.\n", | |
| "Non-Zero-Sum Games\n", | |
| "(\n", | |
| "nonzerosum.games\n", | |
| ")\n", | |
| "118 points\n", | |
| "by\n", | |
| "8organicbits\n", | |
| "2 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "17 comments\n", | |
| "3.\n", | |
| "Times New American: A Tale of Two Fonts\n", | |
| "(\n", | |
| "hsu.cy\n", | |
| ")\n", | |
| "46 points\n", | |
| "by\n", | |
| "firexcy\n", | |
| "1 hour ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "18 comments\n", | |
| "4.\n", | |
| "The British Empire's Resilient Subsea Telegraph Network\n", | |
| "(\n", | |
| "subseacables.blogspot.com\n", | |
| ")\n", | |
| "18 points\n", | |
| "by\n", | |
| "giuliomagnifico\n", | |
| "1 hour ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "1 comment\n", | |
| "5.\n", | |
| "Approachable Swift Concurrency\n", | |
| "(\n", | |
| "fuckingapproachableswiftconcurrency.com\n", | |
| ")\n", | |
| "26 points\n", | |
| "by\n", | |
| "wrxd\n", | |
| "1 hour ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "2 comments\n", | |
| "6.\n", | |
| "Google is dead. Where do we go now?\n", | |
| "(\n", | |
| "circusscientist.com\n", | |
| ")\n", | |
| "924 points\n", | |
| "by\n", | |
| "tomjuggler\n", | |
| "17 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "734 comments\n", | |
| "7.\n", | |
| "Win32 is the stable Linux ABI\n", | |
| "(\n", | |
| "loss32.org\n", | |
| ")\n", | |
| "64 points\n", | |
| "by\n", | |
| "krautburglar\n", | |
| "1 hour ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "23 comments\n", | |
| "8.\n", | |
| "Go Away Python\n", | |
| "(\n", | |
| "lorentz.app\n", | |
| ")\n", | |
| "147 points\n", | |
| "by\n", | |
| "baalimago\n", | |
| "5 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "80 comments\n", | |
| "9.\n", | |
| "No strcpy either\n", | |
| "(\n", | |
| "haxx.se\n", | |
| ")\n", | |
| "46 points\n", | |
| "by\n", | |
| "firesteelrain\n", | |
| "1 hour ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "13 comments\n", | |
| "10.\n", | |
| "GOG is getting acquired by its original co-founder\n", | |
| "(\n", | |
| "gog.com\n", | |
| ")\n", | |
| "761 points\n", | |
| "by\n", | |
| "haunter\n", | |
| "21 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "449 comments\n", | |
| "11.\n", | |
| "Crimson (YC X25) is hiring founding engineers in London\n", | |
| "(\n", | |
| "ycombinator.com\n", | |
| ")\n", | |
| "2 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "12.\n", | |
| "Show HN: One clean, developer-focused page for every Unicode symbol\n", | |
| "(\n", | |
| "fontgenerator.design\n", | |
| ")\n", | |
| "80 points\n", | |
| "by\n", | |
| "yarlinghe\n", | |
| "7 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "38 comments\n", | |
| "13.\n", | |
| "Stranger Things creator says turn off \"garbage\" settings\n", | |
| "(\n", | |
| "screenrant.com\n", | |
| ")\n", | |
| "259 points\n", | |
| "by\n", | |
| "1970-01-01\n", | |
| "14 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "457 comments\n", | |
| "14.\n", | |
| "Hacking Washing Machines [video]\n", | |
| "(\n", | |
| "ccc.de\n", | |
| ")\n", | |
| "162 points\n", | |
| "by\n", | |
| "clausecker\n", | |
| "12 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "32 comments\n", | |
| "15.\n", | |
| "Tesla's 4680 battery supply chain collapses as partner writes down deal by 99%\n", | |
| "(\n", | |
| "electrek.co\n", | |
| ")\n", | |
| "531 points\n", | |
| "by\n", | |
| "coloneltcb\n", | |
| "20 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "585 comments\n", | |
| "16.\n", | |
| "ManusAI Joins Meta\n", | |
| "(\n", | |
| "manus.im\n", | |
| ")\n", | |
| "266 points\n", | |
| "by\n", | |
| "gniting\n", | |
| "16 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "160 comments\n", | |
| "17.\n", | |
| "UNIX Fourth Edition\n", | |
| "(\n", | |
| "squoze.net\n", | |
| ")\n", | |
| "77 points\n", | |
| "by\n", | |
| "dcminter\n", | |
| "8 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "6 comments\n", | |
| "18.\n", | |
| "The future of software development is software developers\n", | |
| "(\n", | |
| "codemanship.wordpress.com\n", | |
| ")\n", | |
| "281 points\n", | |
| "by\n", | |
| "cdrnsf\n", | |
| "19 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "287 comments\n", | |
| "19.\n", | |
| "Graph Algorithms in Rayon\n", | |
| "(\n", | |
| "davidlattimore.github.io\n", | |
| ")\n", | |
| "26 points\n", | |
| "by\n", | |
| "PaulHoule\n", | |
| "5 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "discuss\n", | |
| "20.\n", | |
| "Charm Ruby – Glamorous Terminal Libraries for Ruby\n", | |
| "(\n", | |
| "charm-ruby.dev\n", | |
| ")\n", | |
| "43 points\n", | |
| "by\n", | |
| "todsacerdoti\n", | |
| "6 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "5 comments\n", | |
| "21.\n", | |
| "AI is forcing us to write good code\n", | |
| "(\n", | |
| "logic.inc\n", | |
| ")\n", | |
| "228 points\n", | |
| "by\n", | |
| "sgk284\n", | |
| "19 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "163 comments\n", | |
| "22.\n", | |
| "Concurrent Hash Table Designs\n", | |
| "(\n", | |
| "bluuewhale.github.io\n", | |
| ")\n", | |
| "6 points\n", | |
| "by\n", | |
| "signa11\n", | |
| "2 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "discuss\n", | |
| "23.\n", | |
| "Turning an old Amazon Kindle into a eInk development platform (2021)\n", | |
| "(\n", | |
| "lidskialf.net\n", | |
| ")\n", | |
| "36 points\n", | |
| "by\n", | |
| "fanf2\n", | |
| "7 hours ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "7 comments\n", | |
| "24.\n", | |
| "Groq investor sounds alarm on data centers\n", | |
| "(\n", | |
| "axios.com\n", | |
| ")\n", | |
| "18 points\n", | |
| "by\n", | |
| "giuliomagnifico\n", | |
| "1 hour ago\n", | |
| "|\n", | |
| "hide\n", | |
| "|\n", | |
| "10 comments\n", | |
| "25.\n", | |
| "MongoDB Server Security Update, December 2025\n", | |
| "(\n", | |
| "mongodb.com\n", | |
| ")\n", | |
| "96 points\n", | |
| "by\n", | |
| "plorkyeran\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from bs4 import BeautifulSoup\n", | |
| "\n", | |
| "if html_content:\n", | |
| " soup = BeautifulSoup(html_content, \"lxml\")\n", | |
| " for el in soup([\"script\", \"style\", \"nav\", \"footer\", \"header\"]):\n", | |
| " el.decompose()\n", | |
| "\n", | |
| " content = soup.find(\"div\", {\"id\": \"mw-content-text\"})\n", | |
| " bs_text = (\n", | |
| " content.get_text(separator=\"\\n\", strip=True)\n", | |
| " if content\n", | |
| " else soup.get_text(separator=\"\\n\", strip=True)\n", | |
| " )\n", | |
| " print(bs_text[:MAX_CHARS])\n", | |
| "else:\n", | |
| " bs_text = \"\"\n", | |
| " print(f\"Skipped: {fetch_error}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "899b203c", | |
| "metadata": { | |
| "papermill": { | |
| "duration": 0.001806, | |
| "end_time": "2025-12-30T14:30:30.314694", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:30.312888", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "## Summary\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "id": "0888b242", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2025-12-30T14:30:30.320395Z", | |
| "iopub.status.busy": "2025-12-30T14:30:30.320260Z", | |
| "iopub.status.idle": "2025-12-30T14:30:30.323654Z", | |
| "shell.execute_reply": "2025-12-30T14:30:30.323240Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.0075, | |
| "end_time": "2025-12-30T14:30:30.324247", | |
| "exception": false, | |
| "start_time": "2025-12-30T14:30:30.316747", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "mozilla readability : 14,015 chars\n", | |
| "parallel.ai : 11,460 chars\n", | |
| "html2text : 10,985 chars\n", | |
| "exa : 10,946 chars\n", | |
| "playwright : 9,519 chars\n", | |
| "beautifulsoup : 3,762 chars\n", | |
| "trafilatura : 3,750 chars\n", | |
| "readability-lxml : 1 chars\n", | |
| "newspaper3k : 0 chars\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "results = {\n", | |
| " \"trafilatura\": len(trafilatura_text or \"\"),\n", | |
| " \"newspaper3k\": len(article.text or \"\")\n", | |
| " if html_content and not newspaper_error\n", | |
| " else 0,\n", | |
| " \"readability-lxml\": len(readability_markdown),\n", | |
| " \"mozilla readability\": len(mozilla_markdown),\n", | |
| " \"playwright\": len(playwright_extracted or \"\"),\n", | |
| " \"parallel.ai\": len(parallel_result[0].full_content or \"\") if parallel_result else 0,\n", | |
| " \"exa\": len(exa_result or \"\"),\n", | |
| " \"html2text\": len(html2text_output),\n", | |
| " \"beautifulsoup\": len(bs_text),\n", | |
| "}\n", | |
| "\n", | |
| "if fetch_error:\n", | |
| " print(\n", | |
| " f\"Note: requests fetch failed ({fetch_error}), some tools used Playwright-fetched HTML\\n\"\n", | |
| " )\n", | |
| "\n", | |
| "for name, length in sorted(results.items(), key=lambda x: -x[1]):\n", | |
| " print(f\"{name:25s}: {length:>8,} chars\")" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.9" | |
| }, | |
| "papermill": { | |
| "default_parameters": {}, | |
| "duration": 7.340255, | |
| "end_time": "2025-12-30T14:30:30.543496", | |
| "environment_variables": {}, | |
| "exception": null, | |
| "input_path": "compare_extractors.ipynb", | |
| "output_path": "hackernews.ipynb", | |
| "parameters": { | |
| "TEST_URL": "https://news.ycombinator.com/" | |
| }, | |
| "start_time": "2025-12-30T14:30:23.203241", | |
| "version": "2.6.0" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment