ethanabrooks · December 30, 2025 00:45
diff --git a/arxiv.ipynb b/arxiv.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0a3a1d08",
   "metadata": {},
   "source": [
    "# Web Content Extraction Tool Comparison\n",
    "\n",
    "Comparing tools for extracting readable text/markdown from web pages.\n",
    "\n",
    "## Tools Evaluated\n",
    "\n",
    "| Tool                | Type       | Notes                                    |\n",
    "| ------------------- | ---------- | ---------------------------------------- |\n",
    "| trafilatura         | Python     | Purpose-built for web text extraction    |\n",
    "| newspaper3k         | Python     | News article extraction                  |\n",
    "| readability-lxml    | Python     | Python port of Mozilla Readability       |\n",
    "| Mozilla Readability | JavaScript | Original Firefox Reader View library     |\n",
    "| Playwright          | Python     | Browser automation for JS-rendered pages |\n",
    "| html2text           | Python     | HTML to Markdown converter               |\n",
    "| BeautifulSoup       | Python     | Manual extraction baseline               |\n",
    "| Parallel.ai         | API        | Commercial service (requires API key)    |\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "025255de",
   "metadata": {},
   "source": [
    "## Configuration\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d880d862",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-30T00:41:21.283503Z",
     "iopub.status.busy": "2025-12-30T00:41:21.283341Z",
     "iopub.status.idle": "2025-12-30T00:41:21.287073Z",
     "shell.execute_reply": "2025-12-30T00:41:21.286349Z"
    }
   },
   "outputs": [],
   "source": [
    "MAX_CHARS = 3000  # Maximum characters to display per output"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b24ee9dd",
   "metadata": {},
   "source": [
    "## Setup\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "97de6407",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-30T00:41:21.289177Z",
     "iopub.status.busy": "2025-12-30T00:41:21.289059Z",
     "iopub.status.idle": "2025-12-30T00:41:21.337303Z",
     "shell.execute_reply": "2025-12-30T00:41:21.336749Z"
    }
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import subprocess\n",
    "import requests\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ee68e648",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-30T00:41:21.338834Z",
     "iopub.status.busy": "2025-12-30T00:41:21.338746Z",
     "iopub.status.idle": "2025-12-30T00:41:26.368835Z",
     "shell.execute_reply": "2025-12-30T00:41:26.368366Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fetched 2,446,996 bytes\n"
     ]
    }
   ],
   "source": [
    "TEST_URL = \"https://www.arxiv.org/pdf/2510.02387\"\n",
    "\n",
    "headers = {\n",
    "    \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\",\n",
    "    \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\",\n",
    "    \"Accept-Language\": \"en-US,en;q=0.9\",\n",
    "}\n",
    "response = requests.get(TEST_URL, headers=headers, timeout=30)\n",
    "html_content = None\n",
    "fetch_error = None\n",
    "\n",
    "if response.ok:\n",
    "    html_content = response.text\n",
    "    print(f\"Fetched {len(html_content):,} bytes\")\n",
    "else:\n",
    "    fetch_error = f\"HTTP {response.status_code}: {response.reason}\"\n",
    "    print(f\"Fetch failed: {fetch_error}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20a00121",
   "metadata": {},
   "source": [
    "## 1. Trafilatura\n",
    "\n",
    "[trafilatura](https://trafilatura.readthedocs.io/) - Purpose-built for web text extraction with native markdown output.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "77dc050c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-30T00:41:26.371108Z",
     "iopub.status.busy": "2025-12-30T00:41:26.370954Z",
     "iopub.status.idle": "2025-12-30T00:41:26.495319Z",
     "shell.execute_reply": "2025-12-30T00:41:26.494754Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No content\n"
     ]
    }
   ],
   "source": [
    "import trafilatura\n",
    "\n",
    "if html_content:\n",
    "    trafilatura_text = trafilatura.extract(\n",
    "        html_content,\n",
    "        output_format=\"markdown\",\n",
    "        include_tables=True,\n",
    "        include_links=True,\n",
    "        include_images=False,\n",
    "    )\n",
    "    print(trafilatura_text[:MAX_CHARS] if trafilatura_text else \"No content\")\n",
    "else:\n",
    "    trafilatura_text = None\n",
    "    print(f\"Skipped: {fetch_error}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "37293fa6",
   "metadata": {},
   "source": [
    "## 2. Newspaper3k\n",
    "\n",
    "[newspaper3k](https://newspaper.readthedocs.io/) - Designed for news article extraction.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f406ca17",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-30T00:41:26.496672Z",
     "iopub.status.busy": "2025-12-30T00:41:26.496549Z",
     "iopub.status.idle": "2025-12-30T00:41:26.625297Z",
     "shell.execute_reply": "2025-12-30T00:41:26.624906Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Newspaper3k error: ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters\n"
     ]
    }
   ],
   "source": [
    "from newspaper import Article\n",
    "\n",
    "article = Article(TEST_URL)\n",
    "newspaper_error = None\n",
    "\n",
    "if html_content:\n",
    "    article.set_html(html_content)\n",
    "    try:\n",
    "        article.parse()\n",
    "    except ValueError as e:\n",
    "        newspaper_error = f\"ValueError: {e}\"\n",
    "\n",
    "    if newspaper_error:\n",
    "        print(f\"Newspaper3k error: {newspaper_error}\")\n",
    "    else:\n",
    "        print(article.text[:MAX_CHARS] if article.text else \"No content\")\n",
    "else:\n",
    "    print(f\"Skipped: {fetch_error}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4580af76",
   "metadata": {},
   "source": [
    "## 3. Readability-lxml\n",
    "\n",
    "[readability-lxml](https://github.com/buriy/python-readability) - Python port of Mozilla Readability.\n",
    "Outputs HTML, so we pipe through html2text for markdown.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a0111b20",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-30T00:41:26.626705Z",
     "iopub.status.busy": "2025-12-30T00:41:26.626621Z",
     "iopub.status.idle": "2025-12-30T00:41:26.648512Z",
     "shell.execute_reply": "2025-12-30T00:41:26.648095Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "error getting summary: \n",
      "Traceback (most recent call last):\n",
      "  File \"/Users/ethan/exa-replacement/.venv/lib/python3.12/site-packages/readability/readability.py\", line 227, in summary\n",
      "    self._html(True)\n",
      "  File \"/Users/ethan/exa-replacement/.venv/lib/python3.12/site-packages/readability/readability.py\", line 153, in _html\n",
      "    self.html = self._parse(self.input)\n",
      "                ^^^^^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/Users/ethan/exa-replacement/.venv/lib/python3.12/site-packages/readability/readability.py\", line 167, in _parse\n",
      "    doc = html_cleaner.clean_html(doc)\n",
      "          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/Users/ethan/exa-replacement/.venv/lib/python3.12/site-packages/lxml_html_clean/clean.py\", line 633, in clean_html\n",
      "    self(doc)\n",
      "  File \"/Users/ethan/exa-replacement/.venv/lib/python3.12/site-packages/lxml_html_clean/clean.py\", line 452, in __call__\n",
      "    _kill.popleft().drop_tree()  # popleft to start with innermost elements\n",
      "    ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/Users/ethan/exa-replacement/.venv/lib/python3.12/site-packages/lxml/html/__init__.py\", line 326, in drop_tree\n",
      "    previous.tail = (previous.tail or '') + self.tail\n",
      "    ^^^^^^^^^^^^^\n",
      "  File \"src/lxml/etree.pyx\", line 1180, in lxml.etree._Element.tail.__set__\n",
      "  File \"src/lxml/apihelpers.pxi\", line 762, in lxml.etree._setTailText\n",
      "  File \"src/lxml/apihelpers.pxi\", line 737, in lxml.etree._createTextNode\n",
      "  File \"src/lxml/apihelpers.pxi\", line 1538, in lxml.etree._utf8\n",
      "ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Readability-lxml error: Unparseable: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters\n"
     ]
    }
   ],
   "source": [
    "from readability import Document\n",
    "import html2text\n",
    "\n",
    "h2t = html2text.HTML2Text()\n",
    "h2t.ignore_links = False\n",
    "h2t.ignore_images = True\n",
    "h2t.body_width = 0\n",
    "\n",
    "readability_markdown = \"\"\n",
    "readability_error = None\n",
    "\n",
    "if html_content:\n",
    "    doc = Document(html_content)\n",
    "    try:\n",
    "        readable_html = doc.summary()\n",
    "        readability_markdown = h2t.handle(readable_html)\n",
    "    except Exception as e:\n",
    "        readability_error = f\"{type(e).__name__}: {e}\"\n",
    "\n",
    "    if readability_error:\n",
    "        print(f\"Readability-lxml error: {readability_error}\")\n",
    "    else:\n",
    "        print(readability_markdown[:MAX_CHARS])\n",
    "else:\n",
    "    print(f\"Skipped: {fetch_error}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3ef93b51",
   "metadata": {},
   "source": [
    "## 4. Mozilla Readability (JavaScript)\n",
    "\n",
    "[Mozilla Readability](https://github.com/mozilla/readability) - Original Firefox Reader View library, called via Node.js.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3dba3e69",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-30T00:41:26.649708Z",
     "iopub.status.busy": "2025-12-30T00:41:26.649619Z",
     "iopub.status.idle": "2025-12-30T00:41:30.565322Z",
     "shell.execute_reply": "2025-12-30T00:41:30.564873Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "�K�a� \u0015�\u0001����� \u001a�\u00105[��{ \u0001\u0001[\u0005 _d\u0015��ƹW\u0010��C�L�Xgm\u0005�\u0014R&��Iʅ�)0�d &#�su�u��\u0007�\u0003J�v�� � &�HL�~ ��\"\u0003I�\u001aNd�M�ܐ��h;KAjR'�$R����.�h�K&X��(\b�\u0018��(��(�!�˜11]�gŲ��\u001a� ��);a(�e(� �\u0019U\u001aB ����\u0019P3�T\u001a�y��+\u0003�$���gH��v�{����-\u0007 �\u000e�ɑ��FƔ 8�7�a��XE�l�\u0017�u�7(\u0012 �?{��\u0003g�@\u0017�,���GZ���/�$���o$ z���\\ؗ�\u0010玁\u0016$\\�C\u0016�y(�' 8`��q����W�PA����.\u0016%l\u000e��k@\u000fj���.���8\b��Ģ2��Q� :�\u000e2?O�Q��*��2_\u0014��!�\u0004_�����t��\u0011��Xc�\u0014s�\u0004vu�b �\u001bZ\b�Zn��֣�;^.��\\P��y�\u0004�E�t��,�0��S�J �a���Y2\u0005ڳ<��/�0$f� �7� t\u0012���.� ��)2x�� �u ��}�s @�+T_۝\u00071�$\u0004����W:�\u001a�W���=\u0015�� X�DΥ��;{�-��\u0012�\"J��t�[G!pD�c%c��f��.�T2^�w���fK\u0001� �%\\ �v�gi\u0016Q\u0001�i'�+� ��wc4��\u0010&��3;{�4�����C�Ab,Q��̬\\å�σ�)�����\u0004������EEe�-0o[\u0016��i�{\"��}�P���|���A���܊\u0002����\u0001�\b���\u0001\u0014\u0018�/ �Q�KT����(��h yK�t6r\u001a{~\u001a .��2ERI[ ��\u0006���\u0017�a �P<�@��@�x/\u0005* \u0007j���߃LĜ�v��{�/!ea\\���+.\u0013<\u0014{J+�K�\u0011��-ZI\u0016gi��r\u0010�R c�:�) t߁Z\u0019�X`\u0016\u0003ً��T�u���F�C�K]�\u0017�T!H�8\u001ak$͡5U%9Y<֮#��LYrgW\u0017'I�z�����> �c*��\")Y\u0004��\u001b�[��Fmя\u0013VPn!���f蛺���c�G����B��,��R��* �\u0019�s\u0018)��\u0014�y\u0004�Ȋ���s�F���\u0018@}\u0007�f\u0018��\u0005��9 �H�;)\u0003!���W.�. s��Kǚ����\u0010�m\u001a ��!���:\u0010\u00158�N�$S�� ���2M6�����U5�q4\u0003�\u001b�\u000e�\u0013ӹ�vCێ\u0005C옗7��B��蜨0<��A��p1Z�i�;\u0001��]��\u0002\u0019ɠs�6���� N쎠�\u0010[^��.��+w#\u0006oQ �n�W�< ��Wi4{˖\u0005��)���Ou�A �-^}`s� \" ��;Ѕa\u0004\bx�'@�z�Ϡ\\ s���\u0007n�3񵸓�!���;�T�i\u0004��o��2&v�\u0017��\u00170�6�.Vd�˛�wc4Ə�\"�G�\u000f�HY��x\u0014\u0005\u0001*R?p�Sa ѕf�TYŪ�<���\u0002��\u000eΔ��wO�\"�p*ӂ��A���H�|�IN\u0014!N�;� +\u0015�:��s�F� Skv�\u0011�e�����\u0015�.�; �\u0002A �.>M��\u001a�Ͽr���\u001b���J�wR ��]��K�Z�\\ �] �$xE {�Żjy<\u0005��0{��6]A臁��G[���QU�^A�g�g^ ��/T��B:Ŭ�x�\u0005�\u00164C+�\u0014F ��ι �Ƨ���\u000e]�Q`SXP>����� ��=B�Y�M�s\u0005�S�==$\\�j\u0003{l��\u001b�ъFRXp�m?�?\u0015�]L/�\"g�ƛJT��ƥ� ����^���V\u000e8J�{U;�Lh\"��\u0004���G.\u0006�f\u000f�\"\u000eRƇ�2ѽ��k9�_��G� �\u0003 P4S� ,��%pW � ���:V�����@җ�\"v�b��7\u0017�\u0001�\u0006�� endstream endobj 46 0 obj << /Length 11 /Filter /FlateDecode >> stream x�� �\u0002�f endstream endobj 47 0 obj << /Filter /FlateDecode /Length 140 >> stream x�E�� \u00021\u0010D������� 7�\u0003�\u0010D��H'\u0016��b\u0011E�09\u0005�)޼a�I�D�g��鋞U�Ď �S�Y�����G �����o����~\u0011|TkC\u0017\u0003\u0006( �#�0�\"\u0016�UT�\u0018\u0010��anv����#X�M� [ Җ։v�\u0001v}&� endstream endobj 48 0 obj << /CS /DeviceRGB /I true /S /Transparency /Type /Group >> endobj 49 0 obj << /Count 6 /Kids [ 9 0 R 99 0 R 100 0 R 101 0 R 102 0 R 103 0 R ] /Parent 12 0 R /Type /Pages >> endobj 50 0 obj << /ColorSpace 104 0 R /ExtGState 105 0 R /Font << /F103 106 0 R /F110 107 0 R /F122 108 0 R /F32 109 0 R /F35 110 0 R /F41 111 0 R /F50 112 0 R /F91 113 0 R /Times-Roman 114 0 R >> /Pattern 115 0 R /ProcSet [ /PDF /Text ] /XObject << /Im1 116 0 R >> >> endobj 51 0 obj << /D (section.1) /S /GoTo >> endobj 52 0 obj << /A 117 0 R /Count -3 /First 118 0 R /Last 119 0 R /Next 120 0 R /Parent 6 0 R /Prev 10 0 R /Title 121 0 R >> endobj 53 0 obj  endobj 54 0 obj << /D (appendix.K) /S /GoTo >> endobj 55 0 obj << /A 122 0 R /Next 11 0 R /Parent 6 0 R /Prev 123 0 R /Title 124 0 R >> endobj 56 0 obj  endobj 57 0 obj << /Count 6 /Kids [ 125 0 R 126 0 R 127 0 R 128 0 R 129 0 R 130 0 R ] /Parent 12 0 R /Type /Pages >> endobj 58 0 obj << /Count 6 /Kids [ 131 0 R 132 0 R 133 0 R 134 0 R 135 0 R 136 0 R ] /Parent 12 0 R /Type /Pages >> endobj 59 0 obj << /Count 6 /Kids [ 137 0 R 138 0 R 139 0 R 140 0 R 141 0 R 142 0 R ] /Parent 12 0 R /Type /Pages >> endobj 60 0 obj << /Count \n"
     ]
    }
   ],
   "source": [
    "script_path = Path(\"readability_extract.js\")\n",
    "mozilla_markdown = \"\"\n",
    "\n",
    "if not html_content:\n",
    "    print(f\"Skipped: {fetch_error}\")\n",
    "elif not script_path.exists():\n",
    "    print(\"readability_extract.js not found\")\n",
    "else:\n",
    "    result = subprocess.run(\n",
    "        [\"node\", str(script_path)],\n",
    "        input=html_content,\n",
    "        capture_output=True,\n",
    "        text=True,\n",
    "        timeout=30,\n",
    "    )\n",
    "    if result.returncode == 0:\n",
    "        mozilla_result = json.loads(result.stdout)\n",
    "        mozilla_markdown = h2t.handle(mozilla_result.get(\"content\", \"\"))\n",
    "        print(mozilla_markdown[:MAX_CHARS])\n",
    "    else:\n",
    "        print(f\"Error: {result.stderr}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ceaffb93",
   "metadata": {},
   "source": [
    "## 5. Playwright\n",
    "\n",
    "[Playwright](https://playwright.dev/) - Browser automation that renders JavaScript before extraction.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "5e9bfba5",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-30T00:41:30.567221Z",
     "iopub.status.busy": "2025-12-30T00:41:30.567085Z",
     "iopub.status.idle": "2025-12-30T00:41:31.554785Z",
     "shell.execute_reply": "2025-12-30T00:41:31.554247Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Playwright error: Error: Page.goto: Download is starting\n",
      "Call log:\n",
      "  - navigating to \"https://www.arxiv.org/pdf/2510.02387\", waiting until \"networkidle\"\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import asyncio\n",
    "from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout\n",
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()\n",
    "\n",
    "\n",
    "async def fetch_with_playwright(\n",
    "    url: str, timeout: int = 60000\n",
    ") -> tuple[str | None, str | None]:\n",
    "    \"\"\"Returns (html, error). One will be None.\"\"\"\n",
    "    browser = None\n",
    "    try:\n",
    "        async with async_playwright() as p:\n",
    "            browser = await p.chromium.launch(headless=True)\n",
    "            page = await browser.new_page()\n",
    "            response = await page.goto(url, wait_until=\"networkidle\", timeout=timeout)\n",
    "            html = await page.content()\n",
    "            await browser.close()\n",
    "            status = response.status if response else None\n",
    "            if status and status >= 400:\n",
    "                return None, f\"HTTP {status}\"\n",
    "            return html, None\n",
    "    except PlaywrightTimeout:\n",
    "        return None, f\"Timeout after {timeout}ms\"\n",
    "    except Exception as e:\n",
    "        return None, f\"{type(e).__name__}: {e}\"\n",
    "\n",
    "\n",
    "playwright_html = None\n",
    "playwright_extracted = None\n",
    "playwright_error = None\n",
    "\n",
    "loop = asyncio.get_event_loop()\n",
    "result = loop.run_until_complete(fetch_with_playwright(TEST_URL))\n",
    "playwright_html, playwright_error = result\n",
    "\n",
    "if playwright_html:\n",
    "    playwright_extracted = trafilatura.extract(\n",
    "        playwright_html,\n",
    "        output_format=\"markdown\",\n",
    "        include_tables=True,\n",
    "        include_links=True,\n",
    "        include_images=False,\n",
    "    )\n",
    "    print(\n",
    "        playwright_extracted[:MAX_CHARS]\n",
    "        if playwright_extracted\n",
    "        else \"No content extracted from HTML\"\n",
    "    )\n",
    "else:\n",
    "    print(f\"Playwright error: {playwright_error}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6e522bf1",
   "metadata": {},
   "source": [
    "## 6. Parallel.ai\n",
    "\n",
    "[Parallel.ai](https://docs.parallel.ai/) - Commercial API for web extraction using the Python SDK.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a66a4eb3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-30T00:41:31.556107Z",
     "iopub.status.busy": "2025-12-30T00:41:31.556014Z",
     "iopub.status.idle": "2025-12-30T00:41:33.367159Z",
     "shell.execute_reply": "2025-12-30T00:41:33.366312Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "**CWM: An Open-Weights LLM for Research on Code**\n",
      "\n",
      "**Generation with World Models**\n",
      "\n",
      "**Meta FAIR CodeGen Team**\n",
      "\n",
      "We release Code World Model ( CWM ), a 32-billion-parameter open-weights LLM, to advance research\n",
      "\n",
      "on code generation with world models. To improve code understanding beyond what can be learned\n",
      "\n",
      "from training on static code alone, we mid-train CWM on a large amount of observation-action\n",
      "\n",
      "trajectories from Python interpreter and agentic Docker environments, and perform extensive multi-\n",
      "\n",
      "task reasoning RL in verifiable coding, math, and multi-turn software engineering environments. With\n",
      "\n",
      "CWM , we provide a strong testbed for researchers to explore the opportunities world modeling affords\n",
      "\n",
      "for improving code generation with reasoning and planning in computational environments. We\n",
      "\n",
      "present first steps of how world models can benefit agentic coding, enable step-by-step simulation of\n",
      "\n",
      "Python code execution, and show early results of how reasoning can benefit from the latter. CWM is\n",
      "\n",
      "a dense, decoder-only LLM trained with a context size of up to 131 k tokens. Independent of its world\n",
      "\n",
      "modeling capabilities, CWM offers strong performance on general coding and math tasks: it reaches\n",
      "\n",
      "pass@1 scores of 65 _._ 8 % on SWE-bench Verified (with test-time scaling), 68 _._ 6 % on LiveCodeBench,\n",
      "\n",
      "96 _._ 6 % on Math-500, and 76 _._ 0 % on AIME 2024. To support further research on code world modeling,\n",
      "\n",
      "we release model checkpoints after mid-training, SFT, and RL.\n",
      "\n",
      "**Date:** September 29, 2025\n",
      "\n",
      "**Inference Code:** github.com/facebookresearch/cwm\n",
      "\n",
      "**Model Weights:** ai.meta.com/resources/models-and-libraries/cwm-downloads ,\n",
      "\n",
      "huggingface.co/facebook/cwm , ../cwm-sft , ../cwm-pretrain\n",
      "\n",
      "**1**\n",
      "\n",
      "**Introduction**\n",
      "\n",
      "Software development is one of the domains where Large Language Models (LLMs) have already had a\n",
      "\n",
      "significant real-world impact ( Cui et al. , 2024 ; Bick et al. , 2024 ). They have quickly been adopted into the\n",
      "\n",
      "workflows of software engineers worldwide, and their capabilities are advancing fast: from only supporting\n",
      "\n",
      "programmers with small snippets of code to fixing issues or writing code bases autonomously ( Yeverechyahu\n",
      "\n",
      "et al. , 2024 ; Handa et al. , 2025 ). However, reliably generating high-quality code remains a challenge even for\n",
      "\n",
      "the current generation of LLMs, with benchmarks consistently revealing shortcomings upon release ( Hendrycks\n",
      "\n",
      "et al. , 2021a ; Chen et al. , 2021 ; Aider Team , 2025 ; Jimenez et al. , 2024 ).\n",
      "\n",
      "We believe that advancing code generation with LLMs may require new training and modeling paradigms.\n",
      "\n",
      "Typically, code is treated the same as any other text data during pre-training: the model learns to predict\n",
      "\n",
      "code line by line, from left to right and top to bottom. We think this is not sufficient – to master coding, one\n",
      "\n",
      "must understand not just what code _looks like_ but what it _does_ when executed. Such skill is instrumental to\n",
      "\n",
      "the everyday work of software engineers: at a local level, they understand how the execution of \n"
     ]
    }
   ],
   "source": [
    "from parallel import Parallel\n",
    "\n",
    "parallel_result = None\n",
    "parallel_error = None\n",
    "\n",
    "api_key = os.getenv(\"PARALLEL_API_KEY\")\n",
    "if not api_key:\n",
    "    parallel_error = \"PARALLEL_API_KEY not set\"\n",
    "else:\n",
    "    client = Parallel(api_key=api_key)\n",
    "    extract = client.beta.extract(\n",
    "        urls=[TEST_URL],\n",
    "        objective=\"Extract the main content of this page\",\n",
    "        excerpts=True,\n",
    "        full_content=True,\n",
    "    )\n",
    "    parallel_result = extract.results\n",
    "\n",
    "if parallel_result:\n",
    "    for result in parallel_result:\n",
    "        if result.full_content:\n",
    "            print(result.full_content[:MAX_CHARS])\n",
    "else:\n",
    "    print(f\"Parallel.ai error: {parallel_error}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fb8e4c91",
   "metadata": {},
   "source": [
    "## 7. Exa\n",
    "\n",
    "[Exa](https://exa.ai/) - AI-native search and content extraction API.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "fd2bc1d0",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-30T00:41:33.369925Z",
     "iopub.status.busy": "2025-12-30T00:41:33.369719Z",
     "iopub.status.idle": "2025-12-30T00:41:33.855113Z",
     "shell.execute_reply": "2025-12-30T00:41:33.854644Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CWM: An Open-Weights LLM for Research on Code\n",
      "Generation with World Models\n",
      "Meta FAIR CodeGen Team\n",
      "We release Code World Model (CWM), a 32-billion-parameter open-weights LLM, to advance research\n",
      "on code generation with world models. To improve code understanding beyond what can be learned\n",
      "from training on static code alone, we mid-train CWM on a large amount of observation-action\n",
      "trajectories from Python interpreter and agentic Docker environments, and perform extensive multi\u0002task reasoning RL in verifiable coding, math, and multi-turn software engineering environments. With\n",
      "CWM, we provide a strong testbed for researchers to explore the opportunities world modeling affords\n",
      "for improving code generation with reasoning and planning in computational environments. We\n",
      "present first steps of how world models can benefit agentic coding, enable step-by-step simulation of\n",
      "Python code execution, and show early results of how reasoning can benefit from the latter. CWM is\n",
      "a dense, decoder-only LLM trained with a context size of up to 131 k tokens. Independent of its world\n",
      "modeling capabilities, CWM offers strong performance on general coding and math tasks: it reaches\n",
      "pass@1 scores of 65.8 % on SWE-bench Verified (with test-time scaling), 68.6 % on LiveCodeBench,\n",
      "96.6 % on Math-500, and 76.0 % on AIME 2024. To support further research on code world modeling,\n",
      "we release model checkpoints after mid-training, SFT, and RL.\n",
      "Date: September 29, 2025\n",
      "Inference Code: github.com/facebookresearch/cwm\n",
      "Model Weights: ai.meta.com/resources/models-and-libraries/cwm-downloads,\n",
      "huggingface.co/facebook/cwm, ../cwm-sft, ../cwm-pretrain\n",
      "1 Introduction\n",
      "Software development is one of the domains where Large Language Models (LLMs) have already had a\n",
      "significant real-world impact (Cui et al., 2024; Bick et al., 2024). They have quickly been adopted into the\n",
      "workflows of software engineers worldwide, and their capabilities are advancing fast: from only supporting\n",
      "programmers with small snippets of code to fixing issues or writing code bases autonomously (Yeverechyahu\n",
      "et al., 2024; Handa et al., 2025). However, reliably generating high-quality code remains a challenge even for\n",
      "the current generation of LLMs, with benchmarks consistently revealing shortcomings upon release (Hendrycks\n",
      "et al., 2021a; Chen et al., 2021; Aider Team, 2025; Jimenez et al., 2024).\n",
      "We believe that advancing code generation with LLMs may require new training and modeling paradigms.\n",
      "Typically, code is treated the same as any other text data during pre-training: the model learns to predict\n",
      "code line by line, from left to right and top to bottom. We think this is not sufficient – to master coding, one\n",
      "must understand not just what code looks like but what it does when executed. Such skill is instrumental to\n",
      "the everyday work of software engineers: at a local level, they understand how the execution of a line of code\n",
      "changes the state of the local variables, and, at a global level, they can make predictions about\n"
     ]
    }
   ],
   "source": [
    "from exa_py import Exa\n",
    "\n",
    "exa_result = None\n",
    "exa_error = None\n",
    "\n",
    "exa_api_key = os.getenv(\"EXA_API_KEY\")\n",
    "if not exa_api_key:\n",
    "    exa_error = \"EXA_API_KEY not set\"\n",
    "else:\n",
    "    exa = Exa(exa_api_key)\n",
    "    results = exa.get_contents(urls=[TEST_URL], text=True)\n",
    "    if results.results:\n",
    "        exa_result = results.results[0].text\n",
    "\n",
    "if exa_result:\n",
    "    print(exa_result[:MAX_CHARS])\n",
    "else:\n",
    "    print(f\"Exa error: {exa_error}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5c89c1a",
   "metadata": {},
   "source": [
    "## 8. html2text (direct)\n",
    "\n",
    "[html2text](https://github.com/Alir3z4/html2text) - Converts HTML to Markdown without readability filtering.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "298e1fb0",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-30T00:41:33.856501Z",
     "iopub.status.busy": "2025-12-30T00:41:33.856404Z",
     "iopub.status.idle": "2025-12-30T00:41:33.964718Z",
     "shell.execute_reply": "2025-12-30T00:41:33.964259Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "%PDF-1.7 %���� 1 0 obj << /Metadata 3 0 R /Names 4 0 R /OpenAction 5 0 R /Outlines 6 0 R /PageMode /UseOutlines /Pages 7 0 R /Type /Catalog >> endobj 2 0 obj << /Author (FAIR CodeGen team; Quentin Carbonneaux; Gal Cohen; Jonas Gehring; Jacob Kahn; Jannik Kossen; Felix Kreuk; Emily McMilin; Michel Meyer; Yuxiang Wei; David Zhang; Kunhao Zheng; Jordi Armengol-Estap�; Pedram Bashiri; Maximilian Beck; Pierre Chambon; Abhishek Charnalia; Chris Cummins; Juliette Decugis; Zacharias V. Fisches; Fran�ois Fleuret; Fabian Gloeckle; Alex Gu; Michael Hassid; Daniel Haziza; Badr Youbi Idrissi; Christian Keller; Rahul Kindi; Hugh Leather; Gallil Maimon; Aram Markosyan; Francisco Massa; Pierre-Emmanuel Mazar�; Vegard Mella; Naila Murray; Keyur Muzumdar; Peter O'Hearn; Matteo Pagliardini; Dmitrii Pedchenko; Tal Remez; Volker Seeker; Marco Selvi; Oren Sultan; Sida Wang; Luca Wehrstedt; Ori Yoran; Lingming Zhang; Taco Cohen; Yossi Adi; Gabriel Synnaeve) /Creator (arXiv GenPDF \\\\(tex2pdf:\\\\)) /DOI (https://doi.org/10.48550/arXiv.2510.02387) /License (http://arxiv.org/licenses/nonexclusive-distrib/1.0/) /PTEX.Fullbanner (This is pdfTeX, Version 3.141592653-2.6-1.40.28 \\\\(TeX Live 2025\\\\) kpathsea version 6.4.1) /Producer (pikepdf 8.15.1) /Title (CWM: An Open-Weights LLM for Research on Code Generation with World Models) /Trapped /False /arXivID (https://arxiv.org/abs/2510.02387v1) >> endobj 3 0 obj << /Subtype /XML /Type /Metadata /Length 3121 >> stream  CWM: An Open-Weights LLM for Research on Code Generation with World ModelsFAIR CodeGen teamQuentin CarbonneauxGal CohenJonas GehringJacob KahnJannik KossenFelix KreukEmily McMilinMichel MeyerYuxiang WeiDavid ZhangKunhao ZhengJordi Armengol-EstapéPedram BashiriMaximilian BeckPierre ChambonAbhishek CharnaliaChris CumminsJuliette DecugisZacharias V. FischesFrançois FleuretFabian GloeckleAlex GuMichael HassidDaniel HazizaBadr Youbi IdrissiChristian KellerRahul KindiHugh LeatherGallil MaimonAram MarkosyanFrancisco MassaPierre-Emmanuel MazaréVegard MellaNaila MurrayKeyur MuzumdarPeter O'HearnMatteo PagliardiniDmitrii PedchenkoTal RemezVolker SeekerMarco SelviOren SultanSida WangLuca WehrstedtOri YoranLingming ZhangTaco CohenYossi AdiGabriel Synnaevehttp://arxiv.org/licenses/nonexclusive-distrib/1.0/cs.SEcs.AIcs.LG endstream endobj 4 0 obj << /Dests 8 0 R >> endobj 5 0 obj << /D [ 9 0 R /Fit ] /S /GoTo >> endobj 6 0 obj << /Count 20 /First 10 0 R /Last 11 0 R /Type /Outlines >> endobj 7 0 obj << /Count 58 /Kids [ 12 0 R 13 0 R ] /Type /Pages >> endobj 8 0 obj << /Kids [ 14 0 R 15 0 R 16 0 R 17 0 R 18 0 R 19 0 R ] /Limits [ (Doc-Start) (table.caption.8) ] >> endobj 9 0 obj << /Annots [ 20 0 R 21 0 R 22 0 R 23 0 R 24 0 R 25 0 R 26 0 R 27 0 R 28 0 R 29 0 R 30 0 R 31 0 R 32 0 R 33 0 R 34 0 R 35 0 R 36 0 R 37 0 R 38 0 R 39 0 R 40 0 R 41 0 R 42 0 R 43 0 R ] /Contents [ 44 0 R 45 0 R 46 0 R 47 0 R ] /Group 48 0 R /MediaBox [ 0 0 612 792 ] /Parent 49 0 R /Resources 50 0 R /Type /Page >> endobj 10 0 obj << /A 51 0 R /Next 52 0 R /Pa\n"
     ]
    }
   ],
   "source": [
    "if html_content:\n",
    "    html2text_output = h2t.handle(html_content)\n",
    "    print(html2text_output[:MAX_CHARS])\n",
    "else:\n",
    "    html2text_output = \"\"\n",
    "    print(f\"Skipped: {fetch_error}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6584f23",
   "metadata": {},
   "source": [
    "## 9. BeautifulSoup\n",
    "\n",
    "[BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) - Manual text extraction baseline.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1a165cb6",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-30T00:41:33.966032Z",
     "iopub.status.busy": "2025-12-30T00:41:33.965944Z",
     "iopub.status.idle": "2025-12-30T00:41:34.032760Z",
     "shell.execute_reply": "2025-12-30T00:41:34.032358Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "%PDF-1.7\n",
      "%����\n",
      "1 0 obj\n",
      "<< /Metadata 3 0 R /Names 4 0 R /OpenAction 5 0 R /Outlines 6 0 R /PageMode /UseOutlines /Pages 7 0 R /Type /Catalog >>\n",
      "endobj\n",
      "2 0 obj\n",
      "<< /Author (FAIR CodeGen team; Quentin Carbonneaux; Gal Cohen; Jonas Gehring; Jacob Kahn; Jannik Kossen; Felix Kreuk; Emily McMilin; Michel Meyer; Yuxiang Wei; David Zhang; Kunhao Zheng; Jordi Armengol-Estap�; Pedram Bashiri; Maximilian Beck; Pierre Chambon; Abhishek Charnalia; Chris Cummins; Juliette Decugis; Zacharias V. Fisches; Fran�ois Fleuret; Fabian Gloeckle; Alex Gu; Michael Hassid; Daniel Haziza; Badr Youbi Idrissi; Christian Keller; Rahul Kindi; Hugh Leather; Gallil Maimon; Aram Markosyan; Francisco Massa; Pierre-Emmanuel Mazar�; Vegard Mella; Naila Murray; Keyur Muzumdar; Peter O'Hearn; Matteo Pagliardini; Dmitrii Pedchenko; Tal Remez; Volker Seeker; Marco Selvi; Oren Sultan; Sida Wang; Luca Wehrstedt; Ori Yoran; Lingming Zhang; Taco Cohen; Yossi Adi; Gabriel Synnaeve) /Creator (arXiv GenPDF \\(tex2pdf:\\)) /DOI (https://doi.org/10.48550/arXiv.2510.02387) /License (http://arxiv.org/licenses/nonexclusive-distrib/1.0/) /PTEX.Fullbanner (This is pdfTeX, Version 3.141592653-2.6-1.40.28 \\(TeX Live 2025\\) kpathsea version 6.4.1) /Producer (pikepdf 8.15.1) /Title (CWM: An Open-Weights LLM for Research on Code Generation with World Models) /Trapped /False /arXivID (https://arxiv.org/abs/2510.02387v1) >>\n",
      "endobj\n",
      "3 0 obj\n",
      "<< /Subtype /XML /Type /Metadata /Length 3121 >>\n",
      "stream\n",
      "CWM: An Open-Weights LLM for Research on Code Generation with World Models\n",
      "FAIR CodeGen team\n",
      "Quentin Carbonneaux\n",
      "Gal Cohen\n",
      "Jonas Gehring\n",
      "Jacob Kahn\n",
      "Jannik Kossen\n",
      "Felix Kreuk\n",
      "Emily McMilin\n",
      "Michel Meyer\n",
      "Yuxiang Wei\n",
      "David Zhang\n",
      "Kunhao Zheng\n",
      "Jordi Armengol-Estapé\n",
      "Pedram Bashiri\n",
      "Maximilian Beck\n",
      "Pierre Chambon\n",
      "Abhishek Charnalia\n",
      "Chris Cummins\n",
      "Juliette Decugis\n",
      "Zacharias V. Fisches\n",
      "François Fleuret\n",
      "Fabian Gloeckle\n",
      "Alex Gu\n",
      "Michael Hassid\n",
      "Daniel Haziza\n",
      "Badr Youbi Idrissi\n",
      "Christian Keller\n",
      "Rahul Kindi\n",
      "Hugh Leather\n",
      "Gallil Maimon\n",
      "Aram Markosyan\n",
      "Francisco Massa\n",
      "Pierre-Emmanuel Mazaré\n",
      "Vegard Mella\n",
      "Naila Murray\n",
      "Keyur Muzumdar\n",
      "Peter O'Hearn\n",
      "Matteo Pagliardini\n",
      "Dmitrii Pedchenko\n",
      "Tal Remez\n",
      "Volker Seeker\n",
      "Marco Selvi\n",
      "Oren Sultan\n",
      "Sida Wang\n",
      "Luca Wehrstedt\n",
      "Ori Yoran\n",
      "Lingming Zhang\n",
      "Taco Cohen\n",
      "Yossi Adi\n",
      "Gabriel Synnaeve\n",
      "http://arxiv.org/licenses/nonexclusive-distrib/1.0/\n",
      "cs.SE\n",
      "cs.AI\n",
      "cs.LG\n",
      "endstream\n",
      "endobj\n",
      "4 0 obj\n",
      "<< /Dests 8 0 R >>\n",
      "endobj\n",
      "5 0 obj\n",
      "<< /D [ 9 0 R /Fit ] /S /GoTo >>\n",
      "endobj\n",
      "6 0 obj\n",
      "<< /Count 20 /First 10 0 R /Last 11 0 R /Type /Outlines >>\n",
      "endobj\n",
      "7 0 obj\n",
      "<< /Count 58 /Kids [ 12 0 R 13 0 R ] /Type /Pages >>\n",
      "endobj\n",
      "8 0 obj\n",
      "<< /Kids [ 14 0 R 15 0 R 16 0 R 17 0 R 18 0 R 19 0 R ] /Limits [ (Doc-Start) (table.caption.8) ] >>\n",
      "endobj\n",
      "9 0 obj\n",
      "<< /Annots [ 20 0 R 21 0 R 22 0 R 23 0 R 24 0 R 25 0 R 26 0 R 27 0 R 28 0 R 29 0 R 30 0 R 31 0 R 32 0 R 33 0 R 34 0 R 35 0 R 36 0 R 37 0 R 38 0 R 39 0 R 40 0 R 41 0 R 42 0 R 43 0 R ] /Contents [ 44 0 R 45 0 R 46 0 R 47 0 R ] /Group 48 0 R /MediaBox [ 0 0 612 792 ] /Parent 49 0 R /Resources 50 0 R /Type /Page\n"
     ]
    }
   ],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "\n",
    "if html_content:\n",
    "    soup = BeautifulSoup(html_content, \"lxml\")\n",
    "    for el in soup([\"script\", \"style\", \"nav\", \"footer\", \"header\"]):\n",
    "        el.decompose()\n",
    "\n",
    "    content = soup.find(\"div\", {\"id\": \"mw-content-text\"})\n",
    "    bs_text = (\n",
    "        content.get_text(separator=\"\\n\", strip=True)\n",
    "        if content\n",
    "        else soup.get_text(separator=\"\\n\", strip=True)\n",
    "    )\n",
    "    print(bs_text[:MAX_CHARS])\n",
    "else:\n",
    "    bs_text = \"\"\n",
    "    print(f\"Skipped: {fetch_error}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "899b203c",
   "metadata": {},
   "source": [
    "## Summary\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0888b242",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-30T00:41:34.034135Z",
     "iopub.status.busy": "2025-12-30T00:41:34.034058Z",
     "iopub.status.idle": "2025-12-30T00:41:34.036724Z",
     "shell.execute_reply": "2025-12-30T00:41:34.036241Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "html2text                : 2,427,273 chars\n",
      "beautifulsoup            : 2,106,014 chars\n",
      "mozilla readability      : 2,074,369 chars\n",
      "exa                      :  220,035 chars\n",
      "parallel.ai              :  100,000 chars\n",
      "trafilatura              :        0 chars\n",
      "newspaper3k              :        0 chars\n",
      "readability-lxml         :        0 chars\n",
      "playwright               :        0 chars\n"
     ]
    }
   ],
   "source": [
    "results = {\n",
    "    \"trafilatura\": len(trafilatura_text or \"\"),\n",
    "    \"newspaper3k\": len(article.text or \"\")\n",
    "    if html_content and not newspaper_error\n",
    "    else 0,\n",
    "    \"readability-lxml\": len(readability_markdown),\n",
    "    \"mozilla readability\": len(mozilla_markdown),\n",
    "    \"playwright\": len(playwright_extracted or \"\"),\n",
    "    \"parallel.ai\": len(parallel_result[0].full_content or \"\") if parallel_result else 0,\n",
    "    \"exa\": len(exa_result or \"\"),\n",
    "    \"html2text\": len(html2text_output),\n",
    "    \"beautifulsoup\": len(bs_text),\n",
    "}\n",
    "\n",
    "if fetch_error:\n",
    "    print(\n",
    "        f\"Note: requests fetch failed ({fetch_error}), some tools used Playwright-fetched HTML\\n\"\n",
    "    )\n",
    "\n",
    "for name, length in sorted(results.items(), key=lambda x: -x[1]):\n",
    "    print(f\"{name:25s}: {length:>8,} chars\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
No results found