Last active
December 14, 2025 04:38
-
-
Save jmcph4/4012ff4d7966b6a901d8228539fb93de to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| use eyre::Result; | |
| use futures::{stream, StreamExt, TryStreamExt}; | |
| use playwright::{api::DocumentLoadState, Playwright}; | |
| use std::{sync::Arc, time::Duration}; | |
| use tokio::sync::Semaphore; | |
| use url::Url; | |
| pub struct HtmlFetcher { | |
| context: playwright::api::BrowserContext, | |
| } | |
| impl HtmlFetcher { | |
| /// Create one browser + one context and reuse it across many fetches. | |
| pub async fn new() -> Result<Self> { | |
| let playwright = Playwright::initialize().await?; | |
| playwright.prepare()?; // installs browsers on first run | |
| let browser = playwright | |
| .chromium() | |
| .launcher() | |
| .headless(true) | |
| .launch() | |
| .await?; | |
| let context = browser | |
| .context_builder() | |
| .user_agent( | |
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \ | |
| (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", | |
| ) | |
| .build() | |
| .await?; | |
| context.set_default_navigation_timeout(45_000).await?; | |
| Ok(Self { context }) | |
| } | |
| /// Fetch a single URL using a fresh Page (safe to do concurrently). | |
| pub async fn fetch_html(&self, url: Url) -> Result<String> { | |
| match url.scheme() { | |
| "http" | "https" => {} | |
| s => return Err(eyre::eyre!("unsupported URL scheme: {s}")), | |
| } | |
| let page = self.context.new_page().await?; | |
| let target = url.as_str(); | |
| // Prefer network-idle for JS rendering, fallback to "load" for pages that never settle. | |
| if page | |
| .goto_builder(target) | |
| .wait_until(DocumentLoadState::NetworkIdle) | |
| .timeout(45_000.0) | |
| .goto() | |
| .await | |
| .is_err() | |
| { | |
| page.goto_builder(target) | |
| .wait_until(DocumentLoadState::Load) | |
| .timeout(45_000.0) | |
| .goto() | |
| .await?; | |
| } | |
| // Let hydration / microtasks settle briefly. | |
| let _: bool = page | |
| .eval("() => new Promise(r => setTimeout(() => r(true), 250))") | |
| .await?; | |
| let html = page.content().await?; | |
| // Close the page to free resources. | |
| let _ = page.close(None).await; | |
| Ok(html) | |
| } | |
| } | |
| /// Run up to `concurrency` fetches at a time over ~5,800 URLs. | |
| /// | |
| /// Notes: | |
| /// - “As parallel as possible” is limited by CPU/RAM and target sites. | |
| /// - With Playwright/Chromium, 16–64 concurrency is common; tune based on machine. | |
| pub async fn fetch_all_html( | |
| urls: Vec<Url>, | |
| concurrency: usize, | |
| ) -> Result<Vec<(Url, Result<String>)>> { | |
| let fetcher = Arc::new(HtmlFetcher::new().await?); | |
| let sem = Arc::new(Semaphore::new(concurrency)); | |
| stream::iter(urls.into_iter().map(|url| { | |
| let fetcher = Arc::clone(&fetcher); | |
| let sem = Arc::clone(&sem); | |
| async move { | |
| let _permit = sem.acquire_owned().await.expect("semaphore closed"); | |
| // If you want per-request hard timeouts: | |
| let res = tokio::time::timeout(Duration::from_secs(60), fetcher.fetch_html(url.clone())) | |
| .await | |
| .map_err(|_| eyre::eyre!("timeout"))? | |
| .map_err(Into::into); | |
| Ok::<_, eyre::Report>((url, res)) | |
| } | |
| })) | |
| .buffer_unordered(concurrency) | |
| .try_collect::<Vec<_>>() | |
| .await | |
| } | |
| // Example usage: | |
| // #[tokio::main] | |
| // async fn main() -> eyre::Result<()> { | |
| // let urls: Vec<Url> = vec![ | |
| // "https://example.com".parse()?, | |
| // "https://news.ycombinator.com".parse()?, | |
| // ]; | |
| // let out = fetch_all_html(urls, 32).await?; | |
| // for (u, res) in out { | |
| // match res { | |
| // Ok(html) => println!("{} -> {} bytes", u, html.len()), | |
| // Err(e) => eprintln!("{} -> error: {e}", u), | |
| // } | |
| // } | |
| // Ok(()) | |
| // } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment