Skip to content

Instantly share code, notes, and snippets.

@jmcph4
Last active December 14, 2025 04:38
Show Gist options
  • Select an option

  • Save jmcph4/4012ff4d7966b6a901d8228539fb93de to your computer and use it in GitHub Desktop.

Select an option

Save jmcph4/4012ff4d7966b6a901d8228539fb93de to your computer and use it in GitHub Desktop.
use eyre::Result;
use futures::{stream, StreamExt, TryStreamExt};
use playwright::{api::DocumentLoadState, Playwright};
use std::{sync::Arc, time::Duration};
use tokio::sync::Semaphore;
use url::Url;
pub struct HtmlFetcher {
context: playwright::api::BrowserContext,
}
impl HtmlFetcher {
/// Create one browser + one context and reuse it across many fetches.
pub async fn new() -> Result<Self> {
let playwright = Playwright::initialize().await?;
playwright.prepare()?; // installs browsers on first run
let browser = playwright
.chromium()
.launcher()
.headless(true)
.launch()
.await?;
let context = browser
.context_builder()
.user_agent(
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
)
.build()
.await?;
context.set_default_navigation_timeout(45_000).await?;
Ok(Self { context })
}
/// Fetch a single URL using a fresh Page (safe to do concurrently).
pub async fn fetch_html(&self, url: Url) -> Result<String> {
match url.scheme() {
"http" | "https" => {}
s => return Err(eyre::eyre!("unsupported URL scheme: {s}")),
}
let page = self.context.new_page().await?;
let target = url.as_str();
// Prefer network-idle for JS rendering, fallback to "load" for pages that never settle.
if page
.goto_builder(target)
.wait_until(DocumentLoadState::NetworkIdle)
.timeout(45_000.0)
.goto()
.await
.is_err()
{
page.goto_builder(target)
.wait_until(DocumentLoadState::Load)
.timeout(45_000.0)
.goto()
.await?;
}
// Let hydration / microtasks settle briefly.
let _: bool = page
.eval("() => new Promise(r => setTimeout(() => r(true), 250))")
.await?;
let html = page.content().await?;
// Close the page to free resources.
let _ = page.close(None).await;
Ok(html)
}
}
/// Run up to `concurrency` fetches at a time over ~5,800 URLs.
///
/// Notes:
/// - “As parallel as possible” is limited by CPU/RAM and target sites.
/// - With Playwright/Chromium, 16–64 concurrency is common; tune based on machine.
pub async fn fetch_all_html(
urls: Vec<Url>,
concurrency: usize,
) -> Result<Vec<(Url, Result<String>)>> {
let fetcher = Arc::new(HtmlFetcher::new().await?);
let sem = Arc::new(Semaphore::new(concurrency));
stream::iter(urls.into_iter().map(|url| {
let fetcher = Arc::clone(&fetcher);
let sem = Arc::clone(&sem);
async move {
let _permit = sem.acquire_owned().await.expect("semaphore closed");
// If you want per-request hard timeouts:
let res = tokio::time::timeout(Duration::from_secs(60), fetcher.fetch_html(url.clone()))
.await
.map_err(|_| eyre::eyre!("timeout"))?
.map_err(Into::into);
Ok::<_, eyre::Report>((url, res))
}
}))
.buffer_unordered(concurrency)
.try_collect::<Vec<_>>()
.await
}
// Example usage:
// #[tokio::main]
// async fn main() -> eyre::Result<()> {
// let urls: Vec<Url> = vec![
// "https://example.com".parse()?,
// "https://news.ycombinator.com".parse()?,
// ];
// let out = fetch_all_html(urls, 32).await?;
// for (u, res) in out {
// match res {
// Ok(html) => println!("{} -> {} bytes", u, html.len()),
// Err(e) => eprintln!("{} -> error: {e}", u),
// }
// }
// Ok(())
// }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment