jmcph4 · December 14, 2025 04:38
diff --git a/fetch.rs b/fetch.rs
 use eyre::Result;
 use futures::{stream, StreamExt, TryStreamExt};
 use playwright::{api::DocumentLoadState, Playwright};
 use std::{sync::Arc, time::Duration};
 use tokio::sync::Semaphore;
 use url::Url;

 pub struct HtmlFetcher {
    context: playwright::api::BrowserContext,
 }

 impl HtmlFetcher {
    /// Create one browser + one context and reuse it across many fetches.
    pub async fn new() -> Result<Self> {
        let playwright = Playwright::initialize().await?;
        playwright.prepare()?; // installs browsers on first run

        let browser = playwright
            .chromium()
            .launcher()
            .headless(true)
            .launch()
            .await?;

        let context = browser
            .context_builder()
            .user_agent(
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
                 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            )
            .build()
            .await?;

        context.set_default_navigation_timeout(45_000).await?;

        Ok(Self { context })
    }

    /// Fetch a single URL using a fresh Page (safe to do concurrently).
    pub async fn fetch_html(&self, url: Url) -> Result<String> {
        match url.scheme() {
            "http" | "https" => {}
            s => return Err(eyre::eyre!("unsupported URL scheme: {s}")),
        }

        let page = self.context.new_page().await?;
        let target = url.as_str();

        // Prefer network-idle for JS rendering, fallback to "load" for pages that never settle.
        if page
            .goto_builder(target)
            .wait_until(DocumentLoadState::NetworkIdle)
            .timeout(45_000.0)
            .goto()
            .await
            .is_err()
        {
            page.goto_builder(target)
                .wait_until(DocumentLoadState::Load)
                .timeout(45_000.0)
                .goto()
                .await?;
        }

        // Let hydration / microtasks settle briefly.
        let _: bool = page
            .eval("() => new Promise(r => setTimeout(() => r(true), 250))")
            .await?;

        let html = page.content().await?;

        // Close the page to free resources.
        let _ = page.close(None).await;

        Ok(html)
    }
 }

 /// Run up to `concurrency` fetches at a time over ~5,800 URLs.
 ///
 /// Notes:
 /// - “As parallel as possible” is limited by CPU/RAM and target sites.
 /// - With Playwright/Chromium, 16–64 concurrency is common; tune based on machine.
 pub async fn fetch_all_html(
    urls: Vec<Url>,
    concurrency: usize,
 ) -> Result<Vec<(Url, Result<String>)>> {
    let fetcher = Arc::new(HtmlFetcher::new().await?);
    let sem = Arc::new(Semaphore::new(concurrency));

    stream::iter(urls.into_iter().map(|url| {
        let fetcher = Arc::clone(&fetcher);
        let sem = Arc::clone(&sem);

        async move {
            let _permit = sem.acquire_owned().await.expect("semaphore closed");
            // If you want per-request hard timeouts:
            let res = tokio::time::timeout(Duration::from_secs(60), fetcher.fetch_html(url.clone()))
                .await
                .map_err(|_| eyre::eyre!("timeout"))?
                .map_err(Into::into);

            Ok::<_, eyre::Report>((url, res))
        }
    }))
    .buffer_unordered(concurrency)
    .try_collect::<Vec<_>>()
    .await
 }

 // Example usage:
 // #[tokio::main]
 // async fn main() -> eyre::Result<()> {
 //     let urls: Vec<Url> = vec![
 //         "https://example.com".parse()?,
 //         "https://news.ycombinator.com".parse()?,
 //     ];
 //     let out = fetch_all_html(urls, 32).await?;
 //     for (u, res) in out {
 //         match res {
 //             Ok(html) => println!("{} -> {} bytes", u, html.len()),
 //             Err(e) => eprintln!("{} -> error: {e}", u),
 //         }
 //     }
 //     Ok(())
 // }
	use eyre::Result;
	use futures::{stream, StreamExt, TryStreamExt};
	use playwright::{api::DocumentLoadState, Playwright};
	use std::{sync::Arc, time::Duration};
	use tokio::sync::Semaphore;
	use url::Url;

	pub struct HtmlFetcher {
	context: playwright::api::BrowserContext,
	}

	impl HtmlFetcher {
	/// Create one browser + one context and reuse it across many fetches.
	pub async fn new() -> Result<Self> {
	let playwright = Playwright::initialize().await?;
	playwright.prepare()?; // installs browsers on first run

	let browser = playwright
	.chromium()
	.launcher()
	.headless(true)
	.launch()
	.await?;

	let context = browser
	.context_builder()
	.user_agent(
	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
	(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
	)
	.build()
	.await?;

	context.set_default_navigation_timeout(45_000).await?;

	Ok(Self { context })
	}

	/// Fetch a single URL using a fresh Page (safe to do concurrently).
	pub async fn fetch_html(&self, url: Url) -> Result<String> {
	match url.scheme() {
	"http" \| "https" => {}
	s => return Err(eyre::eyre!("unsupported URL scheme: {s}")),
	}

	let page = self.context.new_page().await?;
	let target = url.as_str();

	// Prefer network-idle for JS rendering, fallback to "load" for pages that never settle.
	if page
	.goto_builder(target)
	.wait_until(DocumentLoadState::NetworkIdle)
	.timeout(45_000.0)
	.goto()
	.await
	.is_err()
	{
	page.goto_builder(target)
	.wait_until(DocumentLoadState::Load)
	.timeout(45_000.0)
	.goto()
	.await?;
	}

	// Let hydration / microtasks settle briefly.
	let _: bool = page
	.eval("() => new Promise(r => setTimeout(() => r(true), 250))")
	.await?;

	let html = page.content().await?;

	// Close the page to free resources.
	let _ = page.close(None).await;

	Ok(html)
	}
	}

	/// Run up to `concurrency` fetches at a time over ~5,800 URLs.
	///
	/// Notes:
	/// - “As parallel as possible” is limited by CPU/RAM and target sites.
	/// - With Playwright/Chromium, 16–64 concurrency is common; tune based on machine.
	pub async fn fetch_all_html(
	urls: Vec<Url>,
	concurrency: usize,
	) -> Result<Vec<(Url, Result<String>)>> {
	let fetcher = Arc::new(HtmlFetcher::new().await?);
	let sem = Arc::new(Semaphore::new(concurrency));

	stream::iter(urls.into_iter().map(\|url\| {
	let fetcher = Arc::clone(&fetcher);
	let sem = Arc::clone(&sem);

	async move {
	let _permit = sem.acquire_owned().await.expect("semaphore closed");
	// If you want per-request hard timeouts:
	let res = tokio::time::timeout(Duration::from_secs(60), fetcher.fetch_html(url.clone()))
	.await
	.map_err(\|_\| eyre::eyre!("timeout"))?
	.map_err(Into::into);

	Ok::<_, eyre::Report>((url, res))
	}
	}))
	.buffer_unordered(concurrency)
	.try_collect::<Vec<_>>()
	.await
	}

	// Example usage:
	// #[tokio::main]
	// async fn main() -> eyre::Result<()> {
	// let urls: Vec<Url> = vec![
	// "https://example.com".parse()?,
	// "https://news.ycombinator.com".parse()?,
	// ];
	// let out = fetch_all_html(urls, 32).await?;
	// for (u, res) in out {
	// match res {
	// Ok(html) => println!("{} -> {} bytes", u, html.len()),
	// Err(e) => eprintln!("{} -> error: {e}", u),
	// }
	// }
	// Ok(())
	// }
No results found