Created
December 29, 2025 10:04
-
-
Save me-suzy/6d2e7f717c238de116b8b6fc56fcc08c to your computer and use it in GitHub Desktop.
descarca_biblioteca_TOATA.ps1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # PowerShell script pentru TOATE PDF-urile de pe biblioteca-digitala.ro | |
| # Foloseste Wayback Machine CDX API pentru a gasi TOATE cele ~100.000 PDF-uri | |
| $OutputDir = "G:\biblioteca-digitala-COMPLET" | |
| # Creaza directorul daca nu exista | |
| if (!(Test-Path $OutputDir)) { | |
| New-Item -ItemType Directory -Path $OutputDir | Out-Null | |
| } | |
| Write-Host "============================================================" -ForegroundColor Cyan | |
| Write-Host "Scraper pentru TOATE PDF-urile de pe biblioteca-digitala.ro" -ForegroundColor Cyan | |
| Write-Host "Destinatie: $OutputDir" -ForegroundColor Cyan | |
| Write-Host "============================================================" -ForegroundColor Cyan | |
| Write-Host "" | |
| # Pasul 1: Extrage TOATE URL-urile PDF din Wayback Machine | |
| Write-Host "[1/2] Extragere URL-uri din Wayback Machine..." -ForegroundColor Yellow | |
| Write-Host " Aceasta poate dura cateva minute pentru ~100.000 fisiere..." -ForegroundColor Gray | |
| # Query pentru TOT domeniul biblioteca-digitala.ro | |
| $cdxUrl = "https://web.archive.org/cdx/search/cdx?url=biblioteca-digitala.ro/*&output=json&filter=mimetype:application/pdf&collapse=urlkey&fl=original&limit=200000" | |
| try { | |
| Write-Host " Interogare Wayback Machine API..." -ForegroundColor Gray | |
| $response = Invoke-RestMethod -Uri $cdxUrl -Method Get -TimeoutSec 600 | |
| $pdfUrls = @() | |
| for ($i = 1; $i -lt $response.Count; $i++) { | |
| $url = $response[$i][0] | |
| if ($url -match '\.pdf$') { | |
| $url = $url -replace '^http://', 'https://' | |
| $pdfUrls += $url | |
| } | |
| } | |
| $pdfUrls = $pdfUrls | Sort-Object -Unique | |
| Write-Host "" | |
| Write-Host "GASIT: $($pdfUrls.Count) URL-uri PDF unice!" -ForegroundColor Green | |
| Write-Host "" | |
| # Salveaza lista | |
| $urlsFile = Join-Path $OutputDir "TOATE_pdf_urls.txt" | |
| $pdfUrls | Out-File -FilePath $urlsFile -Encoding UTF8 | |
| Write-Host "Lista salvata in: $urlsFile" -ForegroundColor Gray | |
| # Statistici pe sectiuni | |
| Write-Host "" | |
| Write-Host "Statistici pe sectiuni:" -ForegroundColor Yellow | |
| $revisteCarte = ($pdfUrls | Where-Object { $_ -match '/reviste/carte/' }).Count | |
| $reviste = ($pdfUrls | Where-Object { $_ -match '/reviste/' -and $_ -notmatch '/reviste/carte/' }).Count | |
| $altele = $pdfUrls.Count - $revisteCarte - $reviste | |
| Write-Host " /reviste/carte/ (carti): $revisteCarte" -ForegroundColor White | |
| Write-Host " /reviste/ (periodice): $reviste" -ForegroundColor White | |
| Write-Host " alte sectiuni: $altele" -ForegroundColor White | |
| Write-Host "" | |
| } catch { | |
| Write-Host "Eroare la accesarea Wayback Machine: $_" -ForegroundColor Red | |
| Read-Host "Apasa Enter pentru a inchide" | |
| exit 1 | |
| } | |
| # Intreaba daca vrea sa continue cu descarcarea | |
| Write-Host "============================================================" -ForegroundColor Yellow | |
| Write-Host "ATENTIE: Descarcarea a $($pdfUrls.Count) fisiere poate dura" -ForegroundColor Yellow | |
| Write-Host " multe ore sau chiar zile!" -ForegroundColor Yellow | |
| Write-Host "============================================================" -ForegroundColor Yellow | |
| Write-Host "" | |
| $confirm = Read-Host "Vrei sa incepi descarcarea? (da/nu)" | |
| if ($confirm -ne "da") { | |
| Write-Host "" | |
| Write-Host "Descarcarea anulata. Lista URL-uri este salvata in:" -ForegroundColor Gray | |
| Write-Host "$urlsFile" -ForegroundColor White | |
| Write-Host "" | |
| Write-Host "Poti relua descarcarea mai tarziu ruland din nou scriptul." -ForegroundColor Gray | |
| Read-Host "Apasa Enter pentru a inchide" | |
| exit 0 | |
| } | |
| # Pasul 2: Descarca PDF-uri | |
| Write-Host "" | |
| Write-Host "[2/2] Descarcare PDF-uri in $OutputDir ..." -ForegroundColor Yellow | |
| Write-Host "" | |
| $downloaded = 0 | |
| $skipped = 0 | |
| $errors = 0 | |
| $total = $pdfUrls.Count | |
| $current = 0 | |
| $startTime = Get-Date | |
| foreach ($url in $pdfUrls) { | |
| $current++ | |
| # Extrage calea relativa pentru a crea subdirectoare | |
| $relativePath = $url -replace 'https://biblioteca-digitala\.ro/', '' | |
| $relativePath = [System.Web.HttpUtility]::UrlDecode($relativePath) | |
| $relativePath = $relativePath -replace '[<>:"|?*]', '_' | |
| # Calea completa locala (pastreaza structura de foldere) | |
| $localPath = Join-Path $OutputDir $relativePath | |
| $localDir = Split-Path $localPath -Parent | |
| # Creaza subdirectoarele daca nu exista | |
| if (!(Test-Path $localDir)) { | |
| New-Item -ItemType Directory -Path $localDir -Force | Out-Null | |
| } | |
| # Verifica daca exista deja | |
| if (Test-Path $localPath) { | |
| $size = (Get-Item $localPath).Length | |
| if ($size -gt 1000) { | |
| if ($current % 100 -eq 0) { | |
| Write-Host "[$current/$total] Sarit $skipped existente..." -ForegroundColor DarkGray | |
| } | |
| $skipped++ | |
| continue | |
| } | |
| } | |
| # Calculeaza timpul ramas estimat | |
| $elapsed = (Get-Date) - $startTime | |
| if ($downloaded -gt 0) { | |
| $avgTime = $elapsed.TotalSeconds / $downloaded | |
| $remaining = ($total - $current) * $avgTime | |
| $eta = [TimeSpan]::FromSeconds($remaining) | |
| $etaStr = "{0:hh\:mm\:ss}" -f $eta | |
| } else { | |
| $etaStr = "calculare..." | |
| } | |
| # Descarca | |
| try { | |
| $fileName = Split-Path $localPath -Leaf | |
| Write-Host "[$current/$total] ETA:$etaStr - $fileName" -ForegroundColor White | |
| Invoke-WebRequest -Uri $url -OutFile $localPath -TimeoutSec 120 -UseBasicParsing | |
| $downloaded++ | |
| Start-Sleep -Milliseconds 200 | |
| } catch { | |
| Write-Host "[$current/$total] EROARE: $fileName" -ForegroundColor Red | |
| $errors++ | |
| } | |
| # Salveaza progresul la fiecare 100 fisiere | |
| if ($current % 100 -eq 0) { | |
| $progressFile = Join-Path $OutputDir "PROGRES.txt" | |
| "Progres: $current / $total`nDescarcat: $downloaded`nSarit: $skipped`nErori: $errors`nUltima actualizare: $(Get-Date)" | Out-File $progressFile -Encoding UTF8 | |
| } | |
| } | |
| Write-Host "" | |
| Write-Host "============================================================" -ForegroundColor Cyan | |
| Write-Host "REZULTAT FINAL:" -ForegroundColor Cyan | |
| Write-Host " Total fisiere gasite: $total" -ForegroundColor White | |
| Write-Host " Descarcat nou: $downloaded" -ForegroundColor Green | |
| Write-Host " Existau deja: $skipped" -ForegroundColor Yellow | |
| Write-Host " Erori: $errors" -ForegroundColor $(if ($errors -gt 0) {"Red"} else {"Gray"}) | |
| Write-Host " Timp total: $($elapsed.ToString('hh\:mm\:ss'))" -ForegroundColor White | |
| Write-Host "============================================================" -ForegroundColor Cyan | |
| Write-Host "" | |
| Write-Host "Fisierele sunt in: $OutputDir" -ForegroundColor Green | |
| Write-Host "" | |
| Read-Host "Apasa Enter pentru a inchide" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment