Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Created December 29, 2025 10:04
Show Gist options
  • Select an option

  • Save me-suzy/6d2e7f717c238de116b8b6fc56fcc08c to your computer and use it in GitHub Desktop.

Select an option

Save me-suzy/6d2e7f717c238de116b8b6fc56fcc08c to your computer and use it in GitHub Desktop.
descarca_biblioteca_TOATA.ps1
# PowerShell script pentru TOATE PDF-urile de pe biblioteca-digitala.ro
# Foloseste Wayback Machine CDX API pentru a gasi TOATE cele ~100.000 PDF-uri
$OutputDir = "G:\biblioteca-digitala-COMPLET"
# Creaza directorul daca nu exista
if (!(Test-Path $OutputDir)) {
New-Item -ItemType Directory -Path $OutputDir | Out-Null
}
Write-Host "============================================================" -ForegroundColor Cyan
Write-Host "Scraper pentru TOATE PDF-urile de pe biblioteca-digitala.ro" -ForegroundColor Cyan
Write-Host "Destinatie: $OutputDir" -ForegroundColor Cyan
Write-Host "============================================================" -ForegroundColor Cyan
Write-Host ""
# Pasul 1: Extrage TOATE URL-urile PDF din Wayback Machine
Write-Host "[1/2] Extragere URL-uri din Wayback Machine..." -ForegroundColor Yellow
Write-Host " Aceasta poate dura cateva minute pentru ~100.000 fisiere..." -ForegroundColor Gray
# Query pentru TOT domeniul biblioteca-digitala.ro
$cdxUrl = "https://web.archive.org/cdx/search/cdx?url=biblioteca-digitala.ro/*&output=json&filter=mimetype:application/pdf&collapse=urlkey&fl=original&limit=200000"
try {
Write-Host " Interogare Wayback Machine API..." -ForegroundColor Gray
$response = Invoke-RestMethod -Uri $cdxUrl -Method Get -TimeoutSec 600
$pdfUrls = @()
for ($i = 1; $i -lt $response.Count; $i++) {
$url = $response[$i][0]
if ($url -match '\.pdf$') {
$url = $url -replace '^http://', 'https://'
$pdfUrls += $url
}
}
$pdfUrls = $pdfUrls | Sort-Object -Unique
Write-Host ""
Write-Host "GASIT: $($pdfUrls.Count) URL-uri PDF unice!" -ForegroundColor Green
Write-Host ""
# Salveaza lista
$urlsFile = Join-Path $OutputDir "TOATE_pdf_urls.txt"
$pdfUrls | Out-File -FilePath $urlsFile -Encoding UTF8
Write-Host "Lista salvata in: $urlsFile" -ForegroundColor Gray
# Statistici pe sectiuni
Write-Host ""
Write-Host "Statistici pe sectiuni:" -ForegroundColor Yellow
$revisteCarte = ($pdfUrls | Where-Object { $_ -match '/reviste/carte/' }).Count
$reviste = ($pdfUrls | Where-Object { $_ -match '/reviste/' -and $_ -notmatch '/reviste/carte/' }).Count
$altele = $pdfUrls.Count - $revisteCarte - $reviste
Write-Host " /reviste/carte/ (carti): $revisteCarte" -ForegroundColor White
Write-Host " /reviste/ (periodice): $reviste" -ForegroundColor White
Write-Host " alte sectiuni: $altele" -ForegroundColor White
Write-Host ""
} catch {
Write-Host "Eroare la accesarea Wayback Machine: $_" -ForegroundColor Red
Read-Host "Apasa Enter pentru a inchide"
exit 1
}
# Intreaba daca vrea sa continue cu descarcarea
Write-Host "============================================================" -ForegroundColor Yellow
Write-Host "ATENTIE: Descarcarea a $($pdfUrls.Count) fisiere poate dura" -ForegroundColor Yellow
Write-Host " multe ore sau chiar zile!" -ForegroundColor Yellow
Write-Host "============================================================" -ForegroundColor Yellow
Write-Host ""
$confirm = Read-Host "Vrei sa incepi descarcarea? (da/nu)"
if ($confirm -ne "da") {
Write-Host ""
Write-Host "Descarcarea anulata. Lista URL-uri este salvata in:" -ForegroundColor Gray
Write-Host "$urlsFile" -ForegroundColor White
Write-Host ""
Write-Host "Poti relua descarcarea mai tarziu ruland din nou scriptul." -ForegroundColor Gray
Read-Host "Apasa Enter pentru a inchide"
exit 0
}
# Pasul 2: Descarca PDF-uri
Write-Host ""
Write-Host "[2/2] Descarcare PDF-uri in $OutputDir ..." -ForegroundColor Yellow
Write-Host ""
$downloaded = 0
$skipped = 0
$errors = 0
$total = $pdfUrls.Count
$current = 0
$startTime = Get-Date
foreach ($url in $pdfUrls) {
$current++
# Extrage calea relativa pentru a crea subdirectoare
$relativePath = $url -replace 'https://biblioteca-digitala\.ro/', ''
$relativePath = [System.Web.HttpUtility]::UrlDecode($relativePath)
$relativePath = $relativePath -replace '[<>:"|?*]', '_'
# Calea completa locala (pastreaza structura de foldere)
$localPath = Join-Path $OutputDir $relativePath
$localDir = Split-Path $localPath -Parent
# Creaza subdirectoarele daca nu exista
if (!(Test-Path $localDir)) {
New-Item -ItemType Directory -Path $localDir -Force | Out-Null
}
# Verifica daca exista deja
if (Test-Path $localPath) {
$size = (Get-Item $localPath).Length
if ($size -gt 1000) {
if ($current % 100 -eq 0) {
Write-Host "[$current/$total] Sarit $skipped existente..." -ForegroundColor DarkGray
}
$skipped++
continue
}
}
# Calculeaza timpul ramas estimat
$elapsed = (Get-Date) - $startTime
if ($downloaded -gt 0) {
$avgTime = $elapsed.TotalSeconds / $downloaded
$remaining = ($total - $current) * $avgTime
$eta = [TimeSpan]::FromSeconds($remaining)
$etaStr = "{0:hh\:mm\:ss}" -f $eta
} else {
$etaStr = "calculare..."
}
# Descarca
try {
$fileName = Split-Path $localPath -Leaf
Write-Host "[$current/$total] ETA:$etaStr - $fileName" -ForegroundColor White
Invoke-WebRequest -Uri $url -OutFile $localPath -TimeoutSec 120 -UseBasicParsing
$downloaded++
Start-Sleep -Milliseconds 200
} catch {
Write-Host "[$current/$total] EROARE: $fileName" -ForegroundColor Red
$errors++
}
# Salveaza progresul la fiecare 100 fisiere
if ($current % 100 -eq 0) {
$progressFile = Join-Path $OutputDir "PROGRES.txt"
"Progres: $current / $total`nDescarcat: $downloaded`nSarit: $skipped`nErori: $errors`nUltima actualizare: $(Get-Date)" | Out-File $progressFile -Encoding UTF8
}
}
Write-Host ""
Write-Host "============================================================" -ForegroundColor Cyan
Write-Host "REZULTAT FINAL:" -ForegroundColor Cyan
Write-Host " Total fisiere gasite: $total" -ForegroundColor White
Write-Host " Descarcat nou: $downloaded" -ForegroundColor Green
Write-Host " Existau deja: $skipped" -ForegroundColor Yellow
Write-Host " Erori: $errors" -ForegroundColor $(if ($errors -gt 0) {"Red"} else {"Gray"})
Write-Host " Timp total: $($elapsed.ToString('hh\:mm\:ss'))" -ForegroundColor White
Write-Host "============================================================" -ForegroundColor Cyan
Write-Host ""
Write-Host "Fisierele sunt in: $OutputDir" -ForegroundColor Green
Write-Host ""
Read-Host "Apasa Enter pentru a inchide"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment