Created
December 29, 2025 09:41
-
-
Save me-suzy/13fb68265e5bb15770da5225981a8442 to your computer and use it in GitHub Desktop.
Download biblioteca.sh
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Script pentru descărcarea PDF-urilor de pe biblioteca-digitala.ro/reviste/carte/ | |
| # Folosește Wayback Machine CDX API + wget/curl | |
| OUTPUT_DIR="${1:-biblioteca_digitala_pdfs}" | |
| URLS_FILE="$OUTPUT_DIR/pdf_urls.txt" | |
| MAX_PARALLEL=3 | |
| echo "============================================================" | |
| echo "Scraper pentru biblioteca-digitala.ro/reviste/carte/" | |
| echo "============================================================" | |
| echo "" | |
| # Creează directorul output | |
| mkdir -p "$OUTPUT_DIR" | |
| # Pasul 1: Extrage URL-uri din Wayback Machine CDX API | |
| echo "[1/2] Extragere URL-uri din Wayback Machine..." | |
| CDX_URL="https://web.archive.org/cdx/search/cdx?url=biblioteca-digitala.ro/reviste/carte/*&output=json&filter=mimetype:application/pdf&collapse=urlkey&fl=original" | |
| # Descarcă lista JSON și extrage URL-urile | |
| curl -s "$CDX_URL" | \ | |
| jq -r '.[] | if type == "array" then .[0] else empty end' | \ | |
| grep -E '\.pdf$' | \ | |
| sed 's|^http://|https://|' | \ | |
| sort -u > "$URLS_FILE" | |
| if [ ! -s "$URLS_FILE" ]; then | |
| echo "Eroare: Nu s-au găsit URL-uri sau problema cu Wayback Machine" | |
| echo "" | |
| echo "Încercați manual în browser:" | |
| echo "$CDX_URL" | |
| exit 1 | |
| fi | |
| COUNT=$(wc -l < "$URLS_FILE") | |
| echo "Găsite $COUNT URL-uri PDF unice" | |
| echo "Lista salvată în: $URLS_FILE" | |
| echo "" | |
| # Pasul 2: Descarcă PDF-uri | |
| echo "[2/2] Descărcare PDF-uri..." | |
| echo "" | |
| downloaded=0 | |
| skipped=0 | |
| errors=0 | |
| while IFS= read -r url; do | |
| # Extrage calea relativă | |
| relative_path=$(echo "$url" | sed 's|https://biblioteca-digitala.ro/reviste/carte/||') | |
| relative_path=$(python3 -c "import urllib.parse; print(urllib.parse.unquote('$relative_path'))" 2>/dev/null || echo "$relative_path") | |
| # Calea locală | |
| local_path="$OUTPUT_DIR/$relative_path" | |
| local_dir=$(dirname "$local_path") | |
| # Creează directoare | |
| mkdir -p "$local_dir" | |
| # Verifică dacă există | |
| if [ -f "$local_path" ] && [ $(stat -f%z "$local_path" 2>/dev/null || stat -c%s "$local_path" 2>/dev/null) -gt 1000 ]; then | |
| echo " Există: $(basename "$local_path")" | |
| ((skipped++)) | |
| continue | |
| fi | |
| # Descarcă | |
| filename=$(basename "$local_path") | |
| echo " Descărcare: $filename" | |
| if curl -s -L -o "$local_path" "$url"; then | |
| ((downloaded++)) | |
| else | |
| echo " Eroare: $filename" | |
| ((errors++)) | |
| fi | |
| sleep 0.5 # Rate limiting | |
| done < "$URLS_FILE" | |
| echo "" | |
| echo "============================================================" | |
| echo "REZULTAT:" | |
| echo " Descărcate nou: $downloaded" | |
| echo " Existau deja: $skipped" | |
| echo " Erori: $errors" | |
| echo "============================================================" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment