Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Created December 29, 2025 09:41
Show Gist options
  • Select an option

  • Save me-suzy/13fb68265e5bb15770da5225981a8442 to your computer and use it in GitHub Desktop.

Select an option

Save me-suzy/13fb68265e5bb15770da5225981a8442 to your computer and use it in GitHub Desktop.
Download biblioteca.sh
#!/bin/bash
# Script pentru descărcarea PDF-urilor de pe biblioteca-digitala.ro/reviste/carte/
# Folosește Wayback Machine CDX API + wget/curl
OUTPUT_DIR="${1:-biblioteca_digitala_pdfs}"
URLS_FILE="$OUTPUT_DIR/pdf_urls.txt"
MAX_PARALLEL=3
echo "============================================================"
echo "Scraper pentru biblioteca-digitala.ro/reviste/carte/"
echo "============================================================"
echo ""
# Creează directorul output
mkdir -p "$OUTPUT_DIR"
# Pasul 1: Extrage URL-uri din Wayback Machine CDX API
echo "[1/2] Extragere URL-uri din Wayback Machine..."
CDX_URL="https://web.archive.org/cdx/search/cdx?url=biblioteca-digitala.ro/reviste/carte/*&output=json&filter=mimetype:application/pdf&collapse=urlkey&fl=original"
# Descarcă lista JSON și extrage URL-urile
curl -s "$CDX_URL" | \
jq -r '.[] | if type == "array" then .[0] else empty end' | \
grep -E '\.pdf$' | \
sed 's|^http://|https://|' | \
sort -u > "$URLS_FILE"
if [ ! -s "$URLS_FILE" ]; then
echo "Eroare: Nu s-au găsit URL-uri sau problema cu Wayback Machine"
echo ""
echo "Încercați manual în browser:"
echo "$CDX_URL"
exit 1
fi
COUNT=$(wc -l < "$URLS_FILE")
echo "Găsite $COUNT URL-uri PDF unice"
echo "Lista salvată în: $URLS_FILE"
echo ""
# Pasul 2: Descarcă PDF-uri
echo "[2/2] Descărcare PDF-uri..."
echo ""
downloaded=0
skipped=0
errors=0
while IFS= read -r url; do
# Extrage calea relativă
relative_path=$(echo "$url" | sed 's|https://biblioteca-digitala.ro/reviste/carte/||')
relative_path=$(python3 -c "import urllib.parse; print(urllib.parse.unquote('$relative_path'))" 2>/dev/null || echo "$relative_path")
# Calea locală
local_path="$OUTPUT_DIR/$relative_path"
local_dir=$(dirname "$local_path")
# Creează directoare
mkdir -p "$local_dir"
# Verifică dacă există
if [ -f "$local_path" ] && [ $(stat -f%z "$local_path" 2>/dev/null || stat -c%s "$local_path" 2>/dev/null) -gt 1000 ]; then
echo " Există: $(basename "$local_path")"
((skipped++))
continue
fi
# Descarcă
filename=$(basename "$local_path")
echo " Descărcare: $filename"
if curl -s -L -o "$local_path" "$url"; then
((downloaded++))
else
echo " Eroare: $filename"
((errors++))
fi
sleep 0.5 # Rate limiting
done < "$URLS_FILE"
echo ""
echo "============================================================"
echo "REZULTAT:"
echo " Descărcate nou: $downloaded"
echo " Existau deja: $skipped"
echo " Erori: $errors"
echo "============================================================"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment