Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Created December 29, 2025 17:24
Show Gist options
  • Select an option

  • Save me-suzy/ff9068307330cddc8fd7cab8833a6a00 to your computer and use it in GitHub Desktop.

Select an option

Save me-suzy/ff9068307330cddc8fd7cab8833a6a00 to your computer and use it in GitHub Desktop.
CĂUTARE ÎN WAYBACK MACHINE.py
import requests
import sys
import os
from urllib.parse import quote, unquote
def cauta_in_wayback(url_pattern):
"""Caută snapshot-uri în Wayback Machine"""
print(f"\nCaut: {url_pattern}")
cdx_url = f"https://web.archive.org/cdx/search/cdx?url={quote(url_pattern, safe='')}&output=json&fl=timestamp,original,statuscode,mimetype,length"
try:
response = requests.get(cdx_url, timeout=60)
if response.status_code == 200:
data = response.json()
if len(data) > 1:
return data[1:] # Skip header
except Exception as e:
print(f" Eroare: {e}")
return []
def descarca_din_wayback(timestamp, original_url, output_filename=None):
"""Descarcă fișierul din Wayback Machine"""
# if_ = raw file, fără banner-ul Wayback
wayback_url = f"https://web.archive.org/web/{timestamp}if_/{original_url}"
print(f"\nDescarc de la:\n{wayback_url}")
try:
response = requests.get(wayback_url, timeout=300, stream=True)
if response.status_code == 200:
if not output_filename:
output_filename = unquote(original_url.split("/")[-1])
# Curăță numele de caractere invalide
output_filename = "".join(c if c not in '<>:"/\\|?*' else '_' for c in output_filename)
total_size = int(response.headers.get('content-length', 0))
downloaded = 0
with open(output_filename, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
downloaded += len(chunk)
if total_size > 0:
pct = (downloaded / total_size) * 100
print(f"\r Progres: {pct:.1f}% ({downloaded:,} / {total_size:,} bytes)", end='')
print(f"\n\n✓ SALVAT: {output_filename}")
print(f" Dimensiune: {os.path.getsize(output_filename):,} bytes")
return True
else:
print(f"Eroare HTTP: {response.status_code}")
return False
except Exception as e:
print(f"Eroare la descărcare: {e}")
return False
def recupereaza_de_pe_archive_org(item_name, filename=None):
"""
Recuperează un fișier șters de pe archive.org
item_name: numele item-ului (ex: matei-horia-enciclopedia-statelor-lumii)
filename: numele fișierului (optional, ex: Matei Horia Enciclopedia statelor lumii.pdf)
"""
print("=" * 70)
print("RECUPERARE FIȘIER DIN WAYBACK MACHINE")
print("=" * 70)
print(f"\nItem: {item_name}")
if filename:
print(f"Fișier: {filename}")
# Construiește pattern-uri posibile de URL-uri archive.org
url_patterns = [
f"archive.org/details/{item_name}*",
f"archive.org/download/{item_name}*",
f"ia800*.us.archive.org/*/{item_name}*",
f"ia600*.us.archive.org/*/{item_name}*",
]
if filename:
encoded_filename = quote(filename)
url_patterns.extend([
f"archive.org/download/{item_name}/{encoded_filename}",
f"archive.org/download/{item_name}/{filename}",
])
all_snapshots = []
print("\n" + "-" * 70)
print("CĂUTARE ÎN WAYBACK MACHINE...")
print("-" * 70)
for pattern in url_patterns:
results = cauta_in_wayback(pattern)
if results:
print(f" → Găsite {len(results)} rezultate")
all_snapshots.extend(results)
if not all_snapshots:
print("\n❌ Nu s-a găsit nimic în Wayback Machine.")
print("\nÎncearcă să cauți manual:")
print(f" https://web.archive.org/web/*/archive.org/download/{item_name}/*")
return False
# Elimină duplicate și sortează
unique_snapshots = {}
for snap in all_snapshots:
timestamp, original = snap[0], snap[1]
status = snap[2] if len(snap) > 2 else "200"
mimetype = snap[3] if len(snap) > 3 else ""
size = snap[4] if len(snap) > 4 else "0"
key = original
if key not in unique_snapshots or timestamp > unique_snapshots[key][0]:
unique_snapshots[key] = (timestamp, original, status, mimetype, size)
snapshots = sorted(unique_snapshots.values(), key=lambda x: x[0], reverse=True)
# Filtrează doar PDF-uri și status 200
pdf_snapshots = [s for s in snapshots if s[2] == "200" and ('.pdf' in s[1].lower() or 'pdf' in s[3].lower())]
print("\n" + "-" * 70)
print(f"REZULTATE: {len(snapshots)} snapshot-uri totale, {len(pdf_snapshots)} PDF-uri valide")
print("-" * 70)
if pdf_snapshots:
print("\nPDF-uri găsite:")
for i, (ts, orig, status, mime, size) in enumerate(pdf_snapshots[:20], 1):
date_str = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}"
fname = unquote(orig.split("/")[-1])[:50]
size_mb = f"{int(size)/1024/1024:.1f}MB" if size.isdigit() and int(size) > 0 else "?"
print(f" [{i}] {date_str} | {size_mb:>8} | {fname}")
# Descarcă primul (cel mai recent)
print("\n" + "-" * 70)
ts, orig, _, _, _ = pdf_snapshots[0]
print(f"Descarc cel mai recent snapshot ({ts[:4]}-{ts[4:6]}-{ts[6:8]})...")
return descarca_din_wayback(ts, orig)
else:
# Afișează toate rezultatele
print("\nToate snapshot-urile găsite:")
for i, (ts, orig, status, mime, size) in enumerate(snapshots[:30], 1):
date_str = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}"
print(f" [{i}] {date_str} | Status:{status} | {mime} | {orig[:60]}")
print("\n⚠ Nu s-au găsit PDF-uri cu status 200.")
print("Verifică manual URL-urile de mai sus.")
return False
if __name__ == "__main__":
# ============================================
# CONFIGURARE - Modifică aici datele tale:
# ============================================
ITEM_NAME = "matei-horia-enciclopedia-statelor-lumii"
FILE_NAME = "Matei Horia Enciclopedia statelor lumii.pdf"
# Sau varianta cu .pdf în item name dacă așa era:
# ITEM_NAME = "matei-horia-enciclopedia-statelor-lumii.pdf"
# FILE_NAME = None
# ============================================
recupereaza_de_pe_archive_org(ITEM_NAME, FILE_NAME)
print("\n" + "=" * 70)
input("Apasă Enter pentru a închide...")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment