Created
December 29, 2025 17:24
-
-
Save me-suzy/ff9068307330cddc8fd7cab8833a6a00 to your computer and use it in GitHub Desktop.
CĂUTARE ÎN WAYBACK MACHINE.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| import sys | |
| import os | |
| from urllib.parse import quote, unquote | |
| def cauta_in_wayback(url_pattern): | |
| """Caută snapshot-uri în Wayback Machine""" | |
| print(f"\nCaut: {url_pattern}") | |
| cdx_url = f"https://web.archive.org/cdx/search/cdx?url={quote(url_pattern, safe='')}&output=json&fl=timestamp,original,statuscode,mimetype,length" | |
| try: | |
| response = requests.get(cdx_url, timeout=60) | |
| if response.status_code == 200: | |
| data = response.json() | |
| if len(data) > 1: | |
| return data[1:] # Skip header | |
| except Exception as e: | |
| print(f" Eroare: {e}") | |
| return [] | |
| def descarca_din_wayback(timestamp, original_url, output_filename=None): | |
| """Descarcă fișierul din Wayback Machine""" | |
| # if_ = raw file, fără banner-ul Wayback | |
| wayback_url = f"https://web.archive.org/web/{timestamp}if_/{original_url}" | |
| print(f"\nDescarc de la:\n{wayback_url}") | |
| try: | |
| response = requests.get(wayback_url, timeout=300, stream=True) | |
| if response.status_code == 200: | |
| if not output_filename: | |
| output_filename = unquote(original_url.split("/")[-1]) | |
| # Curăță numele de caractere invalide | |
| output_filename = "".join(c if c not in '<>:"/\\|?*' else '_' for c in output_filename) | |
| total_size = int(response.headers.get('content-length', 0)) | |
| downloaded = 0 | |
| with open(output_filename, 'wb') as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| downloaded += len(chunk) | |
| if total_size > 0: | |
| pct = (downloaded / total_size) * 100 | |
| print(f"\r Progres: {pct:.1f}% ({downloaded:,} / {total_size:,} bytes)", end='') | |
| print(f"\n\n✓ SALVAT: {output_filename}") | |
| print(f" Dimensiune: {os.path.getsize(output_filename):,} bytes") | |
| return True | |
| else: | |
| print(f"Eroare HTTP: {response.status_code}") | |
| return False | |
| except Exception as e: | |
| print(f"Eroare la descărcare: {e}") | |
| return False | |
| def recupereaza_de_pe_archive_org(item_name, filename=None): | |
| """ | |
| Recuperează un fișier șters de pe archive.org | |
| item_name: numele item-ului (ex: matei-horia-enciclopedia-statelor-lumii) | |
| filename: numele fișierului (optional, ex: Matei Horia Enciclopedia statelor lumii.pdf) | |
| """ | |
| print("=" * 70) | |
| print("RECUPERARE FIȘIER DIN WAYBACK MACHINE") | |
| print("=" * 70) | |
| print(f"\nItem: {item_name}") | |
| if filename: | |
| print(f"Fișier: {filename}") | |
| # Construiește pattern-uri posibile de URL-uri archive.org | |
| url_patterns = [ | |
| f"archive.org/details/{item_name}*", | |
| f"archive.org/download/{item_name}*", | |
| f"ia800*.us.archive.org/*/{item_name}*", | |
| f"ia600*.us.archive.org/*/{item_name}*", | |
| ] | |
| if filename: | |
| encoded_filename = quote(filename) | |
| url_patterns.extend([ | |
| f"archive.org/download/{item_name}/{encoded_filename}", | |
| f"archive.org/download/{item_name}/{filename}", | |
| ]) | |
| all_snapshots = [] | |
| print("\n" + "-" * 70) | |
| print("CĂUTARE ÎN WAYBACK MACHINE...") | |
| print("-" * 70) | |
| for pattern in url_patterns: | |
| results = cauta_in_wayback(pattern) | |
| if results: | |
| print(f" → Găsite {len(results)} rezultate") | |
| all_snapshots.extend(results) | |
| if not all_snapshots: | |
| print("\n❌ Nu s-a găsit nimic în Wayback Machine.") | |
| print("\nÎncearcă să cauți manual:") | |
| print(f" https://web.archive.org/web/*/archive.org/download/{item_name}/*") | |
| return False | |
| # Elimină duplicate și sortează | |
| unique_snapshots = {} | |
| for snap in all_snapshots: | |
| timestamp, original = snap[0], snap[1] | |
| status = snap[2] if len(snap) > 2 else "200" | |
| mimetype = snap[3] if len(snap) > 3 else "" | |
| size = snap[4] if len(snap) > 4 else "0" | |
| key = original | |
| if key not in unique_snapshots or timestamp > unique_snapshots[key][0]: | |
| unique_snapshots[key] = (timestamp, original, status, mimetype, size) | |
| snapshots = sorted(unique_snapshots.values(), key=lambda x: x[0], reverse=True) | |
| # Filtrează doar PDF-uri și status 200 | |
| pdf_snapshots = [s for s in snapshots if s[2] == "200" and ('.pdf' in s[1].lower() or 'pdf' in s[3].lower())] | |
| print("\n" + "-" * 70) | |
| print(f"REZULTATE: {len(snapshots)} snapshot-uri totale, {len(pdf_snapshots)} PDF-uri valide") | |
| print("-" * 70) | |
| if pdf_snapshots: | |
| print("\nPDF-uri găsite:") | |
| for i, (ts, orig, status, mime, size) in enumerate(pdf_snapshots[:20], 1): | |
| date_str = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}" | |
| fname = unquote(orig.split("/")[-1])[:50] | |
| size_mb = f"{int(size)/1024/1024:.1f}MB" if size.isdigit() and int(size) > 0 else "?" | |
| print(f" [{i}] {date_str} | {size_mb:>8} | {fname}") | |
| # Descarcă primul (cel mai recent) | |
| print("\n" + "-" * 70) | |
| ts, orig, _, _, _ = pdf_snapshots[0] | |
| print(f"Descarc cel mai recent snapshot ({ts[:4]}-{ts[4:6]}-{ts[6:8]})...") | |
| return descarca_din_wayback(ts, orig) | |
| else: | |
| # Afișează toate rezultatele | |
| print("\nToate snapshot-urile găsite:") | |
| for i, (ts, orig, status, mime, size) in enumerate(snapshots[:30], 1): | |
| date_str = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}" | |
| print(f" [{i}] {date_str} | Status:{status} | {mime} | {orig[:60]}") | |
| print("\n⚠ Nu s-au găsit PDF-uri cu status 200.") | |
| print("Verifică manual URL-urile de mai sus.") | |
| return False | |
| if __name__ == "__main__": | |
| # ============================================ | |
| # CONFIGURARE - Modifică aici datele tale: | |
| # ============================================ | |
| ITEM_NAME = "matei-horia-enciclopedia-statelor-lumii" | |
| FILE_NAME = "Matei Horia Enciclopedia statelor lumii.pdf" | |
| # Sau varianta cu .pdf în item name dacă așa era: | |
| # ITEM_NAME = "matei-horia-enciclopedia-statelor-lumii.pdf" | |
| # FILE_NAME = None | |
| # ============================================ | |
| recupereaza_de_pe_archive_org(ITEM_NAME, FILE_NAME) | |
| print("\n" + "=" * 70) | |
| input("Apasă Enter pentru a închide...") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment