Created
December 29, 2025 09:40
-
-
Save me-suzy/743098841bd65233699b2239769d98a6 to your computer and use it in GitHub Desktop.
6899.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Scraper pentru descărcarea PDF-urilor de pe biblioteca-digitala.ro/reviste/carte/ | |
| Versiune îmbunătățită cu URL-uri pre-descoperite și suport Wayback Machine. | |
| METODĂ RECOMANDATĂ: | |
| 1. Rulează mai întâi: python3 scraper.py --wayback | |
| Aceasta va extrage toate URL-urile din Wayback Machine CDX API | |
| 2. Apoi descarcă: python3 scraper.py --download | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import os | |
| import re | |
| import time | |
| import json | |
| from urllib.parse import urljoin, unquote, quote | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import logging | |
| import argparse | |
| # Configurare logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # Configurări | |
| BASE_URL = "https://biblioteca-digitala.ro/reviste/carte/" | |
| OUTPUT_DIR = "biblioteca_digitala_pdfs" | |
| HEADERS = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
| 'Accept-Language': 'ro-RO,ro;q=0.9,en;q=0.8', | |
| } | |
| # URL-uri PDF cunoscute (din căutări anterioare) | |
| KNOWN_PDF_URLS = [ | |
| "https://biblioteca-digitala.ro/reviste/carte/Crisan_burebista-si-epoca-sa_1977.pdf", | |
| "https://biblioteca-digitala.ro/reviste/carte/ISISP/august-1944-mai-1945_1969.pdf", | |
| "https://biblioteca-digitala.ro/reviste/carte/Institutul-Sociologie/Dictionar-sociologie-rurala_2005.pdf", | |
| "https://biblioteca-digitala.ro/reviste/carte/gaius_institutiunile-dreptului-privat-roman_1982.pdf", | |
| "https://biblioteca-digitala.ro/reviste/carte/curtea-domneasca-targoviste/Gavrila-Ion_Ioan-AL-Bratescu–Voinesti-2004.pdf", | |
| "https://biblioteca-digitala.ro/reviste/carte/turlea-petre_Carol-II-si-Camarila-Regala_2010.pdf", | |
| "https://biblioteca-digitala.ro/reviste/carte/Platon_opere_V_1986.pdf", | |
| "https://biblioteca-digitala.ro/reviste/carte/ISSEE/Dutu_Dimensiunea-umana-a-istoriei_1986.pdf", | |
| "https://biblioteca-digitala.ro/reviste/carte/nicolae-titulescu-documente-diplomatice_1967.pdf", | |
| "https://biblioteca-digitala.ro/reviste/carte/Terentiu-seneca_eunucul-medeea.pdf", | |
| "https://biblioteca-digitala.ro/reviste/carte/Institutul-Studii-Banatice-Titu-Maiorescu/TODORAN_Scrieri_vol_III_2018.pdf", | |
| "https://biblioteca-digitala.ro/reviste/carte/Nicolae-Constantinescu-Citite-de-mine-Folclor-Etnologie-Antropologie.pdf", | |
| "https://biblioteca-digitala.ro/reviste/carte/institutul-calinescu/Cronologia-vietii-literare/01_Cronologia-vietii-literare-romanesti_perioada-postcomunista_I_1990.pdf", | |
| "https://biblioteca-digitala.ro/reviste/carte/satu-mare/Istoria-colegiului-National-Mihai-Eminescu-Satu-Mare_2021.pdf", | |
| ] | |
| def get_wayback_cdx_urls(): | |
| """ | |
| Extrage TOATE URL-urile PDF din Wayback Machine CDX API. | |
| Aceasta este cea mai completă metodă! | |
| """ | |
| pdf_urls = set() | |
| # CDX API query pentru toate PDF-urile din directorul carte | |
| cdx_url = ( | |
| "https://web.archive.org/cdx/search/cdx" | |
| "?url=biblioteca-digitala.ro/reviste/carte/*" | |
| "&output=json" | |
| "&filter=mimetype:application/pdf" | |
| "&collapse=urlkey" | |
| "&fl=original,timestamp,statuscode" | |
| "&limit=10000" | |
| ) | |
| logger.info("Interogare Wayback Machine CDX API...") | |
| logger.info(f"URL: {cdx_url}") | |
| try: | |
| response = requests.get(cdx_url, headers=HEADERS, timeout=120) | |
| if response.status_code == 200: | |
| data = response.json() | |
| if len(data) > 1: # Prima linie e header | |
| logger.info(f"Găsite {len(data) - 1} intrări în Wayback Machine") | |
| for row in data[1:]: # Skip header | |
| original_url = row[0] | |
| status = row[2] if len(row) > 2 else "200" | |
| if status == "200" and original_url.endswith('.pdf'): | |
| # Normalizează URL-ul | |
| if original_url.startswith('http://'): | |
| original_url = original_url.replace('http://', 'https://') | |
| pdf_urls.add(original_url) | |
| logger.info(f"URL-uri PDF unice: {len(pdf_urls)}") | |
| else: | |
| logger.warning("Nu s-au găsit rezultate în Wayback Machine") | |
| else: | |
| logger.error(f"Eroare HTTP {response.status_code}") | |
| except Exception as e: | |
| logger.error(f"Eroare Wayback Machine: {e}") | |
| return pdf_urls | |
| def get_common_crawl_urls(): | |
| """ | |
| Extrage URL-uri din Common Crawl Index. | |
| """ | |
| pdf_urls = set() | |
| # Încearcă mai multe indexuri Common Crawl | |
| cc_indexes = [ | |
| "CC-MAIN-2024-51", | |
| "CC-MAIN-2024-46", | |
| "CC-MAIN-2024-42", | |
| "CC-MAIN-2024-38", | |
| "CC-MAIN-2024-33", | |
| "CC-MAIN-2024-26", | |
| "CC-MAIN-2024-22", | |
| "CC-MAIN-2024-18", | |
| "CC-MAIN-2024-10", | |
| ] | |
| for index in cc_indexes: | |
| try: | |
| cc_url = f"https://index.commoncrawl.org/{index}-index?url=biblioteca-digitala.ro/reviste/carte/*&output=json&limit=1000" | |
| logger.info(f"Verificare Common Crawl index: {index}") | |
| response = requests.get(cc_url, headers=HEADERS, timeout=60) | |
| if response.status_code == 200: | |
| for line in response.text.strip().split('\n'): | |
| try: | |
| data = json.loads(line) | |
| url = data.get('url', '') | |
| if url.endswith('.pdf'): | |
| if url.startswith('http://'): | |
| url = url.replace('http://', 'https://') | |
| pdf_urls.add(url) | |
| except: | |
| pass | |
| time.sleep(0.5) # Rate limiting | |
| except Exception as e: | |
| logger.warning(f"Eroare Common Crawl {index}: {e}") | |
| logger.info(f"URL-uri din Common Crawl: {len(pdf_urls)}") | |
| return pdf_urls | |
| def search_duckduckgo(query): | |
| """Caută pe DuckDuckGo.""" | |
| pdf_urls = set() | |
| try: | |
| url = f"https://html.duckduckgo.com/html/?q={quote(query)}" | |
| response = requests.get(url, headers=HEADERS, timeout=30) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| for link in soup.find_all('a', class_='result__a'): | |
| href = link.get('href', '') | |
| if 'biblioteca-digitala.ro/reviste/carte/' in href and '.pdf' in href: | |
| if 'uddg=' in href: | |
| real_url = unquote(href.split('uddg=')[-1]) | |
| else: | |
| real_url = href | |
| pdf_urls.add(real_url) | |
| except Exception as e: | |
| logger.warning(f"Eroare DuckDuckGo: {e}") | |
| return pdf_urls | |
| def search_with_terms(): | |
| """Căutări cu termeni variați.""" | |
| pdf_urls = set() | |
| search_terms = [ | |
| 'site:biblioteca-digitala.ro/reviste/carte/ filetype:pdf', | |
| 'site:biblioteca-digitala.ro inurl:/reviste/carte/ .pdf', | |
| '"biblioteca-digitala.ro" "reviste/carte" pdf', | |
| ] | |
| for term in search_terms: | |
| logger.info(f"Căutare: {term}") | |
| pdf_urls.update(search_duckduckgo(term)) | |
| time.sleep(2) | |
| return pdf_urls | |
| def download_pdf(url, output_dir, session=None): | |
| """Descarcă un PDF.""" | |
| if session is None: | |
| session = requests.Session() | |
| try: | |
| # Extrage numele fișierului | |
| filename = unquote(url.split('/')[-1]) | |
| if not filename.endswith('.pdf'): | |
| filename += '.pdf' | |
| # Sanitizează numele | |
| filename = re.sub(r'[<>:"/\\|?*]', '_', filename) | |
| # Crează subdirectoare | |
| path_parts = url.replace('https://biblioteca-digitala.ro/reviste/carte/', '').split('/') | |
| if len(path_parts) > 1: | |
| subdir = os.path.join(output_dir, *path_parts[:-1]) | |
| os.makedirs(subdir, exist_ok=True) | |
| filepath = os.path.join(subdir, filename) | |
| else: | |
| filepath = os.path.join(output_dir, filename) | |
| # Verifică dacă există | |
| if os.path.exists(filepath): | |
| size = os.path.getsize(filepath) | |
| if size > 1000: # Mai mult de 1KB | |
| logger.info(f"Există deja: {filename} ({size} bytes)") | |
| return True, "exists" | |
| # Descarcă | |
| response = session.get(url, headers=HEADERS, timeout=120, stream=True) | |
| if response.status_code == 200: | |
| total_size = int(response.headers.get('content-length', 0)) | |
| with open(filepath, 'wb') as f: | |
| downloaded = 0 | |
| for chunk in response.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| downloaded += len(chunk) | |
| logger.info(f"Descărcat: {filename} ({downloaded} bytes)") | |
| return True, "downloaded" | |
| else: | |
| logger.warning(f"Eroare {response.status_code}: {url}") | |
| return False, f"error_{response.status_code}" | |
| except Exception as e: | |
| logger.error(f"Eroare descărcare {url}: {e}") | |
| return False, str(e) | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description='Scraper pentru biblioteca-digitala.ro/reviste/carte/', | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Exemple de utilizare: | |
| python3 scraper.py --wayback # Extrage URL-uri din Wayback Machine (RECOMANDAT) | |
| python3 scraper.py --commoncrawl # Extrage URL-uri din Common Crawl | |
| python3 scraper.py --search # Caută în motoare de căutare | |
| python3 scraper.py --download # Descarcă PDF-urile din lista salvată | |
| python3 scraper.py --all # Toate metodele + descărcare | |
| """ | |
| ) | |
| parser.add_argument('--wayback', action='store_true', | |
| help='Extrage URL-uri din Wayback Machine CDX API') | |
| parser.add_argument('--commoncrawl', action='store_true', | |
| help='Extrage URL-uri din Common Crawl') | |
| parser.add_argument('--search', action='store_true', | |
| help='Caută în motoare de căutare') | |
| parser.add_argument('--download', action='store_true', | |
| help='Descarcă PDF-urile') | |
| parser.add_argument('--all', action='store_true', | |
| help='Toate metodele + descărcare') | |
| parser.add_argument('--workers', type=int, default=3, | |
| help='Număr de workers pentru descărcare (default: 3)') | |
| parser.add_argument('--output', type=str, default=OUTPUT_DIR, | |
| help=f'Director output (default: {OUTPUT_DIR})') | |
| args = parser.parse_args() | |
| output_dir = args.output | |
| urls_file = os.path.join(output_dir, "pdf_urls.txt") | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Dacă nu sunt argumente, arată help | |
| if not any([args.wayback, args.commoncrawl, args.search, args.download, args.all]): | |
| parser.print_help() | |
| print("\n" + "=" * 60) | |
| print("NOTĂ: Metoda recomandată este --wayback pentru a obține") | |
| print("lista completă de PDF-uri indexate.") | |
| print("=" * 60) | |
| return | |
| all_pdfs = set(KNOWN_PDF_URLS) # Începe cu URL-urile cunoscute | |
| # Încarcă URL-uri existente | |
| if os.path.exists(urls_file): | |
| with open(urls_file, 'r') as f: | |
| existing = [line.strip() for line in f if line.strip()] | |
| all_pdfs.update(existing) | |
| logger.info(f"Încărcate {len(existing)} URL-uri existente") | |
| # Colectare URL-uri | |
| if args.wayback or args.all: | |
| logger.info("\n" + "=" * 50) | |
| logger.info("WAYBACK MACHINE CDX API") | |
| logger.info("=" * 50) | |
| all_pdfs.update(get_wayback_cdx_urls()) | |
| if args.commoncrawl or args.all: | |
| logger.info("\n" + "=" * 50) | |
| logger.info("COMMON CRAWL") | |
| logger.info("=" * 50) | |
| all_pdfs.update(get_common_crawl_urls()) | |
| if args.search or args.all: | |
| logger.info("\n" + "=" * 50) | |
| logger.info("CĂUTARE MOTOARE") | |
| logger.info("=" * 50) | |
| all_pdfs.update(search_with_terms()) | |
| # Filtrează și salvează | |
| valid_pdfs = sorted([ | |
| url for url in all_pdfs | |
| if 'biblioteca-digitala.ro/reviste/carte/' in url and url.endswith('.pdf') | |
| ]) | |
| with open(urls_file, 'w', encoding='utf-8') as f: | |
| for url in valid_pdfs: | |
| f.write(url + '\n') | |
| logger.info(f"\n{'=' * 60}") | |
| logger.info(f"Total URL-uri PDF: {len(valid_pdfs)}") | |
| logger.info(f"Salvate în: {urls_file}") | |
| logger.info(f"{'=' * 60}") | |
| # Descărcare | |
| if args.download or args.all: | |
| if valid_pdfs: | |
| logger.info(f"\nDescărcare {len(valid_pdfs)} PDF-uri cu {args.workers} workers...") | |
| session = requests.Session() | |
| stats = {'downloaded': 0, 'exists': 0, 'errors': 0} | |
| with ThreadPoolExecutor(max_workers=args.workers) as executor: | |
| futures = { | |
| executor.submit(download_pdf, url, output_dir, session): url | |
| for url in valid_pdfs | |
| } | |
| for future in as_completed(futures): | |
| success, status = future.result() | |
| if success: | |
| if status == 'exists': | |
| stats['exists'] += 1 | |
| else: | |
| stats['downloaded'] += 1 | |
| else: | |
| stats['errors'] += 1 | |
| logger.info(f"\n{'=' * 60}") | |
| logger.info(f"REZULTAT FINAL:") | |
| logger.info(f" Descărcate nou: {stats['downloaded']}") | |
| logger.info(f" Existau deja: {stats['exists']}") | |
| logger.info(f" Erori: {stats['errors']}") | |
| logger.info(f"{'=' * 60}") | |
| else: | |
| logger.warning("Nu există URL-uri de descărcat!") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment