Created
January 5, 2025 12:12
-
-
Save poojarsn/378847774d80f80f84c94c468d31229f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const axios = require('axios'); | |
| const cheerio = require('cheerio'); | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| const { URL } = require('url'); | |
| // Base URL for the WACE exams directories | |
| const BASE_URL = 'https://www.vaultofthewace.xyz/biology/WACE%20exams/'; | |
| // Local base directory to save downloaded content | |
| const LOCAL_BASE_DIR = './Biology/WACE_exams/'; | |
| // Delay between downloads in milliseconds | |
| const DOWNLOAD_DELAY = 1000; // 1 second | |
| // Initialize server load counter | |
| let serverLoad = 0; | |
| // Track visited directories to avoid infinite loops | |
| const visited = new Set(); | |
| // Function to log server load | |
| const logServerLoad = () => { | |
| console.log(`Server Load: ${serverLoad} requests made.`); | |
| }; | |
| // Function to download a file | |
| const downloadFile = (fileUrl, savePath) => { | |
| return new Promise((resolve) => { | |
| serverLoad++; | |
| logServerLoad(); | |
| axios | |
| .get(fileUrl, { responseType: 'arraybuffer' }) | |
| .then((response) => { | |
| fs.writeFileSync(savePath, response.data); | |
| console.log(`Downloaded: ${savePath}`); | |
| }) | |
| .catch((err) => { | |
| console.error(`Failed to download file ${fileUrl}: ${err.message}`); | |
| }) | |
| .finally(() => { | |
| setTimeout(resolve, DOWNLOAD_DELAY); // Add delay before resolving | |
| }); | |
| }); | |
| }; | |
| // Function to create a folder if it doesn't exist | |
| const createFolder = (folderPath) => { | |
| if (!fs.existsSync(folderPath)) { | |
| fs.mkdirSync(folderPath, { recursive: true }); | |
| } | |
| }; | |
| // Function to parse and download PDF and DOCX files from a directory | |
| const parseDirectory = (url, localPath) => { | |
| return new Promise((resolve) => { | |
| // Skip if the directory has already been visited | |
| if (visited.has(url)) { | |
| resolve(); | |
| return; | |
| } | |
| visited.add(url); // Mark directory as visited | |
| serverLoad++; | |
| logServerLoad(); | |
| axios | |
| .get(url) | |
| .then((response) => { | |
| const $ = cheerio.load(response.data); | |
| // Create local directory for this URL | |
| createFolder(localPath); | |
| // Get all links in the directory | |
| const links = $('a') | |
| .filter((i, el) => { | |
| const href = $(el).attr('href'); | |
| // Only process valid href links for PDF and DOCX files | |
| return href && (href.endsWith('.pdf') || href.endsWith('.PDF') ||href.endsWith('.docx')); | |
| }) | |
| .toArray(); | |
| // Sequentially process all links | |
| const processLink = (index) => { | |
| if (index >= links.length) { | |
| resolve(); // Resolve when all links are processed | |
| return; | |
| } | |
| const link = $(links[index]); | |
| const href = link.attr('href'); | |
| const fullUrl = href.startsWith('http') ? href : new URL(href, url).href; | |
| // Download the file | |
| console.log(`Preparing to download file: ${fullUrl}`); | |
| const fileName = path.basename(fullUrl); | |
| const filePath = path.join(localPath, fileName); | |
| downloadFile(fullUrl, filePath).then(() => { | |
| processLink(index + 1); // Process next link after this one | |
| }); | |
| }; | |
| processLink(0); // Start processing links | |
| }) | |
| .catch((err) => { | |
| console.error(`Failed to parse directory ${url}: ${err.message}`); | |
| resolve(); // Resolve even if the directory parsing fails | |
| }); | |
| }); | |
| }; | |
| // Main function to download files from 1999 to 2022 | |
| const downloadWaceExams = () => { | |
| for (let year = 2010; year <= 2022; year++) { | |
| const yearUrl = `${BASE_URL}${year}`; | |
| const localYearPath = path.join(LOCAL_BASE_DIR, String(year)); | |
| console.log(`Starting download for year ${year}...`); | |
| parseDirectory(yearUrl, localYearPath).then(() => { | |
| console.log(`Download complete for year ${year}.`); | |
| }); | |
| } | |
| }; | |
| // Start the download process | |
| downloadWaceExams(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment