Skip to content

Instantly share code, notes, and snippets.

@poojarsn
Created January 5, 2025 12:12
Show Gist options
  • Select an option

  • Save poojarsn/378847774d80f80f84c94c468d31229f to your computer and use it in GitHub Desktop.

Select an option

Save poojarsn/378847774d80f80f84c94c468d31229f to your computer and use it in GitHub Desktop.
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');
const path = require('path');
const { URL } = require('url');
// Base URL for the WACE exams directories
const BASE_URL = 'https://www.vaultofthewace.xyz/biology/WACE%20exams/';
// Local base directory to save downloaded content
const LOCAL_BASE_DIR = './Biology/WACE_exams/';
// Delay between downloads in milliseconds
const DOWNLOAD_DELAY = 1000; // 1 second
// Initialize server load counter
let serverLoad = 0;
// Track visited directories to avoid infinite loops
const visited = new Set();
// Function to log server load
const logServerLoad = () => {
console.log(`Server Load: ${serverLoad} requests made.`);
};
// Function to download a file
const downloadFile = (fileUrl, savePath) => {
return new Promise((resolve) => {
serverLoad++;
logServerLoad();
axios
.get(fileUrl, { responseType: 'arraybuffer' })
.then((response) => {
fs.writeFileSync(savePath, response.data);
console.log(`Downloaded: ${savePath}`);
})
.catch((err) => {
console.error(`Failed to download file ${fileUrl}: ${err.message}`);
})
.finally(() => {
setTimeout(resolve, DOWNLOAD_DELAY); // Add delay before resolving
});
});
};
// Function to create a folder if it doesn't exist
const createFolder = (folderPath) => {
if (!fs.existsSync(folderPath)) {
fs.mkdirSync(folderPath, { recursive: true });
}
};
// Function to parse and download PDF and DOCX files from a directory
const parseDirectory = (url, localPath) => {
return new Promise((resolve) => {
// Skip if the directory has already been visited
if (visited.has(url)) {
resolve();
return;
}
visited.add(url); // Mark directory as visited
serverLoad++;
logServerLoad();
axios
.get(url)
.then((response) => {
const $ = cheerio.load(response.data);
// Create local directory for this URL
createFolder(localPath);
// Get all links in the directory
const links = $('a')
.filter((i, el) => {
const href = $(el).attr('href');
// Only process valid href links for PDF and DOCX files
return href && (href.endsWith('.pdf') || href.endsWith('.PDF') ||href.endsWith('.docx'));
})
.toArray();
// Sequentially process all links
const processLink = (index) => {
if (index >= links.length) {
resolve(); // Resolve when all links are processed
return;
}
const link = $(links[index]);
const href = link.attr('href');
const fullUrl = href.startsWith('http') ? href : new URL(href, url).href;
// Download the file
console.log(`Preparing to download file: ${fullUrl}`);
const fileName = path.basename(fullUrl);
const filePath = path.join(localPath, fileName);
downloadFile(fullUrl, filePath).then(() => {
processLink(index + 1); // Process next link after this one
});
};
processLink(0); // Start processing links
})
.catch((err) => {
console.error(`Failed to parse directory ${url}: ${err.message}`);
resolve(); // Resolve even if the directory parsing fails
});
});
};
// Main function to download files from 1999 to 2022
const downloadWaceExams = () => {
for (let year = 2010; year <= 2022; year++) {
const yearUrl = `${BASE_URL}${year}`;
const localYearPath = path.join(LOCAL_BASE_DIR, String(year));
console.log(`Starting download for year ${year}...`);
parseDirectory(yearUrl, localYearPath).then(() => {
console.log(`Download complete for year ${year}.`);
});
}
};
// Start the download process
downloadWaceExams();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment