Skip to content

Instantly share code, notes, and snippets.

@simonswiss
Last active January 26, 2017 22:56
Show Gist options
  • Select an option

  • Save simonswiss/2fe2ff41cd4a946a0e8a11f971b79732 to your computer and use it in GitHub Desktop.

Select an option

Save simonswiss/2fe2ff41cd4a946a0e8a11f971b79732 to your computer and use it in GitHub Desktop.
Hipster Scraper
const fs = require('fs')
const axios = require('axios')
const cheerio = require('cheerio')
const URLS = require('./urls')
const TARGET_FILE = './data.js'
function scrapeData(url) {
axios.get(url).then(response => {
// each page may have multiple entries, we'll push then into this array
const entries = []
// cheerio setup
const HTML = response.data
let $ = cheerio.load(HTML)
/*
#################################
##### Custom Business Logic #####
################################# */
const review = $('.review')
review.each( function(index, entry) {
// grab the values of the review and map them to Craft field names
entries.push({
title: $(this).find('.review-author h6 a').text().trim(),
reviewTitle: $(this).find('.review-content h3').text(),
reviewText: $(this).find('.review-overall').text().trim(),
reviewAvatar: $(this).find('.review-author img').attr('src'),
reviewDate: $(this).find('.rating-md p meta[itemprop="datePublished"]').attr('content'),
reviewRating: parseInt($(this).find('.rating-md span').text().trim().substring(0,1)),
reviewBestRating: 5
})
})
/* ################################## */
// push all the page entries into our data array
data.push(...entries)
// log update to terminal
console.info(`Scraped ${entries.length} entries from ${url}`)
// call next iteration of the generator
const nextItem = runLoop.next(data)
// when generator has finished iterating
if ( nextItem.done === true ) {
// Write JSON data to file..
fs.writeFile(TARGET_FILE, JSON.stringify(data, null, 2), (err) => {
if(err) { return console.log(err); }
console.info(`##########
${data.length} entries written to "${TARGET_FILE}"!
##########`
)
})
}
})
}
// The Generator...
function* loop() {
for (const url of URLS) {
yield scrapeData(url)
}
}
// Kicking off the loop with an empty data array!
const runLoop = loop()
runLoop.next(data = [])
@dominikwilkowski
Copy link

This is great! Love it. Thanks Simon

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment