Here is a short program web scraping program written in Node.js. I'm just getting to grips with node and this is the first thing I've written with it. I'm liking it so far though I guess I'm kinda missing the point with the whole asynchronous aspect.
This is supposed to be an extremely basic project. I'm just a beginner with this stuff. I know the program is pretty brittle in terms of what it could do with a real scrape but I'm happy that I've managed to put things together in not too much time (I only just started coding a few months ago).
However, I'm having an absolute nightmare getting my head around promises and how I can make them work with this project with minimal libraries. So, I'm probably going to offend some of you for my 'band-aid-like' timeout functions.
How would I rework this with promises without completely rewriting my code?
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices. //Save the scraped data in a spreadsheet (CSV format).'use strict';//Modules being used:var cheerio = require('cheerio');var json2csv = require('json2csv');var request = require('request');var moment = require('moment');var fs = require('fs');//harcoded urlvar url = 'http://shirts4mike.com/';//url for tshirt pagesvar urlSet = new Set();var remainder;var tshirtArray = [];// Load front page of shirts4mikefunction firstScrape(){ request(url, function(error, response, html) { if(!error && response.statusCode == 200){ var $ = cheerio.load(html); //iterate over links with 'shirt' $('a[href*=shirt]').each(function(){ var a = $(this).attr('href'); //create new link var scrapeLink = url + a; //for each new link, go in and find out if there is a submit button. //If there, add it to the set request(scrapeLink, function(error,response, html){ if(!error && response.statusCode == 200) { var $ = cheerio.load(html); //if page has a submit it must be a product page if($('[type=submit]').length !== 0){ //add page to set urlSet.add(scrapeLink); } else if(remainder == undefined) { //if not a product page, add it to remainder so it another scrape can be performed. remainder = scrapeLink; } } }); }); } }); secondScraper();}firstScrape();function secondScraper(){ setTimeout(function () { request(remainder, function(error, response, html) { if(!error && response.statusCode == 200){ var $ = cheerio.load(html); $('a[href*=shirt]').each(function(){ var a = $(this).attr('href'); //create new link var scrapeLink = url + a; request(scrapeLink, function(error,response, html){ if(!error && response.statusCode == 200){ var $ = cheerio.load(html); //collect remaining product pages and add to set if($('[type=submit]').length !== 0){ urlSet.add(scrapeLink); } } }); }); } }); lastScraper(); }, 2000);}function lastScraper(){ //call lastScraper so we can grab data from the set (product pages) setTimeout(function(){ //scrape set, product pages for(var item of urlSet){ var url = item; request(url, function(error, response, html){ if(!error && response.statusCode == 200){ var $ = cheerio.load(html); //grab data and store as variables var price = $('.price').text(); var imgURL = $('.shirt-picture').find('img').attr('src'); var title = $('body').find('.shirt-details > h1').text().slice(4); var tshirtObject = {}; //add values into tshirt object tshirtObject.Title = title; tshirtObject.Price = price; tshirtObject.ImageURL = imgURL; tshirtObject.URL = url; tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a'); //add the object into the array of tshirts tshirtArray.push(tshirtObject); } }); } convertJson2Csv(); }, 2000);}function convertJson2Csv(){ setTimeout(function(){ //The scraper should generate a folder called `data` if it doesn’t exist. var dir ='./data'; if(!fs.existsSync(dir)){ fs.mkdirSync(dir); } var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date']; //convert tshirt data into CSV and pass in fields var csv = json2csv({ data: tshirtArray, fields: fields }); //Name of file will be the date var fileDate = moment().format('MM-DD-YY'); var fileName = dir + '/' + fileDate + '.csv'; //Write file fs.writeFile(fileName, csv, {overwrite: true}, function(err) { console.log('file saved'); if (err) throw err; }); }, 2000);}1 Answer1
The basic idea is to return promises from your function and to resolve (or reject) those promises when you're done with what those function are waiting for. Something like this:
function firstScrape () { return new Promise(function (resolve, reject) { request(url, function(error, response, html) { if (!error && response.statusCode == 200) { var $ = cheerio.load(html); // ... resolve(); } else { reject(); } }); });}function secondScraper () { return new Promise(function (resolve, reject) { request(remainder, function(error, response, html) { if (!error && response.statusCode == 200) { // ... resolve(); } else { reject(); } }); });}And then in your main code:
firstScrape().then(function (val) { return secondScraper();}).then(function (val) { return lastScraper();}).catch(function (error) { // there was some error});With one of your functions it's more complicated because you are iterating over some values and making more requests than one, but in that case you can do something like this:
var array; // some urls that you want to iterate overvar promises = array.map(function (element) { return new Promise(function (resolve, reject) { request(element, function (error, response, html) { if(!error && response.statusCode == 200){ // ... resolve(); } else { // ... reject(); } }); });});This will give you an array of promises, which you can use withPromise.all:
Promise.all(promises).then(function (values) { // you have all values}).catch(function (error) { // you have some error});See:https://developer.mozilla.org/en/docs/Web/JavaScript/Reference/Global_Objects/Promise/all
You can have some more useful helpers with modules like Bluebird:http://bluebirdjs.com/
- \$\begingroup\$Thank you for taking the time. For the functions in which I make more than one request, do I need to be creating an additional function? Do I not do that in both of the first two?\$\endgroup\$bloppit– bloppit2016-09-26 18:48:32 +00:00CommentedSep 26, 2016 at 18:48
You mustlog in to answer this question.
Explore related questions
See similar questions with these tags.