node package manager


Boilerplate code for a Node.js scraper with CLI


npm install scrapebp

This module can be forked or depended upon for future scraping projects.
Caller only need to specify opts and implement the custom scraper and scrape callback function.

See bin/scrapebp for details.

var ScrapeBp = require('scrapebp');
// DemoScraper and scrapeCallback are defined 
// opts for ScrapeBp is prepared 
var scrapebp = ScrapeBp(opts);
scrapebp.on('headers', function (headers) {
  console.log("- %s headers ready", opts.method);
  if (argv.dumpHeader) {
scrapebp.on('redirect', function (url, remaining) {
  console.log("- redirects to: %s (%d remaining)", url, remaining);
scrapebp.on('error', function (err) {
scrapebp.on('$ready', function(url, $) {
  console.log("- $ ready");
  // $ is the cheerio object 
  // use $.html() to get the response body 
  // useful if the response is not html/xml 
  if (argv.dumpBody) {
  // invoke our scraper 
  DemoScraper.scrape(url, $, scrapeCallback);

Following needle, scrapebp uses visionmedia/debug.

DEBUG=scrapebp bin/scrapebp

Originally hyperquest, hyperdirect and hyperzip is used as the HTTP stack. Then I switched to tomas/needle, which supports all of the above and iconv conversion.



write tests that covers:

  • GET with query string
  • POST with payload
  • redirects
  • use of compression (-z and check response header and decoded body)
  • error handling


  • character set detection with aadsm/jschardet? (in case HTTP header and HTML meta did not signals charset)
  • promisify?
  • browserify


  • multi-byte cut-off (