scr-api

Turn web pages in to APIs.

scr-api

A schema based HTML scraping library for node.js

scr-api takes a JSON document called a schema, some HTML, and returns JSON objects based on the schema.

A simple example

var scrapi = require("scr-api");

var html = "<div id'time-and-date'>" +
  "<p>Today is <span class='date'>14 May, 2013</span>.<br/> " +
  "The time is <span class='time'>11:53 AM</span></p>" +
  "</div>";

var schema = {
  "The current date": {
    "selector": ".date",
    "is":""
  },
  "The current time": {
    "selector": ".time",
    "is":""
  }
}

console.log(scrapi.scrape(html, schema));

Would return

{ 
	'The current date': '14 May, 2013',
	'The current time': '11:53 AM' 
}

Schemas are of the form

{
	"I'm an object key name": 					// This value is the name of the 
												// property that will be scraped
	{ 
		"selector": "a #valid .css .selector", 	// A selector to filter HTML with,
												// these are compound

		"attr": "href", 						// The property of the tag to 
												// extract a value from, optional
		 
		"is": "", 								// either "", [], or {} 	

		"type":	"int",							// how to interpret the value of 
												// the text found by the selector
		 										// Either: 	string
		 										// 			int
		 										// 			float
		 										// 			bool
		 										// 			boolTextExists
		 										// 			boolHtmlExists
		 										

		"of": < anotherSchemaDefiniton > 		// Schema definitions are recursive,
												// they can be arbitrarily nested
	}
}

Scrapi schemas can be arbitrarily nested

{
	"I'm an object key name": {
		"selector": "a #valid .css",
		"is": [],
		"of": {
			"So am I": {
				"selector": ".selector",
				"is": {},
				"of": {
					"I'm a turtle": {
						"selector": "a #valid .css .selector",
						"is": [],
						"of": < turtlesAllTheWayDown >
					}
				}
			}
		}
	}
}

For a more complex example, let's scrape the hackernews headlines

var request = require('request'),
	scrapi = require("scr-api");

request({
	uri: "http://news.ycombinator.com",
}, function(err, res, body) {

	var schema = {
		"Headlines": {
			"selector": "table table .title a",
			"is": [],
			"of": {
				"Headline": {
					"is": ""
				},
				"Link": {
					"is": "",
					"attr": "href"
				}
			}
		}
	};

	console.log(scrapi.scrape(body, schema));
});

Would then return

{
	Headlines: [{
		Headline: 'Go 1.1 is released',
		Link: 'http://blog.golang.org/2013/05/go-11-is-released.html'
	}, {
		Headline: 'U.S. Secretly Obtains Two Months of A.P. Phone Records',
		Link: 'http://www.nytimes.com/aponline/2013/05/13/us/politics/ap-us-ap-phone-records- subpoena.html?pagewanted=all'
	}, 
	... 
	... 
	...]
}