scr-api
A schema based HTML scraping library for node.js
scr-api takes a JSON document called a schema, some HTML, and returns JSON objects based on the schema.
Example
A simple example
var scrapi = require("scr-api");
var html = "<div id'time-and-date'>" +
"<p>Today is <span class='date'>14 May, 2013</span>.<br/> " +
"The time is <span class='time'>11:53 AM</span></p>" +
"</div>";
var schema = {
"The current date": {
"selector": ".date",
"is":""
},
"The current time": {
"selector": ".time",
"is":""
}
}
console.log(scrapi.scrape(html, schema));
Would return
{
'The current date': '14 May, 2013',
'The current time': '11:53 AM'
}
Schema syntax
Schemas are of the form
{
"I'm an object key name": // This value is the name of the
// property that will be scraped
{
"selector": "a #valid .css .selector", // A selector to filter HTML with,
// these are compound
"attr": "href", // The property of the tag to
// extract a value from, optional
"is": "", // either "", [], or {}
"type": "int", // how to interpret the value of
// the text found by the selector
// Either: string
// int
// float
// bool
// boolTextExists
// boolHtmlExists
"of": < anotherSchemaDefiniton > // Schema definitions are recursive,
// they can be arbitrarily nested
}
}
Scrapi schemas can be arbitrarily nested
{
"I'm an object key name": {
"selector": "a #valid .css",
"is": [],
"of": {
"So am I": {
"selector": ".selector",
"is": {},
"of": {
"I'm a turtle": {
"selector": "a #valid .css .selector",
"is": [],
"of": < turtlesAllTheWayDown >
}
}
}
}
}
}
Complex example
For a more complex example, let's scrape the hackernews headlines
var request = require('request'),
scrapi = require("scr-api");
request({
uri: "http://news.ycombinator.com",
}, function(err, res, body) {
var schema = {
"Headlines": {
"selector": "table table .title a",
"is": [],
"of": {
"Headline": {
"is": ""
},
"Link": {
"is": "",
"attr": "href"
}
}
}
};
console.log(scrapi.scrape(body, schema));
});
Would then return
{
Headlines: [{
Headline: 'Go 1.1 is released',
Link: 'http://blog.golang.org/2013/05/go-11-is-released.html'
}, {
Headline: 'U.S. Secretly Obtains Two Months of A.P. Phone Records',
Link: 'http://www.nytimes.com/aponline/2013/05/13/us/politics/ap-us-ap-phone-records- subpoena.html?pagewanted=all'
},
...
...
...]
}