node package manager

rdfa-parser

rdfa-parser

install

npm install rdfa-parser

use

let html = '\
<div vocab="http://xmlns.com/foaf/0.1/">\
    <div resource="#manu" typeof="Person">\
    <span property="name">Manu Sporny</span> knows\
<a property="knows" href="#alex">Alex</a> and\
    <a property="knows" href="#brian">Brian</a>.\
</div>\
<div resource="#alex" typeof="Person">\
    <span property="name">Alex Milowski</span> wrote the RDFa processor for this page.\
</div>\
<div resource="#brian" typeof="Person">\
    <span property="name">Brian Sletten</span> wrote the syntax highlighting for the raw data.\
</div>\
</div>'
 
let triples = rdfaParser.parseRDFa(html);

triples data structure

  {
    "subject": "http://localhost/index.html",
    "predicates": [
      {
        "predicate": {
          "nominalValue": "http://www.w3.org/ns/rdfa#usesVocabulary"
        },
        "objects": [
          {
            "nominalValue": "http://xmlns.com/foaf/0.1/"
          }
        ]
    
      }
    ]
  }

to get turtle triples

for (let i = 0; i < triples.length; i++) {
    console.log(triples[i].toString());
}

download single website

let base = "http://booking.com";
request(base, function (error, response, html) {
    let triples = rdfaParser.parseRDFa(html, base);
 
    for (let i = 0; i < triples.length; i++) {
        console.log(triples[i].toString());
    }
});

crawl website

let start = "http://booking.com";
let depth = 2;
 
rdfaParser.crawler(start, depth, function (base) {
    request(base, function (error, response, html) {
        let triples = rdfaParser.parseRDFa(html, base);
 
        for (let i = 0; i < triples.length; i++) {
            console.log(triples[i].toString());
        }
    });
});

more

You can find the whole project including web interface, test harness and triple store here