linkscrape

1.0.0 • Public • Published

Node.js - linkscrape

build status

This module allows scrapes links from an HTML string and normalizes them. It does not actually perform the HTTP request. Use superagent or request for that.

Installation

npm install linkscrape

Example

HTML string:

<html>
  <head>
    <title>
      Test File
    </title>
  </head>
  <body>
    <p id="wat">
      <a href="http://google.com"><b>Google</b></a>
    </p>
    <p>
      <a href="#wat" class="pretty">Link in page</a>
      <a href="javascript:alert('hi');">hi</a>
      <a href="alert('hello')">hello</a>
      <a href="/faq/questions">Faq</a>
      <a href="aboutus">About Us</a>
    </p>
  </body>
</html>

You must pass in the URL (of where the HTML string came from) to the scrape() method so that it can normalize the links.

var linkscrape = require('linkscrape');

linkscrape('http://someserver.com/mypage', htmlString, function(links, $){
  console.log(links.length);// is 6

  console.log(links[0].href); //is 'http://google.com'
  console.log(links[0].text); //is 'Google'
  console.log(links[0].html); //is '<b>Google</b>'
  console.log(links[0].element); //object
  console.log(links[0].link); //is 'http://google.com'

  console.log(links[1].href); //is '#wat'
  console.log(links[1].text); //is 'Link in page'
  console.log(links[1].html); //is 'Link in page'
  console.log(links[1].element); //object
  console.log(links[1].link); //is null
  console.log($(links[1].element).attr('class')); //is 'pretty'

  console.log(links[2].href); //is "javascript:alert('hi');"
  console.log(links[2].text); //is 'hi'
  console.log(links[2].html); //is 'hi'
  console.log(links[2].element); //object
  console.log(links[2].link); //is null

  console.log(links[3].href); //is "alert('hello')"
  console.log(links[3].text); //is 'hello'
  console.log(links[3].html); //is 'hello'
  console.log(links[3].element); //object
  console.log(links[3].link); //is null

  console.log(links[4].href); //is "/faq/questions"
  console.log(links[4].text); //is 'Faq'
  console.log(links[4].html); //is 'Faq'
  console.log(links[4].element); //object
  console.log(links[4].link); //is 'http://someserver.com/faq/questions'

  console.log(links[5].href); //is "aboutus"
  console.log(links[5].text); //is 'About Us'
  console.log(links[5].html); //is 'About Us'
  console.log(links[5].element); //object
  console.log(links[5].link); //is 'http://someserver.com/aboutus'
});

It's currently backed by cheerio. So you can use the $ with the jQuery selectors. See cheerio docs for more details.

Test

npm test

or...

mocha test

License

Licensed under MIT. See LICENSE for more details.

Copyright (c) 2012 JP Richardson

Package Sidebar

Install

npm i linkscrape

Weekly Downloads

5

Version

1.0.0

License

none

Last publish

Collaborators

  • jprichardson