What's this
A http crawler for nodejs. Based on crawlee, puppeteer. Auto handle cache using filesystem Easy to planning your request in queue
How to use
Install
npm i hunt-crawler
Sample
const { Hunter } = require('hunt-crawler')
const $hunter = new Hunter({
// options for Hunter
cacheFolder : __dirname + '/../caches' // where to save cache files
log : false // show logs
},{
// options for PuppeteerCrawler
keepAlive: false, // should the crawler keep alive after request cleaned
})
/**
* Crawl the url
* - Start to crawl the url and cache its content to file
* - Launch browser if not launched
*/
await $hunter.run(
'https://111.com/somepage', // url to scrawl
/**
* Callback
* handle the content( already cached to file )
* or start new requests ( $hunter.run )
*/
async( url, content ){
console.log( content )
},
/**
* Option
*/
{
/**
* suffix
* optional, default '.cache'
* - the cache file name = md5(url).suffix
* - $hunter.run prefer to use cache (if exists), except when force=true
* - You can use this to manage the caches of this page
*/
suffix :moment.utc().format('YYYY_MM_DD')+'.html',
/**
* removeTags
* optional, default []
* remove tags from the returned page.content() when save to cache
*/
removeTags : ['script','svg'],
/**
* play
* Do actions like scroll, wait or anyting on page in browse.
*/
async play( url, ctx, $hunter ){
await new Promise( r => setTimeout(r,200) ) // wait for 1 second
},
}
)
More Sample and Test
see /test folder
you could test by run:
node test/simpleCrawl {url?} {force?}
node test/batchCrawl {url1} {url2} {url3} ...