downloadcollector
Downloads given url(s) and stores file or mongodb, if exists ignore them.
Install
npm install downloadcollector --save
Usage
var DownloadCollector = require('downloadcollector').DownloadCollector;
var dc = new DownloadCollector({path: '/home/rawdata/'})
dc.download({filename: 'google.html', uri: 'http://google.com'}).then(function (html) {
console.log('html:', html);
}, function (err) {
console.error(err);
});
Multi page download
var DownloadCollector = require('downloadcollector').DownloadCollector;
var dc = new DownloadCollector({path: '/home/rawdata/'});
dc.simultaneouslyDownload = 10;
for (var i = 0; i < 1000; i += 1) {
dc.download({filename: 'google' + i + '.html', uri: 'https://www.google.com.tr/search?q=test&start=' + (i * 10).toString()}).then(function (html) {
console.log('html:', html);
}, function (err) {
console.error(err);
});
}
DownloadCollector Options
path: Files path default: /temp/
simultaneouslyDownload: download queue count simultaneously (concurrent download) default: 100
autoGenerateFilename: Generates filename from url (replace some escape characters to underscore) default: false
dbname: mongoDb database name (if use mongo must be set) default: undefined
mongoserver: mongoDb connection string default: mongodb://localhost:27017/
colname: mongoDb collection name default: data
field: html data field default: data
key: finds _id field and return field if exists else download
resulttype: (html/object) download returns html or json object default: html
debug: shows console messages default: false
useragent: browser user agent for downloading default: random select from browser user agents
autoGenerateFilename: Generates file name from url automaticaly
simultaneouslyDownload: Simultaneously download limit (wait others and when download finished any url continue from list). default: 100
License
MIT License
Copyright 2017 Kutlay Özger