Server for parsing documents scraped from supplier websites

Supplier Bill Parser

Parse bills scraped from a supplier website into bill and invoice documents. This module provides the general framework for setting up a supplier-specific parser in the docparse system. Each supplier-specific module will need to implement the actual parsing. This module subscribes the the changes feed and calls the supplier-specific parsing code when a new document is found


Each supplier parser is registered with seaport under the role parseScrapedFGS where FGS is the supplier code


When the parsing service starts up, it setups up a simple express server that reponds to post requests at the /parse/:scrapedID route. The service registers with seaport using the role role parseScrapedFGS. Seaport sends back a port and the parser server listens on this port


To parse a scraped document, issue a post http request to the /parse/:scrapedID route at the appropriate host and port, where :scrapedID is the _id of a scraped document in the database

var should = require('should')
var seaport = require('seaport')
var seaConfig = config.get('seaport')
var seaHost =
var seaPort = seaConfig.port
var ports = seaport.connect(seaHost, seaPort)
var role = 'parseScrapedNGA'
var services = ports.query(role), 'failed to parse scraped document, no services registered with the role parseScrapedNGA')
var service = services[0]
var serviceHost =
var servicePort = service.port
var url = 'http://' + serviceHost + ':' + servicePort + '/parse/' + scrapedID
var opts = {
  json: true,
  url: url,
  method: 'post'
request(opts, function (errresbody) {
  if (err) {
    delete err.stack
    inspect(err, 'error requesting parse via post request to service with role parseScrapedNGA')
  inspect(body, 'parsed scraped NGA document correctly')