textiijs

Text inverted index creator

Text inverted index generator for node.

  • excludes "stop words"
  • normalize words with porter-stemmer
  • converts words to lowercase
  • excludes words of length less than a specified value - default: 3
  • reports word's position within a text file counting excluded ones
  • supports section indexing
  • splits text with regexp word separator - default: /\W+/
  • supports text encoding - default: 'utf8'

via npm:

$ npm install textiijs
var textii = require('textiijs'),
    sample_text = "Zero, one and three or five, six, seven... seven...";
 
var pii = new textii(sample_text, null, function(err, data) {
  if (err) {
    return console.log("Error: " + err);
  }
  console.log(data);
});
 
pii.get();
var textii = require('textiijs'),
    sample_text = "Zero, one and three or five, six, seven... seven...",
    options = { "word_separator": /\W+/, "min_word_length": 3, "encoding": "utf8" },
    get_options = { "section": "page1" };
 
var pii = new textii(sample_text, options, function(err, data) {
  if (err) {
    return console.log("Error: " + err);
  }
  console.log(data);
});
 
pii.get(get_options);
$ make test

Coverage report

$ make test-cov