Text inverted index creator

Text inverted index generator for node.

  • excludes "stop words"
  • normalize words with snowball-js
  • converts words to lowercase
  • excludes words of length less than a specified value - default: 3
  • reports word's position within a text file counting excluded ones
  • supports section indexing
  • splits text with regexp word separator - default: /\W+/
  • supports text encoding - default: 'utf8'
  • supports multiple languages - currently English (which is default) and Norwegian

via npm:

$ npm install textiijs
var textii = require('textiijs'),
    sample_text = "Zero, one and three or five, six, seven... seven...";
var pii = new textii(sample_text);
pii.get(null, function(data) {
// var textii = require('textiijs'),
var textii = require('../index'),
    sample_text = "Zero, one and three or five, six, seven... seven...",
    options = { "word_separator": /\W+/, "min_word_length": 3, "encoding": "utf8", language: "Norwegian" },
    get_options = { "section": "page1" };
var pii = new textii(sample_text, options);
pii.get(get_options, function(data) {
npm install textiijs -g

Then you can either pipe in data or provide a filename

echo "hello world" | textiijs  
# or 
textiijs text.txt
$ make test

Coverage report

$ make test-cov