data-projector

0.2.0 • Public • Published

data-projector

Load CSV datasets and map / transform the data for use.

  • Load CSV datasets
  • Guess types and cast data fields to types
  • Calculate stats: global, fields and pairwise (field by field correlations etc)
  • Map datasets to other datasets using transform functions

This is designed to take a specification object (JSON) and load a dataset and optionally map values to requested ranges.

The JSON specification objects can be saved in your application for use in presets.

Status: ALPHA

// Simple example with only the default statistics and no field mappings requested.
const dp = require('data-projector');
 
const datasetPath = '/Users/crucial/code/idmx/playsplom/app/vendor/datasets/Nightingale.csv';
const functions = {};
const calculateStatsParams = {};
const mapFieldsParams = {};
 
const x = dp.project(functions, datasetPath, calculateStatsParams, mapFieldsParams);
 
x.then(console.log, console.error);
{
  "data": [
    {
      "0": 1,
      "Date": "1854-04-01",
      "Month": "Apr",
      "Year": 1854,
      "Army": 8571,
      "Disease": 1,
      "Wounds": 0,
      "Other": 5,
      "Disease.rate": 1.4,
      "Wounds.rate": 0,
      "Other.rate": 7
    },
    {
      "0": 2,
      "Date": "1854-05-01",
      "Month": "May",
      "Year": 1854,
      "Army": 23333,
      "Disease": 12,
      "Wounds": 0,
      "Other": 9,
      "Disease.rate": 6.2,
      "Wounds.rate": 0,
      "Other.rate": 4.6
    },
    // etc etc
  ],
  "fields": [
    "0",  // first column header did not have a name
    "Date",
    "Month",
    "Year",
    "Army",
    "Disease",
    "Wounds",
    "Other",
    "Disease.rate",
    "Wounds.rate",
    "Other.rate"
  ],
  "path": "/Users/crucial/code/idmx/playsplom/app/vendor/datasets/Nightingale.csv",
  "stats": {
    "fields": {
      "0": {
        "minval": 1,
        "maxval": 24,
        "type": {
          "type": "number",
          "null": false
        }
      },
      "Date": {
        "minval": null,  // bug: didn't calculate minval max for the date column
        "maxval": null,
        "type": {
          "type": "date",
          "null": false,
          "dateFormat": "YYYY-MM-DD"  // auto-detected the date format
        }
      },
      "Month": {
        "minval": null,
        "maxval": null,
        "type": {
          "type": "string",
          "null": false
        }
      },
      "Year": {
        "minval": 1854,
        "maxval": 1856,
        "type": {
          "type": "number",
          "null": false
        }
      },
      "Army": {
        "minval": 8571,
        "maxval": 47751,
        "type": {
          "type": "number",
          "null": false
        }
      },
      "Disease": {
        "minval": 1,
        "maxval": 2761,
        "type": {
          "type": "number",
          "null": false
        }
      },
      "Wounds": {
        "minval": 0,
        "maxval": 287,
        "type": {
          "type": "number",
          "null": false
        }
      },
      "Other": {
        "minval": 5,
        "maxval": 361,
        "type": {
          "type": "number",
          "null": false
        }
      },
      "Disease.rate": {
        "minval": 1.4,
        "maxval": 1022.8,
        "type": {
          "type": "number",
          "null": false
        }
      },
      "Wounds.rate": {
        "minval": 0,
        "maxval": 115.8,
        "type": {
          "type": "number",
          "null": false
        }
      },
      "Other.rate": {
        "minval": 2.5,
        "maxval": 140.1,
        "type": {
          "type": "number",
          "null": false
        }
      }
    }
  }
}
 
 
## API
 
<a name="project"></a>
 
## project(functions, path, calculateStatsParams, mapFieldsParams) ⇒ <code>Object</code>
Load a dataset from disk, calculate statistics and apply transformations
 
 
**Returns**: <code>Object</code> - Dataset
 
| Param | Type | Description |
| --- | --- | --- |
| functions | <code>Object</code> | Named function registery |
| path | <code>String</code> |  |
| calculateStatsParams | <code>Object</code> |  |
| mapFieldsParams | <code>Array.&lt;Object&gt;</code> |  |
 
<a name="readParseDataset"></a>
 
## readParseDataset(path) ⇒ <code>Promise.&lt;Object&gt;</code>
Load and parse a dataset from path.
Stats are not yet calculated so types are unknown
and all fields are strings.
 
 
**Returns**: <code>Promise.&lt;Object&gt;</code> - Promise for a dataset
 
| Param | Type | Description |
| --- | --- | --- |
| path | <code>String</code> | Absolute path to file |
 
<a name="loadDataset"></a>
 
## loadDataset(path, functions, calculateStatsParams) ⇒ <code>Promise.&lt;Object&gt;</code>
Load and parse a dataset and calculate stats and coerce types of field values.
 
 
**Returns**: <code>Promise.&lt;Object&gt;</code> - Promise for a dataset
 
| Param | Type | Description |
| --- | --- | --- |
| path | <code>String</code> | Absolute path to file |
| functions | <code>Object</code> | Named function registery |
| calculateStatsParams | <code>Object</code> | The `stats` object from params |
 
<a name="createDataset"></a>
 
## createDataset(data, fields, path) ⇒ <code>Object</code>
Create a dataset object from an array of objects
 
 
**Returns**: <code>Object</code> - dataset - {data, fields, path}
 
| Param | Type | Description |
| --- | --- | --- |
| data | <code>Object</code> | [{field: value, field2: value}, ...] |
| fields | <code>Array.&lt;String&gt;</code> | Field names |
| path | <code>String</code> |  |
 
<a name="_calculateStats"></a>
 
## _calculateStats(functions, calculateStatsParams, dataset) ⇒ <code>Object</code>
Calculate statistics (minval, maxval, avg etc.) for a dataset using a stats specification.
 
 
**Returns**: <code>Object</code> - stats
 
| Param | Type | Description |
| --- | --- | --- |
| functions | <code>Object</code> | Named function registery |
| calculateStatsParams | <code>Object</code> | The `stats` object from params |
| dataset | <code>Object</code> | As returned by loadDataset or from a previous transformation. |
 
<a name="calculateStats"></a>
 
## calculateStats(functions, calculateStatsParams, dataset) ⇒ <code>Object</code>
Calculate statistics and return a new dataset objects with .stats set
 
 
**Returns**: <code>Object</code> - dataset
 
| Param | Type | Description |
| --- | --- | --- |
| functions | <code>Object</code> | Named function registery |
| calculateStatsParams | <code>Object</code> |  |
| dataset | <code>Object</code> |  |
 
<a name="castTypes"></a>
 
## castTypes(dataset) ⇒ <code>Object</code>
Having guessed types with calculateStats, cast all fields to the guessed types.
 
- This converts '1.1' to 1.1
- Enums of strings to their integer indices
- Date strings to Date objects
- String fields with high cardinality remain strings
 
 
**Returns**: <code>Object</code> - Dataset object with values cast to guessed types
 
| Param | Type | Description |
| --- | --- | --- |
| dataset | <code>Object</code> | Dataset object |
 
<a name="mapFields"></a>
 
## mapFields(functions, mapFieldsParams, dataset)
mapFields
 
Map input fields to output fields using mapping functions as specified in
mapFieldsParams
 
```js
{
   input: 'inFieldName',
   output: 'outFieldName'
   fn: 'linear',  // named function in functions registry
   args: [0, 1]   // parameters for linear mapping function
}

fn may be a String key to a function in the functions registery or a function(stats, fieldName, [...args], value)

Param Type Description
functions Object Named function registery
mapFieldsParams Array.<Object>
dataset Object

makeMapFunction(functions, stats, mapParam) ⇒ function

makeMapFunction from mapParam

mapParam: .fn .args

Where fn is a Function or a String key to lookup Function in functions

Function should accept: (stats, fieldName, ...args, value)

Args are optional array of params to configure your mapping function. eg. [minval, maxval]

This curries the function and calls it with: (stats, fieldName, ...args) and returns that mapping function which accepts just value and returns the mapped value.

Returns: function - any => any

Param Type Description
functions Object Named function registery
stats Object
mapParam Object

getRow(dataset, fields) ⇒ Object

Get a single row as an Object.

As this function is curried you can bake in dataset and fields:

getter = getRow(dataset, null);  // returns a function with first two args satisfied
getter(12);  // get row 12

Returns: Object - - The object for this row.

Param Type Description
dataset Object
fields Array.<string> | null Optionally select just the fields you need. null selects all fields.

getCell(dataset, field, index) ⇒ mixed

Get a single data value (row, column)

As this function is curried you can bake in dataset and field:

 getter = getCell(dataset, 'sepalLength');
 getter(12);  // get value at row 12, field 'sepalLength'

Returns: mixed - - The value for this cell.

Param Type Description
dataset Object
field String key of the field to select
index Number integer index of row

getColumn(dataset, field) ⇒ Array.<mixed>

Get all values for a column

As this function is curried you can bake in dataset:

 getter = getColumn(dataset);
 getter('sepalLength');  // get the array of values for the sepalLength field

Returns: Array.<mixed> - - Array of values for this field

Param Type Description
dataset Object
field String key of the field to select

Package Sidebar

Install

npm i data-projector

Weekly Downloads

3

Version

0.2.0

License

MIT

Last publish

Collaborators

  • crucialfelix