@ridi/pdf-parser
Common PDF data parser for Ridibooks services
Features
- [x] Structure parsing
- [ ] Read files
- [ ] Read cover page
- [x] Encrypt and decrypt function when parsing or reading
- [ ] Debug mode
- [ ] Environment
- [x] Node
- [ ] CLI
- [ ] Browser
- [ ] Online demo
Install
npm install @ridi/pdf-parser
Usage
Basic:
import { PdfParser } from '@ridi/pdf-parser';
// or const { PdfParser } = require('@ridi/pdf-parser');
const parser = new PdfParser('./foo/bar.pdf');
parser.parse().then((book) => {
...
});
parser.read().then((pdfFileBuffer) => {
...
});
with AesCryptor:
import { CryptoProvider, AesCryptor } from '@ridi/pdf-parser';
// or const { CryptoProvider, AesCryptor } = require('@ridi/pdf-parser');
const { Purpose } = CryptoProvider;
const { Mode, Padding } = AesCryptor;
class ContentCryptoProvider extends CryptoProvider {
constructor(key) {
super();
this.cryptor = new AesCryptor(Mode.ECB, { key });
}
getCryptor(filePath, purpose) {
return this.cryptor;
}
// If use as follows:
// const provider = new ContentCryptoProvider(...);
// const parser = new PdfParser('encrypted.pdf', provider);
// const book = await parser.parse();
// const cover = await parser.readItem(book.cover);
//
// It will be called as follows:
// 1. run(data, 'encrypted.pdf', Purpose.READ_IN_DIR)
// 2. run(data, 'encrypted.pdf', Purpose.READ_IN_DIR)
//
run(data, filePath, purpose) {
const cryptor = this.getCryptor(filePath, purpose);
if (purpose === Purpose.READ_IN_DIR) {
return cryptor.decrypt(data, { padding: Padding.AUTO });
}
return data;
}
}
const cryptoProvider = new ContentCryptoProvider(key);
const parser = new PdfParser('./foo/bar.pdf', cryptoProvider);
Log level setting:
import { LogLevel, ... } from '@ridi/pdf-parser';
const parser = new PdfParser(/* path */, /* cryptoProvider */, /* logLevel */)
// or const parser = new PdfParser(/* path */, /* logLevel */)
parser.logger.logLevel = LogLevel.VERBOSE; // SILENT, ERROR, WARN(default), INFO, DEBUG, VERBOSE
API
parse(parseOptions)
Returns Promise<PdfBook>
with:
- PdfBook: Instance with pages info.
Or throw exception.
parseOptions: ?object
read()
Returns PDF file as Buffer
.
onProgress = callback(step, totalStep, action)
Tells the progress of parser through callback
.
const { Action } = PdfParser; // PARSE, READ_ITEMS
parser.onProgress = (step, totalStep, action) => {
console.log(`[${action}] ${step} / ${totalStep}`);
}
Model
PdfBook
- version: Version
- title: string
- author: string
- subject: string
- keywords: string
- creator: string
- producer: string
- creationDate: ?string
- modificationDate: ?string
- outlineItems: OutlineItem[]
- isLinearized: boolean
- isAcroFormPresent: boolean
- isXFAPresent: boolean
- isCollectionPresent: boolean
- userInfo: object
- pageCount: number
- permissions: Permissions
- toRaw(): object
Version
- major: number
- minor: number
- patch: number
- toString(): string
OutlineItem
- dest: ?string|*[]
- url: ?string
- title: string
- color: Color
- bold: boolean
- italic: boolean
- depth: number (Default: 0)
- children: OutlineItem[]
- page: ?number
- toRaw(): object
Color
- red: number
- green: number
- blue: number
- intValue: number (ex:
7237488
) - hexString: string (ex:
'#6e6f70'
) - rgbString: string (ex:
'rgb(110, 111, 112)'
) - toRaw(): object
Permissions
- allowPrinting: boolean
- allowContentsModifying: boolean
- allowCopying: boolean
- allowAnnotationsModifying: boolean
- allowInteractiveFormsModifying: boolean
- allowCopyingForAccessibility: boolean
- allowAssembling: boolean
- allowHighQualityPrinting: boolean
- toRaw(): ?number[]
Parse Options
boolean
fakeWorker: Use fake worker when used in a browser environment such as Electron Renderer Proccess.
Default: false