@ridi/epub-parser
Common EPUB2 data parser for Ridibooks services
Features
- [x] EPUB2 parsing
- [ ] EPUB3 parsing
- [x] Package validation with option
- [x] Unzip epub file when parsing with options
- [x] Read files
- [x] Extract inner HTML of body in Spine with option
- [x] Change base path of Spine, CSS and Inline style with option
- [x] Customize CSS, Inline Style with options
- [ ] Truncate inner HTML of body in Spine with options
- [ ] Minify HTML, CSS, Inline Style with options
- [x] Encrypt and decrypt function when parsing or reading or unzipping
- [ ] More spec
- [ ] encryption.xml
- [ ] manifest.xml
- [ ] metadata.xml
- [ ] rights.xml
- [ ] signatures.xml
- [ ] Debug mode
- [ ] Environment
- [x] Node
- [ ] CLI
- [ ] Browser
- [ ] Online demo
Install
npm install @ridi/epub-parser
Usage
Basic:
import { EpubParser } from '@ridi/epub-parser';
// or const { EpubParser } = require('@ridi/epub-parser');
const parser = new EpubParser('./foo/bar.epub' or './unzippedPath');
parser.parse(/* { parseOptions } */).then((book) => {
parser.readItems(book.spines/*, { readOptions } */).then((results) => {
...
});
...
});
with AesCryptor:
import { CryptoProvider, AesCryptor } from '@ridi/epub-parser';
// or const { CryptoProvider, AesCryptor } = require('@ridi/epub-parser');
const { Purpose } = CryptoProvider;
const { Mode, Padding } = AesCryptor;
class ContentCryptoProvider extends CryptoProvider {
constructor(key) {
super();
this.cryptor = new AesCryptor(Mode.ECB, { key });
}
getCryptor(filePath, purpose) {
return this.cryptor;
}
// If use as follows:
// const provider = new ContentCryptoProvider(...);
// const parser = new EpubParser('encrypted.epub', provider);
// const book = await parser.parse({ unzipPath: ... });
// const firstSpine = await parser.readItem(book.spines[0]);
//
// It will be called as follows:
// 1. run(data, 'encrypted.epub', Purpose.READ_IN_DIR)
// 2. run(data, 'META-INF/container.xml', Purpose.READ_IN_ZIP)
// 3. run(data, 'OEBPS/content.opf', Purpose.READ_IN_ZIP)
// ...
// 4. run(data, 'mimetype', Purpose.WRITE)
// ...
// 5. run(data, 'OEBPS/Text/Section0001.xhtml', Purpose.READ_IN_DIR)
//
run(data, filePath, purpose) {
const cryptor = this.getAesCryptor(filePath, purpose);
const padding = Padding.AUTO;
if (purpose === Purpose.READ_IN_DIR) {
return cryptor.decrypt(data, { padding });
} else if (purpose === Purpose.WRITE) {
return cryptor.encrypt(data, { padding });
}
return data;
}
}
const cryptoProvider = new ContentCryptoProvider(key);
const parser = new EpubParser('./encrypted.epub' or './unzippedPath', cryptoProvider);
Log level setting:
import { LogLevel, ... } from '@ridi/epub-parser';
const parser = new EpubParser(/* path */, /* cryptoProvider */, /* logLevel */)
// or const parser = new EpubParser(/* path */, /* logLevel */)
parser.logger.logLevel = LogLevel.VERBOSE; // SILENT, ERROR, WARN(default), INFO, DEBUG, VERBOSE
API
parse(parseOptions)
Returns Promise<EpubBook>
with:
- EpubBook: Instance with metadata, spine list, table of contents, etc.
Or throw exception.
parseOptions: ?object
readItem(item, readOptions)
Returns string
or Buffer
in Promise
with:
-
SpineItem, CssItem, InlineCssItem, NcxItem, SvgItem:
string
-
Other items:
Buffer
or throw exception.
Item
(see: Item Types)
item:
readOptions: ?object
readItems(items, readOptions)
Returns string[]
or Buffer[]
in Promise
with:
-
SpineItem, CssItem, InlineCssItem, NcxItem, SvgItem:
string[]
-
Other items:
Buffer[]
or throw exception.
Item[]
(see: Item Types)
items:
readOptions: ?object
unzip(unzipPath, overwrite)
Returns Promise<boolean>
with:
- If result is
true
, unzip is successful or has already been unzipped.
Or throw exception.
string
unzipPath:
boolean
overwrite: onProgress = callback(step, totalStep, action)
Tells the progress of parser through callback
.
const { Action } = EpubParser; // PARSE, READ_ITEMS
parser.onProgress = (step, totalStep, action) => {
console.log(`[${action}] ${step} / ${totalStep}`);
}
Model
EpubBook
- titles: string[]
- creators: Author[]
- subjects: string[]
- description: ?string
- publisher: ?string
- contributors: Author[]
- dates: DateTime[]
- type: ?string
- format: ?string
- identifiers: Identifier[]
- source: ?string
- languages: string[]
- relation: ?string
- rights: ?string
- version: Version
- metas: Meta[]
- items: Item[]
- spines: SpintItem[]
- ncx: ?NcxItem
- fonts: FontItem[]
- cover: ?ImageItem
- images: ImageItem[]
- styles: CssItem[]
- guides: Guide[]
- deadItems: DeadItem[]
- toRaw(): object
Author
- name: ?string
- fileAs: ?string
- role: string (Default: Author.Roles.UNDEFINED)
- toRaw(): object
Author.Roles
Type | Value |
---|---|
UNDEFINED | undefined |
UNKNOWN | unknown |
ADAPTER | adp |
ANNOTATOR | ann |
ARRANGER | arr |
ARTIST | art |
ASSOCIATEDNAME | asn |
AUTHOR | aut |
AUTHOR_IN_QUOTATIONS_OR_TEXT_EXTRACTS | aqt |
AUTHOR_OF_AFTER_WORD_OR_COLOPHON_OR_ETC | aft |
AUTHOR_OF_INTRODUCTIONOR_ETC | aui |
BIBLIOGRAPHIC_ANTECEDENT | ant |
BOOK_PRODUCER | bkp |
COLLABORATOR | clb |
COMMENTATOR | cmm |
DESIGNER | dsr |
EDITOR | edt |
ILLUSTRATOR | ill |
LYRICIST | lyr |
METADATA_CONTACT | mdc |
MUSICIAN | mus |
NARRATOR | nrt |
OTHER | oth |
PHOTOGRAPHER | pht |
PRINTER | prt |
REDACTOR | red |
REVIEWER | rev |
SPONSOR | spn |
THESIS_ADVISOR | ths |
TRANSCRIBER | trc |
TRANSLATOR | trl |
DateTime
- value: ?string
- event: string (Default: DateTime.Events.UNDEFINED)
- toRaw(): object
DateTime.Events
Type | Value |
---|---|
UNDEFINED | undefined |
UNKNOWN | unknown |
CREATION | creation |
MODIFICATION | modification |
PUBLICATION | publication |
Identifier
- value: ?string
- scheme: string (Default: Identifier.Schemes.UNDEFINED)
- toRaw(): object
Identifier.Schemes
Type | Value |
---|---|
UNDEFINED | undefined |
UNKNOWN | unknown |
DOI | doi |
ISBN | isbn |
ISBN13 | isbn13 |
ISBN10 | isbn10 |
ISSN | issn |
UUID | uuid |
URI | uri |
Meta
- name: ?string
- content: ?string
- toRaw(): object
Guide
- title: ?string
- type: string (Default: Guide.Types.UNDEFINED)
- href: ?string
- item: ?Item
- toRaw(): object
Guide.Types
Type | Value |
---|---|
UNDEFINED | undefined |
UNKNOWN | unknown |
COVER | cover |
TITLE_PAGE | title-page |
TOC | toc |
INDEX | index |
GLOSSARY | glossary |
ACKNOWLEDGEMENTS | acknowledgements |
BIBLIOGRAPHY | bibliography |
COLOPHON | colophon |
COPYRIGHT_PAGE | copyright-page |
DEDICATION | dedication |
EPIGRAPH | epigraph |
FOREWORD | foreword |
LOI | loi |
LOT | lot |
NOTES | notes |
PREFACE | preface |
TEXT | text |
Item Types
Item
- id: ?string
- href: ?string
- mediaType: ?string
- size: ?number
- isFileExists: boolean (size !== undefined)
- toRaw(): object
SpineItem (extend Item)
- index: number (Default: undefined)
- isLinear: boolean (Default: true)
- styles: ?CssItem[]
- first: ?SpineItem
- prev: ?SpineItem
- next: ?SpineItem
NcxItem (extend Item)
- navPoints: NavPoint[]
CssItem (extend Item)
- namespace: string
InlineCssItem (extend CssItem)
- style: string (Default: '')
ImageItem (extend Item)
- isCover: boolean (Default: false)
SvgItem (extend ImageItem)
FontItem (extend Item)
DeadItem (extend Item)
- reason: string (Default: DeadItem.Reason.UNDEFINED)
DeadItem.Reason
Type | Value |
---|---|
UNDEFINED | undefined |
UNKNOWN | unknown |
NOT_EXISTS | not_exists |
NOT_SPINE | not_spine |
NOT_NCX | not_ncx |
NOT_SUPPORT_TYPE | not_support_type |
NavPoint
- id: ?string
- label: ?string
- src: ?string
- anchor: ?string
- depth: number (Default: 0)
- children: NavPoint[]
- spine: ?SpineItem
- toRaw(): object
Version
- major: number
- minor: number
- patch: number
- toString(): string
Parse Options
- validatePackage
- allowNcxFileMissing
- unzipPath
- overwrite
- parseStyle
- styleNamespacePrefix
- additionalInlineStyle
boolean
validatePackage: If true, validation package specifications in IDPF listed below.
used only if input is EPUB file.
- Zip header should not corrupt.
-
mimetype
file must be first file in archive. -
mimetype
file should not compressed. -
mimetype
file should only contain stringapplication/epub+zip
. - Should not use extra field feature of ZIP format for mimetype file.
Default: false
boolean
allowNcxFileMissing: If false, stop parsing when NCX file not exists.
Default: true
?string
unzipPath: If specified, unzip to that path.
only using if input is EPUB file.
Default: undefined
boolean
overwrite: If true, overwrite to unzipPath when unzip.
only using if unzipPath specified.
Default: true
boolean
parseStyle: If true, styles used for spine is described, and one namespace is given per CSS file or inline style.
Otherwise it CssItem.namespace
, SpineItem.styles
is undefined
.
In any list, InlineCssItem is always positioned after CssItem. (EpubBook.styles
, EpubBook.items
, SpineItem.styles
, ...)
Default: true
string
styleNamespacePrefix: Prepend given string to namespace for identification.
only available if parseStyle is true.
Default: 'ridi_style'
?string
additionalInlineStyle: If specified, added inline styles to all spines.
only available if parseStyle is true.
Default: undefined
Read Options
- force
- basePath
- extractBody
- serializedAnchor
- ignoreScript
- removeAtrules
- removeTagSelector
- removeIdSelector
- removeClassSelector
force: boolean
If true, ignore any exceptions that occur within parser.
Default: false
?string
basePath: If specified, change base path of paths used by spine and css.
HTML: SpineItem
...
<!-- Before -->
<div>
<img src="../Images/cover.jpg">
</div>
<!-- After -->
<div>
<img src="{basePath}/OEBPS/Images/cover.jpg">
</div>
...
CSS: CssItem, InlineCssItem
/* Before */
@font-face {
font-family: NotoSansRegular;
src: url("../Fonts/NotoSans-Regular.ttf");
}
/* After */
@font-face {
font-family: NotoSansRegular;
src: url("{basePath}/OEBPS/Fonts/NotoSans-Regular.ttf");
}
Default: undefined
boolean|function
extractBody: If true, extract body. Otherwise it returns a full string. If specify a function instead of true, use function to transform body.
false
:
'<!doctype><html>\n<head>\n</head>\n<body style="background-color: #000000;">\n <p>Extract style</p>\n <img src=\"../Images/api-map.jpg\"/>\n</body>\n</html>'
true
:
'<body style="background-color: #000000;">\n <p>Extract style</p>\n <img src=\"../Images/api-map.jpg\"/>\n</body>'
function
:
readOptions.extractBody = (innerHTML, attrs) => {
const string = attrs.map((attr) => {
return ` ${attr.key}=\"${attr.value}\"`;
}).join(' ');
return `<article ${string}>${innerHTML}</article>`;
};
'<article style="background-color: #000000;">\n <p>Extract style</p>\n <img src=\"../Images/api-map.jpg\"/>\n</article>'
Default: false
Boolean
serializedAnchor: If true, replace file path of anchor in spine with spine index.
...
<spine toc="ncx">
<itemref idref="Section0001.xhtml"/> <!-- index: 0 -->
<itemref idref="Section0002.xhtml"/> <!-- index: 1 -->
<itemref idref="Section0003.xhtml"/> <!-- index: 2 -->
...
</spine>
...
<!-- Before -->
<a href="./Text/Section0002.xhtml#title">Chapter 2</a>
<!-- After -->
<a href="1#title">Chapter 2</a>
Default: false
boolean
ignoreScript: Ignore all scripts from within HTML.
Default: false
string[]
removeAtrules: Remove at-rules.
Default: []
string[]
removeTagSelector: Remove selector that point to specified tags.
Default: []
string[]
removeIdSelector: Remove selector that point to specified ids.
Default: []
string[]
removeClassSelector: Remove selector that point to specified classes.
Default: []