pdf-text-img

Extract text, images, and export pages as images from PDF
Written in pure JS

Usage

const fs = require('fs');
const pdf = require('pdf-text-img');

//todo
let yourPDF = 'pdf.pdf';
let dataBuffer = fs.readFileSync(yourPDF);

pdf.LoadingPDF(dataBuffer).then(function (pageCount) {
    for (let i = 1; i <= pageCount; i++) {
        /**
         * Extract text
         */
        pdf.ExtractText(i).then(function (data) {
            console.log(data);
        });

        /**
         * Extract image
         */
        pdf.ExtractImg(i).then(function (data) {
            for (let j = 0; j < data.length; j++) {
                // width: data[j]['width']
                // height: data[j]['height']

                let dataBuffer = new Buffer.from(data[j]['imgBase64'], 'base64');

                fs.writeFile('a_' + i + '_' + j + '.jpg', dataBuffer, function (err) {
                    if (err) throw err;
                });
            }
        });

        /**
         * Export as picture
         * Tips: Because of the use of native canvas, it needs to be used in H5 environment
         */
        pdf.ExportImg(i).then(function (data) {
            // width: data['width']
            // height: data['height']

            let dataBuffer = new Buffer.from(data['imgBase64'], 'base64');
            fs.writeFile('b_' + i + '.jpg', dataBuffer, function (err) {
                if (err) throw err;
            });
        });
    }
});

It is worth noting that if a PDF has 1000 pages and each page has 100 pictures, it is obviously a waste of resources according to the above logic. Therefore, we can refer to the following logic of "batch extraction of pictures":

const fs = require('fs');
const pdf = require('pdf-text-img');

//todo
let yourPDF = 'pdf.pdf';
let dataBuffer = fs.readFileSync(yourPDF);

async function Batch(startPage) {
    /**
     * Download pictures
     */
    startPage = startPage == undefined ? 1 : startPage;
    if (startPage > pdf.pdfInfo.numPages) {
        console.log('Completed!');
        pdf.End();
        return;
    }

    pdf.ExtractImg(startPage).then(function (data) {
        let index = 0;
        let downloadCount = data.length; // Total number of pictures to download

        // No picture on current page
        if (downloadCount == 0) {
            startPage++;

            console.log('No picture on current page, extract next page...');

            Batch(startPage);
            return;
        }

        for (let i = 0; i < data.length; i++) {
            let dataBuffer = new Buffer.from(data[i]['imgBase64'], 'base64');

            downloadName++;
            fs.writeFile('pdf_' + startPage + '_' + i + '.jpg', dataBuffer, function (err) {
                if (err) throw err;

                index++;

                if (index == downloadCount) {
                    startPage++;

                    console.log('Continue extraction...');

                    Batch(startPage);
                }
            });
        }
    });
}

pdf.LoadingPDF(dataBuffer).then(function (pageCount) {
    /**
     * batch extraction of pictures
     */
    Batch();
});

Welcome to exchange, email: yfilemail@163.com

pdf-text-img

pdf-text-img

Usage

It is worth noting that if a PDF has 1000 pages and each page has 100 pictures, it is obviously a waste of resources according to the above logic. Therefore, we can refer to the following logic of "batch extraction of pictures":

Readme

Keywords

Package Sidebar

Install

Weekly Downloads

Version

License

Unpacked Size

Total Files

Last publish

Collaborators

pdf-text-img

pdf-text-img

Usage

It is worth noting that if a PDF has 1000 pages and each page has 100 pictures, it is obviously a waste of resources according to the above logic. Therefore, we can refer to the following logic of "batch extraction of pictures":

Readme

Keywords

Package Sidebar

Install

DownloadsWeekly Downloads

Version

License

Unpacked Size

Total Files

Last publish

Collaborators

Weekly Downloads