Extract text, images, and export pages as images from PDF
Written in pure JS
Usage
constfs=require('fs');constpdf=require('pdf-text-img');//todoletyourPDF='pdf.pdf';letdataBuffer=fs.readFileSync(yourPDF);pdf.LoadingPDF(dataBuffer).then(function(pageCount){for(leti=1;i<=pageCount;i++){/** * Extract text */pdf.ExtractText(i).then(function(data){console.log(data);});/** * Extract image */pdf.ExtractImg(i).then(function(data){for(letj=0;j<data.length;j++){// width: data[j]['width']// height: data[j]['height']letdataBuffer=newBuffer.from(data[j]['imgBase64'],'base64');fs.writeFile('a_'+i+'_'+j+'.jpg',dataBuffer,function(err){if(err)throwerr;});}});/** * Export as picture * Tips: Because of the use of native canvas, it needs to be used in H5 environment */pdf.ExportImg(i).then(function(data){// width: data['width']// height: data['height']letdataBuffer=newBuffer.from(data['imgBase64'],'base64');fs.writeFile('b_'+i+'.jpg',dataBuffer,function(err){if(err)throwerr;});});}});
It is worth noting that if a PDF has 1000 pages and each page has 100 pictures, it is obviously a waste of resources according to the above logic. Therefore, we can refer to the following logic of "batch extraction of pictures":
constfs=require('fs');constpdf=require('pdf-text-img');//todoletyourPDF='pdf.pdf';letdataBuffer=fs.readFileSync(yourPDF);asyncfunctionBatch(startPage){/** * Download pictures */startPage=startPage==undefined ? 1 : startPage;if(startPage>pdf.pdfInfo.numPages){console.log('Completed!');pdf.End();return;}pdf.ExtractImg(startPage).then(function(data){letindex=0;letdownloadCount=data.length;// Total number of pictures to download// No picture on current pageif(downloadCount==0){startPage++;console.log('No picture on current page, extract next page...');Batch(startPage);return;}for(leti=0;i<data.length;i++){letdataBuffer=newBuffer.from(data[i]['imgBase64'],'base64');downloadName++;fs.writeFile('pdf_'+startPage+'_'+i+'.jpg',dataBuffer,function(err){if(err)throwerr;index++;if(index==downloadCount){startPage++;console.log('Continue extraction...');Batch(startPage);}});}});}pdf.LoadingPDF(dataBuffer).then(function(pageCount){/** * batch extraction of pictures */Batch();});