How to Run OCR on Scanned Documents with Tesseract.js
Paper documents are still ubiquitous in our everyday work, like invoices, checks and printed books. It takes effort to manage them.
With Dynamic Web TWAIN, we can build a web application to scan documents from document scanners. But further steps are required to better manage the documents and OCR is an important part of the process. Here is a list of what OCR can help with the document managing process:
- Save time for manual data entry.
- Save disk space by digitizing the document images.
- Full-text search can be used to find useful data easily.
- Use the text for further editing, translating, etc.
In this article, we are going to talk about how to use Tesseract.js to run OCR on documents scanned with Dynamic Web TWAIN.
New Project
Clone a webpack starter project as the template for starting a new project:
git clone https://github.com/wbkd/webpack-starter
Install Dependencies
-
Install Dynamic Web TWAIN.
npm install dwt
In addition, we need to copy the resources of Dynamic Web TWAIN to the public folder.
-
Install
ncp
.npm install --save-dev ncp
-
Modify
package.json
to copy the resources for the build and start commands."scripts": { "lint": "npm run lint:styles; npm run lint:scripts", "lint:styles": "stylelint src", "lint:scripts": "eslint src", "build": "cross-env NODE_ENV=production webpack --config webpack/webpack.config.prod.js", "start": "webpack serve --config webpack/webpack.config.dev.js" + "build": "ncp node_modules/dwt/dist public/dwt-resources && cross-env NODE_ENV=production webpack --config webpack/webpack.config.prod.js", + "start": "ncp node_modules/dwt/dist public/dwt-resources && webpack serve --config webpack/webpack.config.dev.js" },
-
Modify
webpack.common.js
to copy the files in thepublic
folder to the output folder instead of thepublic
folder inside the output folder.new CopyWebpackPlugin({ - patterns: [{ from: Path.resolve(__dirname, '../public'), to: 'public' }], + patterns: [{ from: Path.resolve(__dirname, '../public'), to: '' }], }),
-
-
Install Tesseract.js.
npm install tesseract.js
Use Dynamic Web TWAIN to Scan Documents
-
In the HTML file, add a container for Dynamic Web TWAIN and several buttons to use it.
<h2>Web TWAIN + Tesseract OCR Demo</h2> <div class="app"> <div class="document-scanner"> <div>Document Scanner:</div> <button class="scan-btn">Scan</button> <button class="edit-btn">Edit</button> <button class="load-btn">Load Files</button> <div id="dwtcontrolcontainer"></div> </div> </div>
-
In the JavaScript, initialize Dynamic Web TWAIN after the page is loaded. Here, we bind the viewer for documents to the
dwtcontrolcontainer
container.import Dynamsoft from "dwt"; let DWObject; window.onload = function(){ initDWT(); }; function initDWT(){ const containerID = "dwtcontrolcontainer"; Dynamsoft.DWT.RegisterEvent('OnWebTwainReady', () => { DWObject = Dynamsoft.DWT.GetWebTwain(containerID); DWObject.Viewer.width = "100%"; DWObject.Viewer.height = "100%"; }); Dynamsoft.DWT.ResourcesPath = "/dwt-resources"; Dynamsoft.DWT.Containers = [{ WebTwainId: 'dwtObject', ContainerId: containerID }]; Dynamsoft.DWT.Load(); }
-
Register the event for the scan button. It will show a list of scanners and let the user choose one of them to perform scanning.
document.getElementsByClassName("scan-btn")[0].addEventListener("click",function(){ if (DWObject) { DWObject.SelectSource(function () { DWObject.OpenSource(); DWObject.AcquireImage(); }, function () { console.log("SelectSource failed!"); } ); } });
-
Register the event for the load button. It allows loading local PDF and image files.
document.getElementsByClassName("load-btn")[0].addEventListener("click",function(){ if (DWObject) { DWObject.IfShowFileDialog = true; // PDF Rasterizer Addon is used here to ensure PDF support DWObject.Addon.PDF.SetResolution(200); DWObject.Addon.PDF.SetConvertMode(Dynamsoft.DWT.EnumDWT_ConvertMode.CM_RENDERALL); DWObject.LoadImageEx("", Dynamsoft.DWT.EnumDWT_ImageType.IT_ALL); } });
-
Register the event for the edit button to show an image editor.
document.getElementsByClassName("edit-btn")[0].addEventListener("click",function(){ if (DWObject) { let imageEditor = DWObject.Viewer.createImageEditor(); imageEditor.show(); } });
Screenshot of the demo:
Use Tesseract.js for OCR
-
In the HTML file, add several elements for using Tesseract-OCR.
<div class="ocr"> <span>Tesseract:</span> <span id="status"></span> <div> <button class="ocr-btn">OCR Selected</button> <button class="batch-ocr-btn">OCR All</button> <label for="skip-processed-chk">Skip processed <input id="skip-processed-chk" type="checkbox"/> </label> <button class="download-text-btn">Download Text</button> </div> <div class="text"></div> </div>
-
Initialize Tesseract-OCR after the page is loaded.
import { createWorker } from 'tesseract.js'; let worker; window.onload = function(){ initDWT(); initTesseract(); }; async function initTesseract(){ const status = document.getElementById("status"); status.innerText = "Loading tesseract core..."; worker = await createWorker({ logger: m => console.log(m) }); status.innerText = "Loading lanuage model..."; await worker.loadLanguage('eng'); status.innerText = "Initializing..."; await worker.initialize('eng'); status.innerText = "Ready"; }
-
Add a function to convert one page in Dynamic Web TWAIN’s buffer to Blob for Tesseract to recognize the text in it.
async function OCROneImage(index){ return new Promise(function (resolve, reject) { if (DWObject) { const success = async (result) => { const data = await worker.recognize(result); resolve(data); }; const failure = (errorCode, errorString) => { reject(errorString); }; DWObject.ConvertToBlob([index],Dynamsoft.DWT.EnumDWT_ImageType.IT_JPG, success, failure); }else{ reject("Not initialized"); } }); }
-
Register events for the
OCR Selected
button andOCR All
button. If the page has already been processed and theSkip processed
checkbox is checked, skip the page. The results are saved in theresultsDict
object with the page’s image ID as the key.let resultsDict = {}; document.getElementsByClassName("ocr-btn")[0].addEventListener("click",function(){ OCRSelected(); }); document.getElementsByClassName("batch-ocr-btn")[0].addEventListener("click",function(){ BatchOCR(); }); async function OCRSelected(){ if (DWObject && worker) { const index = DWObject.CurrentImageIndexInBuffer; const skipProcessed = document.getElementById("skip-processed-chk").checked; const ImageID = DWObject.IndexToImageID(index); if (skipProcessed) { if (resultsDict[ImageID]) { console.log("Processed"); return; } } const status = document.getElementById("status"); status.innerText = "Recognizing..."; const data = await OCROneImage(index); resultsDict[ImageID] = data; status.innerText = "Done"; showTextOfPage(ImageID); } } async function BatchOCR(){ if (DWObject && worker) { const skipProcessed = document.getElementById("skip-processed-chk").checked; const status = document.getElementById("status"); for (let index = 0; index < DWObject.HowManyImagesInBuffer; index++) { const ImageID = DWObject.IndexToImageID(index); if (skipProcessed) { if (resultsDict[ImageID]) { console.log("Processed"); continue; } } status.innerText = "Recognizing page "+(index+1)+"..."; const data = await OCROneImage(index); resultsDict[ImageID] = data; } status.innerText = "Done"; } }
-
Register the
OnBufferChanged
event of Dynamic Web TWAIN. It can be triggered when the selected pages are changed. If the selected page changes, then display the extracted text of that page.DWObject.RegisterEvent('OnBufferChanged',function (bufferChangeInfo) { const selectedIds = bufferChangeInfo["selectedIds"]; if (selectedIds.length === 1) { showTextOfPage(selectedIds[0]); } }); function showTextOfPage(ImageID){ if (resultsDict[ImageID]) { const text = resultsDict[ImageID].data.text; document.getElementsByClassName("text")[0].innerText = text; }else{ document.getElementsByClassName("text")[0].innerText = ""; } }
-
Add a
DownloadText
function to download the text file of the extracted text of the scanned pages.function DownloadText(){ if (DWObject) { let text = ""; for (let index = 0; index < DWObject.HowManyImagesInBuffer; index++) { const ImageID = DWObject.IndexToImageID(index); if (resultsDict[ImageID]) { text = text + resultsDict[ImageID].data.text; } text = text + "\n\n=== "+ "Page "+ (index+1) +" ===\n\n"; } let filename = 'text.txt'; let link = document.createElement('a'); link.style.display = 'none'; link.setAttribute('target', '_blank'); link.setAttribute('href', 'data:text/plain;charset=utf-8,' + encodeURIComponent(text)); link.setAttribute('download', filename); document.body.appendChild(link); link.click(); document.body.removeChild(link); } }
All right, we can now use Tesseract-OCR to extract the text of documents scanned with Dynamic Web TWAIN.
Screenshot of the final result:
Source Code
Check out the code of the demo to have a try: