Source code for pypdfocr.pypdfocr_tesseract

#!/usr/bin/env python2.7

# Copyright 2013 Virantha Ekanayake All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

   Run Tesseract to generate hocr file 

import os, sys
import logging
import subprocess
import glob
from subprocess import CalledProcessError

from multiprocessing import Pool
from pypdfocr_interrupts import init_worker

[docs]def error(text): print("ERROR: %s" % text) sys.exit(-1) # Ugly hack to pass in object method to the multiprocessing library # From # Basically gets passed in a pair of (self, arg), and calls the method
[docs]def unwrap_self(arg, **kwarg): return PyTesseract.make_hocr_from_pnm(*arg, **kwarg)
[docs]class PyTesseract(object): """Class to wrap all the tesseract calls""" def __init__(self, config): """ Detect windows tesseract location. """ self.lang = 'eng' self.required = "3.02.02" self.threads = config.get('threads',4) if "binary" in config: # Override location of binary binary = config['binary'] if == 'nt': binary = '"%s"' % binary binary = binary.replace("\\", "\\\\")"Setting location for tesseracdt executable to %s" % (binary)) else: if str( == 'nt': # Explicit str here to get around some MagicMock stuff for testing that I don't quite understand binary = '"c:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"' else: binary = "tesseract" self.binary = binary self.msgs = { 'TS_MISSING': """ Could not execute %s Please make sure you have Tesseract installed correctly """ % self.binary, 'TS_VERSION':'Tesseract version is too old', 'TS_img_MISSING':'Cannot find specified tiff file', 'TS_FAILED': 'Tesseract-OCR execution failed!', }
[docs] def _is_version_uptodate(self): """ Make sure the version is current """"Checking tesseract version") cmd = '%s -v' % (self.binary) try: ret_output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) except CalledProcessError: # Could not run tesseract error(self.msgs['TS_MISSING']) ver_str = '0.0.0' for line in ret_output.splitlines(): if 'tesseract' in line: ver_str = line.split(' ')[1] if ver_str.endswith('dev'): # Fix for version strings that end in 'dev' ver_str = ver_str[:-3] # Iterate through the version dots ver = [int(x) for x in ver_str.split('.')] req = [int(x) for x in self.required.split('.')] # Aargh, in windows 3.02.02 is reported as version 3.02 # SFKM if str( == 'nt': req = req[:2] version_good = False for i,num in enumerate(req): if len(ver) < i+1: # This minor version number is not present in tesseract, so it must be # lower than required. (3.02 < 3.02.01) break if ver[i]==num and len(ver) == i+1 and len(ver)==len(req): # 3.02.02 == 3.02.02 version_good = True continue if ver[i]>num: # 4.0 > 3.02.02 # 3.03.02 > 3.02.02 version_good = True break if ver[i]<num: # 3.01.02 < 3.02.02 break return version_good, ver_str
[docs] def _warn(self, msg): # pragma: no cover print("WARNING: %s" % msg)
[docs] def make_hocr_from_pnms(self, fns): uptodate,ver = self._is_version_uptodate() if not uptodate: error(self.msgs['TS_VERSION']+ " (found %s, required %s)" % (ver, self.required)) # Glob it #fns = glob.glob(img_filename) logging.debug("Making pool for tesseract") pool = Pool(processes=self.threads, initializer=init_worker) try: hocr_filenames =, zip([self]*len(fns), fns)) pool.close() except KeyboardInterrupt or Exception: print("Caught keyboard interrupt... terminating") pool.terminate() raise finally: pool.join() return zip(fns,hocr_filenames)
[docs] def make_hocr_from_pnm(self, img_filename): basename,filext = os.path.splitext(img_filename) hocr_filename = "%s.html" % basename if not os.path.exists(img_filename): error(self.msgs['TS_img_MISSING'] + " %s" % (img_filename))"Running OCR on %s to create %s.html" % (img_filename, basename)) cmd = '%s "%s" "%s" -psm 1 -c hocr_font_info=1 -l %s hocr' % (self.binary, img_filename, basename, self.lang) try: ret_output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: # Could not run tesseract print e.output self._warn (self.msgs['TS_FAILED']) if os.path.isfile(hocr_filename): # Output format is html for old versions of tesseract"Created %s.html" % basename) return hocr_filename else: # Try changing extension to .hocr for tesseract 3.03 and higher hocr_filename = "%s.hocr" % basename if os.path.isfile(hocr_filename):"Created %s.hocr" % basename) return hocr_filename else: error(self.msgs['TS_FAILED'])