Source code for pypdfocr.pypdfocr_tesseract

#!/usr/bin/env python2.7

# Copyright 2013 Virantha Ekanayake All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""
   Run Tesseract to generate hocr file 
"""

import os, sys
import logging
import subprocess
import glob
from subprocess import CalledProcessError

from multiprocessing import Pool
from pypdfocr_interrupts import init_worker

[docs]def error(text):
    print("ERROR: %s" % text)
    sys.exit(-1)

# Ugly hack to pass in object method to the multiprocessing library
# From http://www.rueckstiess.net/research/snippets/show/ca1d7d90
# Basically gets passed in a pair of (self, arg), and calls the method
[docs]def unwrap_self(arg, **kwarg):
    return PyTesseract.make_hocr_from_pnm(*arg, **kwarg)

[docs]class PyTesseract(object):
    """Class to wrap all the tesseract calls"""
    def __init__(self, config):
        """
           Detect windows tesseract location.  
        """
        self.lang = 'eng'
        self.required = "3.02.02"
        self.threads = config.get('threads',4)

        if "binary" in config:  # Override location of binary
            binary = config['binary']
            if os.name == 'nt':
                binary = '"%s"' % binary
                binary = binary.replace("\\", "\\\\")
            logging.info("Setting location for tesseracdt executable to %s" % (binary))
        else:
            if str(os.name) == 'nt':
                # Explicit str here to get around some MagicMock stuff for testing that I don't quite understand
                binary = '"c:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"'
            else:
                binary = "tesseract"

        self.binary = binary

        self.msgs = {
            'TS_MISSING': """ 
                Could not execute %s
                Please make sure you have Tesseract installed correctly
                """ % self.binary,
            'TS_VERSION':'Tesseract version is too old',
            'TS_img_MISSING':'Cannot find specified tiff file',
            'TS_FAILED': 'Tesseract-OCR execution failed!',
        }


[docs]    def _is_version_uptodate(self):
        """
            Make sure the version is current 
        """
        logging.info("Checking tesseract version")
        cmd = '%s -v' % (self.binary)
        logging.info(cmd)        
        try:
            ret_output = subprocess.check_output(cmd, shell=True,  stderr=subprocess.STDOUT)
        except CalledProcessError:
            # Could not run tesseract
            error(self.msgs['TS_MISSING'])

        ver_str = '0.0.0'
        for line in ret_output.splitlines():
            if 'tesseract' in line:
                ver_str = line.split(' ')[1]
                if ver_str.endswith('dev'): # Fix for version strings that end in 'dev'
                    ver_str = ver_str[:-3]

        # Iterate through the version dots
        ver = [int(x) for x in ver_str.split('.')]
        req = [int(x) for x in self.required.split('.')]

        # Aargh, in windows 3.02.02 is reported as version 3.02  
        # SFKM
        if str(os.name) == 'nt':
            req = req[:2]

        version_good = False
        for i,num in enumerate(req):
            if len(ver) < i+1:
                # This minor version number is not present in tesseract, so it must be
                # lower than required.  (3.02 < 3.02.01)
                break
            if ver[i]==num and len(ver) == i+1 and len(ver)==len(req):
                # 3.02.02 == 3.02.02
                version_good = True
                continue
            if ver[i]>num:
                # 4.0 > 3.02.02
                # 3.03.02 > 3.02.02
                version_good = True
                break
            if ver[i]<num:
                # 3.01.02 < 3.02.02
                break
            
        return version_good, ver_str

[docs]    def _warn(self, msg): # pragma: no cover
        print("WARNING: %s" % msg)


[docs]    def make_hocr_from_pnms(self, fns):
        uptodate,ver =  self._is_version_uptodate()
        if not uptodate:
            error(self.msgs['TS_VERSION']+ " (found %s, required %s)" % (ver, self.required))

        # Glob it
        #fns = glob.glob(img_filename)
        logging.debug("Making pool for tesseract")
        pool = Pool(processes=self.threads, initializer=init_worker)

        try:
            hocr_filenames = pool.map(unwrap_self, zip([self]*len(fns), fns))
            pool.close()
        except KeyboardInterrupt or Exception:
            print("Caught keyboard interrupt... terminating")
            pool.terminate()
            raise
        finally:
            pool.join()

        return zip(fns,hocr_filenames)


[docs]    def make_hocr_from_pnm(self, img_filename):

        basename,filext = os.path.splitext(img_filename)
        hocr_filename = "%s.html" % basename

        if not os.path.exists(img_filename):
            error(self.msgs['TS_img_MISSING'] + " %s" % (img_filename))

        logging.info("Running OCR on %s to create %s.html" % (img_filename, basename))
        cmd = '%s "%s" "%s" -psm 1 -c hocr_font_info=1 -l %s hocr' % (self.binary, img_filename, basename, self.lang)
        logging.info(cmd)
        try:
            ret_output = subprocess.check_output(cmd, shell=True,  stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError as e:
            # Could not run tesseract
            print e.output
            self._warn (self.msgs['TS_FAILED'])
                
        if os.path.isfile(hocr_filename):
            # Output format is html for old versions of tesseract
            logging.info("Created %s.html" % basename)
            return hocr_filename
        else:
            # Try changing extension to .hocr for tesseract 3.03 and higher
            hocr_filename = "%s.hocr" % basename
            if os.path.isfile(hocr_filename):
                logging.info("Created %s.hocr" % basename)
                return hocr_filename
            else:
                error(self.msgs['TS_FAILED'])
Navigation

Quick search

Source code for pypdfocr.pypdfocr_tesseract

Navigation