Source code for pypdfocr.pypdfocr_gs

#!/usr/bin/env python2.7

# Copyright 2013 Virantha Ekanayake All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.



"""
    Wrap ghostscript calls.  Yes, this is ugly.
"""

import subprocess
import sys, os
import logging
import glob

[docs]def error(text):
    print("ERROR: %s" % text)
    exit(-1)

[docs]class PyGs(object):
    """Class to wrap all the ghostscript calls"""

    def __init__(self, config):
        self.msgs = {
                'GS_FAILED': 'Ghostscript execution failed',
                'GS_MISSING_PDF': 'Cannot find specified pdf file',
                'GS_OUTDATED': 'Your Ghostscript version is probably out of date.  Please upgrade to the latest version',
                'GS_MISSING_BINARY': 'Could not find Ghostscript in the usual place; please specify it using your config file',
            }
        self.threads = config.get('threads',4)

        if "binary" in config:  # Override location of binary
            binary = config['binary']
            if os.name == 'nt':
                binary = '"%s"' % binary
                binary = binary.replace("\\", "\\\\")
            logging.info("Setting location for executable to %s" % (binary))
        else:
            if str(os.name) == 'nt':
                win_binary = self._find_windows_gs()
                binary = '"%s"' % win_binary
                logging.info("Using Ghostscript: %s" % binary)
            else:
                binary = "gs"
        self.binary = binary

        #self.tiff_dpi = 300
        self.output_dpi = 300
        self.greyscale = True
        # Tiff is used for the ocr, so just fix it at 300dpi
        #  The other formats will be used to create the final OCR'ed image, so determine
        #  the DPI by using pdfimages if available, o/w default to 200
        self.gs_options = {'tiff': ['tiff', ['-sDEVICE=tiff24nc','-r%(dpi)s' ]],
                            'jpg': ['jpg', ['-sDEVICE=jpeg','-dJPEGQ=75', '-r%(dpi)s']],
                            'jpggrey': ['jpg', ['-sDEVICE=jpeggray', '-dJPEGQ=75', '-r%(dpi)s']],
                            'png': ['png', ['-sDEVICE=png16m', '-r%(dpi)s']],
                            'pnggrey': ['png', ['-sDEVICE=pngmono', '-r%(dpi)s']],
                            'tifflzw': ['tiff', ['-sDEVICE=tifflzw', '-r%(dpi)s']],
                            'tiffg4': ['tiff', ['-sDEVICE=tiffg4', '-r%(dpi)s']],
                            'pnm': ['pnm', ['-sDEVICE=pnmraw', '-r%(dpi)s']],
                            'pgm': ['pgm', ['-sDEVICE=pgm', '-r%(dpi)s']],
                        }

[docs]    def _find_windows_gs(self):
        """
            Searches through the Windows program files directories to find Ghostscript.
            If it finds multiple versions, it does a naive sort for now to find the most
            recent.

            :rval: The ghostscript binary location

        """
        windirs = ["c:\\Program Files\\gs", "c:\\Program Files (x86)\\gs"]
        gs = None
        for d in windirs:
            if not os.path.exists(d):
                continue
            cwd = os.getcwd()
            os.chdir(d)
            listing = os.listdir('.')

            # Find all possible gs* sub-directories
	    listing = [x for x in listing if x.startswith('gs')]

            # TODO: Make this a natural sort
            listing.sort(reverse=True)
	    for bindir in listing:
		binpath = os.path.join(bindir,'bin')
		if not os.path.exists(binpath): continue
		os.chdir(binpath)
                # Look for gswin64c.exe or gswin32c.exe (the c is for the command-line version)
		gswin = glob.glob('gswin*c.exe')
		if len(gswin) == 0:
		    continue
		gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?)
		os.chdir(cwd)
		return gs

        if not gs:
            error(self.msgs['GS_MISSING_BINARY'])

[docs]    def _warn(self, msg):
        print("WARNING: %s" % msg)

[docs]    def _get_dpi(self, pdf_filename):
        if not os.path.exists(pdf_filename):
            error(self.msgs['GS_MISSING_PDF'] + " %s" % pdf_filename)

        cmd = 'pdfimages -list "%s"' % pdf_filename
        logging.info("Running pdfimages to figure out DPI...")
        logging.debug(cmd)
        try:
            out = subprocess.check_output(cmd, shell=True)
        except subprocess.CalledProcessError as e:
            self._warn ("Could not execute pdfimages to calculate DPI (try installing xpdf or poppler?), so defaulting to %sdpi" % self.output_dpi) 
            return

        # Need the second line of output
        # Make sure it exists (in case this is an empty pdf)
        results = out.splitlines()
        if len(results)<3:
            self._warn("Empty pdf, cannot determine dpi using pdfimages")
            return
        results = results[2]
        logging.debug(results)
        results = results.split()
        if(results[2] != 'image'):
            self._warn("Could not understand output of pdfimages, please rerun with -d option and file an issue at http://github.com/virantha/pypdfocr/issues") 
            return
        x_pt, y_pt, greyscale = int(results[3]), int(results[4]), results[5]=='gray'
        self.greyscale = greyscale

        # Now, run imagemagick identify to get pdf width/height/density
        cmd = 'identify -format "%%w %%x %%h %%y\n" "%s"' % pdf_filename
        try:
            out = subprocess.check_output(cmd, shell=True)
            results = out.splitlines()[0]
            results = results.replace("Undefined", "")
            width, xdensity, height, ydensity = [float(x) for x in results.split()]
            xdpi = round(x_pt/width*xdensity)
            ydpi = round(y_pt/height*ydensity)
            self.output_dpi = xdpi
            if ydpi>xdpi: self.output_dpi = ydpi
            if self.output_dpi < 300: self.output_dpi = 300
            if abs(xdpi-ydpi) > xdpi*.05:  # Make sure the two dpi's are within 5%
                self._warn("X-dpi is %d, Y-dpi is %d, defaulting to %d" % (xdpi, ydpi, self.output_dpi))
            else:
                print("Using %d DPI" % self.output_dpi)


        except Exception as e:
            logging.debug(str(e))
            self._warn ("Could not execute identify to calculate DPI (try installing imagemagick?), so defaulting to %sdpi" % self.output_dpi) 
        return



[docs]    def _run_gs(self, options, output_filename, pdf_filename):
        try:
            cmd = '%s -q -dNOPAUSE %s -sOutputFile="%s" "%s" -c quit' % (self.binary, options, output_filename, pdf_filename)
            logging.info(cmd)        
            out = subprocess.check_output(cmd, shell=True)

        except subprocess.CalledProcessError as e:
            print e.output
            if "undefined in .getdeviceparams" in e.output:
                error(self.msgs['GS_OUTDATED'])
            else:
                error (self.msgs['GS_FAILED'])


[docs]    def make_img_from_pdf(self, pdf_filename):
        self._get_dpi(pdf_filename) # No need to bother anymore

        if not os.path.exists(pdf_filename):
            error(self.msgs['GS_MISSING_PDF'] + " %s" % pdf_filename)

        filename, filext = os.path.splitext(pdf_filename)


        # Create ancillary jpeg files to use later to calculate image dpi etc
        #   We no longer use these for the final image. Instead the text is merged
        #   directly with the original PDF.  Yay!
        if self.greyscale:
            self.img_format = 'jpggrey'
            #self.img_format = 'pnggrey'
            logging.info("Detected greyscale")
        else:
            self.img_format = 'jpg'
            #self.img_format = 'png'
            logging.info("Detected color")

        self.img_file_ext = self.gs_options[self.img_format][0]

        # The possible output files glob
        globable_filename = '%s_*.%s' % (filename, self.img_file_ext)
        # Delete any img files already existing
        for fn in glob.glob(globable_filename):
            os.remove(fn)

        options = ' '.join(self.gs_options[self.img_format][1]) % {'dpi':self.output_dpi}
        output_filename = '%s_%%d.%s' % (filename, self.img_file_ext)
        self._run_gs(options, output_filename, pdf_filename)
        for fn in glob.glob(globable_filename):
            logging.info("Created image %s" % fn)
        return (self.output_dpi, globable_filename)
Navigation

Quick search

Source code for pypdfocr.pypdfocr_gs

Navigation