Source code for pypdfocr.pypdfocr_gs

#!/usr/bin/env python2.7

# Copyright 2013 Virantha Ekanayake All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.



"""
    Wrap ghostscript calls.  Yes, this is ugly.
"""

import subprocess
import sys, os
import logging
import glob

[docs]def error(text): print("ERROR: %s" % text) exit(-1)
[docs]class PyGs(object): """Class to wrap all the ghostscript calls""" def __init__(self, config): self.msgs = { 'GS_FAILED': 'Ghostscript execution failed', 'GS_MISSING_PDF': 'Cannot find specified pdf file', 'GS_OUTDATED': 'Your Ghostscript version is probably out of date. Please upgrade to the latest version', 'GS_MISSING_BINARY': 'Could not find Ghostscript in the usual place; please specify it using your config file', } self.threads = config.get('threads',4) if "binary" in config: # Override location of binary binary = config['binary'] if os.name == 'nt': binary = '"%s"' % binary binary = binary.replace("\\", "\\\\") logging.info("Setting location for executable to %s" % (binary)) else: if str(os.name) == 'nt': win_binary = self._find_windows_gs() binary = '"%s"' % win_binary logging.info("Using Ghostscript: %s" % binary) else: binary = "gs" self.binary = binary #self.tiff_dpi = 300 self.output_dpi = 300 self.greyscale = True # Tiff is used for the ocr, so just fix it at 300dpi # The other formats will be used to create the final OCR'ed image, so determine # the DPI by using pdfimages if available, o/w default to 200 self.gs_options = {'tiff': ['tiff', ['-sDEVICE=tiff24nc','-r%(dpi)s' ]], 'jpg': ['jpg', ['-sDEVICE=jpeg','-dJPEGQ=75', '-r%(dpi)s']], 'jpggrey': ['jpg', ['-sDEVICE=jpeggray', '-dJPEGQ=75', '-r%(dpi)s']], 'png': ['png', ['-sDEVICE=png16m', '-r%(dpi)s']], 'pnggrey': ['png', ['-sDEVICE=pngmono', '-r%(dpi)s']], 'tifflzw': ['tiff', ['-sDEVICE=tifflzw', '-r%(dpi)s']], 'tiffg4': ['tiff', ['-sDEVICE=tiffg4', '-r%(dpi)s']], 'pnm': ['pnm', ['-sDEVICE=pnmraw', '-r%(dpi)s']], 'pgm': ['pgm', ['-sDEVICE=pgm', '-r%(dpi)s']], }
[docs] def _find_windows_gs(self): """ Searches through the Windows program files directories to find Ghostscript. If it finds multiple versions, it does a naive sort for now to find the most recent. :rval: The ghostscript binary location """ windirs = ["c:\\Program Files\\gs", "c:\\Program Files (x86)\\gs"] gs = None for d in windirs: if not os.path.exists(d): continue cwd = os.getcwd() os.chdir(d) listing = os.listdir('.') # Find all possible gs* sub-directories listing = [x for x in listing if x.startswith('gs')] # TODO: Make this a natural sort listing.sort(reverse=True) for bindir in listing: binpath = os.path.join(bindir,'bin') if not os.path.exists(binpath): continue os.chdir(binpath) # Look for gswin64c.exe or gswin32c.exe (the c is for the command-line version) gswin = glob.glob('gswin*c.exe') if len(gswin) == 0: continue gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?) os.chdir(cwd) return gs if not gs: error(self.msgs['GS_MISSING_BINARY'])
[docs] def _warn(self, msg): print("WARNING: %s" % msg)
[docs] def _get_dpi(self, pdf_filename): if not os.path.exists(pdf_filename): error(self.msgs['GS_MISSING_PDF'] + " %s" % pdf_filename) cmd = 'pdfimages -list "%s"' % pdf_filename logging.info("Running pdfimages to figure out DPI...") logging.debug(cmd) try: out = subprocess.check_output(cmd, shell=True) except subprocess.CalledProcessError as e: self._warn ("Could not execute pdfimages to calculate DPI (try installing xpdf or poppler?), so defaulting to %sdpi" % self.output_dpi) return # Need the second line of output # Make sure it exists (in case this is an empty pdf) results = out.splitlines() if len(results)<3: self._warn("Empty pdf, cannot determine dpi using pdfimages") return results = results[2] logging.debug(results) results = results.split() if(results[2] != 'image'): self._warn("Could not understand output of pdfimages, please rerun with -d option and file an issue at http://github.com/virantha/pypdfocr/issues") return x_pt, y_pt, greyscale = int(results[3]), int(results[4]), results[5]=='gray' self.greyscale = greyscale # Now, run imagemagick identify to get pdf width/height/density cmd = 'identify -format "%%w %%x %%h %%y\n" "%s"' % pdf_filename try: out = subprocess.check_output(cmd, shell=True) results = out.splitlines()[0] results = results.replace("Undefined", "") width, xdensity, height, ydensity = [float(x) for x in results.split()] xdpi = round(x_pt/width*xdensity) ydpi = round(y_pt/height*ydensity) self.output_dpi = xdpi if ydpi>xdpi: self.output_dpi = ydpi if self.output_dpi < 300: self.output_dpi = 300 if abs(xdpi-ydpi) > xdpi*.05: # Make sure the two dpi's are within 5% self._warn("X-dpi is %d, Y-dpi is %d, defaulting to %d" % (xdpi, ydpi, self.output_dpi)) else: print("Using %d DPI" % self.output_dpi) except Exception as e: logging.debug(str(e)) self._warn ("Could not execute identify to calculate DPI (try installing imagemagick?), so defaulting to %sdpi" % self.output_dpi) return
[docs] def _run_gs(self, options, output_filename, pdf_filename): try: cmd = '%s -q -dNOPAUSE %s -sOutputFile="%s" "%s" -c quit' % (self.binary, options, output_filename, pdf_filename) logging.info(cmd) out = subprocess.check_output(cmd, shell=True) except subprocess.CalledProcessError as e: print e.output if "undefined in .getdeviceparams" in e.output: error(self.msgs['GS_OUTDATED']) else: error (self.msgs['GS_FAILED'])
[docs] def make_img_from_pdf(self, pdf_filename): self._get_dpi(pdf_filename) # No need to bother anymore if not os.path.exists(pdf_filename): error(self.msgs['GS_MISSING_PDF'] + " %s" % pdf_filename) filename, filext = os.path.splitext(pdf_filename) # Create ancillary jpeg files to use later to calculate image dpi etc # We no longer use these for the final image. Instead the text is merged # directly with the original PDF. Yay! if self.greyscale: self.img_format = 'jpggrey' #self.img_format = 'pnggrey' logging.info("Detected greyscale") else: self.img_format = 'jpg' #self.img_format = 'png' logging.info("Detected color") self.img_file_ext = self.gs_options[self.img_format][0] # The possible output files glob globable_filename = '%s_*.%s' % (filename, self.img_file_ext) # Delete any img files already existing for fn in glob.glob(globable_filename): os.remove(fn) options = ' '.join(self.gs_options[self.img_format][1]) % {'dpi':self.output_dpi} output_filename = '%s_%%d.%s' % (filename, self.img_file_ext) self._run_gs(options, output_filename, pdf_filename) for fn in glob.glob(globable_filename): logging.info("Created image %s" % fn) return (self.output_dpi, globable_filename)