Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
#!/usr/bin/env python2.7
# Copyright 2013 Virantha Ekanayake All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.
Wrap ghostscript calls. Yes, this is ugly. """
"""Class to wrap all the ghostscript calls"""
'GS_FAILED': 'Ghostscript execution failed', 'GS_MISSING_PDF': 'Cannot find specified pdf file', 'GS_OUTDATED': 'Your Ghostscript version is probably out of date. Please upgrade to the latest version', 'GS_MISSING_BINARY': 'Could not find Ghostscript in the usual place; please specify it using your config file', }
binary = '"%s"' % binary binary = binary.replace("\\", "\\\\") else: win_binary = self._find_windows_gs() binary = '"%s"' % win_binary logging.info("Using Ghostscript: %s" % binary) else:
#self.tiff_dpi = 300 # Tiff is used for the ocr, so just fix it at 300dpi # The other formats will be used to create the final OCR'ed image, so determine # the DPI by using pdfimages if available, o/w default to 200 'jpg': ['jpg', ['-sDEVICE=jpeg','-dJPEGQ=75', '-r%(dpi)s']], 'jpggrey': ['jpg', ['-sDEVICE=jpeggray', '-dJPEGQ=75', '-r%(dpi)s']], 'png': ['png', ['-sDEVICE=png16m', '-r%(dpi)s']], 'pnggrey': ['png', ['-sDEVICE=pngmono', '-r%(dpi)s']], 'tifflzw': ['tiff', ['-sDEVICE=tifflzw', '-r%(dpi)s']], 'tiffg4': ['tiff', ['-sDEVICE=tiffg4', '-r%(dpi)s']], 'pnm': ['pnm', ['-sDEVICE=pnmraw', '-r%(dpi)s']], 'pgm': ['pgm', ['-sDEVICE=pgm', '-r%(dpi)s']], }
""" Searches through the Windows program files directories to find Ghostscript. If it finds multiple versions, it does a naive sort for now to find the most recent.
:rval: The ghostscript binary location
""" windirs = ["c:\\Program Files\\gs", "c:\\Program Files (x86)\\gs"] gs = None for d in windirs: if not os.path.exists(d): continue cwd = os.getcwd() os.chdir(d) listing = os.listdir('.')
# Find all possible gs* sub-directories listing = [x for x in listing if x.startswith('gs')]
# TODO: Make this a natural sort listing.sort(reverse=True) for bindir in listing: binpath = os.path.join(bindir,'bin') if not os.path.exists(binpath): continue os.chdir(binpath) # Look for gswin64c.exe or gswin32c.exe (the c is for the command-line version) gswin = glob.glob('gswin*c.exe') if len(gswin) == 0: continue gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?) os.chdir(cwd) return gs
if not gs: error(self.msgs['GS_MISSING_BINARY'])
except subprocess.CalledProcessError as e: self._warn ("Could not execute pdfimages to calculate DPI (try installing xpdf or poppler?), so defaulting to %sdpi" % self.output_dpi) return
# Need the second line of output # Make sure it exists (in case this is an empty pdf) self._warn("Empty pdf, cannot determine dpi using pdfimages") return self._warn("Could not understand output of pdfimages, please rerun with -d option and file an issue at http://github.com/virantha/pypdfocr/issues") return
# Now, run imagemagick identify to get pdf width/height/density else:
except Exception as e: logging.debug(str(e)) self._warn ("Could not execute identify to calculate DPI (try installing imagemagick?), so defaulting to %sdpi" % self.output_dpi)
except subprocess.CalledProcessError as e: print e.output if "undefined in .getdeviceparams" in e.output: error(self.msgs['GS_OUTDATED']) else: error (self.msgs['GS_FAILED'])
error(self.msgs['GS_MISSING_PDF'] + " %s" % pdf_filename)
# Create ancillary jpeg files to use later to calculate image dpi etc # We no longer use these for the final image. Instead the text is merged # directly with the original PDF. Yay! #self.img_format = 'pnggrey' else: #self.img_format = 'png'
# The possible output files glob # Delete any img files already existing os.remove(fn)
|