Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
#!/usr/bin/env python2.7 # Copyright 2013 Virantha Ekanayake All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.
# Replace the Popen routine to allow win32 pyinstaller to build
print("ERROR: %s" % text) sys.exit(-1)
# decorator to retry multiple times def result(*args, **kwargs): except exc_type: pass raise
def open_file_with_timeout(parser, arg):
""" Make scanned PDFs searchable using Tesseract-OCR and autofile them .. automodule:: pypdfocr :private-members: """
""" The main clas. Performs the following functions:
* Parses command line options * Optionally just watches a directory for new PDF's to OCR; once a file appears, it does the next step * Runs a single file conversion: * Runs ghostscript to get tiff/jpg * Runs Tesseract-OCR to do the actual OCR * Takes the HOCR from Tesseract and creates a new PDF with the text overlay * Files the OCR'ed file in the proper place if specified * Files the original file if specified * """
""" Initializes the GhostScript, Tesseract, and PDF helper classes. """
""" Read in the yaml config file
:param config_file: Configuration file (YAML format) :type config_file: file :returns: dict of yaml file :rtype: dict """
""" Parse the command-line options and set the following object properties:
:param argv: usually just sys.argv[1:] :returns: Nothing
:ivar debug: Enable logging debug statements :ivar verbose: Enable verbose logging :ivar enable_filing: Whether to enable post-OCR filing of PDFs :ivar pdf_filename: Filename for single conversion mode :ivar watch_dir: Directory to watch for files to convert :ivar config: Dict of the config file :ivar watch: Whether folder watching mode is turned on :ivar enable_evernote: Enable filing to evernote
""" description = "Convert scanned PDFs into their OCR equivalent. Depends on GhostScript and Tesseract-OCR being installed.", epilog = "PyPDFOCR version %s (Copyright 2013 Virantha Ekanayake)" % __version__, )
default=False, dest='debug', help='Turn on debugging')
default=False, dest='verbose', help='Turn on verbose mode')
default=False, dest='mail', help='Send email after conversion')
default='eng', dest='lang', help='Language(default eng)')
default=False, dest='preprocess', help='Enable preprocessing. Not really useful now with improved Tesseract 3.04+')
default=False, dest='skip_preprocess', help='DEPRECATED: always skips now.')
#--------- # Single or watch mode #-------- # Positional argument for single file conversion # Watch directory for watch mode dest='watch_dir', help='Watch given directory and run ocr automatically until terminated')
#----------- # Filing options #---------- default=False, dest='enable_filing', help='Enable filing of converted PDFs') #filing_group.add_argument('-c', '--config', type = argparse.FileType('r'), dest='configfile', help='Configuration file for defaults and PDF filing') default=False, dest='enable_evernote', help='Enable filing to Evernote') default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder')
# Add flow option to single mode extract_images,preprocess,ocr,write
# Deprecating skip_preprocess to make skipping the default (always true). Tesseract 3.04 is so much better now # at handling non-ideal inputs and lines
# Parse configuration file (YAML) if specified
else:
else:
p.error("Please specify a configuration file(CONFIGFILE) to enable email")
""" Helper function to delete files :param files: List of files to delete :type files: list :returns: None """
""" Instance the proper PyFiler object (either :class:`pypdfocr.pypdfocr_filer_dirs.PyFilerDirs` or :class:`pypdfocr.pypdfocr_filer_evernote.PyFilerEvernote`)
TODO: Make this more generic to allow third-party plugin filing objects
:ivar filer: :class:`pypdfocr.pypdfocr_filer.PyFiler` PyFiler subclass object that is instantiated :ivar pdf_filer: :class:`pypdfocr.pypdfocr_pdffiler.PyPdfFiler` object to help with PDF reading :returns: Nothing
""" # Look at self.config and create a self.pdf_filer object
# -------------------------------------------------- # Some sanity checks # -------------------------------------------------- error ("%s must be specified in config file" % required) else: # Make sure these required folders are in abspath format # User wants to move the original after filing os.makedirs(self.config[orig]) else: # -------------------------------------------------- # Start the filing object # -------------------------------------------------- self.filer = PyFilerEvernote(self.config['evernote_developer_token']) else:
# ------------------------------ # Add all the folder names with associated keywords # to the filer object # ------------------------------ # Make sure keywords are lower-cased before adding
""" Instantiate the external tool wrappers with their config dicts """
""" Does the following:
- Convert the PDF using GhostScript to TIFF and JPG - Run Tesseract on the TIFF to extract the text into HOCR (html) - Use PDF generator to overlay the text on the JPG and output a new PDF - Clean up temporary image files
:param pdf_filename: Scanned PDF :type pdf_filename: string :returns: OCR'ed PDF :rtype: filename string """ # Make the images for Tesseract
except Exception: raise
# Preprocess else: # Run teserract
# Generate new pdf with overlayed text #ocr_pdf_filename = self.pdf.overlay_hocr(tiff_dpi, hocr_filename, pdf_filename)
finally: # Clean up the files # Need to clean up the original image files before preprocessing
# clean up the hocr input (jpg) and output (html) files #self._clean_up_files(itertools.chain(*hocr_filenames)) # splat the hocr_filenames as it is a list of pairs # Seems like newer tessearct > 3.03 is now creating .txt files with the OCR text?/? #self._clean_up_files([x[1].replace(".hocr", ".txt") for x in hocr_filenames])
""" move the converted filename to its destiantion directory. Optionally also moves the original PDF.
:param ocr_pdffilename: Converted PDF file :type ocr_pdffilename: filename string :param original_pdffilename: Original scanned PDF file :type original_pdffilename: filename string :returns: Target folder name "rtype: string """
""" Send email using smtp """
PyPDFOCR Conversion: -------------------- Original file: %s Converted file: %s Filing: %s """ % (infilename, outfilename, filing)
""" The main entry point into PyPDFOCR
#. Parses options #. If filing is enabled, call :func:`_setup_filing` #. If watch is enabled, start the watcher #. :func:`run_conversion` #. if filing is enabled, call :func:`file_converted_file` """ # Read the command line options
# Setup tesseract and ghostscript
# Setup the pdf filing if enabled
# Do the actual conversion followed by optional filing and email while True: # Make sure the watcher doesn't terminate try: py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch')) for pdf_filename in py_watcher.start(): self._convert_and_file_email(pdf_filename) except KeyboardInterrupt: break except Exception as e: print traceback.print_exc(e) py_watcher.stop()
else:
""" Helper function to run the conversion, then do the optional filing, and optional emailing. """ else:
def main(): # pragma: no cover multiprocessing.freeze_support() script = PyPDFOCR() script.go(sys.argv[1:])
main()
|