Source code for pypdfocr.pypdfocr

#!/usr/bin/env python2.7
# Copyright 2013 Virantha Ekanayake All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import smtplib
import argparse
import sys, os, traceback, time
import logging
import shutil, glob
import itertools
from functools import wraps

from version import __version__
from PIL import Image
import yaml

import multiprocessing
# Replace the Popen routine to allow win32 pyinstaller to build
from multiprocessing import forking
from pypdfocr_multiprocessing import _Popen
forking.Popen = _Popen

from pypdfocr_pdf import PyPdf
from pypdfocr_tesseract import PyTesseract
from pypdfocr_gs import PyGs
from pypdfocr_watcher import PyPdfWatcher
from pypdfocr_pdffiler import PyPdfFiler
from pypdfocr_filer_dirs import PyFilerDirs
from pypdfocr_filer_evernote import PyFilerEvernote
from pypdfocr_preprocess import PyPreprocess

[docs]def error(text): print("ERROR: %s" % text) sys.exit(-1) # decorator to retry multiple times
[docs]def retry(count=5, exc_type = Exception): def decorator(func): @wraps(func) def result(*args, **kwargs): for _ in range(count): try: return func(*args, **kwargs) except exc_type: pass raise return result return decorator
@retry(count=6, exc_type=IOError)
[docs]def open_file_with_timeout(parser, arg): f = open(arg, 'r') return f
""" Make scanned PDFs searchable using Tesseract-OCR and autofile them .. automodule:: pypdfocr :private-members: """
[docs]class PyPDFOCR(object): """ The main clas. Performs the following functions: * Parses command line options * Optionally just watches a directory for new PDF's to OCR; once a file appears, it does the next step * Runs a single file conversion: * Runs ghostscript to get tiff/jpg * Runs Tesseract-OCR to do the actual OCR * Takes the HOCR from Tesseract and creates a new PDF with the text overlay * Files the OCR'ed file in the proper place if specified * Files the original file if specified * """ def __init__ (self): """ Initializes the GhostScript, Tesseract, and PDF helper classes. """ self.config = {}
[docs] def _get_config_file(self, config_file): """ Read in the yaml config file :param config_file: Configuration file (YAML format) :type config_file: file :returns: dict of yaml file :rtype: dict """ with config_file: myconfig = yaml.load(config_file) return myconfig
[docs] def get_options(self, argv): """ Parse the command-line options and set the following object properties: :param argv: usually just sys.argv[1:] :returns: Nothing :ivar debug: Enable logging debug statements :ivar verbose: Enable verbose logging :ivar enable_filing: Whether to enable post-OCR filing of PDFs :ivar pdf_filename: Filename for single conversion mode :ivar watch_dir: Directory to watch for files to convert :ivar config: Dict of the config file :ivar watch: Whether folder watching mode is turned on :ivar enable_evernote: Enable filing to evernote """ p = argparse.ArgumentParser( description = "Convert scanned PDFs into their OCR equivalent. Depends on GhostScript and Tesseract-OCR being installed.", epilog = "PyPDFOCR version %s (Copyright 2013 Virantha Ekanayake)" % __version__, ) p.add_argument('-d', '--debug', action='store_true', default=False, dest='debug', help='Turn on debugging') p.add_argument('-v', '--verbose', action='store_true', default=False, dest='verbose', help='Turn on verbose mode') p.add_argument('-m', '--mail', action='store_true', default=False, dest='mail', help='Send email after conversion') p.add_argument('-l', '--lang', default='eng', dest='lang', help='Language(default eng)') p.add_argument('--preprocess', action='store_true', default=False, dest='preprocess', help='Enable preprocessing. Not really useful now with improved Tesseract 3.04+') p.add_argument('--skip-preprocess', action='store_true', default=False, dest='skip_preprocess', help='DEPRECATED: always skips now.') #--------- # Single or watch mode #-------- single_or_watch_group = p.add_mutually_exclusive_group(required=True) # Positional argument for single file conversion single_or_watch_group.add_argument("pdf_filename", nargs="?", help="Scanned pdf file to OCR") # Watch directory for watch mode single_or_watch_group.add_argument('-w', '--watch', dest='watch_dir', help='Watch given directory and run ocr automatically until terminated') #----------- # Filing options #---------- filing_group = p.add_argument_group(title="Filing optinos") filing_group.add_argument('-f', '--file', action='store_true', default=False, dest='enable_filing', help='Enable filing of converted PDFs') #filing_group.add_argument('-c', '--config', type = argparse.FileType('r'), filing_group.add_argument('-c', '--config', type = lambda x: open_file_with_timeout(p,x), dest='configfile', help='Configuration file for defaults and PDF filing') filing_group.add_argument('-e', '--evernote', action='store_true', default=False, dest='enable_evernote', help='Enable filing to Evernote') filing_group.add_argument('-n', action='store_true', default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder') # Add flow option to single mode extract_images,preprocess,ocr,write args = p.parse_args(argv) self.debug = args.debug self.verbose = args.verbose self.pdf_filename = args.pdf_filename self.lang = args.lang self.watch_dir = args.watch_dir self.enable_email = args.mail self.match_using_filename = args.match_using_filename # Deprecating skip_preprocess to make skipping the default (always true). Tesseract 3.04 is so much better now # at handling non-ideal inputs and lines if args.skip_preprocess: print("Warning: --skip_preprocess is not needed anymore (defaults to skipping preprocessing). If you want to enable preprocessing, use the new --preprocess option") self.skip_preprocess = True if args.preprocess: self.skip_preprocess = False if self.debug: logging.basicConfig(level=logging.DEBUG, format='%(message)s') if self.verbose: logging.basicConfig(level=logging.INFO, format='%(message)s') # Parse configuration file (YAML) if specified if args.configfile: self.config = self._get_config_file(args.configfile) logging.debug("Read in configuration file") logging.debug(self.config) if args.enable_evernote: self.enable_evernote = True else: self.enable_evernote = False if args.enable_filing or args.enable_evernote: self.enable_filing = True if not args.configfile: p.error("Please specify a configuration file(CONFIGFILE) to enable filing") else: self.enable_filing = False self.watch = False if args.watch_dir: logging.debug("Starting to watch") self.watch = True if self.enable_email: if not args.configfile: p.error("Please specify a configuration file(CONFIGFILE) to enable email")
[docs] def _clean_up_files(self, files): """ Helper function to delete files :param files: List of files to delete :type files: list :returns: None """ for f in files: try: os.remove(f) except: logging.debug("Error removing file %s .... continuing" % f)
[docs] def _setup_filing(self): """ Instance the proper PyFiler object (either :class:`pypdfocr.pypdfocr_filer_dirs.PyFilerDirs` or :class:`pypdfocr.pypdfocr_filer_evernote.PyFilerEvernote`) TODO: Make this more generic to allow third-party plugin filing objects :ivar filer: :class:`pypdfocr.pypdfocr_filer.PyFiler` PyFiler subclass object that is instantiated :ivar pdf_filer: :class:`pypdfocr.pypdfocr_pdffiler.PyPdfFiler` object to help with PDF reading :returns: Nothing """ # Look at self.config and create a self.pdf_filer object # -------------------------------------------------- # Some sanity checks # -------------------------------------------------- assert(self.config and self.enable_filing) for required in ['target_folder', 'default_folder']: if not required in self.config: error ("%s must be specified in config file" % required) else: # Make sure these required folders are in abspath format self.config[required] = os.path.abspath(self.config[required]) if 'original_move_folder' in self.config: # User wants to move the original after filing orig = 'original_move_folder' self.config[orig] = os.path.abspath(self.config[orig]) if not os.path.exists(self.config[orig]): os.makedirs(self.config[orig]) original_move_folder = self.config[orig] else: original_move_folder = None # -------------------------------------------------- # Start the filing object # -------------------------------------------------- if self.enable_evernote: self.filer = PyFilerEvernote(self.config['evernote_developer_token']) else: self.filer = PyFilerDirs() self.filer.target_folder = self.config['target_folder'] self.filer.default_folder = self.config['default_folder'] self.filer.original_move_folder = original_move_folder self.pdf_filer = PyPdfFiler(self.filer) if self.match_using_filename: print("Matching using filename as a fallback to pdf contents") self.pdf_filer.file_using_filename = True # ------------------------------ # Add all the folder names with associated keywords # to the filer object # ------------------------------ keyword_count = 0 folder_count = 0 if 'folders' in self.config: for folder, keywords in self.config['folders'].items(): folder_count +=1 keyword_count += len(keywords) # Make sure keywords are lower-cased before adding keywords = [str(x).lower() for x in keywords] self.filer.add_folder_target(folder, keywords) print ("Filing of PDFs is enabled") print (" - %d target filing folders" % (folder_count)) print (" - %d keywords" % (keyword_count))
[docs] def _setup_external_tools(self): """ Instantiate the external tool wrappers with their config dicts """ self.gs = PyGs(self.config.get('ghostscript',{})) self.ts = PyTesseract(self.config.get('tesseract',{})) self.pdf = PyPdf(self.gs) self.preprocess = PyPreprocess(self.config.get('preprocess', {})) return
[docs] def run_conversion(self, pdf_filename): """ Does the following: - Convert the PDF using GhostScript to TIFF and JPG - Run Tesseract on the TIFF to extract the text into HOCR (html) - Use PDF generator to overlay the text on the JPG and output a new PDF - Clean up temporary image files :param pdf_filename: Scanned PDF :type pdf_filename: string :returns: OCR'ed PDF :rtype: filename string """ print ("Starting conversion of %s" % pdf_filename) try: # Make the images for Tesseract img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename) fns = glob.glob(glob_img_filename) except Exception: raise try: # Preprocess if not self.skip_preprocess: preprocess_imagefilenames = self.preprocess.preprocess(fns) else: logging.info("Skipping preprocess step") preprocess_imagefilenames = fns # Run teserract self.ts.lang = self.lang hocr_filenames = self.ts.make_hocr_from_pnms(preprocess_imagefilenames) # Generate new pdf with overlayed text #ocr_pdf_filename = self.pdf.overlay_hocr(tiff_dpi, hocr_filename, pdf_filename) ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename) finally: # Clean up the files time.sleep(1) if not self.debug: # Need to clean up the original image files before preprocessing if locals().has_key("fns"): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % fns) self._clean_up_files(fns) if locals().has_key("preprocess_imagefilenames"): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % preprocess_imagefilenames) self._clean_up_files(preprocess_imagefilenames) # splat the hocr_filenames as it is a list of pairs for ext in [".hocr", ".html", ".txt"]: fns_to_remove = [os.path.splitext(fn)[0]+ext for fn in preprocess_imagefilenames] logging.info("Cleaning up %s" % fns_to_remove) self._clean_up_files(fns_to_remove) # splat the hocr_filenames as it is a list of pairs # clean up the hocr input (jpg) and output (html) files #self._clean_up_files(itertools.chain(*hocr_filenames)) # splat the hocr_filenames as it is a list of pairs # Seems like newer tessearct > 3.03 is now creating .txt files with the OCR text?/? #self._clean_up_files([x[1].replace(".hocr", ".txt") for x in hocr_filenames]) print ("Completed conversion successfully to %s" % ocr_pdf_filename) return ocr_pdf_filename
[docs] def file_converted_file(self, ocr_pdffilename, original_pdffilename): """ move the converted filename to its destiantion directory. Optionally also moves the original PDF. :param ocr_pdffilename: Converted PDF file :type ocr_pdffilename: filename string :param original_pdffilename: Original scanned PDF file :type original_pdffilename: filename string :returns: Target folder name "rtype: string """ filed_path = self.pdf_filer.move_to_matching_folder(ocr_pdffilename) print("Filed %s to %s as %s" % (ocr_pdffilename, os.path.dirname(filed_path), os.path.basename(filed_path))) tgt_path = self.pdf_filer.file_original(original_pdffilename) if tgt_path != original_pdffilename: print("Filed original file %s to %s as %s" % (original_pdffilename, os.path.dirname(tgt_path), os.path.basename(tgt_path))) return os.path.dirname(filed_path)
[docs] def _send_email(self, infilename, outfilename, filing ): """ Send email using smtp """ print("Sending email status") from_addr = self.config["mail_from_addr"] to_addr_list = self.config["mail_to_list"] smtpserver = self.config["mail_smtp_server"] login = self.config["mail_smtp_login"] password = self.config["mail_smtp_password"] subject = "PyPDFOCR converted: %s" % (os.path.basename(outfilename)) header = 'From: %s\n' % login header += 'To: %s\n' % ','.join(to_addr_list) header += 'Subject: %s\n\n' % subject message = """ PyPDFOCR Conversion: -------------------- Original file: %s Converted file: %s Filing: %s """ % (infilename, outfilename, filing) message = header + message server = smtplib.SMTP(smtpserver) server.starttls() server.login(login,password) problems = server.sendmail(from_addr, to_addr_list, message) server.quit()
[docs] def go(self, argv): """ The main entry point into PyPDFOCR #. Parses options #. If filing is enabled, call :func:`_setup_filing` #. If watch is enabled, start the watcher #. :func:`run_conversion` #. if filing is enabled, call :func:`file_converted_file` """ # Read the command line options self.get_options(argv) # Setup tesseract and ghostscript self._setup_external_tools() # Setup the pdf filing if enabled if self.enable_filing: self._setup_filing() # Do the actual conversion followed by optional filing and email if self.watch: while True: # Make sure the watcher doesn't terminate try: py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch')) for pdf_filename in py_watcher.start(): self._convert_and_file_email(pdf_filename) except KeyboardInterrupt: break except Exception as e: print traceback.print_exc(e) py_watcher.stop() else: self._convert_and_file_email(self.pdf_filename)
[docs] def _convert_and_file_email(self, pdf_filename): """ Helper function to run the conversion, then do the optional filing, and optional emailing. """ ocr_pdffilename = self.run_conversion(pdf_filename) if self.enable_filing: filing = self.file_converted_file(ocr_pdffilename, pdf_filename) else: filing = "None" if self.enable_email: self._send_email(pdf_filename, ocr_pdffilename, filing)
[docs]def main(): # pragma: no cover multiprocessing.freeze_support() script = PyPDFOCR() script.go(sys.argv[1:])
if __name__ == '__main__': main()