Coverage for /Users/virantha/dev/ocr/pypdfocr/pypdfocr: 90%

print("Warning: --skip_preprocess is not needed anymore (defaults to skipping preprocessing). If you want to enable preprocessing, use the new --preprocess option")

self.skip_preprocess = True

if args.preprocess:

self.skip_preprocess = False

if self.debug:

logging.basicConfig(level=logging.DEBUG, format='%(message)s')

if self.verbose:

logging.basicConfig(level=logging.INFO, format='%(message)s')

# Parse configuration file (YAML) if specified

if args.configfile:

self.config = self._get_config_file(args.configfile)

logging.debug("Read in configuration file")

logging.debug(self.config)

if args.enable_evernote:

self.enable_evernote = True

else:

self.enable_evernote = False

if args.enable_filing or args.enable_evernote:

self.enable_filing = True

if not args.configfile:

p.error("Please specify a configuration file(CONFIGFILE) to enable filing")

else:

self.enable_filing = False

self.watch = False

if args.watch_dir:

logging.debug("Starting to watch")

self.watch = True

if self.enable_email:

if not args.configfile:

p.error("Please specify a configuration file(CONFIGFILE) to enable email")

def _clean_up_files(self, files):

"""

Helper function to delete files

:param files: List of files to delete

:type files: list

:returns: None

"""

for f in files:

try:

os.remove(f)

except:

logging.debug("Error removing file %s .... continuing" % f)

def _setup_filing(self):

"""

Instance the proper PyFiler object (either

:class:`pypdfocr.pypdfocr_filer_dirs.PyFilerDirs` or

:class:`pypdfocr.pypdfocr_filer_evernote.PyFilerEvernote`)

TODO: Make this more generic to allow third-party plugin filing objects

:ivar filer: :class:`pypdfocr.pypdfocr_filer.PyFiler` PyFiler subclass object that is instantiated

:ivar pdf_filer: :class:`pypdfocr.pypdfocr_pdffiler.PyPdfFiler` object to help with PDF reading

:returns: Nothing

"""

# Look at self.config and create a self.pdf_filer object

# --------------------------------------------------

# Some sanity checks

# --------------------------------------------------

assert(self.config and self.enable_filing)

for required in ['target_folder', 'default_folder']:

if not required in self.config:

error ("%s must be specified in config file" % required)

else:

# Make sure these required folders are in abspath format

self.config[required] = os.path.abspath(self.config[required])

if 'original_move_folder' in self.config:

# User wants to move the original after filing

orig = 'original_move_folder'

self.config[orig] = os.path.abspath(self.config[orig])

if not os.path.exists(self.config[orig]):

os.makedirs(self.config[orig])

original_move_folder = self.config[orig]

else:

original_move_folder = None

# --------------------------------------------------

# Start the filing object

# --------------------------------------------------

if self.enable_evernote:

self.filer = PyFilerEvernote(self.config['evernote_developer_token'])

else:

self.filer = PyFilerDirs()

self.filer.target_folder = self.config['target_folder']

self.filer.default_folder = self.config['default_folder']

self.filer.original_move_folder = original_move_folder

self.pdf_filer = PyPdfFiler(self.filer)

if self.match_using_filename:

print("Matching using filename as a fallback to pdf contents")

self.pdf_filer.file_using_filename = True

# ------------------------------

# Add all the folder names with associated keywords

# to the filer object

# ------------------------------

keyword_count = 0

folder_count = 0

if 'folders' in self.config:

for folder, keywords in self.config['folders'].items():

folder_count +=1

keyword_count += len(keywords)

# Make sure keywords are lower-cased before adding

keywords = [str(x).lower() for x in keywords]

self.filer.add_folder_target(folder, keywords)

print ("Filing of PDFs is enabled")

print (" - %d target filing folders" % (folder_count))

print (" - %d keywords" % (keyword_count))

def _setup_external_tools(self):

"""

Instantiate the external tool wrappers with their config dicts

"""

self.gs = PyGs(self.config.get('ghostscript',{}))

self.ts = PyTesseract(self.config.get('tesseract',{}))

self.pdf = PyPdf(self.gs)

self.preprocess = PyPreprocess(self.config.get('preprocess', {}))

return

def run_conversion(self, pdf_filename):

"""

Does the following:

- Convert the PDF using GhostScript to TIFF and JPG

- Run Tesseract on the TIFF to extract the text into HOCR (html)

- Use PDF generator to overlay the text on the JPG and output a new PDF

- Clean up temporary image files

:param pdf_filename: Scanned PDF

:type pdf_filename: string

:returns: OCR'ed PDF

:rtype: filename string

"""

print ("Starting conversion of %s" % pdf_filename)

try:

# Make the images for Tesseract

img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename)

fns = glob.glob(glob_img_filename)

except Exception:

raise

try:

# Preprocess

if not self.skip_preprocess:

preprocess_imagefilenames = self.preprocess.preprocess(fns)

else:

logging.info("Skipping preprocess step")

preprocess_imagefilenames = fns

# Run teserract

self.ts.lang = self.lang

hocr_filenames = self.ts.make_hocr_from_pnms(preprocess_imagefilenames)

# Generate new pdf with overlayed text

#ocr_pdf_filename = self.pdf.overlay_hocr(tiff_dpi, hocr_filename, pdf_filename)

ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename)

finally:

# Clean up the files

time.sleep(1)

if not self.debug:

# Need to clean up the original image files before preprocessing

if locals().has_key("fns"): # Have to check if this was set before exception raised

logging.info("Cleaning up %s" % fns)

self._clean_up_files(fns)

if locals().has_key("preprocess_imagefilenames"): # Have to check if this was set before exception raised

logging.info("Cleaning up %s" % preprocess_imagefilenames)

self._clean_up_files(preprocess_imagefilenames) # splat the hocr_filenames as it is a list of pairs

for ext in [".hocr", ".html", ".txt"]:

fns_to_remove = [os.path.splitext(fn)[0]+ext for fn in preprocess_imagefilenames]

logging.info("Cleaning up %s" % fns_to_remove)

self._clean_up_files(fns_to_remove) # splat the hocr_filenames as it is a list of pairs

# clean up the hocr input (jpg) and output (html) files

#self._clean_up_files(itertools.chain(*hocr_filenames)) # splat the hocr_filenames as it is a list of pairs

# Seems like newer tessearct > 3.03 is now creating .txt files with the OCR text?/?

#self._clean_up_files([x[1].replace(".hocr", ".txt") for x in hocr_filenames])

print ("Completed conversion successfully to %s" % ocr_pdf_filename)

return ocr_pdf_filename

def file_converted_file(self, ocr_pdffilename, original_pdffilename):

""" move the converted filename to its destiantion directory. Optionally also

moves the original PDF.

:param ocr_pdffilename: Converted PDF file

:type ocr_pdffilename: filename string

:param original_pdffilename: Original scanned PDF file

:type original_pdffilename: filename string

:returns: Target folder name

"rtype: string

"""

filed_path = self.pdf_filer.move_to_matching_folder(ocr_pdffilename)

print("Filed %s to %s as %s" % (ocr_pdffilename, os.path.dirname(filed_path), os.path.basename(filed_path)))

tgt_path = self.pdf_filer.file_original(original_pdffilename)

if tgt_path != original_pdffilename:

print("Filed original file %s to %s as %s" % (original_pdffilename, os.path.dirname(tgt_path), os.path.basename(tgt_path)))

return os.path.dirname(filed_path)

def _send_email(self, infilename, outfilename, filing ):

"""

Send email using smtp

"""

print("Sending email status")

from_addr = self.config["mail_from_addr"]

to_addr_list = self.config["mail_to_list"]

smtpserver = self.config["mail_smtp_server"]

password = self.config["mail_smtp_password"]

subject = "PyPDFOCR converted: %s" % (os.path.basename(outfilename))

header = 'From: %s\n' % login

header += 'To: %s\n' % ','.join(to_addr_list)

header += 'Subject: %s\n\n' % subject

message = """

PyPDFOCR Conversion:

--------------------

Original file: %s

Converted file: %s

Filing: %s

""" % (infilename, outfilename, filing)

message = header + message

server = smtplib.SMTP(smtpserver)

server.starttls()

server.login(login,password)

problems = server.sendmail(from_addr, to_addr_list, message)

server.quit()

def go(self, argv):

"""

The main entry point into PyPDFOCR

#. Parses options

#. If filing is enabled, call :func:`_setup_filing`

#. If watch is enabled, start the watcher

#. :func:`run_conversion`

#. if filing is enabled, call :func:`file_converted_file`

"""

# Read the command line options

self.get_options(argv)

# Setup tesseract and ghostscript

self._setup_external_tools()

# Setup the pdf filing if enabled

if self.enable_filing:

self._setup_filing()

# Do the actual conversion followed by optional filing and email

if self.watch:

while True: # Make sure the watcher doesn't terminate

try:

py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch'))

for pdf_filename in py_watcher.start():

self._convert_and_file_email(pdf_filename)

except KeyboardInterrupt:

break

except Exception as e:

print traceback.print_exc(e)

py_watcher.stop()

else:

self._convert_and_file_email(self.pdf_filename)

def _convert_and_file_email(self, pdf_filename):

"""

Helper function to run the conversion, then do the optional filing, and optional emailing.

"""

ocr_pdffilename = self.run_conversion(pdf_filename)

if self.enable_filing:

filing = self.file_converted_file(ocr_pdffilename, pdf_filename)

else:

filing = "None"

if self.enable_email:

self._send_email(pdf_filename, ocr_pdffilename, filing)

def main(): # pragma: no cover

multiprocessing.freeze_support()

script = PyPDFOCR()

script.go(sys.argv[1:])

if __name__ == '__main__':

main()

Coverage for /Users/virantha/dev/ocr/pypdfocr/pypdfocr : 90%

224 statements 202 run 22 missing 4 excluded