Source code for pypdfocr.pypdfocr_pdf

#!/usr/bin/env python2.7
# Copyright 2013 Virantha Ekanayake All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# Following code is adapted and modified from hocr-pdf.py released under
# Apache License, Version 2.0 available at 
# https://code.google.com/p/hocr-tools/source/browse/hocr-pdf
#   - Code was improved to allow multi-page hocr files
"""
    Wrap pdf generation and text addition code
"""

from optparse import OptionParser
import sys, os
import re
import logging
import shutil
import time
import tempfile
import glob

import cStringIO
import base64
import zlib
import math

from cgi import escape
# Pkg to read multiple image tiffs
from PIL import Image
from reportlab.pdfgen.canvas import Canvas
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from xml.etree.ElementTree import ElementTree, ParseError
import xml.etree

# Import Pypdf2
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter, utils

from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.enums import TA_LEFT
from reportlab.platypus.paragraph import Paragraph

from pypdfocr_util import Retry
from functools import partial

[docs]class RotatedPara(Paragraph):
    """
        Used for rotating text, since the low-level rotate method in textobject's don't seem to 
        do anything
    """

    def __init__ (self, text, style, angle):
        Paragraph.__init__(self, text, style)
        self.angle = angle

[docs]    def draw(self):
        self.canv.saveState()
        self.canv.translate(0,0)
        self.canv.rotate(self.angle)
        Paragraph.draw(self)
        self.canv.restoreState()
[docs]    def beginText(self, x, y):
        t = self.canv.beginText(x,y)
        t.setTextRenderMode(3)  # Set to zero if you want the text to appear
        #t.setTextRenderMode(0)  # Set to zero if you want the text to appear
        return t

[docs]class PyPdf(object):
    """Class to create pdfs from images"""
    # Some regexes to compile once
    regex_bbox = re.compile('bbox((\s+\d+){4})')
    regex_baseline = re.compile('baseline((\s+[\d\.\-]+){2})')
    regex_fontspec = re.compile('x_font\s+(.+);\s+x_fsize\s+(\d+)')
    regex_textangle = re.compile('textangle\s+(\d+)')

    def __init__(self, gs):
        self.gs = gs # Pointer to ghostscript object


[docs]    def get_transform(self, rotation, tx, ty):
        # Code taken from here:
        # http://stackoverflow.com/questions/6041244/how-to-merge-two-landscape-pdf-pages-using-pypdf/17392824#17392824
        # Unclear why PyPDF2 builtin page rotation functions don't work
        translation = [[1, 0, 0],
                       [0, 1, 0],
                       [-tx,-ty,1]]
        rotation = math.radians(rotation)
        rotating = [[math.cos(rotation), math.sin(rotation),0],
                    [-math.sin(rotation),math.cos(rotation), 0],
                    [0,                  0,                  1]]
        rtranslation = [[1, 0, 0],
                       [0, 1, 0],
                       [tx,ty,1]]
        ctm = utils.matrixMultiply(translation, rotating)
        ctm = utils.matrixMultiply(ctm, rtranslation)

        return ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]

[docs]    def mergeRotateAroundPointPage(self,page, page2, rotation, tx, ty):
        # Code taken from here:
        # http://stackoverflow.com/questions/6041244/how-to-merge-two-landscape-pdf-pages-using-pypdf/17392824#17392824
        # Unclear why PyPDF2 builtin page rotation functions don't work
        translation = [[1, 0, 0],
                       [0, 1, 0],
                       [-tx,-ty,1]]
        rotation = math.radians(rotation)
        rotating = [[math.cos(rotation), math.sin(rotation),0],
                    [-math.sin(rotation),math.cos(rotation), 0],
                    [0,                  0,                  1]]
        rtranslation = [[1, 0, 0],
                       [0, 1, 0],
                       [tx,ty,1]]
        ctm = utils.matrixMultiply(translation, rotating)
        ctm = utils.matrixMultiply(ctm, rtranslation)

        return page.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
                                                 ctm[1][0], ctm[1][1],
                                                 ctm[2][0], ctm[2][1]])

[docs]    def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
        
        logging.debug("Going to overlay following files onto %s" % orig_pdf_filename)
        # Sort the hocr_filenames into natural keys!
        hocr_filenames.sort(key=lambda x: self.natural_keys(x[0] ))
        logging.debug(hocr_filenames)

        pdf_dir, pdf_basename = os.path.split(orig_pdf_filename)
        basename = os.path.splitext(pdf_basename)[0]
        pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename))

        text_pdf_filenames = []
        for img_filename, hocr_filename in hocr_filenames:
            text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename)
            logging.info("Created temp OCR'ed pdf containing only the text as %s" % (text_pdf_filename))
            text_pdf_filenames.append(text_pdf_filename)

        # Now, concatenate this text_pdfs into one single file.
        # This is a hack to save memory/running time when we have to do the actual merge with a writer

        all_text_filename = os.path.join(pdf_dir, "%s_text.pdf" % (basename))
        merger = PdfFileMerger()
        for text_pdf_filename in text_pdf_filenames:
            merger.append(PdfFileReader(file(text_pdf_filename, 'rb')))
        merger.write(all_text_filename)
        merger.close()
	del merger


        writer = PdfFileWriter()
        orig = open(orig_pdf_filename, 'rb')
        text_file = open(all_text_filename, 'rb')

        for orig_pg, text_pg in zip(self.iter_pdf_page(orig), self.iter_pdf_page(text_file)):
            orig_pg = self._get_merged_single_page(orig_pg, text_pg)
            writer.addPage(orig_pg)

        with open(pdf_filename, 'wb') as f:
            # Flush out this page merge so we can close the text_file
            writer.write(f)

        orig.close()
        text_file.close()

        # Windows sometimes locks the temp text file for no reason, so we need to retry a few times to delete
        for fn in text_pdf_filenames:
            #os.remove(fn)
            Retry(partial(os.remove, fn), tries=10, pause=3).call_with_retry() 

        os.remove(all_text_filename)
        logging.info("Created OCR'ed pdf as %s" % (pdf_filename))

        return pdf_filename

[docs]    def _get_merged_single_page(self, original_page, ocr_text_page):
        """
            Take two page objects, rotate the text page if necessary, and return the merged page
        """
        orig_rotation_angle = int(original_page.get('/Rotate', 0))

        if orig_rotation_angle != 0:
            logging.info("Original Rotation: %s" % orig_rotation_angle)
            self.mergeRotateAroundPointPage(original_page, ocr_text_page, orig_rotation_angle, ocr_text_page.mediaBox.getWidth()/2, ocr_text_page.mediaBox.getWidth()/2)
            # None of these commands worked for me:
            #orig_pg.rotateCounterClockwise(orig_rotation_angle)
            #orig_pg.mergeRotatedPage(text_pg,orig_rotation_angle)
        else:
            original_page.mergePage(ocr_text_page)
        original_page.compressContentStreams()
        return original_page


[docs]    def _get_img_dims(self, img_filename):
        """
            :rval: (width, height, dpi)
        """
        img = Image.open(img_filename)
        w,h = img.size
        dpi = img.info['dpi']
        width = w*72.0/dpi[0]
        height = h*72.0/dpi[1]
        del img
        return (width, height, dpi)

[docs]    def overlay_hocr_page(self, dpi, hocr_filename, img_filename):
        hocr_dir, hocr_basename = os.path.split(hocr_filename)
        img_dir, img_basename = os.path.split(img_filename)
        logging.debug("hocr_filename:%s, hocr_dir:%s, hocr_basename:%s" % (hocr_filename, hocr_dir, hocr_basename))
        assert(img_dir == hocr_dir)

        #basename = hocr_basename.split('.')[0]
        basename = os.path.splitext(hocr_basename)[0]
        pdf_filename = os.path.join("text_%s_ocr.pdf" % (basename))

        # Switch to the hocr directory to make this easier
        cwd = os.getcwd()
        if hocr_dir != "":
            os.chdir(hocr_dir)

        with open(pdf_filename, "wb") as f:
            logging.info("Overlaying hocr and creating text pdf %s" % pdf_filename)
            pdf = Canvas(f, pageCompression=1)
            pdf.setCreator('pypdfocr')
            pdf.setTitle(os.path.basename(hocr_filename))
            pdf.setPageCompression(1)

            width, height, dpi_jpg = self._get_img_dims(img_basename)
            pdf.setPageSize((width,height))
            logging.info("Page width=%f, height=%f" % (width, height))

            pg_num = 1

            logging.info("Adding text to page %s" % pdf_filename)
            self.add_text_layer(pdf,hocr_basename,pg_num,height,dpi)
            pdf.showPage()
            pdf.save()

        os.chdir(cwd)
        return os.path.join(hocr_dir, pdf_filename)

[docs]    def iter_pdf_page(self, f):
        reader = PdfFileReader(f)
        for pgnum in range(reader.getNumPages()):
            pg = reader.getPage(pgnum)
            yield pg

[docs]    def _atoi(self,text):
        return int(text) if text.isdigit() else text

[docs]    def natural_keys(self, text):
        '''
        alist.sort(key=natural_keys) sorts in human order
        http://nedbatchelder.com/blog/200712/human_sorting.html
        (See Toothy's implementation in the comments)
        '''
        return [ self._atoi(c) for c in re.split('(\d+)', text) ]

[docs]    def add_text_layer(self,pdf, hocrfile, page_num,height, dpi):
      """Draw an invisible text layer for OCR data.

        This function really needs to get cleaned up
        
      """
      hocr = ElementTree()
      try: 
        # It's possible tesseract has failed and written garbage to this hocr file, so we need to catch any exceptions
          hocr.parse(hocrfile)
      except Exception:
          logging.info("Error loading hocr, not adding any text")
          return 

      logging.debug(xml.etree.ElementTree.tostring(hocr.getroot()))
      for c in hocr.getroot():  # Find the <body> tag
          if c.tag != 'body':
              continue
      for page in c: # Each child in the body is a page tag
          if (page.attrib['class'] != "ocr_page"):
              assert ("Why is this hocr not paging properly??")
          if page.attrib['id'] == 'page_%d' %(page_num):
              break

      for line in page.findall(".//{http://www.w3.org/1999/xhtml}span"):
      #for line in page.findall(".//span"):
        if line.attrib['class'] != 'ocr_line':
          continue
        linebox = self.regex_bbox.search(line.attrib['title']).group(1).split()
        textangle = self.regex_textangle.search(line.attrib['title'])
        if textangle:
            textangle = self._atoi(textangle.group(1))
        else:
            textangle = 0

        try:
          baseline = self.regex_baseline.search(line.attrib['title']).group(1).split()
        except AttributeError:
          baseline = [ 0, 0 ]

        linebox = [float(i) for i in linebox]
        baseline = [float(i) for i in baseline]

        for word in line:
          if word.attrib['class'] != 'ocrx_word':
            continue
          word_text = []
          for child in word.iter():
              if child.text:
                  word_text.append(child.text)
          word.text = ' '.join(word_text)
          if word.text is None:
            continue
          logging.debug("word: %s, angle: %d" % ( word.text.strip(), textangle))


          box = self.regex_bbox.search(word.attrib['title']).group(1).split()
          #b = self.polyval(baseline, (box[0] + box[2]) / 2 - linebox[0]) + linebox[3]
          box = [float(i) for i in box]

          # Transform angle to x,y co-ords needed for proper text placement
          # We only support 0, 90, 180, 270!.  Anything else, we'll just use the normal orientation for now

          coords = { 0: (box[0], box[1]),
                    90: (box[0], box[3]),  # facing right
                    180: (box[2], box[3]), # upside down
                    270: (box[2], box[1]), # facing left
                    }
          x,y = coords.get(textangle, (box[0], box[1]))

          style = getSampleStyleSheet()
          normal = style["BodyText"]
          normal.alignment = TA_LEFT
          normal.leading = 0
          font_name, font_size = self._get_font_spec(word.attrib['title'])
          normal.fontName = "Helvetica"
          normal.fontSize = font_size

          para = RotatedPara(escape(word.text.strip()), normal, textangle)
          para.wrapOn(pdf, para.minWidth(), 100)  # Not sure what to use as the height  here
          para.drawOn(pdf, x*72/dpi, height - y*72/dpi)



[docs]    def polyval(self,poly, x):
      return x * poly[0] + poly[1]


[docs]    def _get_font_spec(self, tag):
        try:
            fontspec = self.regex_fontspec.search(tag).groups()
            fontname, fontsize = fontspec
        except Exception:
            fontname = ""
            fontsize = "8"
        return (fontname, self._atoi(fontsize))
Navigation

Quick search

Source code for pypdfocr.pypdfocr_pdf

Navigation