Source code for pypdfocr.pypdfocr_pdf

#!/usr/bin/env python2.7
# Copyright 2013 Virantha Ekanayake All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# Following code is adapted and modified from hocr-pdf.py released under
# Apache License, Version 2.0 available at 
# https://code.google.com/p/hocr-tools/source/browse/hocr-pdf
#   - Code was improved to allow multi-page hocr files
"""
    Wrap pdf generation and text addition code
"""

from optparse import OptionParser
import sys, os
import re
import logging
import shutil
import time
import tempfile
import glob

import cStringIO
import base64
import zlib
import math

from cgi import escape
# Pkg to read multiple image tiffs
from PIL import Image
from reportlab.pdfgen.canvas import Canvas
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from xml.etree.ElementTree import ElementTree, ParseError
import xml.etree

# Import Pypdf2
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter, utils

from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.enums import TA_LEFT
from reportlab.platypus.paragraph import Paragraph

from pypdfocr_util import Retry
from functools import partial

[docs]class RotatedPara(Paragraph): """ Used for rotating text, since the low-level rotate method in textobject's don't seem to do anything """ def __init__ (self, text, style, angle): Paragraph.__init__(self, text, style) self.angle = angle
[docs] def draw(self): self.canv.saveState() self.canv.translate(0,0) self.canv.rotate(self.angle) Paragraph.draw(self) self.canv.restoreState()
[docs] def beginText(self, x, y): t = self.canv.beginText(x,y) t.setTextRenderMode(3) # Set to zero if you want the text to appear #t.setTextRenderMode(0) # Set to zero if you want the text to appear return t
[docs]class PyPdf(object): """Class to create pdfs from images""" # Some regexes to compile once regex_bbox = re.compile('bbox((\s+\d+){4})') regex_baseline = re.compile('baseline((\s+[\d\.\-]+){2})') regex_fontspec = re.compile('x_font\s+(.+);\s+x_fsize\s+(\d+)') regex_textangle = re.compile('textangle\s+(\d+)') def __init__(self, gs): self.gs = gs # Pointer to ghostscript object
[docs] def get_transform(self, rotation, tx, ty): # Code taken from here: # http://stackoverflow.com/questions/6041244/how-to-merge-two-landscape-pdf-pages-using-pypdf/17392824#17392824 # Unclear why PyPDF2 builtin page rotation functions don't work translation = [[1, 0, 0], [0, 1, 0], [-tx,-ty,1]] rotation = math.radians(rotation) rotating = [[math.cos(rotation), math.sin(rotation),0], [-math.sin(rotation),math.cos(rotation), 0], [0, 0, 1]] rtranslation = [[1, 0, 0], [0, 1, 0], [tx,ty,1]] ctm = utils.matrixMultiply(translation, rotating) ctm = utils.matrixMultiply(ctm, rtranslation) return ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]
[docs] def mergeRotateAroundPointPage(self,page, page2, rotation, tx, ty): # Code taken from here: # http://stackoverflow.com/questions/6041244/how-to-merge-two-landscape-pdf-pages-using-pypdf/17392824#17392824 # Unclear why PyPDF2 builtin page rotation functions don't work translation = [[1, 0, 0], [0, 1, 0], [-tx,-ty,1]] rotation = math.radians(rotation) rotating = [[math.cos(rotation), math.sin(rotation),0], [-math.sin(rotation),math.cos(rotation), 0], [0, 0, 1]] rtranslation = [[1, 0, 0], [0, 1, 0], [tx,ty,1]] ctm = utils.matrixMultiply(translation, rotating) ctm = utils.matrixMultiply(ctm, rtranslation) return page.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]])
[docs] def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename): logging.debug("Going to overlay following files onto %s" % orig_pdf_filename) # Sort the hocr_filenames into natural keys! hocr_filenames.sort(key=lambda x: self.natural_keys(x[0] )) logging.debug(hocr_filenames) pdf_dir, pdf_basename = os.path.split(orig_pdf_filename) basename = os.path.splitext(pdf_basename)[0] pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename)) text_pdf_filenames = [] for img_filename, hocr_filename in hocr_filenames: text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename) logging.info("Created temp OCR'ed pdf containing only the text as %s" % (text_pdf_filename)) text_pdf_filenames.append(text_pdf_filename) # Now, concatenate this text_pdfs into one single file. # This is a hack to save memory/running time when we have to do the actual merge with a writer all_text_filename = os.path.join(pdf_dir, "%s_text.pdf" % (basename)) merger = PdfFileMerger() for text_pdf_filename in text_pdf_filenames: merger.append(PdfFileReader(file(text_pdf_filename, 'rb'))) merger.write(all_text_filename) merger.close() del merger writer = PdfFileWriter() orig = open(orig_pdf_filename, 'rb') text_file = open(all_text_filename, 'rb') for orig_pg, text_pg in zip(self.iter_pdf_page(orig), self.iter_pdf_page(text_file)): orig_pg = self._get_merged_single_page(orig_pg, text_pg) writer.addPage(orig_pg) with open(pdf_filename, 'wb') as f: # Flush out this page merge so we can close the text_file writer.write(f) orig.close() text_file.close() # Windows sometimes locks the temp text file for no reason, so we need to retry a few times to delete for fn in text_pdf_filenames: #os.remove(fn) Retry(partial(os.remove, fn), tries=10, pause=3).call_with_retry() os.remove(all_text_filename) logging.info("Created OCR'ed pdf as %s" % (pdf_filename)) return pdf_filename
[docs] def _get_merged_single_page(self, original_page, ocr_text_page): """ Take two page objects, rotate the text page if necessary, and return the merged page """ orig_rotation_angle = int(original_page.get('/Rotate', 0)) if orig_rotation_angle != 0: logging.info("Original Rotation: %s" % orig_rotation_angle) self.mergeRotateAroundPointPage(original_page, ocr_text_page, orig_rotation_angle, ocr_text_page.mediaBox.getWidth()/2, ocr_text_page.mediaBox.getWidth()/2) # None of these commands worked for me: #orig_pg.rotateCounterClockwise(orig_rotation_angle) #orig_pg.mergeRotatedPage(text_pg,orig_rotation_angle) else: original_page.mergePage(ocr_text_page) original_page.compressContentStreams() return original_page
[docs] def _get_img_dims(self, img_filename): """ :rval: (width, height, dpi) """ img = Image.open(img_filename) w,h = img.size dpi = img.info['dpi'] width = w*72.0/dpi[0] height = h*72.0/dpi[1] del img return (width, height, dpi)
[docs] def overlay_hocr_page(self, dpi, hocr_filename, img_filename): hocr_dir, hocr_basename = os.path.split(hocr_filename) img_dir, img_basename = os.path.split(img_filename) logging.debug("hocr_filename:%s, hocr_dir:%s, hocr_basename:%s" % (hocr_filename, hocr_dir, hocr_basename)) assert(img_dir == hocr_dir) #basename = hocr_basename.split('.')[0] basename = os.path.splitext(hocr_basename)[0] pdf_filename = os.path.join("text_%s_ocr.pdf" % (basename)) # Switch to the hocr directory to make this easier cwd = os.getcwd() if hocr_dir != "": os.chdir(hocr_dir) with open(pdf_filename, "wb") as f: logging.info("Overlaying hocr and creating text pdf %s" % pdf_filename) pdf = Canvas(f, pageCompression=1) pdf.setCreator('pypdfocr') pdf.setTitle(os.path.basename(hocr_filename)) pdf.setPageCompression(1) width, height, dpi_jpg = self._get_img_dims(img_basename) pdf.setPageSize((width,height)) logging.info("Page width=%f, height=%f" % (width, height)) pg_num = 1 logging.info("Adding text to page %s" % pdf_filename) self.add_text_layer(pdf,hocr_basename,pg_num,height,dpi) pdf.showPage() pdf.save() os.chdir(cwd) return os.path.join(hocr_dir, pdf_filename)
[docs] def iter_pdf_page(self, f): reader = PdfFileReader(f) for pgnum in range(reader.getNumPages()): pg = reader.getPage(pgnum) yield pg
[docs] def _atoi(self,text): return int(text) if text.isdigit() else text
[docs] def natural_keys(self, text): ''' alist.sort(key=natural_keys) sorts in human order http://nedbatchelder.com/blog/200712/human_sorting.html (See Toothy's implementation in the comments) ''' return [ self._atoi(c) for c in re.split('(\d+)', text) ]
[docs] def add_text_layer(self,pdf, hocrfile, page_num,height, dpi): """Draw an invisible text layer for OCR data. This function really needs to get cleaned up """ hocr = ElementTree() try: # It's possible tesseract has failed and written garbage to this hocr file, so we need to catch any exceptions hocr.parse(hocrfile) except Exception: logging.info("Error loading hocr, not adding any text") return logging.debug(xml.etree.ElementTree.tostring(hocr.getroot())) for c in hocr.getroot(): # Find the <body> tag if c.tag != 'body': continue for page in c: # Each child in the body is a page tag if (page.attrib['class'] != "ocr_page"): assert ("Why is this hocr not paging properly??") if page.attrib['id'] == 'page_%d' %(page_num): break for line in page.findall(".//{http://www.w3.org/1999/xhtml}span"): #for line in page.findall(".//span"): if line.attrib['class'] != 'ocr_line': continue linebox = self.regex_bbox.search(line.attrib['title']).group(1).split() textangle = self.regex_textangle.search(line.attrib['title']) if textangle: textangle = self._atoi(textangle.group(1)) else: textangle = 0 try: baseline = self.regex_baseline.search(line.attrib['title']).group(1).split() except AttributeError: baseline = [ 0, 0 ] linebox = [float(i) for i in linebox] baseline = [float(i) for i in baseline] for word in line: if word.attrib['class'] != 'ocrx_word': continue word_text = [] for child in word.iter(): if child.text: word_text.append(child.text) word.text = ' '.join(word_text) if word.text is None: continue logging.debug("word: %s, angle: %d" % ( word.text.strip(), textangle)) box = self.regex_bbox.search(word.attrib['title']).group(1).split() #b = self.polyval(baseline, (box[0] + box[2]) / 2 - linebox[0]) + linebox[3] box = [float(i) for i in box] # Transform angle to x,y co-ords needed for proper text placement # We only support 0, 90, 180, 270!. Anything else, we'll just use the normal orientation for now coords = { 0: (box[0], box[1]), 90: (box[0], box[3]), # facing right 180: (box[2], box[3]), # upside down 270: (box[2], box[1]), # facing left } x,y = coords.get(textangle, (box[0], box[1])) style = getSampleStyleSheet() normal = style["BodyText"] normal.alignment = TA_LEFT normal.leading = 0 font_name, font_size = self._get_font_spec(word.attrib['title']) normal.fontName = "Helvetica" normal.fontSize = font_size para = RotatedPara(escape(word.text.strip()), normal, textangle) para.wrapOn(pdf, para.minWidth(), 100) # Not sure what to use as the height here para.drawOn(pdf, x*72/dpi, height - y*72/dpi)
[docs] def polyval(self,poly, x): return x * poly[0] + poly[1]
[docs] def _get_font_spec(self, tag): try: fontspec = self.regex_fontspec.search(tag).groups() fontname, fontsize = fontspec except Exception: fontname = "" fontsize = "8" return (fontname, self._atoi(fontsize))