Coverage for /Users/virantha/dev/ocr/pypdfocr/pypdfocr

self.mergeRotateAroundPointPage(original_page, ocr_text_page, orig_rotation_angle, ocr_text_page.mediaBox.getWidth()/2, ocr_text_page.mediaBox.getWidth()/2)

# None of these commands worked for me:

#orig_pg.rotateCounterClockwise(orig_rotation_angle)

#orig_pg.mergeRotatedPage(text_pg,orig_rotation_angle)

else:

original_page.mergePage(ocr_text_page)

original_page.compressContentStreams()

return original_page

def _get_img_dims(self, img_filename):

"""

:rval: (width, height, dpi)

"""

img = Image.open(img_filename)

w,h = img.size

dpi = img.info['dpi']

width = w*72.0/dpi[0]

height = h*72.0/dpi[1]

del img

return (width, height, dpi)

def overlay_hocr_page(self, dpi, hocr_filename, img_filename):

hocr_dir, hocr_basename = os.path.split(hocr_filename)

img_dir, img_basename = os.path.split(img_filename)

logging.debug("hocr_filename:%s, hocr_dir:%s, hocr_basename:%s" % (hocr_filename, hocr_dir, hocr_basename))

assert(img_dir == hocr_dir)

#basename = hocr_basename.split('.')[0]

basename = os.path.splitext(hocr_basename)[0]

pdf_filename = os.path.join("text_%s_ocr.pdf" % (basename))

# Switch to the hocr directory to make this easier

cwd = os.getcwd()

if hocr_dir != "":

os.chdir(hocr_dir)

with open(pdf_filename, "wb") as f:

logging.info("Overlaying hocr and creating text pdf %s" % pdf_filename)

pdf = Canvas(f, pageCompression=1)

pdf.setCreator('pypdfocr')

pdf.setTitle(os.path.basename(hocr_filename))

pdf.setPageCompression(1)

width, height, dpi_jpg = self._get_img_dims(img_basename)

pdf.setPageSize((width,height))

logging.info("Page width=%f, height=%f" % (width, height))

pg_num = 1

logging.info("Adding text to page %s" % pdf_filename)

self.add_text_layer(pdf,hocr_basename,pg_num,height,dpi)

pdf.showPage()

pdf.save()

os.chdir(cwd)

return os.path.join(hocr_dir, pdf_filename)

def iter_pdf_page(self, f):

reader = PdfFileReader(f)

for pgnum in range(reader.getNumPages()):

pg = reader.getPage(pgnum)

yield pg

def _atoi(self,text):

return int(text) if text.isdigit() else text

def natural_keys(self, text):

'''

alist.sort(key=natural_keys) sorts in human order

http://nedbatchelder.com/blog/200712/human_sorting.html

(See Toothy's implementation in the comments)

'''

return [ self._atoi(c) for c in re.split('(\d+)', text) ]

def add_text_layer(self,pdf, hocrfile, page_num,height, dpi):

"""Draw an invisible text layer for OCR data.

This function really needs to get cleaned up

"""

hocr = ElementTree()

try:

# It's possible tesseract has failed and written garbage to this hocr file, so we need to catch any exceptions

hocr.parse(hocrfile)

except Exception:

logging.info("Error loading hocr, not adding any text")

return

logging.debug(xml.etree.ElementTree.tostring(hocr.getroot()))

for c in hocr.getroot(): # Find the <body> tag

if c.tag != 'body':

continue

for page in c: # Each child in the body is a page tag

if (page.attrib['class'] != "ocr_page"):

assert ("Why is this hocr not paging properly??")

if page.attrib['id'] == 'page_%d' %(page_num):

break

for line in page.findall(".//{http://www.w3.org/1999/xhtml}span"):

#for line in page.findall(".//span"):

if line.attrib['class'] != 'ocr_line':

continue

linebox = self.regex_bbox.search(line.attrib['title']).group(1).split()

textangle = self.regex_textangle.search(line.attrib['title'])

if textangle:

textangle = self._atoi(textangle.group(1))

else:

textangle = 0

try:

baseline = self.regex_baseline.search(line.attrib['title']).group(1).split()

except AttributeError:

baseline = [ 0, 0 ]

linebox = [float(i) for i in linebox]

baseline = [float(i) for i in baseline]

for word in line:

if word.attrib['class'] != 'ocrx_word':

continue

word_text = []

for child in word.iter():

if child.text:

word_text.append(child.text)

word.text = ' '.join(word_text)

if word.text is None:

continue

logging.debug("word: %s, angle: %d" % ( word.text.strip(), textangle))

box = self.regex_bbox.search(word.attrib['title']).group(1).split()

#b = self.polyval(baseline, (box[0] + box[2]) / 2 - linebox[0]) + linebox[3]

box = [float(i) for i in box]

# Transform angle to x,y co-ords needed for proper text placement

# We only support 0, 90, 180, 270!. Anything else, we'll just use the normal orientation for now

coords = { 0: (box[0], box[1]),

90: (box[0], box[3]), # facing right

180: (box[2], box[3]), # upside down

270: (box[2], box[1]), # facing left

}

x,y = coords.get(textangle, (box[0], box[1]))

style = getSampleStyleSheet()

normal = style["BodyText"]

normal.alignment = TA_LEFT

normal.leading = 0

font_name, font_size = self._get_font_spec(word.attrib['title'])

normal.fontName = "Helvetica"

normal.fontSize = font_size

para = RotatedPara(escape(word.text.strip()), normal, textangle)

para.wrapOn(pdf, para.minWidth(), 100) # Not sure what to use as the height here

para.drawOn(pdf, x*72/dpi, height - y*72/dpi)

def polyval(self,poly, x):

return x * poly[0] + poly[1]

def _get_font_spec(self, tag):

try:

fontspec = self.regex_fontspec.search(tag).groups()

fontname, fontsize = fontspec

except Exception:

fontname = ""

fontsize = "8"

return (fontname, self._atoi(fontsize))

Coverage for /Users/virantha/dev/ocr/pypdfocr/pypdfocr_pdf : 92%

213 statements 196 run 17 missing 0 excluded