pixelocr/pixelocr/page.py

370 lines
12 KiB
Python

# Copyright (C) 2014 Andrey Golovizin
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import itertools
from collections import defaultdict
import numpy as np
from scipy import ndimage
from scipy.ndimage import filters, grey_closing
from .utils import cached_property, collect_iterable, pairwise, neighbourhood
from .image import Image, combine, is_nonblank
CONNECTIVITY8 = ndimage.generate_binary_structure(2, 2)
class PageObject(object):
def __init__(self, document, image, filename=None):
self.image = image
self.filename = filename
self.document = document
self.config = document.config
def _repr_png_(self):
return self.image._repr_png_()
@property
def shape(self):
return self.image.shape
@property
def height(self):
return self.image.height
@property
def width(self):
return self.image.width
@property
def x(self):
return self.image.x
@property
def y(self):
return self.image.y
@property
def left(self):
return self.image.left
@property
def right(self):
return self.image.right
@property
def top(self):
return self.image.top
@property
def bottom(self):
return self.image.bottom
@property
def xcenter(self):
return (self.right - self.left) / 2
@property
def ycenter(self):
return (self.bottom - self.top) / 2
@property
def color(self):
return self.image.color
def fits(self, left, top, right, bottom):
"""Return True if the glyph fits into the given bounding box."""
return self.image.fits(left, top, right, bottom)
def _optical_correction(self, other, max_correction=1000, T=False):
image1 = self.image.T if T else self.image
image2 = other.image.T if T else other.image
base = min(image1.top, image2.top)
height = max(image1.bottom, image2.bottom) - base
bitmap1 = np.hstack([np.ones((image1.height, 1)), image1.bitmap])
bitmap2 = np.hstack([image2.bitmap, np.ones((image2.height, 1))])
margin1 = np.zeros(height, np.int)
margin1.fill(image1.width)
margin2 = np.zeros(height, np.int)
margin2.fill(image2.width)
margin1[image1.top - base: image1.bottom - base] = np.fliplr(bitmap1).argmax(axis=1)
margin2[image2.top - base: image2.bottom - base] = bitmap2.argmax(axis=1)
margins = margin1 + margin2
correction = margins.min()
return min(correction, max_correction)
class Page(PageObject):
def __iter__(self):
return iter(self.lines)
@cached_property
@collect_iterable
def lines(self):
return self._merge_lines(self._iter_lines())
@cached_property
def leftmost_nonblank(self):
"""X coordinate of the leftmost non-blank pixel."""
return min(line.glyphs[0].left for line in self.lines)
def _iter_lines(self):
line_start = None
prev_line_end = 0
for i, row in enumerate(self.image.bitmap):
if is_nonblank(row):
if line_start is None:
line_start = i
else:
if line_start is not None:
yield Line(self, self.image[line_start:i,:])
line_start = None
prev_line_end = i
if line_start is not None:
yield Line(self, self.image[line_start:,:])
def _merge_lines(self, lines, min_space=1, min_height=10):
prev_line = None
for line in lines:
if prev_line is None:
prev_line = line
else:
too_close = (
# the first line is to avoid unnecessary calling optical_distance()
# which may be expensive
line.top - prev_line.bottom < min_space
and prev_line.optical_distance(line) < min_space
)
not_high_enough = prev_line.height < min_height
if too_close or not_high_enough:
prev_line = Line(self, self.image[prev_line.top:line.bottom])
else:
yield prev_line
prev_line = line
if prev_line is not None:
yield prev_line
class Line(PageObject):
def __init__(self, page, image):
super().__init__(page.document, image)
self.page = page
def __iter__(self):
return iter(self.glyphs)
@cached_property
def baseline(self):
"""Detect baseline height, relative to the top."""
skip = self.height * 2 // 3
bitmap = self.image.bitmap[skip:, :]
bitmap = grey_closing(bitmap, (0, 4), mode='constant')
histogram = bitmap.sum(axis=1)
gradient = list(filters.correlate1d(histogram, [-1, 1], axis=0, mode='constant'))
gradient[0] = histogram[0]
gradient.append(-histogram[-1])
# top = gradient.argmax()
bottom = np.argmin(gradient) + skip
return self.y + bottom
@cached_property
def indent(self):
return self.glyphs[0].left - self.page.leftmost_nonblank
@cached_property
@collect_iterable
def words(self):
labels, max_label = ndimage.label(self.image.bitmap, CONNECTIVITY8)
blob_slices = enumerate(ndimage.find_objects(labels, max_label), 1)
glyph_images = (
self._extract_blob(blob_slice, label, labels)
for (label, blob_slice) in blob_slices
)
glyphs = (
Glyph(self, image, self.baseline - image.bottom)
for image in glyph_images
)
glyphs = sorted(glyphs, key=lambda glyph: (glyph.left, -glyph.bottom))
glyphs = self._combine_diacritics(glyphs)
return self._detect_words(glyphs)
@property
@collect_iterable
def glyphs(self):
for word in self.words:
yield from word.glyphs
if word.space_after is not None:
yield word.space_after
def _combine_diacritics(self, glyphs):
def find_correspondence(glyphs):
bodies = defaultdict(list)
diacritics = defaultdict(list)
for i, glyph in enumerate(glyphs):
neighbours = neighbourhood(glyphs, i, 5)
possible_bodies = [body for body in neighbours if body.detect_diacritic(glyph)]
if possible_bodies:
body = possible_bodies[0]
diacritics[body].append(glyph)
bodies[glyph].append(body)
return bodies, diacritics
bodies, diacritics = find_correspondence(glyphs)
for glyph in glyphs:
if glyph.is_body():
yield glyph.add_diacritics(*diacritics[glyph])
else:
if glyph not in bodies: # freestanding diacritic-like glyph without a body
yield glyph
def _detect_words(self, glyphs, min_distance=15):
current_word_glyphs = []
for glyph, next_glyph in pairwise(glyphs):
current_word_glyphs.append(glyph)
if next_glyph is not None:
distance = glyph.optical_distance(next_glyph)
if distance >= min_distance:
space_after = Space(self, self.image[:, glyph.right:next_glyph.left], self.baseline - self.top)
yield Word(self, current_word_glyphs, space_after)
current_word_glyphs = []
if current_word_glyphs:
yield Word(self, current_word_glyphs)
def _extract_blob(self, blob_slice, label, labels):
image = self.image[blob_slice]
mask = labels[blob_slice] != label
return image.mask(mask)
def optical_distance(self, other):
distance = other.top - self.bottom
return distance + self._optical_correction(other, T=True)
class Word(PageObject):
def __init__(self, line, glyphs, space_after=None):
self.glyphs = glyphs
for glyph in glyphs:
glyph.word = self
self.space_after = space_after
beginning = self.glyphs[0]
end = space_after if space_after is not None else self.glyphs[-1]
image = line.image[:, beginning.left:end.right]
super().__init__(line.document, image)
def guess_language(self):
counts = defaultdict(int)
language_map = self.document.language_map
known_glyph_info = self._known_glyph_info()
if not known_glyph_info:
return None
for glyph_info in known_glyph_info:
counts[language_map.get(glyph_info.text)] += 1
language, count = sorted(counts.items(), reverse=True)[0]
return language
def guess_bold(self, *args, **kwargs):
return self._guess('bold', *args, **kwargs)
def guess_italic(self, *args, **kwargs):
return self._guess('italic', *args, **kwargs)
def _guess(self, attr, default=None):
known_glyph_info = self._known_glyph_info()
if not known_glyph_info:
return default
total = sum(getattr(glyph_info.style, attr) for glyph_info in self._known_glyph_info())
avg = total / len(known_glyph_info)
return avg >= 0.5
@collect_iterable
def _known_glyph_info(self):
glyphdb = self.document.glyphdb
for glyph in self.glyphs:
try:
yield glyphdb[glyph]
except KeyError:
pass
class Glyph(PageObject):
word = None
def __init__(self, line, image, elevation):
super().__init__(line.document, image)
self.elevation = elevation
self.line = line
def is_body(self):
"""Return True if the glyph is definitely not diacritic."""
return self.height >= self.config.min_body_height
def optical_distance(self, other):
distance = other.left - self.right
return distance + self._optical_correction(other, max_correction=3)
def detect_diacritic(self, glyph):
"""Check if the given glyph can be our diacritic and return a numeric score."""
if not self.is_body():
return False
# diacritic above the letter
if glyph.fits(
self.left + self.config.diacritic_box_left,
self.top + self.config.diacritic_box_top,
self.right + self.config.diacritic_box_right,
self.top + self.config.diacritic_box_bottom,
):
return True
# apostrophe, like in ť
if (
glyph.fits(
self.right + self.config.apostrophe_box_left,
self.top + self.config.apostrophe_box_top,
self.right + self.config.apostrophe_box_right,
self.top + self.config.apostrophe_box_bottom,
)
and glyph.height > self.config.apostrophe_min_height
and self.optical_distance(glyph) < self.config.apostrophe_max_distance
):
return True
# dot in ? and !
if glyph.fits(
self.left + self.config.dot_box_left,
self.bottom + self.config.dot_box_top,
self.right + self.config.dot_box_right,
self.bottom + self.config.dot_box_bottom,
):
return True
return False
def add_diacritics(self, *diacritics):
if not diacritics:
return self
diacritic_images = (diacritic.image for diacritic in diacritics)
return Glyph(self.line, combine(self.image, *diacritic_images), self.elevation)
class Space(Glyph):
pass