307 lines
9.5 KiB
Python
307 lines
9.5 KiB
Python
# Copyright (C) 2014 Andrey Golovizin
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
import itertools
|
|
from collections import defaultdict
|
|
|
|
import numpy as np
|
|
from scipy import ndimage
|
|
from scipy.ndimage import filters, grey_closing
|
|
|
|
from .utils import cached_property, collect_iterable, pairwise, neighbourhood
|
|
from .image import Image, combine, is_nonblank
|
|
|
|
|
|
CONNECTIVITY8 = ndimage.generate_binary_structure(2, 2)
|
|
|
|
|
|
class PageObject(object):
|
|
def __init__(self, image, filename=None):
|
|
self.image = image
|
|
self.filename = filename
|
|
|
|
def _repr_png_(self):
|
|
return self.image._repr_png_()
|
|
|
|
@property
|
|
def shape(self):
|
|
return self.image.shape
|
|
|
|
@property
|
|
def height(self):
|
|
return self.image.height
|
|
|
|
@property
|
|
def width(self):
|
|
return self.image.width
|
|
|
|
@property
|
|
def x(self):
|
|
return self.image.x
|
|
|
|
@property
|
|
def y(self):
|
|
return self.image.y
|
|
|
|
@property
|
|
def left(self):
|
|
return self.image.left
|
|
|
|
@property
|
|
def right(self):
|
|
return self.image.right
|
|
|
|
@property
|
|
def top(self):
|
|
return self.image.top
|
|
|
|
@property
|
|
def bottom(self):
|
|
return self.image.bottom
|
|
|
|
@property
|
|
def xcenter(self):
|
|
return (self.right - self.left) / 2
|
|
|
|
@property
|
|
def ycenter(self):
|
|
return (self.bottom - self.top) / 2
|
|
|
|
@property
|
|
def color(self):
|
|
return self.image.color
|
|
|
|
def fits(self, left, top, right, bottom):
|
|
"""Return True if the glyph fits into the given bounding box."""
|
|
|
|
return self.image.fits(left, top, right, bottom)
|
|
|
|
def _optical_correction(self, other, max_correction=1000, T=False):
|
|
image1 = self.image.T if T else self.image
|
|
image2 = other.image.T if T else other.image
|
|
base = min(image1.top, image2.top)
|
|
height = max(image1.bottom, image2.bottom) - base
|
|
bitmap1 = np.hstack([np.ones((image1.height, 1)), image1.bitmap])
|
|
bitmap2 = np.hstack([image2.bitmap, np.ones((image2.height, 1))])
|
|
|
|
margin1 = np.zeros(height, np.int)
|
|
margin1.fill(image1.width)
|
|
margin2 = np.zeros(height, np.int)
|
|
margin2.fill(image2.width)
|
|
|
|
margin1[image1.top - base: image1.bottom - base] = np.fliplr(bitmap1).argmax(axis=1)
|
|
margin2[image2.top - base: image2.bottom - base] = bitmap2.argmax(axis=1)
|
|
margins = margin1 + margin2
|
|
correction = margins.min()
|
|
return min(correction, max_correction)
|
|
|
|
|
|
class Page(PageObject):
|
|
def __iter__(self):
|
|
return iter(self.lines)
|
|
|
|
@cached_property
|
|
@collect_iterable
|
|
def lines(self):
|
|
return self._merge_lines(self._iter_lines())
|
|
|
|
@cached_property
|
|
def leftmost_nonblank(self):
|
|
"""X coordinate of the leftmost non-blank pixel."""
|
|
return min(line.glyphs[0].left for line in self.lines)
|
|
|
|
def _iter_lines(self):
|
|
line_start = None
|
|
prev_line_end = 0
|
|
|
|
for i, row in enumerate(self.image.bitmap):
|
|
if is_nonblank(row):
|
|
if line_start is None:
|
|
line_start = i
|
|
else:
|
|
if line_start is not None:
|
|
yield Line(self, self.image[line_start:i,:])
|
|
line_start = None
|
|
prev_line_end = i
|
|
if line_start is not None:
|
|
yield Line(self, self.image[line_start:,:])
|
|
|
|
def _merge_lines(self, lines, min_space=2, min_height=10):
|
|
prev_line = None
|
|
for line in lines:
|
|
if prev_line is None:
|
|
prev_line = line
|
|
else:
|
|
too_close = (
|
|
# the first line is to avoid unnecessary calling optical_distance()
|
|
# which may be expensive
|
|
line.top - prev_line.bottom < min_space
|
|
and prev_line.optical_distance(line) < min_space
|
|
)
|
|
not_high_enough = prev_line.height < min_height
|
|
if too_close or not_high_enough:
|
|
prev_line = Line(self, self.image[prev_line.top:line.bottom])
|
|
else:
|
|
yield prev_line
|
|
prev_line = line
|
|
if prev_line is not None:
|
|
yield prev_line
|
|
|
|
|
|
class Line(PageObject):
|
|
def __init__(self, page, image):
|
|
super().__init__(image)
|
|
self.page = page
|
|
|
|
def __iter__(self):
|
|
return iter(self.glyphs)
|
|
|
|
@cached_property
|
|
def baseline(self):
|
|
"""Detect baseline height, relative to the top."""
|
|
skip = self.height * 2 // 3
|
|
bitmap = self.image.bitmap[skip:, :]
|
|
bitmap = grey_closing(bitmap, (0, 4), mode='constant')
|
|
histogram = bitmap.sum(axis=1)
|
|
gradient = list(filters.correlate1d(histogram, [-1, 1], axis=0, mode='constant'))
|
|
gradient[0] = histogram[0]
|
|
gradient.append(-histogram[-1])
|
|
# top = gradient.argmax()
|
|
bottom = np.argmin(gradient) + skip
|
|
return self.y + bottom
|
|
|
|
@cached_property
|
|
def indent(self):
|
|
return self.glyphs[0].left - self.page.leftmost_nonblank
|
|
|
|
@cached_property
|
|
@collect_iterable
|
|
def glyphs(self):
|
|
labels, max_label = ndimage.label(self.image.bitmap, CONNECTIVITY8)
|
|
blob_slices = enumerate(ndimage.find_objects(labels, max_label), 1)
|
|
glyph_images = (
|
|
self._extract_blob(blob_slice, label, labels)
|
|
for (label, blob_slice) in blob_slices
|
|
)
|
|
glyphs = (
|
|
Glyph(self, image, self.baseline - image.bottom)
|
|
for image in glyph_images
|
|
)
|
|
glyphs = sorted(glyphs, key=lambda glyph: (glyph.left, -glyph.bottom))
|
|
glyphs = self._combine_diacritics(glyphs)
|
|
return self._insert_spaces(glyphs)
|
|
|
|
def _combine_diacritics(self, glyphs):
|
|
def find_correspondence(glyphs):
|
|
bodies = defaultdict(list)
|
|
diacritics = defaultdict(list)
|
|
for i, glyph in enumerate(glyphs):
|
|
neighbours = neighbourhood(glyphs, i, 5)
|
|
possible_bodies = [body for body in neighbours if body.detect_diacritic(glyph)]
|
|
if possible_bodies:
|
|
body = possible_bodies[0]
|
|
diacritics[body].append(glyph)
|
|
bodies[glyph].append(body)
|
|
return bodies, diacritics
|
|
|
|
bodies, diacritics = find_correspondence(glyphs)
|
|
for glyph in glyphs:
|
|
if glyph.is_body():
|
|
yield glyph.add_diacritics(*diacritics[glyph])
|
|
else:
|
|
if glyph not in bodies: # freestanding diacritic-like glyphacter without a body
|
|
yield glyph
|
|
|
|
def _insert_spaces(self, glyphs, min_distance=15):
|
|
for glyph, next_glyph in pairwise(glyphs):
|
|
yield glyph
|
|
if next_glyph is not None:
|
|
distance = glyph.optical_distance(next_glyph)
|
|
if distance >= min_distance:
|
|
yield Space(self, self.image.space(glyph.right, self.top, distance, self.height), self.baseline - self.top)
|
|
|
|
def _extract_blob(self, blob_slice, label, labels):
|
|
image = self.image[blob_slice]
|
|
mask = labels[blob_slice] != label
|
|
return image.mask(mask)
|
|
|
|
def optical_distance(self, other):
|
|
distance = other.top - self.bottom
|
|
return distance + self._optical_correction(other, T=True)
|
|
|
|
|
|
class Glyph(PageObject):
|
|
MIN_BODY_HEIGHT = 10
|
|
|
|
def __init__(self, line, image, elevation):
|
|
super().__init__(image)
|
|
self.elevation = elevation
|
|
self.line = line
|
|
|
|
def is_body(self):
|
|
"""Return True if the glyph is definitely not diacritic."""
|
|
return self.height >= self.MIN_BODY_HEIGHT
|
|
|
|
def optical_distance(self, other):
|
|
distance = other.left - self.right
|
|
return distance + self._optical_correction(other, max_correction=3)
|
|
|
|
def detect_diacritic(self, glyph):
|
|
"""Check if the given glyph can be our diacritic and return a numeric score."""
|
|
|
|
if not self.is_body():
|
|
return False
|
|
|
|
#TODO remove hardcoded sizes
|
|
|
|
# diacritic above the letter
|
|
if glyph.fits(
|
|
self.left - 3,
|
|
self.top - 10,
|
|
self.right + 3,
|
|
self.top + 10,
|
|
):
|
|
return True
|
|
|
|
# apostrophe, like in ť
|
|
if glyph.fits(
|
|
self.right - 5,
|
|
self.top - 5,
|
|
self.right + 7,
|
|
self.top + 10,
|
|
) and glyph.height > 3 and self.optical_distance(glyph) < 4:
|
|
return True
|
|
|
|
# dot in ? and !
|
|
if glyph.fits(
|
|
self.left - 3,
|
|
self.bottom + 1,
|
|
self.right + 3,
|
|
self.bottom + 10,
|
|
):
|
|
return True
|
|
|
|
return False
|
|
|
|
def add_diacritics(self, *diacritics):
|
|
if not diacritics:
|
|
return self
|
|
diacritic_images = (diacritic.image for diacritic in diacritics)
|
|
return Glyph(self.line, combine(self.image, *diacritic_images), self.elevation)
|
|
|
|
|
|
class Space(Glyph):
|
|
pass
|