pixelocr/pixelocr/page.py

# Copyright (C) 2014  Andrey Golovizin
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import itertools
from collections import defaultdict

import numpy as np
from scipy import ndimage
from scipy.ndimage import filters, grey_closing

from .utils import cached_property, collect_iterable, pairwise, neighbourhood
from .image import Image, combine, is_nonblank


CONNECTIVITY8 = ndimage.generate_binary_structure(2, 2)


class PageObject(object):
    def __init__(self, document, image, filename=None):
        self.image = image
        self.filename = filename
        self.document = document
        self.config = document.config

    def _repr_png_(self):
        return self.image._repr_png_()

    @property
    def shape(self):
        return self.image.shape

    @property
    def height(self):
        return self.image.height

    @property
    def width(self):
        return self.image.width

    @property
    def x(self):
        return self.image.x

    @property
    def y(self):
        return self.image.y

    @property
    def left(self):
        return self.image.left

    @property
    def right(self):
        return self.image.right

    @property
    def top(self):
        return self.image.top

    @property
    def bottom(self):
        return self.image.bottom

    @property
    def xcenter(self):
        return (self.right - self.left) / 2

    @property
    def ycenter(self):
        return (self.bottom - self.top) / 2

    @property
    def color(self):
        return self.image.color

    def fits(self, left, top, right, bottom):
        """Return True if the glyph fits into the given bounding box."""

        return self.image.fits(left, top, right, bottom)

    def _optical_correction(self, other, max_correction=1000, T=False):
        image1 = self.image.T if T else self.image
        image2 = other.image.T if T else other.image
        base = min(image1.top, image2.top)
        height = max(image1.bottom, image2.bottom) - base
        bitmap1 = np.hstack([np.ones((image1.height, 1)), image1.bitmap])
        bitmap2 = np.hstack([image2.bitmap, np.ones((image2.height, 1))])

        margin1 = np.zeros(height, np.int)
        margin1.fill(image1.width)
        margin2 = np.zeros(height, np.int)
        margin2.fill(image2.width)

        margin1[image1.top - base: image1.bottom - base] = np.fliplr(bitmap1).argmax(axis=1)
        margin2[image2.top - base: image2.bottom - base] = bitmap2.argmax(axis=1)
        margins = margin1 + margin2
        correction = margins.min()
        return min(correction, max_correction)


class Page(PageObject):
    def __iter__(self):
        return iter(self.lines)

    @cached_property
    @collect_iterable
    def lines(self):
        return self._merge_lines(self._iter_lines())

    @cached_property
    def leftmost_nonblank(self):
        """X coordinate of the leftmost non-blank pixel."""
        return min(line.glyphs[0].left for line in self.lines)

    def _iter_lines(self):
        line_start = None
        prev_line_end = 0

        for i, row in enumerate(self.image.bitmap):
            if is_nonblank(row):
                if line_start is None:
                    line_start = i
            else:
                if line_start is not None:
                    yield Line(self, self.image[line_start:i,:])
                    line_start = None
                    prev_line_end = i
        if line_start is not None:
            yield Line(self, self.image[line_start:,:])

    def _merge_lines(self, lines, min_space=1, min_height=10):
        prev_line = None
        for line in lines:
            if prev_line is None:
                prev_line = line
            else:
                too_close = (
                    # the first line is to avoid unnecessary calling optical_distance()
                    # which may be expensive
                    line.top - prev_line.bottom < min_space
                    and prev_line.optical_distance(line) < min_space
                )
                not_high_enough = prev_line.height < min_height
                if too_close or not_high_enough:
                    prev_line = Line(self, self.image[prev_line.top:line.bottom])
                else:
                    yield prev_line
                    prev_line = line
        if prev_line is not None:
            yield prev_line


class Line(PageObject):
    def __init__(self, page, image):
        super().__init__(page.document, image)
        self.page = page

    def __iter__(self):
        return iter(self.glyphs)

    @cached_property
    def baseline(self):
        """Detect baseline height, relative to the top."""
        skip = self.height * 2 // 3
        bitmap = self.image.bitmap[skip:, :]
        bitmap = grey_closing(bitmap, (0, 4), mode='constant')
        histogram = bitmap.sum(axis=1)
        gradient = list(filters.correlate1d(histogram, [-1, 1], axis=0, mode='constant'))
        gradient[0] = histogram[0]
        gradient.append(-histogram[-1])
        # top = gradient.argmax()
        bottom = np.argmin(gradient) + skip
        return self.y + bottom

    @cached_property
    def indent(self):
        return self.glyphs[0].left - self.page.leftmost_nonblank

    @cached_property
    @collect_iterable
    def words(self):
        labels, max_label = ndimage.label(self.image.bitmap, CONNECTIVITY8)
        blob_slices = enumerate(ndimage.find_objects(labels, max_label), 1)
        glyph_images = (
            self._extract_blob(blob_slice, label, labels)
            for (label, blob_slice) in blob_slices
        )
        glyphs = (
            Glyph(self, image, self.baseline - image.bottom)
            for image in glyph_images
        )
        glyphs = sorted(glyphs, key=lambda glyph: (glyph.left, -glyph.bottom))
        glyphs = self._combine_diacritics(glyphs)
        return self._detect_words(glyphs)

    @property
    @collect_iterable
    def glyphs(self):
        for word in self.words:
            yield from word.glyphs
            if word.space_after is not None:
                yield word.space_after

    def _combine_diacritics(self, glyphs):
        def find_correspondence(glyphs):
            bodies = defaultdict(list)
            diacritics = defaultdict(list)
            for i, glyph in enumerate(glyphs):
                neighbours = neighbourhood(glyphs, i, 5)
                possible_bodies = [body for body in neighbours if body.detect_diacritic(glyph)]
                if possible_bodies:
                    body = possible_bodies[0]
                    diacritics[body].append(glyph)
                    bodies[glyph].append(body)
            return bodies, diacritics

        bodies, diacritics = find_correspondence(glyphs)
        for glyph in glyphs:
            if glyph.is_body():
                yield glyph.add_diacritics(*diacritics[glyph])
            else:
                if glyph not in bodies: # freestanding diacritic-like glyph without a body
                    yield glyph

    def _detect_words(self, glyphs, min_distance=15):
        current_word_glyphs = []
        for glyph, next_glyph in pairwise(glyphs):
            current_word_glyphs.append(glyph)
            if next_glyph is not None:
                distance = glyph.optical_distance(next_glyph)
                if distance >= min_distance:
                    space_after = Space(self, self.image[:, glyph.right:next_glyph.left], self.baseline - self.top)
                    yield Word(self, current_word_glyphs, space_after)
                    current_word_glyphs = []
        if current_word_glyphs:
            yield Word(self, current_word_glyphs)

    def _extract_blob(self, blob_slice, label, labels):
        image = self.image[blob_slice]
        mask = labels[blob_slice] != label
        return image.mask(mask)

    def optical_distance(self, other):
        distance = other.top - self.bottom
        return distance + self._optical_correction(other, T=True)


class Word(PageObject):
    def __init__(self, line, glyphs, space_after=None):
        self.glyphs = glyphs
        for glyph in glyphs:
            glyph.word = self
        self.space_after = space_after
        beginning = self.glyphs[0]
        end = space_after if space_after is not None else self.glyphs[-1]
        image = line.image[:, beginning.left:end.right]
        super().__init__(line.document, image)

    def guess_language(self):
        counts = defaultdict(int)
        language_map = self.document.language_map
        known_glyph_info = self._known_glyph_info()
        if not known_glyph_info:
            return None
        for glyph_info in known_glyph_info:
            counts[language_map.get(glyph_info.text)] += 1
        language, count = sorted(counts.items(), reverse=True)[0]
        return language

    def guess_bold(self, *args, **kwargs):
        return self._guess('bold', *args, **kwargs)

    def guess_italic(self, *args, **kwargs):
        return self._guess('italic', *args, **kwargs)

    def _guess(self, attr, default=None):
        known_glyph_info = self._known_glyph_info()
        if not known_glyph_info:
            return default
        total = sum(getattr(glyph_info.style, attr) for glyph_info in self._known_glyph_info())
        avg = total / len(known_glyph_info)
        return avg >= 0.5

    @collect_iterable
    def _known_glyph_info(self):
        glyphdb = self.document.glyphdb
        for glyph in self.glyphs:
            try:
                yield glyphdb[glyph]
            except KeyError:
                pass


class Glyph(PageObject):
    word = None

    def __init__(self, line, image, elevation):
        super().__init__(line.document, image)
        self.elevation = elevation
        self.line = line

    def is_body(self):
        """Return True if the glyph is definitely not diacritic."""
        return self.height >= self.config.min_body_height

    def optical_distance(self, other):
        distance = other.left - self.right
        return distance + self._optical_correction(other, max_correction=3)

    def detect_diacritic(self, glyph):
        """Check if the given glyph can be our diacritic and return a numeric score."""

        if not self.is_body():
            return False

        # diacritic above the letter
        if glyph.fits(
            self.left + self.config.diacritic_box_left,
            self.top + self.config.diacritic_box_top,
            self.right + self.config.diacritic_box_right,
            self.top + self.config.diacritic_box_bottom,
        ):
            return True

        # apostrophe, like in ť
        if (
            glyph.fits(
                self.right + self.config.apostrophe_box_left,
                self.top + self.config.apostrophe_box_top,
                self.right + self.config.apostrophe_box_right,
                self.top + self.config.apostrophe_box_bottom,
            )
            and glyph.height > self.config.apostrophe_min_height
            and self.optical_distance(glyph) < self.config.apostrophe_max_distance
        ):
            return True

        # dot in ? and !
        if glyph.fits(
            self.left + self.config.dot_box_left,
            self.bottom + self.config.dot_box_top,
            self.right + self.config.dot_box_right,
            self.bottom + self.config.dot_box_bottom,
        ):
            return True

        return False

    def add_diacritics(self, *diacritics):
        if not diacritics:
            return self
        diacritic_images = (diacritic.image for diacritic in diacritics)
        return Glyph(self.line, combine(self.image, *diacritic_images), self.elevation)


class Space(Glyph):
    pass