diff --git a/pixelocr/page.py b/pixelocr/page.py index 3e638d5..1bd3911 100644 --- a/pixelocr/page.py +++ b/pixelocr/page.py @@ -15,6 +15,7 @@ import itertools +from collections import defaultdict import numpy as np from scipy import ndimage @@ -129,21 +130,27 @@ class Line(PageObject): return margins.min() def _combine_diacritics(self, glyphs): - def is_diacritic(glyph): - # XXX - return ( - glyph is not None - and glyph is not None - and glyph.elevation > 5 - ) - while glyphs: - glyph, *glyphs = glyphs - diacritics = list(itertools.takewhile(is_diacritic, iter(glyphs))) - for diacritic in diacritics: - glyph = Glyph(glyph.image.combine(diacritic.image), elevation=glyph.elevation) - glyphs = glyphs[len(diacritics):] - yield glyph + def find_diacritics(glyphs): + bodies = defaultdict(list) + diacritics = defaultdict(list) + for i, glyph in enumerate(glyphs): + if not glyph.is_body(): + continue + neighbours = glyphs[i - 5: i] + glyphs[i + 1: i + 6] + for neighbour in neighbours: + if glyph.detect_diacritic(neighbour): + diacritics[glyph].append(neighbour) + bodies[neighbour].append(glyph) + return bodies, diacritics + + bodies, diacritics = find_diacritics(glyphs) + for glyph in glyphs: + if glyph.is_body(): + yield glyph.add_diacritics(*diacritics[glyph]) + else: + if glyph not in bodies: # freestanding diacritic-like glyphacter without a body + yield glyph def _insert_spaces(self, glyphs): for glyph, next_glyph in pairwise(glyphs): @@ -161,6 +168,10 @@ class Line(PageObject): class Glyph(PageObject): + DIACRITIC_WINDOW_LEFT = 3 + DIACRITIC_WINDOW_RIGHT = 5 + DIACRITIC_MIN_ELEVATION = 5 + def __init__(self, image, elevation): super().__init__(image) self.elevation = elevation @@ -170,6 +181,24 @@ class Glyph(PageObject): """Return a dictionary key uniquely representing this glyph.""" return self.elevation, self.image.tostring() + def is_body(self): + """Return True if the glyph is definitely not diacritic.""" + return self.elevation <= 0 + + def detect_diacritic(self, glyph): + """Return True if the given glyph can be our diacritic.""" + return ( + glyph.elevation >= self.DIACRITIC_MIN_ELEVATION + and glyph.left >= self.left - self.DIACRITIC_WINDOW_LEFT + and glyph.right <= self.right + self.DIACRITIC_WINDOW_RIGHT + ) + + def add_diacritics(self, *diacritics): + if not diacritics: + return self + if len(diacritics) > 1: + raise NotImplementedError + return Glyph(self.image.combine(diacritics[0].image), self.elevation) class Space(Glyph):