diff --git a/pixelocr/page.py b/pixelocr/page.py index 9da11ce..aa4feb5 100644 --- a/pixelocr/page.py +++ b/pixelocr/page.py @@ -71,6 +71,14 @@ class PageObject(object): def bottom(self): return self.image.bottom + @property + def xcenter(self): + return (self.right - self.left) / 2 + + @property + def ycenter(self): + return (self.bottom - self.top) / 2 + class Page(PageObject): def __iter__(self): @@ -131,21 +139,20 @@ class Line(PageObject): return margins.min() def _combine_diacritics(self, glyphs): - - def find_diacritics(glyphs): + def find_correspondence(glyphs): bodies = defaultdict(list) diacritics = defaultdict(list) for i, glyph in enumerate(glyphs): - if not glyph.is_body(): + if glyph.is_body(): continue neighbours = glyphs[i - 5: i] + glyphs[i + 1: i + 6] - for neighbour in neighbours: - if glyph.detect_diacritic(neighbour): - diacritics[glyph].append(neighbour) - bodies[neighbour].append(glyph) + body = max(neighbours, key=lambda neighbour: neighbour.detect_diacritic(glyph)) + if body.detect_diacritic(glyph): + diacritics[body].append(glyph) + bodies[glyph].append(body) return bodies, diacritics - bodies, diacritics = find_diacritics(glyphs) + bodies, diacritics = find_correspondence(glyphs) for glyph in glyphs: if glyph.is_body(): yield glyph.add_diacritics(*diacritics[glyph]) @@ -187,12 +194,19 @@ class Glyph(PageObject): return self.elevation <= 0 def detect_diacritic(self, glyph): - """Return True if the given glyph can be our diacritic.""" - return ( - glyph.elevation >= self.DIACRITIC_MIN_ELEVATION - and glyph.left >= self.left - self.DIACRITIC_WINDOW_LEFT - and glyph.right <= self.right + self.DIACRITIC_WINDOW_RIGHT - ) + """Check if the given glyph can be our diacritic and return a numeric score. + + Higher score means higher probability. Zero means "absolutely not". + """ + + if glyph.elevation < self.DIACRITIC_MIN_ELEVATION: + return 0 + if ( + glyph.left < self.left - self.DIACRITIC_WINDOW_LEFT + or glyph.right > self.right + self.DIACRITIC_WINDOW_RIGHT + ): + return 0 + return 100 - abs(self.xcenter - glyph.xcenter) def add_diacritics(self, *diacritics): if not diacritics: