Implement proper diacritic detection.

This commit is contained in:
Andrey Golovizin 2014-08-18 15:50:43 +02:00
parent ada771b5c9
commit 47dcb5be7f

View file

@ -15,6 +15,7 @@
import itertools
from collections import defaultdict
import numpy as np
from scipy import ndimage
@ -129,21 +130,27 @@ class Line(PageObject):
return margins.min()
def _combine_diacritics(self, glyphs):
def is_diacritic(glyph):
# XXX
return (
glyph is not None
and glyph is not None
and glyph.elevation > 5
)
while glyphs:
glyph, *glyphs = glyphs
diacritics = list(itertools.takewhile(is_diacritic, iter(glyphs)))
for diacritic in diacritics:
glyph = Glyph(glyph.image.combine(diacritic.image), elevation=glyph.elevation)
glyphs = glyphs[len(diacritics):]
yield glyph
def find_diacritics(glyphs):
bodies = defaultdict(list)
diacritics = defaultdict(list)
for i, glyph in enumerate(glyphs):
if not glyph.is_body():
continue
neighbours = glyphs[i - 5: i] + glyphs[i + 1: i + 6]
for neighbour in neighbours:
if glyph.detect_diacritic(neighbour):
diacritics[glyph].append(neighbour)
bodies[neighbour].append(glyph)
return bodies, diacritics
bodies, diacritics = find_diacritics(glyphs)
for glyph in glyphs:
if glyph.is_body():
yield glyph.add_diacritics(*diacritics[glyph])
else:
if glyph not in bodies: # freestanding diacritic-like glyphacter without a body
yield glyph
def _insert_spaces(self, glyphs):
for glyph, next_glyph in pairwise(glyphs):
@ -161,6 +168,10 @@ class Line(PageObject):
class Glyph(PageObject):
DIACRITIC_WINDOW_LEFT = 3
DIACRITIC_WINDOW_RIGHT = 5
DIACRITIC_MIN_ELEVATION = 5
def __init__(self, image, elevation):
super().__init__(image)
self.elevation = elevation
@ -170,6 +181,24 @@ class Glyph(PageObject):
"""Return a dictionary key uniquely representing this glyph."""
return self.elevation, self.image.tostring()
def is_body(self):
"""Return True if the glyph is definitely not diacritic."""
return self.elevation <= 0
def detect_diacritic(self, glyph):
"""Return True if the given glyph can be our diacritic."""
return (
glyph.elevation >= self.DIACRITIC_MIN_ELEVATION
and glyph.left >= self.left - self.DIACRITIC_WINDOW_LEFT
and glyph.right <= self.right + self.DIACRITIC_WINDOW_RIGHT
)
def add_diacritics(self, *diacritics):
if not diacritics:
return self
if len(diacritics) > 1:
raise NotImplementedError
return Glyph(self.image.combine(diacritics[0].image), self.elevation)
class Space(Glyph):