diff --git a/pixelocr/page.py b/pixelocr/page.py index e5cda0e..c6a8aea 100644 --- a/pixelocr/page.py +++ b/pixelocr/page.py @@ -183,9 +183,6 @@ class Line(PageObject): class Glyph(PageObject): - DIACRITIC_WINDOW_LEFT = 3 - DIACRITIC_WINDOW_RIGHT = 3 - DIACRITIC_MIN_ELEVATION = 5 MIN_BODY_HEIGHT = 10 def __init__(self, image, elevation): @@ -202,23 +199,41 @@ class Glyph(PageObject): return self.height >= self.MIN_BODY_HEIGHT def detect_diacritic(self, glyph): - """Check if the given glyph can be our diacritic and return a numeric score. - - Higher score means higher probability. Zero means "absolutely not". - """ + """Check if the given glyph can be our diacritic and return a numeric score.""" if not self.is_body(): - return 0 - if glyph.elevation > 0 and glyph.elevation < self.DIACRITIC_MIN_ELEVATION: - return 0 - if glyph.top >= self.top and glyph.top < self.bottom: - return 0 - if ( - glyph.left < self.left - self.DIACRITIC_WINDOW_LEFT - or glyph.right > self.right + self.DIACRITIC_WINDOW_RIGHT + return False + + #TODO remove hardcoded sizes + + # diacritic above the letter + if glyph.fits( + self.left - 3, + self.top - 10, + self.right + 3, + self.top + 3, ): - return 0 - return 100 - abs(self.xcenter - glyph.xcenter) + return True + + # apostrophe, like in ť + if glyph.fits( + self.right - 5, + self.top - 5, + self.right + 5, + self.top + 5, + ) and glyph.height > 3: + return True + + # dot in ? and ! + if glyph.fits( + self.left - 3, + self.bottom + 1, + self.right + 3, + self.bottom + 10, + ): + return True + + return False def add_diacritics(self, *diacritics): if not diacritics: