Implement proper diacritic detection.
This commit is contained in:
parent
ada771b5c9
commit
47dcb5be7f
1 changed files with 43 additions and 14 deletions
|
|
@ -15,6 +15,7 @@
|
|||
|
||||
|
||||
import itertools
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
from scipy import ndimage
|
||||
|
|
@ -129,21 +130,27 @@ class Line(PageObject):
|
|||
return margins.min()
|
||||
|
||||
def _combine_diacritics(self, glyphs):
|
||||
def is_diacritic(glyph):
|
||||
# XXX
|
||||
return (
|
||||
glyph is not None
|
||||
and glyph is not None
|
||||
and glyph.elevation > 5
|
||||
)
|
||||
|
||||
while glyphs:
|
||||
glyph, *glyphs = glyphs
|
||||
diacritics = list(itertools.takewhile(is_diacritic, iter(glyphs)))
|
||||
for diacritic in diacritics:
|
||||
glyph = Glyph(glyph.image.combine(diacritic.image), elevation=glyph.elevation)
|
||||
glyphs = glyphs[len(diacritics):]
|
||||
yield glyph
|
||||
def find_diacritics(glyphs):
|
||||
bodies = defaultdict(list)
|
||||
diacritics = defaultdict(list)
|
||||
for i, glyph in enumerate(glyphs):
|
||||
if not glyph.is_body():
|
||||
continue
|
||||
neighbours = glyphs[i - 5: i] + glyphs[i + 1: i + 6]
|
||||
for neighbour in neighbours:
|
||||
if glyph.detect_diacritic(neighbour):
|
||||
diacritics[glyph].append(neighbour)
|
||||
bodies[neighbour].append(glyph)
|
||||
return bodies, diacritics
|
||||
|
||||
bodies, diacritics = find_diacritics(glyphs)
|
||||
for glyph in glyphs:
|
||||
if glyph.is_body():
|
||||
yield glyph.add_diacritics(*diacritics[glyph])
|
||||
else:
|
||||
if glyph not in bodies: # freestanding diacritic-like glyphacter without a body
|
||||
yield glyph
|
||||
|
||||
def _insert_spaces(self, glyphs):
|
||||
for glyph, next_glyph in pairwise(glyphs):
|
||||
|
|
@ -161,6 +168,10 @@ class Line(PageObject):
|
|||
|
||||
|
||||
class Glyph(PageObject):
|
||||
DIACRITIC_WINDOW_LEFT = 3
|
||||
DIACRITIC_WINDOW_RIGHT = 5
|
||||
DIACRITIC_MIN_ELEVATION = 5
|
||||
|
||||
def __init__(self, image, elevation):
|
||||
super().__init__(image)
|
||||
self.elevation = elevation
|
||||
|
|
@ -170,6 +181,24 @@ class Glyph(PageObject):
|
|||
"""Return a dictionary key uniquely representing this glyph."""
|
||||
return self.elevation, self.image.tostring()
|
||||
|
||||
def is_body(self):
|
||||
"""Return True if the glyph is definitely not diacritic."""
|
||||
return self.elevation <= 0
|
||||
|
||||
def detect_diacritic(self, glyph):
|
||||
"""Return True if the given glyph can be our diacritic."""
|
||||
return (
|
||||
glyph.elevation >= self.DIACRITIC_MIN_ELEVATION
|
||||
and glyph.left >= self.left - self.DIACRITIC_WINDOW_LEFT
|
||||
and glyph.right <= self.right + self.DIACRITIC_WINDOW_RIGHT
|
||||
)
|
||||
|
||||
def add_diacritics(self, *diacritics):
|
||||
if not diacritics:
|
||||
return self
|
||||
if len(diacritics) > 1:
|
||||
raise NotImplementedError
|
||||
return Glyph(self.image.combine(diacritics[0].image), self.elevation)
|
||||
|
||||
|
||||
class Space(Glyph):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue