Implement proper diacritic detection.
This commit is contained in:
parent
ada771b5c9
commit
47dcb5be7f
1 changed files with 43 additions and 14 deletions
|
|
@ -15,6 +15,7 @@
|
||||||
|
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy import ndimage
|
from scipy import ndimage
|
||||||
|
|
@ -129,20 +130,26 @@ class Line(PageObject):
|
||||||
return margins.min()
|
return margins.min()
|
||||||
|
|
||||||
def _combine_diacritics(self, glyphs):
|
def _combine_diacritics(self, glyphs):
|
||||||
def is_diacritic(glyph):
|
|
||||||
# XXX
|
|
||||||
return (
|
|
||||||
glyph is not None
|
|
||||||
and glyph is not None
|
|
||||||
and glyph.elevation > 5
|
|
||||||
)
|
|
||||||
|
|
||||||
while glyphs:
|
def find_diacritics(glyphs):
|
||||||
glyph, *glyphs = glyphs
|
bodies = defaultdict(list)
|
||||||
diacritics = list(itertools.takewhile(is_diacritic, iter(glyphs)))
|
diacritics = defaultdict(list)
|
||||||
for diacritic in diacritics:
|
for i, glyph in enumerate(glyphs):
|
||||||
glyph = Glyph(glyph.image.combine(diacritic.image), elevation=glyph.elevation)
|
if not glyph.is_body():
|
||||||
glyphs = glyphs[len(diacritics):]
|
continue
|
||||||
|
neighbours = glyphs[i - 5: i] + glyphs[i + 1: i + 6]
|
||||||
|
for neighbour in neighbours:
|
||||||
|
if glyph.detect_diacritic(neighbour):
|
||||||
|
diacritics[glyph].append(neighbour)
|
||||||
|
bodies[neighbour].append(glyph)
|
||||||
|
return bodies, diacritics
|
||||||
|
|
||||||
|
bodies, diacritics = find_diacritics(glyphs)
|
||||||
|
for glyph in glyphs:
|
||||||
|
if glyph.is_body():
|
||||||
|
yield glyph.add_diacritics(*diacritics[glyph])
|
||||||
|
else:
|
||||||
|
if glyph not in bodies: # freestanding diacritic-like glyphacter without a body
|
||||||
yield glyph
|
yield glyph
|
||||||
|
|
||||||
def _insert_spaces(self, glyphs):
|
def _insert_spaces(self, glyphs):
|
||||||
|
|
@ -161,6 +168,10 @@ class Line(PageObject):
|
||||||
|
|
||||||
|
|
||||||
class Glyph(PageObject):
|
class Glyph(PageObject):
|
||||||
|
DIACRITIC_WINDOW_LEFT = 3
|
||||||
|
DIACRITIC_WINDOW_RIGHT = 5
|
||||||
|
DIACRITIC_MIN_ELEVATION = 5
|
||||||
|
|
||||||
def __init__(self, image, elevation):
|
def __init__(self, image, elevation):
|
||||||
super().__init__(image)
|
super().__init__(image)
|
||||||
self.elevation = elevation
|
self.elevation = elevation
|
||||||
|
|
@ -170,6 +181,24 @@ class Glyph(PageObject):
|
||||||
"""Return a dictionary key uniquely representing this glyph."""
|
"""Return a dictionary key uniquely representing this glyph."""
|
||||||
return self.elevation, self.image.tostring()
|
return self.elevation, self.image.tostring()
|
||||||
|
|
||||||
|
def is_body(self):
|
||||||
|
"""Return True if the glyph is definitely not diacritic."""
|
||||||
|
return self.elevation <= 0
|
||||||
|
|
||||||
|
def detect_diacritic(self, glyph):
|
||||||
|
"""Return True if the given glyph can be our diacritic."""
|
||||||
|
return (
|
||||||
|
glyph.elevation >= self.DIACRITIC_MIN_ELEVATION
|
||||||
|
and glyph.left >= self.left - self.DIACRITIC_WINDOW_LEFT
|
||||||
|
and glyph.right <= self.right + self.DIACRITIC_WINDOW_RIGHT
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_diacritics(self, *diacritics):
|
||||||
|
if not diacritics:
|
||||||
|
return self
|
||||||
|
if len(diacritics) > 1:
|
||||||
|
raise NotImplementedError
|
||||||
|
return Glyph(self.image.combine(diacritics[0].image), self.elevation)
|
||||||
|
|
||||||
|
|
||||||
class Space(Glyph):
|
class Space(Glyph):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue