Add Line.words property.

This commit is contained in:
Andrey Golovizin 2014-09-15 22:22:36 +02:00
parent 2f06b66b89
commit 1847bbcd9e

View file

@ -191,7 +191,7 @@ class Line(PageObject):
@cached_property
@collect_iterable
def glyphs(self):
def words(self):
labels, max_label = ndimage.label(self.image.bitmap, CONNECTIVITY8)
blob_slices = enumerate(ndimage.find_objects(labels, max_label), 1)
glyph_images = (
@ -204,7 +204,15 @@ class Line(PageObject):
)
glyphs = sorted(glyphs, key=lambda glyph: (glyph.left, -glyph.bottom))
glyphs = self._combine_diacritics(glyphs)
return self._insert_spaces(glyphs)
return self._detect_words(glyphs)
@property
@collect_iterable
def glyphs(self):
for word in self.words:
yield from word.glyphs
if word.space_after is not None:
yield word.space_after
def _combine_diacritics(self, glyphs):
def find_correspondence(glyphs):
@ -227,13 +235,18 @@ class Line(PageObject):
if glyph not in bodies: # freestanding diacritic-like glyph without a body
yield glyph
def _insert_spaces(self, glyphs, min_distance=15):
def _detect_words(self, glyphs, min_distance=15):
current_word_glyphs = []
for glyph, next_glyph in pairwise(glyphs):
yield glyph
current_word_glyphs.append(glyph)
if next_glyph is not None:
distance = glyph.optical_distance(next_glyph)
if distance >= min_distance:
yield Space(self, self.image.space(glyph.right, self.top, distance, self.height), self.baseline - self.top)
space_after = Space(self, self.image.space(glyph.right, self.top, distance, self.height), self.baseline - self.top)
yield Word(self, current_word_glyphs, space_after)
current_word_glyphs = []
if current_word_glyphs:
yield Word(self, current_word_glyphs)
def _extract_blob(self, blob_slice, label, labels):
image = self.image[blob_slice]
@ -245,7 +258,21 @@ class Line(PageObject):
return distance + self._optical_correction(other, T=True)
class Word(PageObject):
def __init__(self, line, glyphs, space_after=None):
self.glyphs = glyphs
for glyph in glyphs:
glyph.word = self
self.space_after = space_after
beginning = self.glyphs[0]
end = space_after if space_after is not None else self.glyphs[-1]
image = line.image[:, beginning.left:end.right]
super().__init__(line.document, image)
class Glyph(PageObject):
word = None
def __init__(self, line, image, elevation):
super().__init__(line.document, image)
self.elevation = elevation