diff --git a/pixelocr/page.py b/pixelocr/page.py index 0ac82f8..bcef204 100644 --- a/pixelocr/page.py +++ b/pixelocr/page.py @@ -191,7 +191,7 @@ class Line(PageObject): @cached_property @collect_iterable - def glyphs(self): + def words(self): labels, max_label = ndimage.label(self.image.bitmap, CONNECTIVITY8) blob_slices = enumerate(ndimage.find_objects(labels, max_label), 1) glyph_images = ( @@ -204,7 +204,15 @@ class Line(PageObject): ) glyphs = sorted(glyphs, key=lambda glyph: (glyph.left, -glyph.bottom)) glyphs = self._combine_diacritics(glyphs) - return self._insert_spaces(glyphs) + return self._detect_words(glyphs) + + @property + @collect_iterable + def glyphs(self): + for word in self.words: + yield from word.glyphs + if word.space_after is not None: + yield word.space_after def _combine_diacritics(self, glyphs): def find_correspondence(glyphs): @@ -227,13 +235,18 @@ class Line(PageObject): if glyph not in bodies: # freestanding diacritic-like glyph without a body yield glyph - def _insert_spaces(self, glyphs, min_distance=15): + def _detect_words(self, glyphs, min_distance=15): + current_word_glyphs = [] for glyph, next_glyph in pairwise(glyphs): - yield glyph + current_word_glyphs.append(glyph) if next_glyph is not None: distance = glyph.optical_distance(next_glyph) if distance >= min_distance: - yield Space(self, self.image.space(glyph.right, self.top, distance, self.height), self.baseline - self.top) + space_after = Space(self, self.image.space(glyph.right, self.top, distance, self.height), self.baseline - self.top) + yield Word(self, current_word_glyphs, space_after) + current_word_glyphs = [] + if current_word_glyphs: + yield Word(self, current_word_glyphs) def _extract_blob(self, blob_slice, label, labels): image = self.image[blob_slice] @@ -245,7 +258,21 @@ class Line(PageObject): return distance + self._optical_correction(other, T=True) +class Word(PageObject): + def __init__(self, line, glyphs, space_after=None): + self.glyphs = glyphs + for glyph in glyphs: + glyph.word = self + self.space_after = space_after + beginning = self.glyphs[0] + end = space_after if space_after is not None else self.glyphs[-1] + image = line.image[:, beginning.left:end.right] + super().__init__(line.document, image) + + class Glyph(PageObject): + word = None + def __init__(self, line, image, elevation): super().__init__(line.document, image) self.elevation = elevation