diff --git a/pixelocr/page.py b/pixelocr/page.py index d77dd9d..0a2dedc 100644 --- a/pixelocr/page.py +++ b/pixelocr/page.py @@ -80,7 +80,7 @@ class Line(PageObject): @collect_iterable def words(self): for rotated_word_img in self.image.T._iter_lines(min_space=10): - yield Word(rotated_word_img.T) + yield Word(rotated_word_img.T, baseline=self.baseline) @property def letters(self): @@ -99,6 +99,10 @@ class Line(PageObject): class Word(PageObject): + def __init__(self, image, baseline): + super().__init__(image) + self.baseline = baseline + def __iter__(self): return iter(self.letters) @@ -108,10 +112,15 @@ class Word(PageObject): labels, max_label = ndimage.label(self.image.bitmap, CONNECTIVITY8) obj_indices = ndimage.find_objects(labels, max_label) letter_images = (self.image[obj_index] for obj_index in obj_indices) - letters = (Letter(image) for image in letter_images) + letters = ( + Letter(image, self.y1 + self.baseline - image.y2) + for image in letter_images + ) sorted_letters = sorted(letters, key=lambda letter: (letter.x1, -letter.y1)) return iter(sorted_letters) class Letter(PageObject): - pass + def __init__(self, image, elevation): + super().__init__(image) + self.elevation = elevation