From b756ea484b72b11c949cd583404356e4273d0d31 Mon Sep 17 00:00:00 2001 From: Andrey Golovizin Date: Wed, 13 Aug 2014 15:20:35 +0200 Subject: [PATCH] Add Page.lines, Line.words, Line.letters and Word.letters properties. --- pixelocr/page.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pixelocr/page.py b/pixelocr/page.py index 3d74acb..1d34fdc 100644 --- a/pixelocr/page.py +++ b/pixelocr/page.py @@ -16,6 +16,7 @@ from scipy import ndimage +from .utils import cached_property, collect_iterable from .image import Image @@ -60,18 +61,38 @@ class PageObject(object): class Page(PageObject): def __iter__(self): + return iter(self.lines) + + @cached_property + @collect_iterable + def lines(self): for line_img in self.image._iter_lines(min_space=5): yield Line(line_img) class Line(PageObject): def __iter__(self): + return iter(self.words) + + @cached_property + @collect_iterable + def words(self): for rotated_word_img in self.image.T._iter_lines(min_space=10): yield Word(rotated_word_img.T) + @property + def letters(self): + for word in self.words: + yield from word.letters + class Word(PageObject): def __iter__(self): + return iter(self.letters) + + @cached_property + @collect_iterable + def letters(self): labels, max_label = ndimage.label(self.image.bitmap, CONNECTIVITY8) obj_indices = ndimage.find_objects(labels, max_label) letter_images = (self.image[obj_index] for obj_index in obj_indices)