diff --git a/pixelocr/gui/pageview.py b/pixelocr/gui/pageview.py index ee7be4d..4033b26 100644 --- a/pixelocr/gui/pageview.py +++ b/pixelocr/gui/pageview.py @@ -64,12 +64,12 @@ class PageScene(QGraphicsScene): letterBrush = QBrush(QColor(255, 255, 0, 80)) linePen = QPen(QColor(255, 50, 50, 100)) for line in page: - for word in line: - for letter in word: - if not letter.image.isspace: - self.addRect(letter.x - 1, letter.y - 1, letter.width + 1, letter.height + 1, letterPen, letterBrush) + for letter in line: + if not letter.image.isspace: + self.addRect(letter.x - 1, letter.y - 1, letter.width + 1, letter.height + 1, letterPen, letterBrush) + else: + self.addRect(letter.x - 1, letter.y - 1, letter.width + 1, letter.height + 1, letterPen) self.addLine(line.left, line.baseline, line.right, line.baseline, linePen) -# self.addRect(line.x, line.y, line.width, line.height, Qt.red) def addPage(self, page): qimage = ndimage2qimage(page.image.data) diff --git a/pixelocr/page.py b/pixelocr/page.py index 5a7b9d1..76ef924 100644 --- a/pixelocr/page.py +++ b/pixelocr/page.py @@ -14,11 +14,13 @@ # along with this program. If not, see . +import itertools + import numpy as np from scipy import ndimage from scipy.ndimage import filters -from .utils import cached_property, collect_iterable +from .utils import cached_property, collect_iterable, pairwise from .image import Image @@ -82,18 +84,7 @@ class Page(PageObject): class Line(PageObject): def __iter__(self): - return iter(self.words) - - @cached_property - @collect_iterable - def words(self): - for rotated_word_img in self.image.T._iter_lines(min_space=5): - yield Word(rotated_word_img.T, baseline=self.baseline) - - @property - def letters(self): - for word in self.words: - yield from word.letters + return iter(self.letters) @cached_property def baseline(self): @@ -105,17 +96,7 @@ class Line(PageObject): bottom = gradient.argmin() return self.y + bottom - -class Word(PageObject): - def __init__(self, image, baseline): - super().__init__(image) - self.baseline = baseline - - def __iter__(self): - return iter(self.letters) - - @cached_property - @collect_iterable + @property def letters(self): labels, max_label = ndimage.label(self.image.bitmap, CONNECTIVITY8) blob_slices = enumerate(ndimage.find_objects(labels, max_label), 1) @@ -127,8 +108,34 @@ class Word(PageObject): Letter(image, self.baseline - image.bottom) for image in letter_images ) - sorted_letters = sorted(letters, key=lambda letter: (letter.left, -letter.bottom)) - return iter(sorted_letters) + letters = sorted(letters, key=lambda letter: (letter.left, -letter.bottom)) + letters = self._combine_diacritics(letters) + return self._insert_spaces(letters) + + def _combine_diacritics(self, letters): + def is_diacritic(glyph): + # XXX + return ( + letter is not None + and glyph is not None + and glyph.elevation > 5 + ) + + while letters: + letter, *letters = letters + diacritics = list(itertools.takewhile(is_diacritic, iter(letters))) + for diacritic in diacritics: + letter = Letter(letter.image.combine(diacritic.image), elevation=letter.elevation) + letters = letters[len(diacritics):] + yield letter + + def _insert_spaces(self, letters): + for letter, next_letter in pairwise(letters): + yield letter + if next_letter is not None: + distance = next_letter.left - letter.right + if distance > 5: + yield Space(self.image.space(letter.right, self.top, distance, self.baseline - self.top)) def _extract_blob(self, blob_slice, label, labels): image = self.image[blob_slice] @@ -136,8 +143,12 @@ class Word(PageObject): return image.mask(mask) - class Letter(PageObject): def __init__(self, image, elevation): super().__init__(image) self.elevation = elevation + + +class Space(Letter): + def __init__(self, image): + super().__init__(image, elevation=0)