Split lines into letters according to letter distances, ditch page.Word.

This commit is contained in:
Andrey Golovizin 2014-08-15 14:25:02 +02:00
parent 50b40458bc
commit ce4252e361
2 changed files with 43 additions and 32 deletions

View file

@ -64,12 +64,12 @@ class PageScene(QGraphicsScene):
letterBrush = QBrush(QColor(255, 255, 0, 80))
linePen = QPen(QColor(255, 50, 50, 100))
for line in page:
for word in line:
for letter in word:
if not letter.image.isspace:
self.addRect(letter.x - 1, letter.y - 1, letter.width + 1, letter.height + 1, letterPen, letterBrush)
for letter in line:
if not letter.image.isspace:
self.addRect(letter.x - 1, letter.y - 1, letter.width + 1, letter.height + 1, letterPen, letterBrush)
else:
self.addRect(letter.x - 1, letter.y - 1, letter.width + 1, letter.height + 1, letterPen)
self.addLine(line.left, line.baseline, line.right, line.baseline, linePen)
# self.addRect(line.x, line.y, line.width, line.height, Qt.red)
def addPage(self, page):
qimage = ndimage2qimage(page.image.data)

View file

@ -14,11 +14,13 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import itertools
import numpy as np
from scipy import ndimage
from scipy.ndimage import filters
from .utils import cached_property, collect_iterable
from .utils import cached_property, collect_iterable, pairwise
from .image import Image
@ -82,18 +84,7 @@ class Page(PageObject):
class Line(PageObject):
def __iter__(self):
return iter(self.words)
@cached_property
@collect_iterable
def words(self):
for rotated_word_img in self.image.T._iter_lines(min_space=5):
yield Word(rotated_word_img.T, baseline=self.baseline)
@property
def letters(self):
for word in self.words:
yield from word.letters
return iter(self.letters)
@cached_property
def baseline(self):
@ -105,17 +96,7 @@ class Line(PageObject):
bottom = gradient.argmin()
return self.y + bottom
class Word(PageObject):
def __init__(self, image, baseline):
super().__init__(image)
self.baseline = baseline
def __iter__(self):
return iter(self.letters)
@cached_property
@collect_iterable
@property
def letters(self):
labels, max_label = ndimage.label(self.image.bitmap, CONNECTIVITY8)
blob_slices = enumerate(ndimage.find_objects(labels, max_label), 1)
@ -127,8 +108,34 @@ class Word(PageObject):
Letter(image, self.baseline - image.bottom)
for image in letter_images
)
sorted_letters = sorted(letters, key=lambda letter: (letter.left, -letter.bottom))
return iter(sorted_letters)
letters = sorted(letters, key=lambda letter: (letter.left, -letter.bottom))
letters = self._combine_diacritics(letters)
return self._insert_spaces(letters)
def _combine_diacritics(self, letters):
def is_diacritic(glyph):
# XXX
return (
letter is not None
and glyph is not None
and glyph.elevation > 5
)
while letters:
letter, *letters = letters
diacritics = list(itertools.takewhile(is_diacritic, iter(letters)))
for diacritic in diacritics:
letter = Letter(letter.image.combine(diacritic.image), elevation=letter.elevation)
letters = letters[len(diacritics):]
yield letter
def _insert_spaces(self, letters):
for letter, next_letter in pairwise(letters):
yield letter
if next_letter is not None:
distance = next_letter.left - letter.right
if distance > 5:
yield Space(self.image.space(letter.right, self.top, distance, self.baseline - self.top))
def _extract_blob(self, blob_slice, label, labels):
image = self.image[blob_slice]
@ -136,8 +143,12 @@ class Word(PageObject):
return image.mask(mask)
class Letter(PageObject):
def __init__(self, image, elevation):
super().__init__(image)
self.elevation = elevation
class Space(Letter):
def __init__(self, image):
super().__init__(image, elevation=0)