Split lines into letters according to letter distances, ditch page.Word.
This commit is contained in:
parent
50b40458bc
commit
ce4252e361
2 changed files with 43 additions and 32 deletions
|
|
@ -64,12 +64,12 @@ class PageScene(QGraphicsScene):
|
||||||
letterBrush = QBrush(QColor(255, 255, 0, 80))
|
letterBrush = QBrush(QColor(255, 255, 0, 80))
|
||||||
linePen = QPen(QColor(255, 50, 50, 100))
|
linePen = QPen(QColor(255, 50, 50, 100))
|
||||||
for line in page:
|
for line in page:
|
||||||
for word in line:
|
for letter in line:
|
||||||
for letter in word:
|
if not letter.image.isspace:
|
||||||
if not letter.image.isspace:
|
self.addRect(letter.x - 1, letter.y - 1, letter.width + 1, letter.height + 1, letterPen, letterBrush)
|
||||||
self.addRect(letter.x - 1, letter.y - 1, letter.width + 1, letter.height + 1, letterPen, letterBrush)
|
else:
|
||||||
|
self.addRect(letter.x - 1, letter.y - 1, letter.width + 1, letter.height + 1, letterPen)
|
||||||
self.addLine(line.left, line.baseline, line.right, line.baseline, linePen)
|
self.addLine(line.left, line.baseline, line.right, line.baseline, linePen)
|
||||||
# self.addRect(line.x, line.y, line.width, line.height, Qt.red)
|
|
||||||
|
|
||||||
def addPage(self, page):
|
def addPage(self, page):
|
||||||
qimage = ndimage2qimage(page.image.data)
|
qimage = ndimage2qimage(page.image.data)
|
||||||
|
|
|
||||||
|
|
@ -14,11 +14,13 @@
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy import ndimage
|
from scipy import ndimage
|
||||||
from scipy.ndimage import filters
|
from scipy.ndimage import filters
|
||||||
|
|
||||||
from .utils import cached_property, collect_iterable
|
from .utils import cached_property, collect_iterable, pairwise
|
||||||
from .image import Image
|
from .image import Image
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -82,18 +84,7 @@ class Page(PageObject):
|
||||||
|
|
||||||
class Line(PageObject):
|
class Line(PageObject):
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return iter(self.words)
|
return iter(self.letters)
|
||||||
|
|
||||||
@cached_property
|
|
||||||
@collect_iterable
|
|
||||||
def words(self):
|
|
||||||
for rotated_word_img in self.image.T._iter_lines(min_space=5):
|
|
||||||
yield Word(rotated_word_img.T, baseline=self.baseline)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def letters(self):
|
|
||||||
for word in self.words:
|
|
||||||
yield from word.letters
|
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def baseline(self):
|
def baseline(self):
|
||||||
|
|
@ -105,17 +96,7 @@ class Line(PageObject):
|
||||||
bottom = gradient.argmin()
|
bottom = gradient.argmin()
|
||||||
return self.y + bottom
|
return self.y + bottom
|
||||||
|
|
||||||
|
@property
|
||||||
class Word(PageObject):
|
|
||||||
def __init__(self, image, baseline):
|
|
||||||
super().__init__(image)
|
|
||||||
self.baseline = baseline
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return iter(self.letters)
|
|
||||||
|
|
||||||
@cached_property
|
|
||||||
@collect_iterable
|
|
||||||
def letters(self):
|
def letters(self):
|
||||||
labels, max_label = ndimage.label(self.image.bitmap, CONNECTIVITY8)
|
labels, max_label = ndimage.label(self.image.bitmap, CONNECTIVITY8)
|
||||||
blob_slices = enumerate(ndimage.find_objects(labels, max_label), 1)
|
blob_slices = enumerate(ndimage.find_objects(labels, max_label), 1)
|
||||||
|
|
@ -127,8 +108,34 @@ class Word(PageObject):
|
||||||
Letter(image, self.baseline - image.bottom)
|
Letter(image, self.baseline - image.bottom)
|
||||||
for image in letter_images
|
for image in letter_images
|
||||||
)
|
)
|
||||||
sorted_letters = sorted(letters, key=lambda letter: (letter.left, -letter.bottom))
|
letters = sorted(letters, key=lambda letter: (letter.left, -letter.bottom))
|
||||||
return iter(sorted_letters)
|
letters = self._combine_diacritics(letters)
|
||||||
|
return self._insert_spaces(letters)
|
||||||
|
|
||||||
|
def _combine_diacritics(self, letters):
|
||||||
|
def is_diacritic(glyph):
|
||||||
|
# XXX
|
||||||
|
return (
|
||||||
|
letter is not None
|
||||||
|
and glyph is not None
|
||||||
|
and glyph.elevation > 5
|
||||||
|
)
|
||||||
|
|
||||||
|
while letters:
|
||||||
|
letter, *letters = letters
|
||||||
|
diacritics = list(itertools.takewhile(is_diacritic, iter(letters)))
|
||||||
|
for diacritic in diacritics:
|
||||||
|
letter = Letter(letter.image.combine(diacritic.image), elevation=letter.elevation)
|
||||||
|
letters = letters[len(diacritics):]
|
||||||
|
yield letter
|
||||||
|
|
||||||
|
def _insert_spaces(self, letters):
|
||||||
|
for letter, next_letter in pairwise(letters):
|
||||||
|
yield letter
|
||||||
|
if next_letter is not None:
|
||||||
|
distance = next_letter.left - letter.right
|
||||||
|
if distance > 5:
|
||||||
|
yield Space(self.image.space(letter.right, self.top, distance, self.baseline - self.top))
|
||||||
|
|
||||||
def _extract_blob(self, blob_slice, label, labels):
|
def _extract_blob(self, blob_slice, label, labels):
|
||||||
image = self.image[blob_slice]
|
image = self.image[blob_slice]
|
||||||
|
|
@ -136,8 +143,12 @@ class Word(PageObject):
|
||||||
return image.mask(mask)
|
return image.mask(mask)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Letter(PageObject):
|
class Letter(PageObject):
|
||||||
def __init__(self, image, elevation):
|
def __init__(self, image, elevation):
|
||||||
super().__init__(image)
|
super().__init__(image)
|
||||||
self.elevation = elevation
|
self.elevation = elevation
|
||||||
|
|
||||||
|
|
||||||
|
class Space(Letter):
|
||||||
|
def __init__(self, image):
|
||||||
|
super().__init__(image, elevation=0)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue