Split lines into letters according to letter distances, ditch page.Word.
This commit is contained in:
parent
50b40458bc
commit
ce4252e361
2 changed files with 43 additions and 32 deletions
|
|
@ -64,12 +64,12 @@ class PageScene(QGraphicsScene):
|
|||
letterBrush = QBrush(QColor(255, 255, 0, 80))
|
||||
linePen = QPen(QColor(255, 50, 50, 100))
|
||||
for line in page:
|
||||
for word in line:
|
||||
for letter in word:
|
||||
if not letter.image.isspace:
|
||||
self.addRect(letter.x - 1, letter.y - 1, letter.width + 1, letter.height + 1, letterPen, letterBrush)
|
||||
for letter in line:
|
||||
if not letter.image.isspace:
|
||||
self.addRect(letter.x - 1, letter.y - 1, letter.width + 1, letter.height + 1, letterPen, letterBrush)
|
||||
else:
|
||||
self.addRect(letter.x - 1, letter.y - 1, letter.width + 1, letter.height + 1, letterPen)
|
||||
self.addLine(line.left, line.baseline, line.right, line.baseline, linePen)
|
||||
# self.addRect(line.x, line.y, line.width, line.height, Qt.red)
|
||||
|
||||
def addPage(self, page):
|
||||
qimage = ndimage2qimage(page.image.data)
|
||||
|
|
|
|||
|
|
@ -14,11 +14,13 @@
|
|||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
from scipy import ndimage
|
||||
from scipy.ndimage import filters
|
||||
|
||||
from .utils import cached_property, collect_iterable
|
||||
from .utils import cached_property, collect_iterable, pairwise
|
||||
from .image import Image
|
||||
|
||||
|
||||
|
|
@ -82,18 +84,7 @@ class Page(PageObject):
|
|||
|
||||
class Line(PageObject):
|
||||
def __iter__(self):
|
||||
return iter(self.words)
|
||||
|
||||
@cached_property
|
||||
@collect_iterable
|
||||
def words(self):
|
||||
for rotated_word_img in self.image.T._iter_lines(min_space=5):
|
||||
yield Word(rotated_word_img.T, baseline=self.baseline)
|
||||
|
||||
@property
|
||||
def letters(self):
|
||||
for word in self.words:
|
||||
yield from word.letters
|
||||
return iter(self.letters)
|
||||
|
||||
@cached_property
|
||||
def baseline(self):
|
||||
|
|
@ -105,17 +96,7 @@ class Line(PageObject):
|
|||
bottom = gradient.argmin()
|
||||
return self.y + bottom
|
||||
|
||||
|
||||
class Word(PageObject):
|
||||
def __init__(self, image, baseline):
|
||||
super().__init__(image)
|
||||
self.baseline = baseline
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.letters)
|
||||
|
||||
@cached_property
|
||||
@collect_iterable
|
||||
@property
|
||||
def letters(self):
|
||||
labels, max_label = ndimage.label(self.image.bitmap, CONNECTIVITY8)
|
||||
blob_slices = enumerate(ndimage.find_objects(labels, max_label), 1)
|
||||
|
|
@ -127,8 +108,34 @@ class Word(PageObject):
|
|||
Letter(image, self.baseline - image.bottom)
|
||||
for image in letter_images
|
||||
)
|
||||
sorted_letters = sorted(letters, key=lambda letter: (letter.left, -letter.bottom))
|
||||
return iter(sorted_letters)
|
||||
letters = sorted(letters, key=lambda letter: (letter.left, -letter.bottom))
|
||||
letters = self._combine_diacritics(letters)
|
||||
return self._insert_spaces(letters)
|
||||
|
||||
def _combine_diacritics(self, letters):
|
||||
def is_diacritic(glyph):
|
||||
# XXX
|
||||
return (
|
||||
letter is not None
|
||||
and glyph is not None
|
||||
and glyph.elevation > 5
|
||||
)
|
||||
|
||||
while letters:
|
||||
letter, *letters = letters
|
||||
diacritics = list(itertools.takewhile(is_diacritic, iter(letters)))
|
||||
for diacritic in diacritics:
|
||||
letter = Letter(letter.image.combine(diacritic.image), elevation=letter.elevation)
|
||||
letters = letters[len(diacritics):]
|
||||
yield letter
|
||||
|
||||
def _insert_spaces(self, letters):
|
||||
for letter, next_letter in pairwise(letters):
|
||||
yield letter
|
||||
if next_letter is not None:
|
||||
distance = next_letter.left - letter.right
|
||||
if distance > 5:
|
||||
yield Space(self.image.space(letter.right, self.top, distance, self.baseline - self.top))
|
||||
|
||||
def _extract_blob(self, blob_slice, label, labels):
|
||||
image = self.image[blob_slice]
|
||||
|
|
@ -136,8 +143,12 @@ class Word(PageObject):
|
|||
return image.mask(mask)
|
||||
|
||||
|
||||
|
||||
class Letter(PageObject):
|
||||
def __init__(self, image, elevation):
|
||||
super().__init__(image)
|
||||
self.elevation = elevation
|
||||
|
||||
|
||||
class Space(Letter):
|
||||
def __init__(self, image):
|
||||
super().__init__(image, elevation=0)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue