diff --git a/pixelocr/document.py b/pixelocr/document.py index 24e6bf9..bc4eff3 100644 --- a/pixelocr/document.py +++ b/pixelocr/document.py @@ -71,7 +71,7 @@ class Document(object): try: glyph_data = self.glyphdb[glyph] except KeyError: - text, bold, italic = self.ui.ask_for_help(glyph, self.last_style.bold, self.last_style.italic) + text, bold, italic = self.ui.ask_for_help(glyph, glyph.word.guess_bold(), glyph.word.guess_italic()) glyph_data = self.glyphdb.add_glyph(glyph, text, bold, italic) self.last_style = glyph_data.style return glyph_data diff --git a/pixelocr/page.py b/pixelocr/page.py index c621960..7bbf0c8 100644 --- a/pixelocr/page.py +++ b/pixelocr/page.py @@ -269,6 +269,29 @@ class Word(PageObject): image = line.image[:, beginning.left:end.right] super().__init__(line.document, image) + def guess_bold(self): + return self._guess('bold') + + def guess_italic(self): + return self._guess('italic') + + def _guess(self, attr): + known_glyph_info = self._known_glyph_info() + if not known_glyph_info: + return None + total = sum(getattr(glyph_info.style, attr) for glyph_info in self._known_glyph_info()) + avg = total / len(self.glyphs) + return avg > 0.5 + + @collect_iterable + def _known_glyph_info(self): + glyphdb = self.document.glyphdb + for glyph in self.glyphs: + try: + yield glyphdb[glyph] + except KeyError: + pass + class Glyph(PageObject): word = None