Add support for output formats.

2014-09-04 17:46:48 +02:00 · 2014-09-04 17:46:48 +02:00 · 296035c966
commit 296035c966
parent bd2a206940
3 changed files with 97 additions and 10 deletions
--- a/pixelocr/formatting.py
+++ b/pixelocr/formatting.py
@ -0,0 +1,72 @@
 # Copyright (C) 2014  Andrey Golovizin
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import itertools
 from .utils import pipe
 from .glyphdb import WhitespaceData
 class OutputFormat(object):
    last_tag = None
    def __call__(self, glyph_data_stream):
        return self.format(glyph_data_stream)
    def format(self, glyph_data_seq):
        last_tag = None
        return pipe(glyph_data_seq, self.group_by_tag, self.fix_spaces, self.format_tags)
    def group_by_tag(self, styled_glyphs):
        for tag, group in itertools.groupby(styled_glyphs, key=self.assign_tag):
            text = ''.join(glyph_data.text for glyph_data in group)
            yield tag, text
    def fix_spaces(self, tagged_text):
        for tag, text in tagged_text:
            stripped_text = text.rstrip()
            if len(stripped_text) < len(text):
                traling_whitespace = text[len(stripped_text):]
                yield tag, stripped_text
                yield None, traling_whitespace
            else:
                yield tag, text
    def format_tags(self, tagged_text):
        for tag, text in tagged_text:
            yield self.format_tag(tag, text)
    def assign_tag(self, glyph_data):
        if isinstance(glyph_data, WhitespaceData):
            return self.last_tag
        else:
            tag = self.assign_glyph_tag(glyph_data)
            self.last_tag = tag
            return tag
    def assign_glyph_tag(self, glyph_data):
        raise NotImplementedError
    def format_tag(self, tag, text):
        raise NotImplementedError
 class TextFormat(OutputFormat):
    def assign_tag(self, glyph_data):
        return None
    def format_tag(self, tag, text):
        return text
--- a/pixelocr/glyphdb.py
+++ b/pixelocr/glyphdb.py
@ -66,6 +66,17 @@ class GlyphData(object):
        return cls(*args)
 class WhitespaceData(GlyphData):
    color = (255, 255, 255)  # FIXME support non-white background
    def __init__(self, text):
        super().__init__(None, None, text, bold=False, italic=False)
 SPACE = WhitespaceData(' ')
 NEWLINE = WhitespaceData('\n')
 class GlyphDB(object):
    def __init__(self, filename):
        self.filename = filename
--- a/pixelocr/gui/ocrengine.py
+++ b/pixelocr/gui/ocrengine.py
@ -14,6 +14,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import itertools
 from glob import glob
 from os import path
 from queue import Queue
@ -27,9 +28,10 @@ from PyQt4.QtGui import (
    qApp
 )
 from .. import formatting
 from ..image import Image
 from ..page import Page, Glyph, Space
-from ..glyphdb import GlyphDB
+from ..glyphdb import GlyphDB, SPACE, NEWLINE
 class OCREngine(QThread):
@ -53,7 +55,6 @@ class OCREngine(QThread):
    def run(self):
        for page_text in self.recognize():
            print()
            print(page_text)
        if self.quit:
            qApp.quit()
@ -62,27 +63,30 @@ class OCREngine(QThread):
        for filename in self.filenames:
            page = self.load_page(filename)
            self.pageChanged.emit(page)
-            yield '\n'.join(self.recognize_page(page))
+            yield self.recognize_page(page)
    def recognize_page(self, page):
-        for line in page.lines:
+        glyph_data_seq = itertools.chain(*(self.recognize_line(line) for line in page.lines))
-            yield ''.join(self.recognize_line(line))
+        output_format = formatting.TextFormat()
        return ''.join(output_format.format(glyph_data_seq))
    def recognize_line(self, line):
-        yield from ' ' * int(line.indent / self.SPACE_WIDTH)
+        yield from [SPACE] * int(line.indent / self.SPACE_WIDTH)
        for glyph in line.glyphs:
            yield self.recognize_glyph(glyph)
        yield NEWLINE
    def recognize_glyph(self, glyph):
        qApp.processEvents()
        if isinstance(glyph, Space):
-            return ' '
+            return SPACE
        try:
-            return self.glyphdb[glyph].text
+            glyph_data = self.glyphdb[glyph]
        except KeyError:
            text, bold, italic = self.ask_for_help(glyph)
-            self.glyphdb.add_glyph(glyph, text, bold, italic)
+            glyph_data = self.glyphdb.add_glyph(glyph, text, bold, italic)
-            return text
+        return glyph_data
    def ask_for_help(self, unknown_glyph):
        self.unknownGlyph.emit(unknown_glyph)