diff --git a/pixelocr/formatting.py b/pixelocr/formatting.py new file mode 100644 index 0000000..c9a5d4c --- /dev/null +++ b/pixelocr/formatting.py @@ -0,0 +1,72 @@ +# Copyright (C) 2014 Andrey Golovizin +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +import itertools + +from .utils import pipe +from .glyphdb import WhitespaceData + + +class OutputFormat(object): + last_tag = None + + def __call__(self, glyph_data_stream): + return self.format(glyph_data_stream) + + def format(self, glyph_data_seq): + last_tag = None + return pipe(glyph_data_seq, self.group_by_tag, self.fix_spaces, self.format_tags) + + def group_by_tag(self, styled_glyphs): + for tag, group in itertools.groupby(styled_glyphs, key=self.assign_tag): + text = ''.join(glyph_data.text for glyph_data in group) + yield tag, text + + def fix_spaces(self, tagged_text): + for tag, text in tagged_text: + stripped_text = text.rstrip() + if len(stripped_text) < len(text): + traling_whitespace = text[len(stripped_text):] + yield tag, stripped_text + yield None, traling_whitespace + else: + yield tag, text + + def format_tags(self, tagged_text): + for tag, text in tagged_text: + yield self.format_tag(tag, text) + + def assign_tag(self, glyph_data): + if isinstance(glyph_data, WhitespaceData): + return self.last_tag + else: + tag = self.assign_glyph_tag(glyph_data) + self.last_tag = tag + return tag + + def assign_glyph_tag(self, glyph_data): + raise NotImplementedError + + def format_tag(self, tag, text): + raise NotImplementedError + + +class TextFormat(OutputFormat): + def assign_tag(self, glyph_data): + return None + + def format_tag(self, tag, text): + return text diff --git a/pixelocr/glyphdb.py b/pixelocr/glyphdb.py index bb4853c..d918173 100644 --- a/pixelocr/glyphdb.py +++ b/pixelocr/glyphdb.py @@ -66,6 +66,17 @@ class GlyphData(object): return cls(*args) +class WhitespaceData(GlyphData): + color = (255, 255, 255) # FIXME support non-white background + + def __init__(self, text): + super().__init__(None, None, text, bold=False, italic=False) + + +SPACE = WhitespaceData(' ') +NEWLINE = WhitespaceData('\n') + + class GlyphDB(object): def __init__(self, filename): self.filename = filename diff --git a/pixelocr/gui/ocrengine.py b/pixelocr/gui/ocrengine.py index e4d8b89..e946ca9 100644 --- a/pixelocr/gui/ocrengine.py +++ b/pixelocr/gui/ocrengine.py @@ -14,6 +14,7 @@ # along with this program. If not, see . +import itertools from glob import glob from os import path from queue import Queue @@ -27,9 +28,10 @@ from PyQt4.QtGui import ( qApp ) +from .. import formatting from ..image import Image from ..page import Page, Glyph, Space -from ..glyphdb import GlyphDB +from ..glyphdb import GlyphDB, SPACE, NEWLINE class OCREngine(QThread): @@ -53,7 +55,6 @@ class OCREngine(QThread): def run(self): for page_text in self.recognize(): - print() print(page_text) if self.quit: qApp.quit() @@ -62,27 +63,30 @@ class OCREngine(QThread): for filename in self.filenames: page = self.load_page(filename) self.pageChanged.emit(page) - yield '\n'.join(self.recognize_page(page)) + yield self.recognize_page(page) def recognize_page(self, page): - for line in page.lines: - yield ''.join(self.recognize_line(line)) + glyph_data_seq = itertools.chain(*(self.recognize_line(line) for line in page.lines)) + output_format = formatting.TextFormat() + return ''.join(output_format.format(glyph_data_seq)) def recognize_line(self, line): - yield from ' ' * int(line.indent / self.SPACE_WIDTH) + yield from [SPACE] * int(line.indent / self.SPACE_WIDTH) for glyph in line.glyphs: yield self.recognize_glyph(glyph) + yield NEWLINE def recognize_glyph(self, glyph): qApp.processEvents() + if isinstance(glyph, Space): - return ' ' + return SPACE try: - return self.glyphdb[glyph].text + glyph_data = self.glyphdb[glyph] except KeyError: text, bold, italic = self.ask_for_help(glyph) - self.glyphdb.add_glyph(glyph, text, bold, italic) - return text + glyph_data = self.glyphdb.add_glyph(glyph, text, bold, italic) + return glyph_data def ask_for_help(self, unknown_glyph): self.unknownGlyph.emit(unknown_glyph)