diff --git a/pixelocr/formatting.py b/pixelocr/formatting.py
new file mode 100644
index 0000000..c9a5d4c
--- /dev/null
+++ b/pixelocr/formatting.py
@@ -0,0 +1,72 @@
+# Copyright (C) 2014 Andrey Golovizin
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+import itertools
+
+from .utils import pipe
+from .glyphdb import WhitespaceData
+
+
+class OutputFormat(object):
+ last_tag = None
+
+ def __call__(self, glyph_data_stream):
+ return self.format(glyph_data_stream)
+
+ def format(self, glyph_data_seq):
+ last_tag = None
+ return pipe(glyph_data_seq, self.group_by_tag, self.fix_spaces, self.format_tags)
+
+ def group_by_tag(self, styled_glyphs):
+ for tag, group in itertools.groupby(styled_glyphs, key=self.assign_tag):
+ text = ''.join(glyph_data.text for glyph_data in group)
+ yield tag, text
+
+ def fix_spaces(self, tagged_text):
+ for tag, text in tagged_text:
+ stripped_text = text.rstrip()
+ if len(stripped_text) < len(text):
+ traling_whitespace = text[len(stripped_text):]
+ yield tag, stripped_text
+ yield None, traling_whitespace
+ else:
+ yield tag, text
+
+ def format_tags(self, tagged_text):
+ for tag, text in tagged_text:
+ yield self.format_tag(tag, text)
+
+ def assign_tag(self, glyph_data):
+ if isinstance(glyph_data, WhitespaceData):
+ return self.last_tag
+ else:
+ tag = self.assign_glyph_tag(glyph_data)
+ self.last_tag = tag
+ return tag
+
+ def assign_glyph_tag(self, glyph_data):
+ raise NotImplementedError
+
+ def format_tag(self, tag, text):
+ raise NotImplementedError
+
+
+class TextFormat(OutputFormat):
+ def assign_tag(self, glyph_data):
+ return None
+
+ def format_tag(self, tag, text):
+ return text
diff --git a/pixelocr/glyphdb.py b/pixelocr/glyphdb.py
index bb4853c..d918173 100644
--- a/pixelocr/glyphdb.py
+++ b/pixelocr/glyphdb.py
@@ -66,6 +66,17 @@ class GlyphData(object):
return cls(*args)
+class WhitespaceData(GlyphData):
+ color = (255, 255, 255) # FIXME support non-white background
+
+ def __init__(self, text):
+ super().__init__(None, None, text, bold=False, italic=False)
+
+
+SPACE = WhitespaceData(' ')
+NEWLINE = WhitespaceData('\n')
+
+
class GlyphDB(object):
def __init__(self, filename):
self.filename = filename
diff --git a/pixelocr/gui/ocrengine.py b/pixelocr/gui/ocrengine.py
index e4d8b89..e946ca9 100644
--- a/pixelocr/gui/ocrengine.py
+++ b/pixelocr/gui/ocrengine.py
@@ -14,6 +14,7 @@
# along with this program. If not, see .
+import itertools
from glob import glob
from os import path
from queue import Queue
@@ -27,9 +28,10 @@ from PyQt4.QtGui import (
qApp
)
+from .. import formatting
from ..image import Image
from ..page import Page, Glyph, Space
-from ..glyphdb import GlyphDB
+from ..glyphdb import GlyphDB, SPACE, NEWLINE
class OCREngine(QThread):
@@ -53,7 +55,6 @@ class OCREngine(QThread):
def run(self):
for page_text in self.recognize():
- print()
print(page_text)
if self.quit:
qApp.quit()
@@ -62,27 +63,30 @@ class OCREngine(QThread):
for filename in self.filenames:
page = self.load_page(filename)
self.pageChanged.emit(page)
- yield '\n'.join(self.recognize_page(page))
+ yield self.recognize_page(page)
def recognize_page(self, page):
- for line in page.lines:
- yield ''.join(self.recognize_line(line))
+ glyph_data_seq = itertools.chain(*(self.recognize_line(line) for line in page.lines))
+ output_format = formatting.TextFormat()
+ return ''.join(output_format.format(glyph_data_seq))
def recognize_line(self, line):
- yield from ' ' * int(line.indent / self.SPACE_WIDTH)
+ yield from [SPACE] * int(line.indent / self.SPACE_WIDTH)
for glyph in line.glyphs:
yield self.recognize_glyph(glyph)
+ yield NEWLINE
def recognize_glyph(self, glyph):
qApp.processEvents()
+
if isinstance(glyph, Space):
- return ' '
+ return SPACE
try:
- return self.glyphdb[glyph].text
+ glyph_data = self.glyphdb[glyph]
except KeyError:
text, bold, italic = self.ask_for_help(glyph)
- self.glyphdb.add_glyph(glyph, text, bold, italic)
- return text
+ glyph_data = self.glyphdb.add_glyph(glyph, text, bold, italic)
+ return glyph_data
def ask_for_help(self, unknown_glyph):
self.unknownGlyph.emit(unknown_glyph)