Add support for output formats.

This commit is contained in:
Andrey Golovizin 2014-09-04 17:46:48 +02:00
parent bd2a206940
commit 296035c966
3 changed files with 97 additions and 10 deletions

72
pixelocr/formatting.py Normal file
View file

@ -0,0 +1,72 @@
# Copyright (C) 2014 Andrey Golovizin
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import itertools
from .utils import pipe
from .glyphdb import WhitespaceData
class OutputFormat(object):
last_tag = None
def __call__(self, glyph_data_stream):
return self.format(glyph_data_stream)
def format(self, glyph_data_seq):
last_tag = None
return pipe(glyph_data_seq, self.group_by_tag, self.fix_spaces, self.format_tags)
def group_by_tag(self, styled_glyphs):
for tag, group in itertools.groupby(styled_glyphs, key=self.assign_tag):
text = ''.join(glyph_data.text for glyph_data in group)
yield tag, text
def fix_spaces(self, tagged_text):
for tag, text in tagged_text:
stripped_text = text.rstrip()
if len(stripped_text) < len(text):
traling_whitespace = text[len(stripped_text):]
yield tag, stripped_text
yield None, traling_whitespace
else:
yield tag, text
def format_tags(self, tagged_text):
for tag, text in tagged_text:
yield self.format_tag(tag, text)
def assign_tag(self, glyph_data):
if isinstance(glyph_data, WhitespaceData):
return self.last_tag
else:
tag = self.assign_glyph_tag(glyph_data)
self.last_tag = tag
return tag
def assign_glyph_tag(self, glyph_data):
raise NotImplementedError
def format_tag(self, tag, text):
raise NotImplementedError
class TextFormat(OutputFormat):
def assign_tag(self, glyph_data):
return None
def format_tag(self, tag, text):
return text

View file

@ -66,6 +66,17 @@ class GlyphData(object):
return cls(*args) return cls(*args)
class WhitespaceData(GlyphData):
color = (255, 255, 255) # FIXME support non-white background
def __init__(self, text):
super().__init__(None, None, text, bold=False, italic=False)
SPACE = WhitespaceData(' ')
NEWLINE = WhitespaceData('\n')
class GlyphDB(object): class GlyphDB(object):
def __init__(self, filename): def __init__(self, filename):
self.filename = filename self.filename = filename

View file

@ -14,6 +14,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
import itertools
from glob import glob from glob import glob
from os import path from os import path
from queue import Queue from queue import Queue
@ -27,9 +28,10 @@ from PyQt4.QtGui import (
qApp qApp
) )
from .. import formatting
from ..image import Image from ..image import Image
from ..page import Page, Glyph, Space from ..page import Page, Glyph, Space
from ..glyphdb import GlyphDB from ..glyphdb import GlyphDB, SPACE, NEWLINE
class OCREngine(QThread): class OCREngine(QThread):
@ -53,7 +55,6 @@ class OCREngine(QThread):
def run(self): def run(self):
for page_text in self.recognize(): for page_text in self.recognize():
print()
print(page_text) print(page_text)
if self.quit: if self.quit:
qApp.quit() qApp.quit()
@ -62,27 +63,30 @@ class OCREngine(QThread):
for filename in self.filenames: for filename in self.filenames:
page = self.load_page(filename) page = self.load_page(filename)
self.pageChanged.emit(page) self.pageChanged.emit(page)
yield '\n'.join(self.recognize_page(page)) yield self.recognize_page(page)
def recognize_page(self, page): def recognize_page(self, page):
for line in page.lines: glyph_data_seq = itertools.chain(*(self.recognize_line(line) for line in page.lines))
yield ''.join(self.recognize_line(line)) output_format = formatting.TextFormat()
return ''.join(output_format.format(glyph_data_seq))
def recognize_line(self, line): def recognize_line(self, line):
yield from ' ' * int(line.indent / self.SPACE_WIDTH) yield from [SPACE] * int(line.indent / self.SPACE_WIDTH)
for glyph in line.glyphs: for glyph in line.glyphs:
yield self.recognize_glyph(glyph) yield self.recognize_glyph(glyph)
yield NEWLINE
def recognize_glyph(self, glyph): def recognize_glyph(self, glyph):
qApp.processEvents() qApp.processEvents()
if isinstance(glyph, Space): if isinstance(glyph, Space):
return ' ' return SPACE
try: try:
return self.glyphdb[glyph].text glyph_data = self.glyphdb[glyph]
except KeyError: except KeyError:
text, bold, italic = self.ask_for_help(glyph) text, bold, italic = self.ask_for_help(glyph)
self.glyphdb.add_glyph(glyph, text, bold, italic) glyph_data = self.glyphdb.add_glyph(glyph, text, bold, italic)
return text return glyph_data
def ask_for_help(self, unknown_glyph): def ask_for_help(self, unknown_glyph):
self.unknownGlyph.emit(unknown_glyph) self.unknownGlyph.emit(unknown_glyph)