102 lines
3.9 KiB
Python
102 lines
3.9 KiB
Python
# Copyright (C) 2014 Andrey Golovizin
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
import itertools
|
|
import subprocess
|
|
from glob import glob
|
|
from os import path
|
|
|
|
from . import formatting
|
|
from .image import Image
|
|
from .page import Page, Space
|
|
from .glyphdb import GlyphDB, Style, SPACE, NEWLINE
|
|
from .config import Configuration
|
|
from .utils import cached_property
|
|
|
|
|
|
class Document(object):
|
|
SPACE_WIDTH = 15
|
|
|
|
def __init__(self, dirname, ui, skip=0, limit=None, output_format='text'):
|
|
super().__init__()
|
|
self.dirname = dirname
|
|
self.ui = ui
|
|
self.filenames = sorted(glob(path.join(dirname, '*.png')))[skip:skip + limit if limit else None]
|
|
self.glyphdb = GlyphDB(path.join(self.dirname, 'glyphdb.pickle'))
|
|
self.config = Configuration.load_file(path.join(self.dirname, 'config.yaml'))
|
|
self.output_format = output_format
|
|
self.last_style = Style(bold=False, italic=False, color=(255, 255, 255)) # FIXME get rid of hardcoded value
|
|
self.last_language = self.default_language
|
|
|
|
def save_glyphdb(self):
|
|
self.glyphdb.save()
|
|
|
|
def load_page(self, filename):
|
|
return Page(self, Image.fromfile(filename), filename)
|
|
|
|
def recognize(self):
|
|
for filename in self.filenames:
|
|
page = self.load_page(filename)
|
|
self.ui.turn_page(page)
|
|
page_text = self.recognize_page(page)
|
|
print(page_text)
|
|
with open(filename + self.output_format.suffix, 'w') as page_text_file:
|
|
page_text_file.write(page_text)
|
|
|
|
def recognize_page(self, page):
|
|
glyph_data_seq = itertools.chain(*(self.recognize_line(line) for line in page.lines))
|
|
return ''.join(self.output_format.format(glyph_data_seq))
|
|
|
|
def recognize_line(self, line):
|
|
yield from [SPACE] * int(line.indent / self.SPACE_WIDTH)
|
|
for glyph in line.glyphs:
|
|
yield self.recognize_glyph(glyph)
|
|
yield NEWLINE
|
|
|
|
def recognize_glyph(self, glyph):
|
|
self.ui.process_events()
|
|
|
|
if isinstance(glyph, Space):
|
|
return SPACE
|
|
try:
|
|
glyph_data = self.glyphdb[glyph]
|
|
except KeyError:
|
|
guessed_bold = glyph.word.guess_bold(default=self.last_style.bold)
|
|
guessed_italic = glyph.word.guess_italic(default=self.last_style.italic)
|
|
self.switch_layout(glyph.word.guess_language(self.last_language))
|
|
text, bold, italic = self.ui.ask_for_help(glyph, guessed_bold, guessed_italic)
|
|
glyph_data = self.glyphdb.add_glyph(glyph, text, bold, italic)
|
|
self.last_style = glyph_data.style
|
|
self.last_language = self.language_map.get(glyph_data.text)
|
|
return glyph_data
|
|
|
|
@cached_property
|
|
def language_map(self):
|
|
languages = self.config.get('languages', {})
|
|
return {letter: lang_name for lang_name, opts in languages.items() for letter in opts.get('alphabet', ())}
|
|
|
|
@cached_property
|
|
def default_language(self):
|
|
for lang_name, opts in self.config.get('lenauges', {}):
|
|
if opts.get('default'):
|
|
return lang_name
|
|
|
|
def switch_layout(self, language):
|
|
if language is None:
|
|
language = self.default_language
|
|
cmd = self.config.get('languages', {}).get(language, {}).get('command')
|
|
if cmd:
|
|
return subprocess.call(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|