diff --git a/pixelocr/formatting.py b/pixelocr/formatting.py index c9a5d4c..4429a3a 100644 --- a/pixelocr/formatting.py +++ b/pixelocr/formatting.py @@ -70,3 +70,21 @@ class TextFormat(OutputFormat): def format_tag(self, tag, text): return text + + +class HTMLFormat(OutputFormat): + def assign_glyph_tag(self, glyph_data): + style = glyph_data.style + if style.bold: + return 'b' + elif style.italic: + return 'i' + else: + return None + + def format_tag(self, tag, text): + from xml.sax.saxutils import escape + if tag: + return '<{tag}>{text}'.format(tag=tag, text=escape(text)) + else: + return text diff --git a/pixelocr/gui/ocrengine.py b/pixelocr/gui/ocrengine.py index e946ca9..13f3dcd 100644 --- a/pixelocr/gui/ocrengine.py +++ b/pixelocr/gui/ocrengine.py @@ -67,7 +67,7 @@ class OCREngine(QThread): def recognize_page(self, page): glyph_data_seq = itertools.chain(*(self.recognize_line(line) for line in page.lines)) - output_format = formatting.TextFormat() + output_format = formatting.HTMLFormat() return ''.join(output_format.format(glyph_data_seq)) def recognize_line(self, line):