vteam27
commited on
Commit
·
1cfd79c
1
Parent(s):
dc813d0
added searchable pdf
Browse files- app.py +19 -2
- requirements.txt +3 -1
- utils.py +163 -0
app.py
CHANGED
|
@@ -5,6 +5,8 @@ from doctr.io import DocumentFile
|
|
| 5 |
from doctr.models import ocr_predictor
|
| 6 |
import gradio as gr
|
| 7 |
from PIL import Image
|
|
|
|
|
|
|
| 8 |
|
| 9 |
predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn',pretrained=True)
|
| 10 |
|
|
@@ -15,6 +17,10 @@ def greet(img):
|
|
| 15 |
img.save("out.jpg")
|
| 16 |
doc = DocumentFile.from_images("out.jpg")
|
| 17 |
output=predictor(doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
res=""
|
| 19 |
for obj in output.pages:
|
| 20 |
for obj1 in obj.blocks:
|
|
@@ -23,16 +29,27 @@ def greet(img):
|
|
| 23 |
res=res + " " + obj3.value
|
| 24 |
res=res + "\n"
|
| 25 |
res=res + "\n"
|
|
|
|
| 26 |
_output_name = "RESULT_OCR.txt"
|
|
|
|
|
|
|
| 27 |
open(_output_name, 'w').close() # clear file
|
| 28 |
with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
|
| 29 |
f.write(res)
|
| 30 |
print("Writing into file")
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
demo = gr.Interface(fn=greet,
|
| 34 |
inputs=gr.Image(type="pil"),
|
| 35 |
-
outputs=["text", "file"],
|
| 36 |
title=title,
|
| 37 |
description=description,
|
| 38 |
examples=[["Examples/Book.png"],["Examples/News.png"],["Examples/Manuscript.jpg"],["Examples/Files.jpg"]]
|
|
|
|
| 5 |
from doctr.models import ocr_predictor
|
| 6 |
import gradio as gr
|
| 7 |
from PIL import Image
|
| 8 |
+
import base64
|
| 9 |
+
from utils import HocrParser
|
| 10 |
|
| 11 |
predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn',pretrained=True)
|
| 12 |
|
|
|
|
| 17 |
img.save("out.jpg")
|
| 18 |
doc = DocumentFile.from_images("out.jpg")
|
| 19 |
output=predictor(doc)
|
| 20 |
+
|
| 21 |
+
xml_outputs = output.export_as_xml()
|
| 22 |
+
parser = HocrParser()
|
| 23 |
+
|
| 24 |
res=""
|
| 25 |
for obj in output.pages:
|
| 26 |
for obj1 in obj.blocks:
|
|
|
|
| 29 |
res=res + " " + obj3.value
|
| 30 |
res=res + "\n"
|
| 31 |
res=res + "\n"
|
| 32 |
+
|
| 33 |
_output_name = "RESULT_OCR.txt"
|
| 34 |
+
_output_name_pdf="RESULT_OCR.pdf"
|
| 35 |
+
|
| 36 |
open(_output_name, 'w').close() # clear file
|
| 37 |
with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
|
| 38 |
f.write(res)
|
| 39 |
print("Writing into file")
|
| 40 |
+
|
| 41 |
+
base64_encoded_pdfs = list()
|
| 42 |
+
for i, (xml, img) in enumerate(zip(xml_outputs, doc)):
|
| 43 |
+
xml_element_tree = xml[1]
|
| 44 |
+
parser.export_pdfa(_output_name_pdf,
|
| 45 |
+
hocr=xml_element_tree, image=img)
|
| 46 |
+
with open(_output_name_pdf, 'rb') as f:
|
| 47 |
+
base64_encoded_pdfs.append(base64.b64encode(f.read()))
|
| 48 |
+
return res, _output_name, _output_name_pdf
|
| 49 |
|
| 50 |
demo = gr.Interface(fn=greet,
|
| 51 |
inputs=gr.Image(type="pil"),
|
| 52 |
+
outputs=["text", "file","file"],
|
| 53 |
title=title,
|
| 54 |
description=description,
|
| 55 |
examples=[["Examples/Book.png"],["Examples/News.png"],["Examples/Manuscript.jpg"],["Examples/Files.jpg"]]
|
requirements.txt
CHANGED
|
@@ -1,3 +1,5 @@
|
|
| 1 |
pycairo
|
| 2 |
python-doctr[torch]@git+https://github.com/mindee/doctr.git
|
| 3 |
-
gradio
|
|
|
|
|
|
|
|
|
| 1 |
pycairo
|
| 2 |
python-doctr[torch]@git+https://github.com/mindee/doctr.git
|
| 3 |
+
gradio
|
| 4 |
+
reportlab>=3.6.2
|
| 5 |
+
PyPDF2==1.26.0
|
utils.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import re
|
| 3 |
+
from tempfile import TemporaryDirectory
|
| 4 |
+
from math import atan, cos, sin
|
| 5 |
+
from typing import Dict, Optional, Tuple
|
| 6 |
+
from xml.etree import ElementTree as ET
|
| 7 |
+
from xml.etree.ElementTree import Element
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
import PyPDF2
|
| 11 |
+
from PyPDF2 import PdfFileMerger
|
| 12 |
+
from doctr.io import DocumentFile
|
| 13 |
+
from doctr.models import ocr_predictor
|
| 14 |
+
from PIL import Image
|
| 15 |
+
from reportlab.lib.colors import black
|
| 16 |
+
from reportlab.lib.units import inch
|
| 17 |
+
from reportlab.lib.utils import ImageReader
|
| 18 |
+
from reportlab.pdfgen.canvas import Canvas
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class HocrParser():
|
| 24 |
+
|
| 25 |
+
def __init__(self):
|
| 26 |
+
self.box_pattern = re.compile(r'bbox((\s+\d+){4})')
|
| 27 |
+
self.baseline_pattern = re.compile(r'baseline((\s+[\d\.\-]+){2})')
|
| 28 |
+
|
| 29 |
+
def _element_coordinates(self, element: Element) -> Dict:
|
| 30 |
+
"""
|
| 31 |
+
Returns a tuple containing the coordinates of the bounding box around
|
| 32 |
+
an element
|
| 33 |
+
"""
|
| 34 |
+
out = out = {'x1': 0, 'y1': 0, 'x2': 0, 'y2': 0}
|
| 35 |
+
if 'title' in element.attrib:
|
| 36 |
+
matches = self.box_pattern.search(element.attrib['title'])
|
| 37 |
+
if matches:
|
| 38 |
+
coords = matches.group(1).split()
|
| 39 |
+
out = {'x1': int(coords[0]), 'y1': int(
|
| 40 |
+
coords[1]), 'x2': int(coords[2]), 'y2': int(coords[3])}
|
| 41 |
+
return out
|
| 42 |
+
|
| 43 |
+
def _get_baseline(self, element: Element) -> Tuple[float, float]:
|
| 44 |
+
"""
|
| 45 |
+
Returns a tuple containing the baseline slope and intercept.
|
| 46 |
+
"""
|
| 47 |
+
if 'title' in element.attrib:
|
| 48 |
+
matches = self.baseline_pattern.search(
|
| 49 |
+
element.attrib['title']).group(1).split()
|
| 50 |
+
if matches:
|
| 51 |
+
return float(matches[0]), float(matches[1])
|
| 52 |
+
return (0.0, 0.0)
|
| 53 |
+
|
| 54 |
+
def _pt_from_pixel(self, pxl: Dict, dpi: int) -> Dict:
|
| 55 |
+
"""
|
| 56 |
+
Returns the quantity in PDF units (pt) given quantity in pixels
|
| 57 |
+
"""
|
| 58 |
+
pt = [(c / dpi * inch) for c in pxl.values()]
|
| 59 |
+
return {'x1': pt[0], 'y1': pt[1], 'x2': pt[2], 'y2': pt[3]}
|
| 60 |
+
|
| 61 |
+
def _get_element_text(self, element: Element) -> str:
|
| 62 |
+
"""
|
| 63 |
+
Return the textual content of the element and its children
|
| 64 |
+
"""
|
| 65 |
+
text = ''
|
| 66 |
+
if element.text is not None:
|
| 67 |
+
text += element.text
|
| 68 |
+
for child in element:
|
| 69 |
+
text += self._get_element_text(child)
|
| 70 |
+
if element.tail is not None:
|
| 71 |
+
text += element.tail
|
| 72 |
+
return text
|
| 73 |
+
|
| 74 |
+
def export_pdfa(self,
|
| 75 |
+
out_filename: str,
|
| 76 |
+
hocr: ET.ElementTree,
|
| 77 |
+
image: Optional[np.ndarray] = None,
|
| 78 |
+
fontname: str = "Times-Roman",
|
| 79 |
+
fontsize: int = 12,
|
| 80 |
+
invisible_text: bool = True,
|
| 81 |
+
add_spaces: bool = True,
|
| 82 |
+
dpi: int = 300):
|
| 83 |
+
"""
|
| 84 |
+
Generates a PDF/A document from a hOCR document.
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
width, height = None, None
|
| 88 |
+
# Get the image dimensions
|
| 89 |
+
for div in hocr.findall(".//div[@class='ocr_page']"):
|
| 90 |
+
coords = self._element_coordinates(div)
|
| 91 |
+
pt_coords = self._pt_from_pixel(coords, dpi)
|
| 92 |
+
width, height = pt_coords['x2'] - \
|
| 93 |
+
pt_coords['x1'], pt_coords['y2'] - pt_coords['y1']
|
| 94 |
+
# after catch break loop
|
| 95 |
+
break
|
| 96 |
+
if width is None or height is None:
|
| 97 |
+
raise ValueError("Could not determine page size")
|
| 98 |
+
|
| 99 |
+
pdf = Canvas(out_filename, pagesize=(width, height), pageCompression=1)
|
| 100 |
+
|
| 101 |
+
span_elements = [element for element in hocr.iterfind(".//span")]
|
| 102 |
+
for line in span_elements:
|
| 103 |
+
if 'class' in line.attrib and line.attrib['class'] == 'ocr_line' and line is not None:
|
| 104 |
+
# get information from xml
|
| 105 |
+
pxl_line_coords = self._element_coordinates(line)
|
| 106 |
+
line_box = self._pt_from_pixel(pxl_line_coords, dpi)
|
| 107 |
+
|
| 108 |
+
# compute baseline
|
| 109 |
+
slope, pxl_intercept = self._get_baseline(line)
|
| 110 |
+
if abs(slope) < 0.005:
|
| 111 |
+
slope = 0.0
|
| 112 |
+
angle = atan(slope)
|
| 113 |
+
cos_a, sin_a = cos(angle), sin(angle)
|
| 114 |
+
intercept = pxl_intercept / dpi * inch
|
| 115 |
+
baseline_y2 = height - (line_box['y2'] + intercept)
|
| 116 |
+
|
| 117 |
+
# configure options
|
| 118 |
+
text = pdf.beginText()
|
| 119 |
+
text.setFont(fontname, fontsize)
|
| 120 |
+
pdf.setFillColor(black)
|
| 121 |
+
if invisible_text:
|
| 122 |
+
text.setTextRenderMode(3) # invisible text
|
| 123 |
+
|
| 124 |
+
# transform overlayed text
|
| 125 |
+
text.setTextTransform(
|
| 126 |
+
cos_a, -sin_a, sin_a, cos_a, line_box['x1'], baseline_y2)
|
| 127 |
+
|
| 128 |
+
elements = line.findall(".//span[@class='ocrx_word']")
|
| 129 |
+
for elem in elements:
|
| 130 |
+
elemtxt = self._get_element_text(elem).strip()
|
| 131 |
+
# replace unsupported characters
|
| 132 |
+
elemtxt = elemtxt.translate(str.maketrans(
|
| 133 |
+
{'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl'}))
|
| 134 |
+
if not elemtxt:
|
| 135 |
+
continue
|
| 136 |
+
|
| 137 |
+
# compute string width
|
| 138 |
+
pxl_coords = self._element_coordinates(elem)
|
| 139 |
+
box = self._pt_from_pixel(pxl_coords, dpi)
|
| 140 |
+
if add_spaces:
|
| 141 |
+
elemtxt += ' '
|
| 142 |
+
box_width = box['x2'] + pdf.stringWidth(elemtxt, fontname, fontsize) - box['x1']
|
| 143 |
+
else:
|
| 144 |
+
box_width = box['x2'] - box['x1']
|
| 145 |
+
font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
|
| 146 |
+
|
| 147 |
+
# Adjust relative position of cursor
|
| 148 |
+
cursor = text.getStartOfLine()
|
| 149 |
+
dx = box['x1'] - cursor[0]
|
| 150 |
+
dy = baseline_y2 - cursor[1]
|
| 151 |
+
text.moveCursor(dx, dy)
|
| 152 |
+
|
| 153 |
+
# suppress text if it is 0 units wide
|
| 154 |
+
if font_width > 0:
|
| 155 |
+
text.setHorizScale(100 * box_width / font_width)
|
| 156 |
+
text.textOut(elemtxt)
|
| 157 |
+
pdf.drawText(text)
|
| 158 |
+
|
| 159 |
+
# overlay image if provided
|
| 160 |
+
if image is not None:
|
| 161 |
+
pdf.drawImage(ImageReader(Image.fromarray(image)),
|
| 162 |
+
0, 0, width=width, height=height)
|
| 163 |
+
pdf.save()
|