DocTRv1

Build error

App Files Files Community

vteam27 commited on Nov 2, 2023

Commit

1cfd79c

1 Parent(s): dc813d0

added searchable pdf

Browse files

Files changed (3) hide show

app.py +19 -2
requirements.txt +3 -1
utils.py +163 -0

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 import gradio as gr
 from PIL import Image
 predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn',pretrained=True)
@@ -15,6 +17,10 @@ def greet(img):
     img.save("out.jpg")
     doc = DocumentFile.from_images("out.jpg")
     output=predictor(doc)
     res=""
     for obj in output.pages:
       for obj1 in obj.blocks:
@@ -23,16 +29,27 @@ def greet(img):
             res=res + " " + obj3.value
         res=res + "\n"
       res=res + "\n"
     _output_name = "RESULT_OCR.txt"
     open(_output_name, 'w').close() # clear file
     with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
         f.write(res)
         print("Writing into file")
-    return res, _output_name
 demo = gr.Interface(fn=greet,
                     inputs=gr.Image(type="pil"),
-                    outputs=["text", "file"],
                     title=title,
                     description=description,
                     examples=[["Examples/Book.png"],["Examples/News.png"],["Examples/Manuscript.jpg"],["Examples/Files.jpg"]]

 from doctr.models import ocr_predictor
 import gradio as gr
 from PIL import Image
+import base64
+from utils import HocrParser
 predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn',pretrained=True)
     img.save("out.jpg")
     doc = DocumentFile.from_images("out.jpg")
     output=predictor(doc)
+    xml_outputs = output.export_as_xml()
+    parser = HocrParser()
     res=""
     for obj in output.pages:
       for obj1 in obj.blocks:
             res=res + " " + obj3.value
         res=res + "\n"
       res=res + "\n"
     _output_name = "RESULT_OCR.txt"
+    _output_name_pdf="RESULT_OCR.pdf"
     open(_output_name, 'w').close() # clear file
     with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
         f.write(res)
         print("Writing into file")
+    base64_encoded_pdfs = list()
+    for i, (xml, img) in enumerate(zip(xml_outputs, doc)):
+      xml_element_tree = xml[1]
+      parser.export_pdfa(_output_name_pdf,
+            hocr=xml_element_tree, image=img)
+      with open(_output_name_pdf, 'rb') as f:
+            base64_encoded_pdfs.append(base64.b64encode(f.read()))
+    return res, _output_name, _output_name_pdf
 demo = gr.Interface(fn=greet,
                     inputs=gr.Image(type="pil"),
+                    outputs=["text", "file","file"],
                     title=title,
                     description=description,
                     examples=[["Examples/Book.png"],["Examples/News.png"],["Examples/Manuscript.jpg"],["Examples/Files.jpg"]]

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 pycairo
 python-doctr[torch]@git+https://github.com/mindee/doctr.git
-gradio

 pycairo
 python-doctr[torch]@git+https://github.com/mindee/doctr.git
+gradio
+reportlab>=3.6.2
+PyPDF2==1.26.0

utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import base64
+import re
+from tempfile import TemporaryDirectory
+from math import atan, cos, sin
+from typing import Dict, Optional, Tuple
+from xml.etree import ElementTree as ET
+from xml.etree.ElementTree import Element
+import numpy as np
+import PyPDF2
+from PyPDF2 import PdfFileMerger
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+from PIL import Image
+from reportlab.lib.colors import black
+from reportlab.lib.units import inch
+from reportlab.lib.utils import ImageReader
+from reportlab.pdfgen.canvas import Canvas
+class HocrParser():
+    def __init__(self):
+        self.box_pattern = re.compile(r'bbox((\s+\d+){4})')
+        self.baseline_pattern = re.compile(r'baseline((\s+[\d\.\-]+){2})')
+    def _element_coordinates(self, element: Element) -> Dict:
+        """
+        Returns a tuple containing the coordinates of the bounding box around
+        an element
+        """
+        out = out = {'x1': 0, 'y1': 0, 'x2': 0, 'y2': 0}
+        if 'title' in element.attrib:
+            matches = self.box_pattern.search(element.attrib['title'])
+            if matches:
+                coords = matches.group(1).split()
+                out = {'x1': int(coords[0]), 'y1': int(
+                    coords[1]), 'x2': int(coords[2]), 'y2': int(coords[3])}
+        return out
+    def _get_baseline(self, element: Element) -> Tuple[float, float]:
+        """
+        Returns a tuple containing the baseline slope and intercept.
+        """
+        if 'title' in element.attrib:
+            matches = self.baseline_pattern.search(
+                element.attrib['title']).group(1).split()
+            if matches:
+                return float(matches[0]), float(matches[1])
+        return (0.0, 0.0)
+    def _pt_from_pixel(self, pxl: Dict, dpi: int) -> Dict:
+        """
+        Returns the quantity in PDF units (pt) given quantity in pixels
+        """
+        pt = [(c / dpi * inch) for c in pxl.values()]
+        return {'x1': pt[0], 'y1': pt[1], 'x2': pt[2], 'y2': pt[3]}
+    def _get_element_text(self, element: Element) -> str:
+        """
+        Return the textual content of the element and its children
+        """
+        text = ''
+        if element.text is not None:
+            text += element.text
+        for child in element:
+            text += self._get_element_text(child)
+        if element.tail is not None:
+            text += element.tail
+        return text
+    def export_pdfa(self,
+                    out_filename: str,
+                    hocr: ET.ElementTree,
+                    image: Optional[np.ndarray] = None,
+                    fontname: str = "Times-Roman",
+                    fontsize: int = 12,
+                    invisible_text: bool = True,
+                    add_spaces: bool = True,
+                    dpi: int = 300):
+        """
+        Generates a PDF/A document from a hOCR document.
+        """
+        width, height = None, None
+        # Get the image dimensions
+        for div in hocr.findall(".//div[@class='ocr_page']"):
+            coords = self._element_coordinates(div)
+            pt_coords = self._pt_from_pixel(coords, dpi)
+            width, height = pt_coords['x2'] - \
+                pt_coords['x1'], pt_coords['y2'] - pt_coords['y1']
+            # after catch break loop
+            break
+        if width is None or height is None:
+            raise ValueError("Could not determine page size")
+        pdf = Canvas(out_filename, pagesize=(width, height), pageCompression=1)
+        span_elements = [element for element in hocr.iterfind(".//span")]
+        for line in span_elements:
+            if 'class' in line.attrib and line.attrib['class'] == 'ocr_line' and line is not None:
+                # get information from xml
+                pxl_line_coords = self._element_coordinates(line)
+                line_box = self._pt_from_pixel(pxl_line_coords, dpi)
+                # compute baseline
+                slope, pxl_intercept = self._get_baseline(line)
+                if abs(slope) < 0.005:
+                    slope = 0.0
+                angle = atan(slope)
+                cos_a, sin_a = cos(angle), sin(angle)
+                intercept = pxl_intercept / dpi * inch
+                baseline_y2 = height - (line_box['y2'] + intercept)
+                # configure options
+                text = pdf.beginText()
+                text.setFont(fontname, fontsize)
+                pdf.setFillColor(black)
+                if invisible_text:
+                    text.setTextRenderMode(3)  # invisible text
+                # transform overlayed text
+                text.setTextTransform(
+                    cos_a, -sin_a, sin_a, cos_a, line_box['x1'], baseline_y2)
+                elements = line.findall(".//span[@class='ocrx_word']")
+                for elem in elements:
+                    elemtxt = self._get_element_text(elem).strip()
+                    # replace unsupported characters
+                    elemtxt = elemtxt.translate(str.maketrans(
+                        {'ﬀ': 'ff', 'ﬃ': 'f‌f‌i', 'ﬄ': 'f‌f‌l', 'ﬁ': 'fi', 'ﬂ': 'fl'}))
+                    if not elemtxt:
+                        continue
+                    # compute string width
+                    pxl_coords = self._element_coordinates(elem)
+                    box = self._pt_from_pixel(pxl_coords, dpi)
+                    if add_spaces:
+                        elemtxt += ' '
+                        box_width = box['x2'] + pdf.stringWidth(elemtxt, fontname, fontsize) - box['x1']
+                    else:
+                        box_width = box['x2'] - box['x1']
+                    font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
+                    # Adjust relative position of cursor
+                    cursor = text.getStartOfLine()
+                    dx = box['x1'] - cursor[0]
+                    dy = baseline_y2 - cursor[1]
+                    text.moveCursor(dx, dy)
+                    # suppress text if it is 0 units wide
+                    if font_width > 0:
+                        text.setHorizScale(100 * box_width / font_width)
+                        text.textOut(elemtxt)
+                pdf.drawText(text)
+        # overlay image if provided
+        if image is not None:
+            pdf.drawImage(ImageReader(Image.fromarray(image)),
+                          0, 0, width=width, height=height)
+        pdf.save()