vteam27 commited on
Commit
b8b3256
1 Parent(s): 41be449

base doctr

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.jpg filter=lfs diff=lfs merge=lfs -text
37
+ *.png filter=lfs diff=lfs merge=lfs -text
Examples/Book.png ADDED

Git LFS Details

  • SHA256: 45bf8d8c824d48de2013e572bffcedadcbdc84cda21fb73f5f83ecb809aec803
  • Pointer size: 133 Bytes
  • Size of remote file: 16 MB
Examples/Files.jpg ADDED

Git LFS Details

  • SHA256: bc1979e548161bb556a037594b3945749419b2367f93acac00e53c6d621ee009
  • Pointer size: 132 Bytes
  • Size of remote file: 4.37 MB
Examples/Manuscript.jpg ADDED

Git LFS Details

  • SHA256: 4a717cd9c625b7b59ebb80b52b0b3fba47c69e61f881ecd4e4f8ea1bb8883ddf
  • Pointer size: 132 Bytes
  • Size of remote file: 4.54 MB
Examples/News.png ADDED

Git LFS Details

  • SHA256: 5384175e709017ad917f56ff758bce9164444992be3bcad8fe52f7f83343744d
  • Pointer size: 131 Bytes
  • Size of remote file: 388 kB
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ['USE_TORCH'] = '1'
3
+
4
+ from doctr.io import DocumentFile
5
+ from doctr.models import ocr_predictor
6
+ import gradio as gr
7
+ from PIL import Image
8
+ import base64
9
+ from utils import HocrParser
10
+
11
+ predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn',pretrained=True)
12
+
13
+ title="DocTR OCR (PDL Demo)"
14
+ description="Upload an image to get the OCR results !"
15
+
16
+ def greet(img):
17
+ img.save("out.jpg")
18
+ doc = DocumentFile.from_images("out.jpg")
19
+ output=predictor(doc)
20
+
21
+ xml_outputs = output.export_as_xml()
22
+ parser = HocrParser()
23
+
24
+ res=""
25
+ for obj in output.pages:
26
+ for obj1 in obj.blocks:
27
+ for obj2 in obj1.lines:
28
+ for obj3 in obj2.words:
29
+ res=res + " " + obj3.value
30
+ res=res + "\n"
31
+ res=res + "\n"
32
+
33
+ _output_name = "RESULT_OCR.txt"
34
+ _output_name_pdf="RESULT_OCR.pdf"
35
+
36
+ open(_output_name, 'w').close() # clear file
37
+ with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
38
+ f.write(res)
39
+ print("Writing into file")
40
+
41
+ base64_encoded_pdfs = list()
42
+ for i, (xml, img) in enumerate(zip(xml_outputs, doc)):
43
+ xml_element_tree = xml[1]
44
+ parser.export_pdfa(_output_name_pdf,
45
+ hocr=xml_element_tree, image=img)
46
+ with open(_output_name_pdf, 'rb') as f:
47
+ base64_encoded_pdfs.append(base64.b64encode(f.read()))
48
+ return res, _output_name, _output_name_pdf
49
+
50
+ demo = gr.Interface(fn=greet,
51
+ inputs=gr.Image(type="pil"),
52
+ outputs=["text", "file","file"],
53
+ title=title,
54
+ description=description,
55
+ examples=[["Examples/Book.png"],["Examples/News.png"],["Examples/Manuscript.jpg"],["Examples/Files.jpg"]]
56
+ )
57
+
58
+ demo.launch(debug=True)
packages.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ libcairo2-dev
2
+ pkg-config
3
+ fonts-freefont-ttf -y
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pycairo
2
+ python-doctr[torch]@git+https://github.com/mindee/doctr.git
3
+ gradio
4
+ reportlab>=3.6.2
5
+ PyPDF2==1.26.0
utils.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import re
3
+ from tempfile import TemporaryDirectory
4
+ from math import atan, cos, sin
5
+ from typing import Dict, Optional, Tuple
6
+ from xml.etree import ElementTree as ET
7
+ from xml.etree.ElementTree import Element
8
+
9
+ import numpy as np
10
+ import PyPDF2
11
+ from PyPDF2 import PdfFileMerger
12
+ from doctr.io import DocumentFile
13
+ from doctr.models import ocr_predictor
14
+ from PIL import Image
15
+ from reportlab.lib.colors import black
16
+ from reportlab.lib.units import inch
17
+ from reportlab.lib.utils import ImageReader
18
+ from reportlab.pdfgen.canvas import Canvas
19
+
20
+
21
+
22
+
23
+ class HocrParser():
24
+
25
+ def __init__(self):
26
+ self.box_pattern = re.compile(r'bbox((\s+\d+){4})')
27
+ self.baseline_pattern = re.compile(r'baseline((\s+[\d\.\-]+){2})')
28
+
29
+ def _element_coordinates(self, element: Element) -> Dict:
30
+ """
31
+ Returns a tuple containing the coordinates of the bounding box around
32
+ an element
33
+ """
34
+ out = out = {'x1': 0, 'y1': 0, 'x2': 0, 'y2': 0}
35
+ if 'title' in element.attrib:
36
+ matches = self.box_pattern.search(element.attrib['title'])
37
+ if matches:
38
+ coords = matches.group(1).split()
39
+ out = {'x1': int(coords[0]), 'y1': int(
40
+ coords[1]), 'x2': int(coords[2]), 'y2': int(coords[3])}
41
+ return out
42
+
43
+ def _get_baseline(self, element: Element) -> Tuple[float, float]:
44
+ """
45
+ Returns a tuple containing the baseline slope and intercept.
46
+ """
47
+ if 'title' in element.attrib:
48
+ matches = self.baseline_pattern.search(
49
+ element.attrib['title']).group(1).split()
50
+ if matches:
51
+ return float(matches[0]), float(matches[1])
52
+ return (0.0, 0.0)
53
+
54
+ def _pt_from_pixel(self, pxl: Dict, dpi: int) -> Dict:
55
+ """
56
+ Returns the quantity in PDF units (pt) given quantity in pixels
57
+ """
58
+ pt = [(c / dpi * inch) for c in pxl.values()]
59
+ return {'x1': pt[0], 'y1': pt[1], 'x2': pt[2], 'y2': pt[3]}
60
+
61
+ def _get_element_text(self, element: Element) -> str:
62
+ """
63
+ Return the textual content of the element and its children
64
+ """
65
+ text = ''
66
+ if element.text is not None:
67
+ text += element.text
68
+ for child in element:
69
+ text += self._get_element_text(child)
70
+ if element.tail is not None:
71
+ text += element.tail
72
+ return text
73
+
74
+ def export_pdfa(self,
75
+ out_filename: str,
76
+ hocr: ET.ElementTree,
77
+ image: Optional[np.ndarray] = None,
78
+ fontname: str = "Times-Roman",
79
+ fontsize: int = 12,
80
+ invisible_text: bool = True,
81
+ add_spaces: bool = True,
82
+ dpi: int = 300):
83
+ """
84
+ Generates a PDF/A document from a hOCR document.
85
+ """
86
+
87
+ width, height = None, None
88
+ # Get the image dimensions
89
+ for div in hocr.findall(".//div[@class='ocr_page']"):
90
+ coords = self._element_coordinates(div)
91
+ pt_coords = self._pt_from_pixel(coords, dpi)
92
+ width, height = pt_coords['x2'] - \
93
+ pt_coords['x1'], pt_coords['y2'] - pt_coords['y1']
94
+ # after catch break loop
95
+ break
96
+ if width is None or height is None:
97
+ raise ValueError("Could not determine page size")
98
+
99
+ pdf = Canvas(out_filename, pagesize=(width, height), pageCompression=1)
100
+
101
+ span_elements = [element for element in hocr.iterfind(".//span")]
102
+ for line in span_elements:
103
+ if 'class' in line.attrib and line.attrib['class'] == 'ocr_line' and line is not None:
104
+ # get information from xml
105
+ pxl_line_coords = self._element_coordinates(line)
106
+ line_box = self._pt_from_pixel(pxl_line_coords, dpi)
107
+
108
+ # compute baseline
109
+ slope, pxl_intercept = self._get_baseline(line)
110
+ if abs(slope) < 0.005:
111
+ slope = 0.0
112
+ angle = atan(slope)
113
+ cos_a, sin_a = cos(angle), sin(angle)
114
+ intercept = pxl_intercept / dpi * inch
115
+ baseline_y2 = height - (line_box['y2'] + intercept)
116
+
117
+ # configure options
118
+ text = pdf.beginText()
119
+ text.setFont(fontname, fontsize)
120
+ pdf.setFillColor(black)
121
+ if invisible_text:
122
+ text.setTextRenderMode(3) # invisible text
123
+
124
+ # transform overlayed text
125
+ text.setTextTransform(
126
+ cos_a, -sin_a, sin_a, cos_a, line_box['x1'], baseline_y2)
127
+
128
+ elements = line.findall(".//span[@class='ocrx_word']")
129
+ for elem in elements:
130
+ elemtxt = self._get_element_text(elem).strip()
131
+ # replace unsupported characters
132
+ elemtxt = elemtxt.translate(str.maketrans(
133
+ {'ff': 'ff', 'ffi': 'f‌f‌i', 'ffl': 'f‌f‌l', 'fi': 'fi', 'fl': 'fl'}))
134
+ if not elemtxt:
135
+ continue
136
+
137
+ # compute string width
138
+ pxl_coords = self._element_coordinates(elem)
139
+ box = self._pt_from_pixel(pxl_coords, dpi)
140
+ if add_spaces:
141
+ elemtxt += ' '
142
+ box_width = box['x2'] + pdf.stringWidth(elemtxt, fontname, fontsize) - box['x1']
143
+ else:
144
+ box_width = box['x2'] - box['x1']
145
+ font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
146
+
147
+ # Adjust relative position of cursor
148
+ cursor = text.getStartOfLine()
149
+ dx = box['x1'] - cursor[0]
150
+ dy = baseline_y2 - cursor[1]
151
+ text.moveCursor(dx, dy)
152
+
153
+ # suppress text if it is 0 units wide
154
+ if font_width > 0:
155
+ text.setHorizScale(100 * box_width / font_width)
156
+ text.textOut(elemtxt)
157
+ pdf.drawText(text)
158
+
159
+ # overlay image if provided
160
+ if image is not None:
161
+ pdf.drawImage(ImageReader(Image.fromarray(image)),
162
+ 0, 0, width=width, height=height)
163
+ pdf.save()