vteam27 commited on
Commit
4e9395b
1 Parent(s): 4d734fe

Added merged base

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
37
+ *.jpg filter=lfs diff=lfs merge=lfs -text
Examples/Book.png ADDED

Git LFS Details

  • SHA256: 45bf8d8c824d48de2013e572bffcedadcbdc84cda21fb73f5f83ecb809aec803
  • Pointer size: 133 Bytes
  • Size of remote file: 16 MB
Examples/Files.jpg ADDED

Git LFS Details

  • SHA256: bc1979e548161bb556a037594b3945749419b2367f93acac00e53c6d621ee009
  • Pointer size: 132 Bytes
  • Size of remote file: 4.37 MB
Examples/Manuscript.jpg ADDED

Git LFS Details

  • SHA256: 4a717cd9c625b7b59ebb80b52b0b3fba47c69e61f881ecd4e4f8ea1bb8883ddf
  • Pointer size: 132 Bytes
  • Size of remote file: 4.54 MB
Examples/News.png ADDED

Git LFS Details

  • SHA256: 5384175e709017ad917f56ff758bce9164444992be3bcad8fe52f7f83343744d
  • Pointer size: 131 Bytes
  • Size of remote file: 388 kB
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from doctr.io import DocumentFile
3
+ from doctr.models import ocr_predictor
4
+ import gradio as gr
5
+ from PIL import Image
6
+ from happytransformer import HappyTextToText, TTSettings
7
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
+ import re
9
+ from lang_list import (
10
+ LANGUAGE_NAME_TO_CODE,
11
+ T2TT_TARGET_LANGUAGE_NAMES,
12
+ TEXT_SOURCE_LANGUAGE_NAMES,
13
+ )
14
+ DEFAULT_TARGET_LANGUAGE = "English"
15
+ from transformers import SeamlessM4TForTextToText
16
+ from transformers import AutoProcessor
17
+ model = SeamlessM4TForTextToText.from_pretrained("facebook/hf-seamless-m4t-medium")
18
+ processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
19
+
20
+ # OCR Predictor initialization
21
+ OCRpredictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn', pretrained=True)
22
+
23
+ # Grammar Correction Model initialization
24
+ happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
25
+ grammar_args = TTSettings(num_beams=5, min_length=1)
26
+
27
+ # Spell Check Model initialization
28
+ OCRtokenizer = AutoTokenizer.from_pretrained("Bhuvana/t5-base-spellchecker", use_fast=False)
29
+ OCRmodel = AutoModelForSeq2SeqLM.from_pretrained("Bhuvana/t5-base-spellchecker")
30
+
31
+
32
+ def correct_spell(inputs):
33
+ input_ids = OCRtokenizer.encode(inputs, return_tensors='pt')
34
+ sample_output = OCRmodel.generate(
35
+ input_ids,
36
+ do_sample=True,
37
+ max_length=512,
38
+ top_p=0.99,
39
+ num_return_sequences=1
40
+ )
41
+ res = OCRtokenizer.decode(sample_output[0], skip_special_tokens=True)
42
+ return res
43
+
44
+ def process_text_in_chunks(text, process_function, max_chunk_size=256):
45
+ # Split text into sentences
46
+ sentences = re.split(r'(?<=[.!?])\s+', text)
47
+ processed_text = ""
48
+
49
+ for sentence in sentences:
50
+ # Further split long sentences into smaller chunks
51
+ chunks = [sentence[i:i + max_chunk_size] for i in range(0, len(sentence), max_chunk_size)]
52
+ for chunk in chunks:
53
+ processed_text += process_function(chunk)
54
+ processed_text += " " # Add space after each processed sentence
55
+
56
+ return processed_text.strip()
57
+
58
+ def greet(img, apply_grammar_correction, apply_spell_check):
59
+ img.save("out.jpg")
60
+ doc = DocumentFile.from_images("out.jpg")
61
+ output = OCRpredictor(doc)
62
+
63
+ res = ""
64
+ for obj in output.pages:
65
+ for obj1 in obj.blocks:
66
+ for obj2 in obj1.lines:
67
+ for obj3 in obj2.words:
68
+ res += " " + obj3.value
69
+ res += "\n"
70
+ res += "\n"
71
+
72
+ # Process in chunks for grammar correction
73
+ if apply_grammar_correction:
74
+ res = process_text_in_chunks(res, lambda x: happy_tt.generate_text("grammar: " + x, args=grammar_args).text)
75
+
76
+ # Process in chunks for spell check
77
+ if apply_spell_check:
78
+ res = process_text_in_chunks(res, correct_spell)
79
+
80
+ _output_name = "RESULT_OCR.txt"
81
+ open(_output_name, 'w').write(res)
82
+ return res, _output_name
83
+
84
+ # Gradio Interface for OCR
85
+ demo_ocr = gr.Interface(
86
+ fn=greet,
87
+ inputs=[
88
+ gr.Image(type="pil"),
89
+ gr.Checkbox(label="Apply Grammar Correction"),
90
+ gr.Checkbox(label="Apply Spell Check")
91
+ ],
92
+ outputs=["text", "file"],
93
+ title="DocTR OCR with Grammar and Spell Check",
94
+ description="Upload an image to get the OCR results. Optionally, apply grammar and spell check.",
95
+ examples=[["Examples/Book.png"], ["Examples/News.png"], ["Examples/Manuscript.jpg"], ["Examples/Files.jpg"]]
96
+
97
+ )
98
+
99
+
100
+ # demo_ocr.launch(debug=True)
101
+
102
+
103
+
104
+ def run_t2tt(file_uploader , input_text: str, source_language: str, target_language: str) -> (str, bytes):
105
+ if file_uploader is not None:
106
+ with open(file_uploader, 'r') as file:
107
+ input_text=file.read()
108
+ source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
109
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
110
+ text_inputs = processor(text = input_text, src_lang=source_language_code , return_tensors="pt")
111
+ output_tokens = model.generate(**text_inputs, tgt_lang=target_language_code)
112
+ output = processor.decode(output_tokens[0].tolist(), skip_special_tokens=True)
113
+ _output_name = "result.txt"
114
+ open(_output_name, 'w').write(output)
115
+ return str(output), _output_name
116
+
117
+ with gr.Blocks() as demo_t2tt:
118
+ with gr.Row():
119
+ with gr.Column():
120
+ with gr.Group():
121
+ file_uploader = gr.File(label="Upload a text file (Optional)")
122
+ input_text = gr.Textbox(label="Input text")
123
+ with gr.Row():
124
+ source_language = gr.Dropdown(
125
+ label="Source language",
126
+ choices=TEXT_SOURCE_LANGUAGE_NAMES,
127
+ value="Punjabi",
128
+ )
129
+ target_language = gr.Dropdown(
130
+ label="Target language",
131
+ choices=T2TT_TARGET_LANGUAGE_NAMES,
132
+ value=DEFAULT_TARGET_LANGUAGE,
133
+ )
134
+ btn = gr.Button("Translate")
135
+ with gr.Column():
136
+ output_text = gr.Textbox(label="Translated text")
137
+ output_file = gr.File(label="Translated text file")
138
+
139
+ gr.Examples(
140
+ examples=[
141
+ [
142
+ None,
143
+ "The sinister destruction of the holy Akal Takht and the ruthless massacre of thousands of innocent pilgrims had unmasked the deep-seated hatred and animosity that the Indian Government had been nurturing against Sikhs ever since independence",
144
+ "English",
145
+ "Punjabi",
146
+ ],
147
+ [
148
+ None,
149
+ "It contains. much useful information about administrative, revenue, judicial and ecclesiastical activities in various areas which, it is hoped, would supplement the information available in official records.",
150
+ "English",
151
+ "Hindi",
152
+ ],
153
+ [
154
+ None,
155
+ "दुनिया में बहुत सी अलग-अलग भाषाएं हैं और उनमें अपने वर्ण और शब्दों का भंडार होता है. इसमें में कुछ उनके अपने शब्द होते हैं तो कुछ ऐसे भी हैं, जो दूसरी भाषाओं से लिए जाते हैं.",
156
+ "Hindi",
157
+ "Punjabi",
158
+ ],
159
+ [
160
+ None,
161
+ "ਸੂੂਬੇ ਦੇ ਕਈ ਜ਼ਿਲ੍ਹਿਆਂ ’ਚ ਬੁੱਧਵਾਰ ਸਵੇਰੇ ਸੰਘਣੀ ਧੁੰਦ ਛਾਈ ਰਹੀ ਤੇ ਤੇਜ਼ ਹਵਾਵਾਂ ਨੇ ਕਾਂਬਾ ਹੋਰ ਵਧਾ ਦਿੱਤਾ। ਸੱਤ ਸ਼ਹਿਰਾਂ ’ਚ ਦਿਨ ਦਾ ਤਾਪਮਾਨ ਦਸ ਡਿਗਰੀ ਸੈਲਸੀਅਸ ਦੇ ਆਸਪਾਸ ਰਿਹਾ। ਸੂਬੇ ’ਚ ਵੱਧ ਤੋਂ ਵੱਧ ਤਾਪਮਾਨ ’ਚ ਵੀ ਦਸ ਡਿਗਰੀ ਸੈਲਸੀਅਸ ਦੀ ਗਿਰਾਵਟ ਦਰਜ ਕੀਤੀ ਗਈ",
162
+ "Punjabi",
163
+ "English",
164
+ ],
165
+ ],
166
+ inputs=[file_uploader ,input_text, source_language, target_language],
167
+ outputs=[output_text, output_file],
168
+ fn=run_t2tt,
169
+ cache_examples=False,
170
+ api_name=False,
171
+ )
172
+
173
+ gr.on(
174
+ triggers=[input_text.submit, btn.click],
175
+ fn=run_t2tt,
176
+ inputs=[file_uploader, input_text, source_language, target_language],
177
+ outputs=[output_text, output_file],
178
+ api_name="t2tt",
179
+ )
180
+
181
+ with gr.Blocks() as demo:
182
+ with gr.Tabs():
183
+ with gr.Tab(label="OCR"):
184
+ demo_ocr.render()
185
+ with gr.Tab(label="Translate"):
186
+ demo_t2tt.render()
187
+
188
+ if __name__ == "__main__":
189
+ demo.launch()
lang_list.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Language dict
2
+ language_code_to_name = {
3
+ "afr": "Afrikaans",
4
+ "amh": "Amharic",
5
+ "arb": "Modern Standard Arabic",
6
+ "ary": "Moroccan Arabic",
7
+ "arz": "Egyptian Arabic",
8
+ "asm": "Assamese",
9
+ "ast": "Asturian",
10
+ "azj": "North Azerbaijani",
11
+ "bel": "Belarusian",
12
+ "ben": "Bengali",
13
+ "bos": "Bosnian",
14
+ "bul": "Bulgarian",
15
+ "cat": "Catalan",
16
+ "ceb": "Cebuano",
17
+ "ces": "Czech",
18
+ "ckb": "Central Kurdish",
19
+ "cmn": "Mandarin Chinese",
20
+ "cym": "Welsh",
21
+ "dan": "Danish",
22
+ "deu": "German",
23
+ "ell": "Greek",
24
+ "eng": "English",
25
+ "est": "Estonian",
26
+ "eus": "Basque",
27
+ "fin": "Finnish",
28
+ "fra": "French",
29
+ "gaz": "West Central Oromo",
30
+ "gle": "Irish",
31
+ "glg": "Galician",
32
+ "guj": "Gujarati",
33
+ "heb": "Hebrew",
34
+ "hin": "Hindi",
35
+ "hrv": "Croatian",
36
+ "hun": "Hungarian",
37
+ "hye": "Armenian",
38
+ "ibo": "Igbo",
39
+ "ind": "Indonesian",
40
+ "isl": "Icelandic",
41
+ "ita": "Italian",
42
+ "jav": "Javanese",
43
+ "jpn": "Japanese",
44
+ "kam": "Kamba",
45
+ "kan": "Kannada",
46
+ "kat": "Georgian",
47
+ "kaz": "Kazakh",
48
+ "kea": "Kabuverdianu",
49
+ "khk": "Halh Mongolian",
50
+ "khm": "Khmer",
51
+ "kir": "Kyrgyz",
52
+ "kor": "Korean",
53
+ "lao": "Lao",
54
+ "lit": "Lithuanian",
55
+ "ltz": "Luxembourgish",
56
+ "lug": "Ganda",
57
+ "luo": "Luo",
58
+ "lvs": "Standard Latvian",
59
+ "mai": "Maithili",
60
+ "mal": "Malayalam",
61
+ "mar": "Marathi",
62
+ "mkd": "Macedonian",
63
+ "mlt": "Maltese",
64
+ "mni": "Meitei",
65
+ "mya": "Burmese",
66
+ "nld": "Dutch",
67
+ "nno": "Norwegian Nynorsk",
68
+ "nob": "Norwegian Bokm\u00e5l",
69
+ "npi": "Nepali",
70
+ "nya": "Nyanja",
71
+ "oci": "Occitan",
72
+ "ory": "Odia",
73
+ "pan": "Punjabi",
74
+ "pbt": "Southern Pashto",
75
+ "pes": "Western Persian",
76
+ "pol": "Polish",
77
+ "por": "Portuguese",
78
+ "ron": "Romanian",
79
+ "rus": "Russian",
80
+ "slk": "Slovak",
81
+ "slv": "Slovenian",
82
+ "sna": "Shona",
83
+ "snd": "Sindhi",
84
+ "som": "Somali",
85
+ "spa": "Spanish",
86
+ "srp": "Serbian",
87
+ "swe": "Swedish",
88
+ "swh": "Swahili",
89
+ "tam": "Tamil",
90
+ "tel": "Telugu",
91
+ "tgk": "Tajik",
92
+ "tgl": "Tagalog",
93
+ "tha": "Thai",
94
+ "tur": "Turkish",
95
+ "ukr": "Ukrainian",
96
+ "urd": "Urdu",
97
+ "uzn": "Northern Uzbek",
98
+ "vie": "Vietnamese",
99
+ "xho": "Xhosa",
100
+ "yor": "Yoruba",
101
+ "yue": "Cantonese",
102
+ "zlm": "Colloquial Malay",
103
+ "zsm": "Standard Malay",
104
+ "zul": "Zulu",
105
+ }
106
+ LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
107
+
108
+ # Source langs: S2ST / S2TT / ASR don't need source lang
109
+ # T2TT / T2ST use this
110
+ text_source_language_codes = [
111
+ "hin",
112
+ "pan",
113
+ "eng",
114
+ ]
115
+ TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes])
116
+
117
+ # Target langs:
118
+ # S2ST / T2ST
119
+ s2st_target_language_codes = [
120
+ "eng",
121
+ "arb",
122
+ "ben",
123
+ "cat",
124
+ "ces",
125
+ "cmn",
126
+ "cym",
127
+ "dan",
128
+ "deu",
129
+ "est",
130
+ "fin",
131
+ "fra",
132
+ "hin",
133
+ "ind",
134
+ "ita",
135
+ "jpn",
136
+ "kor",
137
+ "mlt",
138
+ "nld",
139
+ "pes",
140
+ "pol",
141
+ "por",
142
+ "ron",
143
+ "rus",
144
+ "slk",
145
+ "spa",
146
+ "swe",
147
+ "swh",
148
+ "tel",
149
+ "tgl",
150
+ "tha",
151
+ "tur",
152
+ "ukr",
153
+ "urd",
154
+ "uzn",
155
+ "vie",
156
+ ]
157
+ S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes])
158
+ T2ST_TARGET_LANGUAGE_NAMES = S2ST_TARGET_LANGUAGE_NAMES
159
+
160
+ # S2TT / T2TT / ASR
161
+ S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
162
+ T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
163
+ ASR_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
packages.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ libcairo2-dev
2
+ pkg-config
3
+ fonts-freefont-ttf -y
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pycairo
2
+ gradio
3
+ reportlab>=3.6.2
4
+ PyPDF2==1.26.0
5
+ happytransformer
6
+ python-doctr[torch]@git+https://github.com/mindee/doctr.git
7
+ transformers
8
+ fairseq2==0.1
9
+ pydub
10
+ yt-dlp
11
+ sentencepiece
utils.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import re
3
+ from tempfile import TemporaryDirectory
4
+ from math import atan, cos, sin
5
+ from typing import Dict, Optional, Tuple
6
+ from xml.etree import ElementTree as ET
7
+ from xml.etree.ElementTree import Element
8
+
9
+ import numpy as np
10
+ import PyPDF2
11
+ from PyPDF2 import PdfFileMerger
12
+ from doctr.io import DocumentFile
13
+ from doctr.models import ocr_predictor
14
+ from PIL import Image
15
+ from reportlab.lib.colors import black
16
+ from reportlab.lib.units import inch
17
+ from reportlab.lib.utils import ImageReader
18
+ from reportlab.pdfgen.canvas import Canvas
19
+
20
+
21
+
22
+
23
+ class HocrParser():
24
+
25
+ def __init__(self):
26
+ self.box_pattern = re.compile(r'bbox((\s+\d+){4})')
27
+ self.baseline_pattern = re.compile(r'baseline((\s+[\d\.\-]+){2})')
28
+
29
+ def _element_coordinates(self, element: Element) -> Dict:
30
+ """
31
+ Returns a tuple containing the coordinates of the bounding box around
32
+ an element
33
+ """
34
+ out = out = {'x1': 0, 'y1': 0, 'x2': 0, 'y2': 0}
35
+ if 'title' in element.attrib:
36
+ matches = self.box_pattern.search(element.attrib['title'])
37
+ if matches:
38
+ coords = matches.group(1).split()
39
+ out = {'x1': int(coords[0]), 'y1': int(
40
+ coords[1]), 'x2': int(coords[2]), 'y2': int(coords[3])}
41
+ return out
42
+
43
+ def _get_baseline(self, element: Element) -> Tuple[float, float]:
44
+ """
45
+ Returns a tuple containing the baseline slope and intercept.
46
+ """
47
+ if 'title' in element.attrib:
48
+ matches = self.baseline_pattern.search(
49
+ element.attrib['title']).group(1).split()
50
+ if matches:
51
+ return float(matches[0]), float(matches[1])
52
+ return (0.0, 0.0)
53
+
54
+ def _pt_from_pixel(self, pxl: Dict, dpi: int) -> Dict:
55
+ """
56
+ Returns the quantity in PDF units (pt) given quantity in pixels
57
+ """
58
+ pt = [(c / dpi * inch) for c in pxl.values()]
59
+ return {'x1': pt[0], 'y1': pt[1], 'x2': pt[2], 'y2': pt[3]}
60
+
61
+ def _get_element_text(self, element: Element) -> str:
62
+ """
63
+ Return the textual content of the element and its children
64
+ """
65
+ text = ''
66
+ if element.text is not None:
67
+ text += element.text
68
+ for child in element:
69
+ text += self._get_element_text(child)
70
+ if element.tail is not None:
71
+ text += element.tail
72
+ return text
73
+
74
+ def export_pdfa(self,
75
+ out_filename: str,
76
+ hocr: ET.ElementTree,
77
+ image: Optional[np.ndarray] = None,
78
+ fontname: str = "Times-Roman",
79
+ fontsize: int = 12,
80
+ invisible_text: bool = True,
81
+ add_spaces: bool = True,
82
+ dpi: int = 300):
83
+ """
84
+ Generates a PDF/A document from a hOCR document.
85
+ """
86
+
87
+ width, height = None, None
88
+ # Get the image dimensions
89
+ for div in hocr.findall(".//div[@class='ocr_page']"):
90
+ coords = self._element_coordinates(div)
91
+ pt_coords = self._pt_from_pixel(coords, dpi)
92
+ width, height = pt_coords['x2'] - \
93
+ pt_coords['x1'], pt_coords['y2'] - pt_coords['y1']
94
+ # after catch break loop
95
+ break
96
+ if width is None or height is None:
97
+ raise ValueError("Could not determine page size")
98
+
99
+ pdf = Canvas(out_filename, pagesize=(width, height), pageCompression=1)
100
+
101
+ span_elements = [element for element in hocr.iterfind(".//span")]
102
+ for line in span_elements:
103
+ if 'class' in line.attrib and line.attrib['class'] == 'ocr_line' and line is not None:
104
+ # get information from xml
105
+ pxl_line_coords = self._element_coordinates(line)
106
+ line_box = self._pt_from_pixel(pxl_line_coords, dpi)
107
+
108
+ # compute baseline
109
+ slope, pxl_intercept = self._get_baseline(line)
110
+ if abs(slope) < 0.005:
111
+ slope = 0.0
112
+ angle = atan(slope)
113
+ cos_a, sin_a = cos(angle), sin(angle)
114
+ intercept = pxl_intercept / dpi * inch
115
+ baseline_y2 = height - (line_box['y2'] + intercept)
116
+
117
+ # configure options
118
+ text = pdf.beginText()
119
+ text.setFont(fontname, fontsize)
120
+ pdf.setFillColor(black)
121
+ if invisible_text:
122
+ text.setTextRenderMode(3) # invisible text
123
+
124
+ # transform overlayed text
125
+ text.setTextTransform(
126
+ cos_a, -sin_a, sin_a, cos_a, line_box['x1'], baseline_y2)
127
+
128
+ elements = line.findall(".//span[@class='ocrx_word']")
129
+ for elem in elements:
130
+ elemtxt = self._get_element_text(elem).strip()
131
+ # replace unsupported characters
132
+ elemtxt = elemtxt.translate(str.maketrans(
133
+ {'ff': 'ff', 'ffi': 'f‌f‌i', 'ffl': 'f‌f‌l', 'fi': 'fi', 'fl': 'fl'}))
134
+ if not elemtxt:
135
+ continue
136
+
137
+ # compute string width
138
+ pxl_coords = self._element_coordinates(elem)
139
+ box = self._pt_from_pixel(pxl_coords, dpi)
140
+ if add_spaces:
141
+ elemtxt += ' '
142
+ box_width = box['x2'] + pdf.stringWidth(elemtxt, fontname, fontsize) - box['x1']
143
+ else:
144
+ box_width = box['x2'] - box['x1']
145
+ font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
146
+
147
+ # Adjust relative position of cursor
148
+ cursor = text.getStartOfLine()
149
+ dx = box['x1'] - cursor[0]
150
+ dy = baseline_y2 - cursor[1]
151
+ text.moveCursor(dx, dy)
152
+
153
+ # suppress text if it is 0 units wide
154
+ if font_width > 0:
155
+ text.setHorizScale(100 * box_width / font_width)
156
+ text.textOut(elemtxt)
157
+ pdf.drawText(text)
158
+
159
+ # overlay image if provided
160
+ if image is not None:
161
+ pdf.drawImage(ImageReader(Image.fromarray(image)),
162
+ 0, 0, width=width, height=height)
163
+ pdf.save()