Spaces:
Running
on
Zero
Running
on
Zero
vteam27
commited on
Commit
•
4e9395b
1
Parent(s):
4d734fe
Added merged base
Browse files- .gitattributes +2 -0
- Examples/Book.png +3 -0
- Examples/Files.jpg +3 -0
- Examples/Manuscript.jpg +3 -0
- Examples/News.png +3 -0
- app.py +189 -0
- lang_list.py +163 -0
- packages.txt +3 -0
- requirements.txt +11 -0
- utils.py +163 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
Examples/Book.png
ADDED
Git LFS Details
|
Examples/Files.jpg
ADDED
Git LFS Details
|
Examples/Manuscript.jpg
ADDED
Git LFS Details
|
Examples/News.png
ADDED
Git LFS Details
|
app.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from doctr.io import DocumentFile
|
3 |
+
from doctr.models import ocr_predictor
|
4 |
+
import gradio as gr
|
5 |
+
from PIL import Image
|
6 |
+
from happytransformer import HappyTextToText, TTSettings
|
7 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
8 |
+
import re
|
9 |
+
from lang_list import (
|
10 |
+
LANGUAGE_NAME_TO_CODE,
|
11 |
+
T2TT_TARGET_LANGUAGE_NAMES,
|
12 |
+
TEXT_SOURCE_LANGUAGE_NAMES,
|
13 |
+
)
|
14 |
+
DEFAULT_TARGET_LANGUAGE = "English"
|
15 |
+
from transformers import SeamlessM4TForTextToText
|
16 |
+
from transformers import AutoProcessor
|
17 |
+
model = SeamlessM4TForTextToText.from_pretrained("facebook/hf-seamless-m4t-medium")
|
18 |
+
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
|
19 |
+
|
20 |
+
# OCR Predictor initialization
|
21 |
+
OCRpredictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn', pretrained=True)
|
22 |
+
|
23 |
+
# Grammar Correction Model initialization
|
24 |
+
happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
|
25 |
+
grammar_args = TTSettings(num_beams=5, min_length=1)
|
26 |
+
|
27 |
+
# Spell Check Model initialization
|
28 |
+
OCRtokenizer = AutoTokenizer.from_pretrained("Bhuvana/t5-base-spellchecker", use_fast=False)
|
29 |
+
OCRmodel = AutoModelForSeq2SeqLM.from_pretrained("Bhuvana/t5-base-spellchecker")
|
30 |
+
|
31 |
+
|
32 |
+
def correct_spell(inputs):
|
33 |
+
input_ids = OCRtokenizer.encode(inputs, return_tensors='pt')
|
34 |
+
sample_output = OCRmodel.generate(
|
35 |
+
input_ids,
|
36 |
+
do_sample=True,
|
37 |
+
max_length=512,
|
38 |
+
top_p=0.99,
|
39 |
+
num_return_sequences=1
|
40 |
+
)
|
41 |
+
res = OCRtokenizer.decode(sample_output[0], skip_special_tokens=True)
|
42 |
+
return res
|
43 |
+
|
44 |
+
def process_text_in_chunks(text, process_function, max_chunk_size=256):
|
45 |
+
# Split text into sentences
|
46 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
47 |
+
processed_text = ""
|
48 |
+
|
49 |
+
for sentence in sentences:
|
50 |
+
# Further split long sentences into smaller chunks
|
51 |
+
chunks = [sentence[i:i + max_chunk_size] for i in range(0, len(sentence), max_chunk_size)]
|
52 |
+
for chunk in chunks:
|
53 |
+
processed_text += process_function(chunk)
|
54 |
+
processed_text += " " # Add space after each processed sentence
|
55 |
+
|
56 |
+
return processed_text.strip()
|
57 |
+
|
58 |
+
def greet(img, apply_grammar_correction, apply_spell_check):
|
59 |
+
img.save("out.jpg")
|
60 |
+
doc = DocumentFile.from_images("out.jpg")
|
61 |
+
output = OCRpredictor(doc)
|
62 |
+
|
63 |
+
res = ""
|
64 |
+
for obj in output.pages:
|
65 |
+
for obj1 in obj.blocks:
|
66 |
+
for obj2 in obj1.lines:
|
67 |
+
for obj3 in obj2.words:
|
68 |
+
res += " " + obj3.value
|
69 |
+
res += "\n"
|
70 |
+
res += "\n"
|
71 |
+
|
72 |
+
# Process in chunks for grammar correction
|
73 |
+
if apply_grammar_correction:
|
74 |
+
res = process_text_in_chunks(res, lambda x: happy_tt.generate_text("grammar: " + x, args=grammar_args).text)
|
75 |
+
|
76 |
+
# Process in chunks for spell check
|
77 |
+
if apply_spell_check:
|
78 |
+
res = process_text_in_chunks(res, correct_spell)
|
79 |
+
|
80 |
+
_output_name = "RESULT_OCR.txt"
|
81 |
+
open(_output_name, 'w').write(res)
|
82 |
+
return res, _output_name
|
83 |
+
|
84 |
+
# Gradio Interface for OCR
|
85 |
+
demo_ocr = gr.Interface(
|
86 |
+
fn=greet,
|
87 |
+
inputs=[
|
88 |
+
gr.Image(type="pil"),
|
89 |
+
gr.Checkbox(label="Apply Grammar Correction"),
|
90 |
+
gr.Checkbox(label="Apply Spell Check")
|
91 |
+
],
|
92 |
+
outputs=["text", "file"],
|
93 |
+
title="DocTR OCR with Grammar and Spell Check",
|
94 |
+
description="Upload an image to get the OCR results. Optionally, apply grammar and spell check.",
|
95 |
+
examples=[["Examples/Book.png"], ["Examples/News.png"], ["Examples/Manuscript.jpg"], ["Examples/Files.jpg"]]
|
96 |
+
|
97 |
+
)
|
98 |
+
|
99 |
+
|
100 |
+
# demo_ocr.launch(debug=True)
|
101 |
+
|
102 |
+
|
103 |
+
|
104 |
+
def run_t2tt(file_uploader , input_text: str, source_language: str, target_language: str) -> (str, bytes):
|
105 |
+
if file_uploader is not None:
|
106 |
+
with open(file_uploader, 'r') as file:
|
107 |
+
input_text=file.read()
|
108 |
+
source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
|
109 |
+
target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
|
110 |
+
text_inputs = processor(text = input_text, src_lang=source_language_code , return_tensors="pt")
|
111 |
+
output_tokens = model.generate(**text_inputs, tgt_lang=target_language_code)
|
112 |
+
output = processor.decode(output_tokens[0].tolist(), skip_special_tokens=True)
|
113 |
+
_output_name = "result.txt"
|
114 |
+
open(_output_name, 'w').write(output)
|
115 |
+
return str(output), _output_name
|
116 |
+
|
117 |
+
with gr.Blocks() as demo_t2tt:
|
118 |
+
with gr.Row():
|
119 |
+
with gr.Column():
|
120 |
+
with gr.Group():
|
121 |
+
file_uploader = gr.File(label="Upload a text file (Optional)")
|
122 |
+
input_text = gr.Textbox(label="Input text")
|
123 |
+
with gr.Row():
|
124 |
+
source_language = gr.Dropdown(
|
125 |
+
label="Source language",
|
126 |
+
choices=TEXT_SOURCE_LANGUAGE_NAMES,
|
127 |
+
value="Punjabi",
|
128 |
+
)
|
129 |
+
target_language = gr.Dropdown(
|
130 |
+
label="Target language",
|
131 |
+
choices=T2TT_TARGET_LANGUAGE_NAMES,
|
132 |
+
value=DEFAULT_TARGET_LANGUAGE,
|
133 |
+
)
|
134 |
+
btn = gr.Button("Translate")
|
135 |
+
with gr.Column():
|
136 |
+
output_text = gr.Textbox(label="Translated text")
|
137 |
+
output_file = gr.File(label="Translated text file")
|
138 |
+
|
139 |
+
gr.Examples(
|
140 |
+
examples=[
|
141 |
+
[
|
142 |
+
None,
|
143 |
+
"The sinister destruction of the holy Akal Takht and the ruthless massacre of thousands of innocent pilgrims had unmasked the deep-seated hatred and animosity that the Indian Government had been nurturing against Sikhs ever since independence",
|
144 |
+
"English",
|
145 |
+
"Punjabi",
|
146 |
+
],
|
147 |
+
[
|
148 |
+
None,
|
149 |
+
"It contains. much useful information about administrative, revenue, judicial and ecclesiastical activities in various areas which, it is hoped, would supplement the information available in official records.",
|
150 |
+
"English",
|
151 |
+
"Hindi",
|
152 |
+
],
|
153 |
+
[
|
154 |
+
None,
|
155 |
+
"दुनिया में बहुत सी अलग-अलग भाषाएं हैं और उनमें अपने वर्ण और शब्दों का भंडार होता है. इसमें में कुछ उनके अपने शब्द होते हैं तो कुछ ऐसे भी हैं, जो दूसरी भाषाओं से लिए जाते हैं.",
|
156 |
+
"Hindi",
|
157 |
+
"Punjabi",
|
158 |
+
],
|
159 |
+
[
|
160 |
+
None,
|
161 |
+
"ਸੂੂਬੇ ਦੇ ਕਈ ਜ਼ਿਲ੍ਹਿਆਂ ’ਚ ਬੁੱਧਵਾਰ ਸਵੇਰੇ ਸੰਘਣੀ ਧੁੰਦ ਛਾਈ ਰਹੀ ਤੇ ਤੇਜ਼ ਹਵਾਵਾਂ ਨੇ ਕਾਂਬਾ ਹੋਰ ਵਧਾ ਦਿੱਤਾ। ਸੱਤ ਸ਼ਹਿਰਾਂ ’ਚ ਦਿਨ ਦਾ ਤਾਪਮਾਨ ਦਸ ਡਿਗਰੀ ਸੈਲਸੀਅਸ ਦੇ ਆਸਪਾਸ ਰਿਹਾ। ਸੂਬੇ ’ਚ ਵੱਧ ਤੋਂ ਵੱਧ ਤਾਪਮਾਨ ’ਚ ਵੀ ਦਸ ਡਿਗਰੀ ਸੈਲਸੀਅਸ ਦੀ ਗਿਰਾਵਟ ਦਰਜ ਕੀਤੀ ਗਈ",
|
162 |
+
"Punjabi",
|
163 |
+
"English",
|
164 |
+
],
|
165 |
+
],
|
166 |
+
inputs=[file_uploader ,input_text, source_language, target_language],
|
167 |
+
outputs=[output_text, output_file],
|
168 |
+
fn=run_t2tt,
|
169 |
+
cache_examples=False,
|
170 |
+
api_name=False,
|
171 |
+
)
|
172 |
+
|
173 |
+
gr.on(
|
174 |
+
triggers=[input_text.submit, btn.click],
|
175 |
+
fn=run_t2tt,
|
176 |
+
inputs=[file_uploader, input_text, source_language, target_language],
|
177 |
+
outputs=[output_text, output_file],
|
178 |
+
api_name="t2tt",
|
179 |
+
)
|
180 |
+
|
181 |
+
with gr.Blocks() as demo:
|
182 |
+
with gr.Tabs():
|
183 |
+
with gr.Tab(label="OCR"):
|
184 |
+
demo_ocr.render()
|
185 |
+
with gr.Tab(label="Translate"):
|
186 |
+
demo_t2tt.render()
|
187 |
+
|
188 |
+
if __name__ == "__main__":
|
189 |
+
demo.launch()
|
lang_list.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Language dict
|
2 |
+
language_code_to_name = {
|
3 |
+
"afr": "Afrikaans",
|
4 |
+
"amh": "Amharic",
|
5 |
+
"arb": "Modern Standard Arabic",
|
6 |
+
"ary": "Moroccan Arabic",
|
7 |
+
"arz": "Egyptian Arabic",
|
8 |
+
"asm": "Assamese",
|
9 |
+
"ast": "Asturian",
|
10 |
+
"azj": "North Azerbaijani",
|
11 |
+
"bel": "Belarusian",
|
12 |
+
"ben": "Bengali",
|
13 |
+
"bos": "Bosnian",
|
14 |
+
"bul": "Bulgarian",
|
15 |
+
"cat": "Catalan",
|
16 |
+
"ceb": "Cebuano",
|
17 |
+
"ces": "Czech",
|
18 |
+
"ckb": "Central Kurdish",
|
19 |
+
"cmn": "Mandarin Chinese",
|
20 |
+
"cym": "Welsh",
|
21 |
+
"dan": "Danish",
|
22 |
+
"deu": "German",
|
23 |
+
"ell": "Greek",
|
24 |
+
"eng": "English",
|
25 |
+
"est": "Estonian",
|
26 |
+
"eus": "Basque",
|
27 |
+
"fin": "Finnish",
|
28 |
+
"fra": "French",
|
29 |
+
"gaz": "West Central Oromo",
|
30 |
+
"gle": "Irish",
|
31 |
+
"glg": "Galician",
|
32 |
+
"guj": "Gujarati",
|
33 |
+
"heb": "Hebrew",
|
34 |
+
"hin": "Hindi",
|
35 |
+
"hrv": "Croatian",
|
36 |
+
"hun": "Hungarian",
|
37 |
+
"hye": "Armenian",
|
38 |
+
"ibo": "Igbo",
|
39 |
+
"ind": "Indonesian",
|
40 |
+
"isl": "Icelandic",
|
41 |
+
"ita": "Italian",
|
42 |
+
"jav": "Javanese",
|
43 |
+
"jpn": "Japanese",
|
44 |
+
"kam": "Kamba",
|
45 |
+
"kan": "Kannada",
|
46 |
+
"kat": "Georgian",
|
47 |
+
"kaz": "Kazakh",
|
48 |
+
"kea": "Kabuverdianu",
|
49 |
+
"khk": "Halh Mongolian",
|
50 |
+
"khm": "Khmer",
|
51 |
+
"kir": "Kyrgyz",
|
52 |
+
"kor": "Korean",
|
53 |
+
"lao": "Lao",
|
54 |
+
"lit": "Lithuanian",
|
55 |
+
"ltz": "Luxembourgish",
|
56 |
+
"lug": "Ganda",
|
57 |
+
"luo": "Luo",
|
58 |
+
"lvs": "Standard Latvian",
|
59 |
+
"mai": "Maithili",
|
60 |
+
"mal": "Malayalam",
|
61 |
+
"mar": "Marathi",
|
62 |
+
"mkd": "Macedonian",
|
63 |
+
"mlt": "Maltese",
|
64 |
+
"mni": "Meitei",
|
65 |
+
"mya": "Burmese",
|
66 |
+
"nld": "Dutch",
|
67 |
+
"nno": "Norwegian Nynorsk",
|
68 |
+
"nob": "Norwegian Bokm\u00e5l",
|
69 |
+
"npi": "Nepali",
|
70 |
+
"nya": "Nyanja",
|
71 |
+
"oci": "Occitan",
|
72 |
+
"ory": "Odia",
|
73 |
+
"pan": "Punjabi",
|
74 |
+
"pbt": "Southern Pashto",
|
75 |
+
"pes": "Western Persian",
|
76 |
+
"pol": "Polish",
|
77 |
+
"por": "Portuguese",
|
78 |
+
"ron": "Romanian",
|
79 |
+
"rus": "Russian",
|
80 |
+
"slk": "Slovak",
|
81 |
+
"slv": "Slovenian",
|
82 |
+
"sna": "Shona",
|
83 |
+
"snd": "Sindhi",
|
84 |
+
"som": "Somali",
|
85 |
+
"spa": "Spanish",
|
86 |
+
"srp": "Serbian",
|
87 |
+
"swe": "Swedish",
|
88 |
+
"swh": "Swahili",
|
89 |
+
"tam": "Tamil",
|
90 |
+
"tel": "Telugu",
|
91 |
+
"tgk": "Tajik",
|
92 |
+
"tgl": "Tagalog",
|
93 |
+
"tha": "Thai",
|
94 |
+
"tur": "Turkish",
|
95 |
+
"ukr": "Ukrainian",
|
96 |
+
"urd": "Urdu",
|
97 |
+
"uzn": "Northern Uzbek",
|
98 |
+
"vie": "Vietnamese",
|
99 |
+
"xho": "Xhosa",
|
100 |
+
"yor": "Yoruba",
|
101 |
+
"yue": "Cantonese",
|
102 |
+
"zlm": "Colloquial Malay",
|
103 |
+
"zsm": "Standard Malay",
|
104 |
+
"zul": "Zulu",
|
105 |
+
}
|
106 |
+
LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
|
107 |
+
|
108 |
+
# Source langs: S2ST / S2TT / ASR don't need source lang
|
109 |
+
# T2TT / T2ST use this
|
110 |
+
text_source_language_codes = [
|
111 |
+
"hin",
|
112 |
+
"pan",
|
113 |
+
"eng",
|
114 |
+
]
|
115 |
+
TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes])
|
116 |
+
|
117 |
+
# Target langs:
|
118 |
+
# S2ST / T2ST
|
119 |
+
s2st_target_language_codes = [
|
120 |
+
"eng",
|
121 |
+
"arb",
|
122 |
+
"ben",
|
123 |
+
"cat",
|
124 |
+
"ces",
|
125 |
+
"cmn",
|
126 |
+
"cym",
|
127 |
+
"dan",
|
128 |
+
"deu",
|
129 |
+
"est",
|
130 |
+
"fin",
|
131 |
+
"fra",
|
132 |
+
"hin",
|
133 |
+
"ind",
|
134 |
+
"ita",
|
135 |
+
"jpn",
|
136 |
+
"kor",
|
137 |
+
"mlt",
|
138 |
+
"nld",
|
139 |
+
"pes",
|
140 |
+
"pol",
|
141 |
+
"por",
|
142 |
+
"ron",
|
143 |
+
"rus",
|
144 |
+
"slk",
|
145 |
+
"spa",
|
146 |
+
"swe",
|
147 |
+
"swh",
|
148 |
+
"tel",
|
149 |
+
"tgl",
|
150 |
+
"tha",
|
151 |
+
"tur",
|
152 |
+
"ukr",
|
153 |
+
"urd",
|
154 |
+
"uzn",
|
155 |
+
"vie",
|
156 |
+
]
|
157 |
+
S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes])
|
158 |
+
T2ST_TARGET_LANGUAGE_NAMES = S2ST_TARGET_LANGUAGE_NAMES
|
159 |
+
|
160 |
+
# S2TT / T2TT / ASR
|
161 |
+
S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
|
162 |
+
T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
|
163 |
+
ASR_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
|
packages.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
libcairo2-dev
|
2 |
+
pkg-config
|
3 |
+
fonts-freefont-ttf -y
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pycairo
|
2 |
+
gradio
|
3 |
+
reportlab>=3.6.2
|
4 |
+
PyPDF2==1.26.0
|
5 |
+
happytransformer
|
6 |
+
python-doctr[torch]@git+https://github.com/mindee/doctr.git
|
7 |
+
transformers
|
8 |
+
fairseq2==0.1
|
9 |
+
pydub
|
10 |
+
yt-dlp
|
11 |
+
sentencepiece
|
utils.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import re
|
3 |
+
from tempfile import TemporaryDirectory
|
4 |
+
from math import atan, cos, sin
|
5 |
+
from typing import Dict, Optional, Tuple
|
6 |
+
from xml.etree import ElementTree as ET
|
7 |
+
from xml.etree.ElementTree import Element
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
import PyPDF2
|
11 |
+
from PyPDF2 import PdfFileMerger
|
12 |
+
from doctr.io import DocumentFile
|
13 |
+
from doctr.models import ocr_predictor
|
14 |
+
from PIL import Image
|
15 |
+
from reportlab.lib.colors import black
|
16 |
+
from reportlab.lib.units import inch
|
17 |
+
from reportlab.lib.utils import ImageReader
|
18 |
+
from reportlab.pdfgen.canvas import Canvas
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
class HocrParser():
|
24 |
+
|
25 |
+
def __init__(self):
|
26 |
+
self.box_pattern = re.compile(r'bbox((\s+\d+){4})')
|
27 |
+
self.baseline_pattern = re.compile(r'baseline((\s+[\d\.\-]+){2})')
|
28 |
+
|
29 |
+
def _element_coordinates(self, element: Element) -> Dict:
|
30 |
+
"""
|
31 |
+
Returns a tuple containing the coordinates of the bounding box around
|
32 |
+
an element
|
33 |
+
"""
|
34 |
+
out = out = {'x1': 0, 'y1': 0, 'x2': 0, 'y2': 0}
|
35 |
+
if 'title' in element.attrib:
|
36 |
+
matches = self.box_pattern.search(element.attrib['title'])
|
37 |
+
if matches:
|
38 |
+
coords = matches.group(1).split()
|
39 |
+
out = {'x1': int(coords[0]), 'y1': int(
|
40 |
+
coords[1]), 'x2': int(coords[2]), 'y2': int(coords[3])}
|
41 |
+
return out
|
42 |
+
|
43 |
+
def _get_baseline(self, element: Element) -> Tuple[float, float]:
|
44 |
+
"""
|
45 |
+
Returns a tuple containing the baseline slope and intercept.
|
46 |
+
"""
|
47 |
+
if 'title' in element.attrib:
|
48 |
+
matches = self.baseline_pattern.search(
|
49 |
+
element.attrib['title']).group(1).split()
|
50 |
+
if matches:
|
51 |
+
return float(matches[0]), float(matches[1])
|
52 |
+
return (0.0, 0.0)
|
53 |
+
|
54 |
+
def _pt_from_pixel(self, pxl: Dict, dpi: int) -> Dict:
|
55 |
+
"""
|
56 |
+
Returns the quantity in PDF units (pt) given quantity in pixels
|
57 |
+
"""
|
58 |
+
pt = [(c / dpi * inch) for c in pxl.values()]
|
59 |
+
return {'x1': pt[0], 'y1': pt[1], 'x2': pt[2], 'y2': pt[3]}
|
60 |
+
|
61 |
+
def _get_element_text(self, element: Element) -> str:
|
62 |
+
"""
|
63 |
+
Return the textual content of the element and its children
|
64 |
+
"""
|
65 |
+
text = ''
|
66 |
+
if element.text is not None:
|
67 |
+
text += element.text
|
68 |
+
for child in element:
|
69 |
+
text += self._get_element_text(child)
|
70 |
+
if element.tail is not None:
|
71 |
+
text += element.tail
|
72 |
+
return text
|
73 |
+
|
74 |
+
def export_pdfa(self,
|
75 |
+
out_filename: str,
|
76 |
+
hocr: ET.ElementTree,
|
77 |
+
image: Optional[np.ndarray] = None,
|
78 |
+
fontname: str = "Times-Roman",
|
79 |
+
fontsize: int = 12,
|
80 |
+
invisible_text: bool = True,
|
81 |
+
add_spaces: bool = True,
|
82 |
+
dpi: int = 300):
|
83 |
+
"""
|
84 |
+
Generates a PDF/A document from a hOCR document.
|
85 |
+
"""
|
86 |
+
|
87 |
+
width, height = None, None
|
88 |
+
# Get the image dimensions
|
89 |
+
for div in hocr.findall(".//div[@class='ocr_page']"):
|
90 |
+
coords = self._element_coordinates(div)
|
91 |
+
pt_coords = self._pt_from_pixel(coords, dpi)
|
92 |
+
width, height = pt_coords['x2'] - \
|
93 |
+
pt_coords['x1'], pt_coords['y2'] - pt_coords['y1']
|
94 |
+
# after catch break loop
|
95 |
+
break
|
96 |
+
if width is None or height is None:
|
97 |
+
raise ValueError("Could not determine page size")
|
98 |
+
|
99 |
+
pdf = Canvas(out_filename, pagesize=(width, height), pageCompression=1)
|
100 |
+
|
101 |
+
span_elements = [element for element in hocr.iterfind(".//span")]
|
102 |
+
for line in span_elements:
|
103 |
+
if 'class' in line.attrib and line.attrib['class'] == 'ocr_line' and line is not None:
|
104 |
+
# get information from xml
|
105 |
+
pxl_line_coords = self._element_coordinates(line)
|
106 |
+
line_box = self._pt_from_pixel(pxl_line_coords, dpi)
|
107 |
+
|
108 |
+
# compute baseline
|
109 |
+
slope, pxl_intercept = self._get_baseline(line)
|
110 |
+
if abs(slope) < 0.005:
|
111 |
+
slope = 0.0
|
112 |
+
angle = atan(slope)
|
113 |
+
cos_a, sin_a = cos(angle), sin(angle)
|
114 |
+
intercept = pxl_intercept / dpi * inch
|
115 |
+
baseline_y2 = height - (line_box['y2'] + intercept)
|
116 |
+
|
117 |
+
# configure options
|
118 |
+
text = pdf.beginText()
|
119 |
+
text.setFont(fontname, fontsize)
|
120 |
+
pdf.setFillColor(black)
|
121 |
+
if invisible_text:
|
122 |
+
text.setTextRenderMode(3) # invisible text
|
123 |
+
|
124 |
+
# transform overlayed text
|
125 |
+
text.setTextTransform(
|
126 |
+
cos_a, -sin_a, sin_a, cos_a, line_box['x1'], baseline_y2)
|
127 |
+
|
128 |
+
elements = line.findall(".//span[@class='ocrx_word']")
|
129 |
+
for elem in elements:
|
130 |
+
elemtxt = self._get_element_text(elem).strip()
|
131 |
+
# replace unsupported characters
|
132 |
+
elemtxt = elemtxt.translate(str.maketrans(
|
133 |
+
{'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl'}))
|
134 |
+
if not elemtxt:
|
135 |
+
continue
|
136 |
+
|
137 |
+
# compute string width
|
138 |
+
pxl_coords = self._element_coordinates(elem)
|
139 |
+
box = self._pt_from_pixel(pxl_coords, dpi)
|
140 |
+
if add_spaces:
|
141 |
+
elemtxt += ' '
|
142 |
+
box_width = box['x2'] + pdf.stringWidth(elemtxt, fontname, fontsize) - box['x1']
|
143 |
+
else:
|
144 |
+
box_width = box['x2'] - box['x1']
|
145 |
+
font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
|
146 |
+
|
147 |
+
# Adjust relative position of cursor
|
148 |
+
cursor = text.getStartOfLine()
|
149 |
+
dx = box['x1'] - cursor[0]
|
150 |
+
dy = baseline_y2 - cursor[1]
|
151 |
+
text.moveCursor(dx, dy)
|
152 |
+
|
153 |
+
# suppress text if it is 0 units wide
|
154 |
+
if font_width > 0:
|
155 |
+
text.setHorizScale(100 * box_width / font_width)
|
156 |
+
text.textOut(elemtxt)
|
157 |
+
pdf.drawText(text)
|
158 |
+
|
159 |
+
# overlay image if provided
|
160 |
+
if image is not None:
|
161 |
+
pdf.drawImage(ImageReader(Image.fromarray(image)),
|
162 |
+
0, 0, width=width, height=height)
|
163 |
+
pdf.save()
|