AlhitawiMohammed22 commited on
Commit
3523bb0
1 Parent(s): f970a37

Create det_rec.py

Browse files
Files changed (1) hide show
  1. det_rec.py +390 -0
det_rec.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -*- coding: utf-8 -*-
2
+ """
3
+ easyocr.py - A wrapper for easyocr to convert pdf to images to text
4
+ """
5
+
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ logging.basicConfig(
10
+ level=logging.INFO,
11
+ format="%(asctime)s %(levelname)s %(message)s",
12
+ datefmt="%m/%d/%Y %I:%M:%S",
13
+ )
14
+
15
+
16
+ import os
17
+ import pprint as pp
18
+ import re
19
+ import shutil
20
+ import time
21
+ from datetime import date, datetime
22
+ from os.path import basename, dirname, join
23
+ from pathlib import Path
24
+
25
+ from cleantext import clean
26
+ from doctr.io import DocumentFile
27
+ from doctr.models import ocr_predictor
28
+ from libretranslatepy import LibreTranslateAPI
29
+ from natsort import natsorted
30
+ from spellchecker import SpellChecker
31
+ from tqdm.auto import tqdm
32
+
33
+
34
+ def simple_rename(filepath, target_ext=".txt"):
35
+ _fp = Path(filepath)
36
+ basename = _fp.stem
37
+ return f"OCR_{basename}_{target_ext}"
38
+
39
+
40
+ def rm_local_text_files(name_contains="RESULT_"):
41
+ """
42
+ rm_local_text_files - remove local text files
43
+ Args:
44
+ name_contains (str, optional): [description]. Defaults to "OCR_".
45
+ """
46
+ files = [
47
+ f
48
+ for f in Path.cwd().iterdir()
49
+ if f.is_file() and f.suffix == ".txt" and name_contains in f.name
50
+ ]
51
+ logging.info(f"removing {len(files)} text files")
52
+ for f in files:
53
+ os.remove(f)
54
+ logging.info("done")
55
+
56
+
57
+ def corr(
58
+ s: str,
59
+ add_space_when_numerics=False,
60
+ exceptions=["e.g.", "i.e.", "etc.", "cf.", "vs.", "p."],
61
+ ) -> str:
62
+ """corrects spacing in a string
63
+ Args:
64
+ s (str): the string to correct
65
+ add_space_when_numerics (bool, optional): [add a space when a period is between two numbers, example 5.73]. Defaults to False.
66
+ exceptions (list, optional): [do not change these substrings]. Defaults to ['e.g.', 'i.e.', 'etc.', 'cf.', 'vs.', 'p.'].
67
+ Returns:
68
+ str: the corrected string
69
+ """
70
+ if add_space_when_numerics:
71
+ s = re.sub(r"(\d)\.(\d)", r"\1. \2", s)
72
+
73
+ s = re.sub(r"\s+", " ", s)
74
+ s = re.sub(r'\s([?.!"](?:\s|$))', r"\1", s)
75
+
76
+ # fix space before apostrophe
77
+ s = re.sub(r"\s\'", r"'", s)
78
+ # fix space after apostrophe
79
+ s = re.sub(r"'\s", r"'", s)
80
+ # fix space before comma
81
+ s = re.sub(r"\s,", r",", s)
82
+
83
+ for e in exceptions:
84
+ expected_sub = re.sub(r"\s", "", e)
85
+ s = s.replace(expected_sub, e)
86
+
87
+ return s
88
+
89
+
90
+ def fix_punct_spaces(string):
91
+ """
92
+ fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
93
+ Parameters
94
+ ----------
95
+ string : str, required, input string to be corrected
96
+ Returns
97
+ -------
98
+ str, corrected string
99
+ """
100
+
101
+ fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
102
+ string = fix_spaces.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), string)
103
+ string = string.replace(" ' ", "'")
104
+ string = string.replace(' " ', '"')
105
+ return string.strip()
106
+
107
+
108
+ def clean_OCR(ugly_text: str):
109
+ """
110
+ clean_OCR - clean the OCR text files.
111
+ Parameters
112
+ ----------
113
+ ugly_text : str, required, input string to be cleaned
114
+ Returns
115
+ -------
116
+ str, cleaned string
117
+ """
118
+ # Remove all the newlines.
119
+ cleaned_text = ugly_text.replace("\n", " ")
120
+ # Remove all the tabs.
121
+ cleaned_text = cleaned_text.replace("\t", " ")
122
+ # Remove all the double spaces.
123
+ cleaned_text = cleaned_text.replace(" ", " ")
124
+ # Remove all the spaces at the beginning of the text.
125
+ cleaned_text = cleaned_text.lstrip()
126
+ # remove all instances of "- " and " - "
127
+ cleaned_text = cleaned_text.replace("- ", "")
128
+ cleaned_text = cleaned_text.replace(" -", "")
129
+ return fix_punct_spaces(cleaned_text)
130
+
131
+
132
+ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
133
+
134
+ # this is the better version
135
+ old_filepath = join(from_dir, filename)
136
+
137
+ new_filedirectory = join(from_dir, new_folder)
138
+
139
+ if not os.path.isdir(new_filedirectory):
140
+ os.mkdir(new_filedirectory)
141
+ if verbose:
142
+ print("created new directory for files at: \n", new_filedirectory)
143
+ new_filepath = join(new_filedirectory, filename)
144
+
145
+ try:
146
+ shutil.move(old_filepath, new_filepath)
147
+ logging.info("successfully moved the file {} to */completed.".format(filename))
148
+ except:
149
+ logging.info(
150
+ "ERROR! unable to move file to \n{}. Please investigate".format(
151
+ new_filepath
152
+ )
153
+ )
154
+
155
+
156
+ """## pdf2text functions
157
+ """
158
+
159
+
160
+ custom_replace_list = {
161
+ "t0": "to",
162
+ "'$": "'s",
163
+ ",,": ", ",
164
+ "_ ": " ",
165
+ " '": "'",
166
+ }
167
+
168
+ replace_corr_exceptions = {
169
+ "i. e.": "i.e.",
170
+ "e. g.": "e.g.",
171
+ "e. g": "e.g.",
172
+ " ,": ",",
173
+ }
174
+
175
+
176
+ spell = SpellChecker()
177
+
178
+
179
+ def check_word_spelling(word: str) -> bool:
180
+ """
181
+ check_word_spelling - check the spelling of a word
182
+ Args:
183
+ word (str): word to check
184
+ Returns:
185
+ bool: True if word is spelled correctly, False if not
186
+ """
187
+
188
+ misspelled = spell.unknown([word])
189
+
190
+ return len(misspelled) == 0
191
+
192
+
193
+ def eval_and_replace(text: str, match_token: str = "- ") -> str:
194
+ """
195
+ eval_and_replace - conditionally replace all instances of a substring in a string based on whether the eliminated substring results in a valid word
196
+ Args:
197
+ text (str): text to evaluate
198
+ match_token (str, optional): token to replace. Defaults to "- ".
199
+ Returns:
200
+ str: text with replaced tokens
201
+ """
202
+
203
+ try:
204
+ if match_token not in text:
205
+ return text
206
+ else:
207
+ while True:
208
+ full_before_text = text.split(match_token, maxsplit=1)[0]
209
+ before_text = [
210
+ char for char in full_before_text.split()[-1] if char.isalpha()
211
+ ]
212
+ before_text = "".join(before_text)
213
+ full_after_text = text.split(match_token, maxsplit=1)[-1]
214
+ after_text = [char for char in full_after_text.split()[0] if char.isalpha()]
215
+ after_text = "".join(after_text)
216
+ full_text = before_text + after_text
217
+ if check_word_spelling(full_text):
218
+ text = full_before_text + full_after_text
219
+ else:
220
+ text = full_before_text + " " + full_after_text
221
+ if match_token not in text:
222
+ break
223
+ except Exception as e:
224
+ logging.error(f"Error spell-checking OCR output, returning default text:\t{e}")
225
+ return text
226
+
227
+
228
+ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
229
+ """
230
+ cleantxt_ocr - clean text from OCR
231
+ Args:
232
+ ugly_text (str): text to clean
233
+ lower (bool, optional): _description_. Defaults to False.
234
+ lang (str, optional): _description_. Defaults to "en".
235
+ Returns:
236
+ str: cleaned text
237
+ """
238
+ # a wrapper for clean text with options different than default
239
+
240
+ # https://pypi.org/project/clean-text/
241
+ cleaned_text = clean(
242
+ ugly_text,
243
+ fix_unicode=True, # fix various unicode errors
244
+ to_ascii=True, # transliterate to closest ASCII representation
245
+ lower=lower, # lowercase text
246
+ no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
247
+ no_urls=True, # replace all URLs with a special token
248
+ no_emails=True, # replace all email addresses with a special token
249
+ no_phone_numbers=False, # replace all phone numbers with a special token
250
+ no_numbers=False, # replace all numbers with a special token
251
+ no_digits=False, # replace all digits with a special token
252
+ no_currency_symbols=False, # replace all currency symbols with a special token
253
+ no_punct=False, # remove punctuations
254
+ replace_with_punct="", # instead of removing punctuations you may replace them
255
+ replace_with_url="<URL>",
256
+ replace_with_email="<EMAIL>",
257
+ replace_with_phone_number="<PHONE>",
258
+ replace_with_number="<NUM>",
259
+ replace_with_digit="0",
260
+ replace_with_currency_symbol="<CUR>",
261
+ lang=lang, # set to 'de' for German special handling
262
+ )
263
+
264
+ return cleaned_text
265
+
266
+
267
+ def format_ocr_out(OCR_data):
268
+
269
+ if isinstance(OCR_data, list):
270
+ text = " ".join(OCR_data)
271
+ else:
272
+ text = str(OCR_data)
273
+ _clean = cleantxt_ocr(text)
274
+ return corr(_clean)
275
+
276
+
277
+ def postprocess(text: str) -> str:
278
+ """to be used after recombining the lines"""
279
+
280
+ proc = corr(cleantxt_ocr(text))
281
+
282
+ for k, v in custom_replace_list.items():
283
+ proc = proc.replace(str(k), str(v))
284
+
285
+ proc = corr(proc)
286
+
287
+ for k, v in replace_corr_exceptions.items():
288
+ proc = proc.replace(str(k), str(v))
289
+
290
+ return eval_and_replace(proc)
291
+
292
+
293
+ def result2text(result, as_text=False) -> str or list:
294
+ """Convert OCR result to text"""
295
+
296
+ full_doc = []
297
+ for i, page in enumerate(result.pages, start=1):
298
+ text = ""
299
+ for block in page.blocks:
300
+ text += "\n\t"
301
+ for line in block.lines:
302
+ for word in line.words:
303
+ # print(dir(word))
304
+ text += word.value + " "
305
+ full_doc.append(text)
306
+
307
+ return "\n".join(full_doc) if as_text else full_doc
308
+
309
+
310
+ def convert_PDF_to_Text(
311
+ PDF_file,
312
+ ocr_model=None,
313
+ max_pages: int = 20,
314
+ ):
315
+
316
+ st = time.perf_counter()
317
+ PDF_file = Path(PDF_file)
318
+ ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
319
+ logging.info(f"starting OCR on {PDF_file.name}")
320
+ doc = DocumentFile.from_pdf(PDF_file)
321
+ truncated = False
322
+ if len(doc) > max_pages:
323
+ logging.warning(
324
+ f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
325
+ )
326
+ doc = doc[:max_pages]
327
+ truncated = True
328
+
329
+ # Analyze
330
+ logging.info(f"running OCR on {len(doc)} pages")
331
+ result = ocr_model(doc)
332
+ raw_text = result2text(result)
333
+ proc_text = [format_ocr_out(r) for r in raw_text]
334
+ fin_text = [postprocess(t) for t in proc_text]
335
+
336
+ ocr_results = "\n\n".join(fin_text)
337
+
338
+ fn_rt = time.perf_counter() - st
339
+
340
+ logging.info("OCR complete")
341
+
342
+ results_dict = {
343
+ "num_pages": len(doc),
344
+ "runtime": round(fn_rt, 2),
345
+ "date": str(date.today()),
346
+ "converted_text": ocr_results,
347
+ "truncated": truncated,
348
+ "length": len(ocr_results),
349
+ }
350
+
351
+ return results_dict
352
+
353
+
354
+ # @title translation functions
355
+
356
+ lt = LibreTranslateAPI("https://translate.astian.org/")
357
+
358
+
359
+ def translate_text(text, source_l, target_l="en"):
360
+
361
+ return str(lt.translate(text, source_l, target_l))
362
+
363
+
364
+ def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
365
+ """translate a document from lang_start to lang_end
366
+ {'code': 'en', 'name': 'English'},
367
+ {'code': 'fr', 'name': 'French'},
368
+ {'code': 'de', 'name': 'German'},
369
+ {'code': 'it', 'name': 'Italian'},"""
370
+
371
+ src_folder = dirname(filepath)
372
+ src_folder = Path(src_folder)
373
+ trgt_folder = src_folder / f"translated_{lang_end}"
374
+ trgt_folder.mkdir(exist_ok=True)
375
+ with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
376
+ foreign_t = f.readlines()
377
+ in_name = basename(filepath)
378
+ translated_doc = []
379
+ for line in tqdm(
380
+ foreign_t, total=len(foreign_t), desc="translating {}...".format(in_name[:10])
381
+ ):
382
+ translated_line = translate_text(line, lang_start, lang_end)
383
+ translated_doc.append(translated_line)
384
+ t_out_name = "[To {}]".format(lang_end) + simple_rename(in_name) + ".txt"
385
+ out_path = join(trgt_folder, t_out_name)
386
+ with open(out_path, "w", encoding="utf-8", errors="ignore") as f_o:
387
+ f_o.writelines(translated_doc)
388
+ if verbose:
389
+ print("finished translating the document! - ", datetime.now())
390
+ return out_path