jaimin commited on
Commit
8f5e699
1 Parent(s): 4471721

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +182 -0
  2. pdf2text.py +406 -0
  3. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ from pathlib import Path
4
+ import contextlib
5
+
6
+ logging.basicConfig(
7
+ level=logging.INFO,
8
+ format="%(asctime)s - %(levelname)s - %(message)s",
9
+ )
10
+
11
+
12
+ import gradio as gr
13
+ import nltk
14
+ import torch
15
+
16
+ from pdf2text import *
17
+
18
+ _here = Path(__file__).parent
19
+
20
+ nltk.download("stopwords") # TODO=find where this requirement originates from
21
+
22
+
23
+ def load_uploaded_file(file_obj, temp_dir: Path = None):
24
+ """
25
+ load_uploaded_file - process an uploaded file
26
+
27
+ Args:
28
+ file_obj (POTENTIALLY list): Gradio file object inside a list
29
+
30
+ Returns:
31
+ str, the uploaded file contents
32
+ """
33
+
34
+ # check if mysterious file object is a list
35
+ if isinstance(file_obj, list):
36
+ file_obj = file_obj[0]
37
+ file_path = Path(file_obj.name)
38
+
39
+ if temp_dir is None:
40
+ _temp_dir = _here / "temp"
41
+ _temp_dir.mkdir(exist_ok=True)
42
+
43
+ try:
44
+ pdf_bytes_obj = open(file_path, "rb").read()
45
+ temp_path = temp_dir / file_path.name if temp_dir else file_path
46
+ # save to PDF file
47
+ with open(temp_path, "wb") as f:
48
+ f.write(pdf_bytes_obj)
49
+ logging.info(f"Saved uploaded file to {temp_path}")
50
+ return str(temp_path.resolve())
51
+
52
+ except Exception as e:
53
+ logging.error(f"Trying to load file with path {file_path}, error: {e}")
54
+ print(f"Trying to load file with path {file_path}, error: {e}")
55
+ return None
56
+
57
+
58
+ def convert_PDF(
59
+ pdf_obj,
60
+ language: str = "en",
61
+ max_pages=20,
62
+ ):
63
+ """
64
+ convert_PDF - convert a PDF file to text
65
+
66
+ Args:
67
+ pdf_bytes_obj (bytes): PDF file contents
68
+ language (str, optional): Language to use for OCR. Defaults to "en".
69
+
70
+ Returns:
71
+ str, the PDF file contents as text
72
+ """
73
+ # clear local text cache
74
+ rm_local_text_files()
75
+ global ocr_model
76
+ st = time.perf_counter()
77
+ if isinstance(pdf_obj, list):
78
+ pdf_obj = pdf_obj[0]
79
+ file_path = Path(pdf_obj.name)
80
+ if not file_path.suffix == ".pdf":
81
+ logging.error(f"File {file_path} is not a PDF file")
82
+
83
+ html_error = f"""
84
+ <div style="color: red; font-size: 20px; font-weight: bold;">
85
+ File {file_path} is not a PDF file. Please upload a PDF file.
86
+ </div>
87
+ """
88
+ return "File is not a PDF file", html_error, None
89
+
90
+ conversion_stats = convert_PDF_to_Text(
91
+ file_path,
92
+ ocr_model=ocr_model,
93
+ max_pages=max_pages,
94
+ )
95
+ converted_txt = conversion_stats["converted_text"]
96
+ num_pages = conversion_stats["num_pages"]
97
+ was_truncated = conversion_stats["truncated"]
98
+ # if alt_lang: # TODO: fix this
99
+
100
+ rt = round((time.perf_counter() - st) / 60, 2)
101
+ print(f"Runtime: {rt} minutes")
102
+ html = ""
103
+ if was_truncated:
104
+ html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
105
+ html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
106
+
107
+ _output_name = f"RESULT_{file_path.stem}_OCR.txt"
108
+ with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
109
+ f.write(converted_txt)
110
+
111
+ return converted_txt, html, _output_name
112
+
113
+
114
+ if __name__ == "__main__":
115
+ logging.info("Starting app")
116
+
117
+ use_GPU = torch.cuda.is_available()
118
+ logging.info(f"Using GPU status: {use_GPU}")
119
+ logging.info("Loading OCR model")
120
+ with contextlib.redirect_stdout(None):
121
+ ocr_model = ocr_predictor(
122
+ "db_resnet50",
123
+ "crnn_mobilenet_v3_large",
124
+ pretrained=True,
125
+ assume_straight_pages=True,
126
+ )
127
+
128
+ # define pdf bytes as None
129
+ pdf_obj = _here / "example_file.pdf"
130
+ pdf_obj = str(pdf_obj.resolve())
131
+ _temp_dir = _here / "temp"
132
+ _temp_dir.mkdir(exist_ok=True)
133
+
134
+ logging.info("starting demo")
135
+ demo = gr.Blocks()
136
+
137
+ with demo:
138
+
139
+ gr.Markdown("# PDF to Text")
140
+ gr.Markdown(
141
+ "A basic demo of pdf-to-text conversion using OCR from the [doctr](https://mindee.github.io/doctr/index.html) package"
142
+ )
143
+ gr.Markdown("---")
144
+
145
+ with gr.Column():
146
+
147
+ gr.Markdown("## Load Inputs")
148
+ gr.Markdown("Upload your own file & replace the default. Files should be < 10MB to avoid upload issues - search for a PDF compressor online as needed.")
149
+ gr.Markdown(
150
+ "_If no file is uploaded, a sample PDF will be used. PDFs are truncated to 20 pages._"
151
+ )
152
+
153
+ uploaded_file = gr.File(
154
+ label="Upload a PDF file",
155
+ file_count="single",
156
+ type="file",
157
+ value=_here / "example_file.pdf",
158
+ )
159
+
160
+ gr.Markdown("---")
161
+
162
+ with gr.Column():
163
+ gr.Markdown("## Convert PDF to Text")
164
+ convert_button = gr.Button("Convert PDF!", variant="primary")
165
+ out_placeholder = gr.HTML("<p><em>Output will appear below:</em></p>")
166
+ gr.Markdown("### Output")
167
+ OCR_text = gr.Textbox(
168
+ label="OCR Result", placeholder="The OCR text will appear here"
169
+ )
170
+ text_file = gr.File(
171
+ label="Download Text File",
172
+ file_count="single",
173
+ type="file",
174
+ interactive=False,
175
+ )
176
+
177
+ convert_button.click(
178
+ fn=convert_PDF,
179
+ inputs=[uploaded_file],
180
+ outputs=[OCR_text, out_placeholder, text_file],
181
+ )
182
+ demo.launch(enable_queue=True)
pdf2text.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+
4
+ easyocr.py - A wrapper for easyocr to convert pdf to images to text
5
+ """
6
+
7
+ import logging
8
+ from pathlib import Path
9
+
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s %(levelname)s %(message)s",
13
+ datefmt="%m/%d/%Y %I:%M:%S",
14
+ )
15
+
16
+
17
+ import os
18
+ import pprint as pp
19
+ import re
20
+ import shutil
21
+ import time
22
+ from datetime import date, datetime
23
+ from os.path import basename, dirname, join
24
+ from pathlib import Path
25
+
26
+ from cleantext import clean
27
+ from doctr.io import DocumentFile
28
+ from doctr.models import ocr_predictor
29
+ from libretranslatepy import LibreTranslateAPI
30
+ from natsort import natsorted
31
+ from spellchecker import SpellChecker
32
+ from tqdm.auto import tqdm
33
+
34
+
35
+ def simple_rename(filepath, target_ext=".txt"):
36
+ _fp = Path(filepath)
37
+ basename = _fp.stem
38
+ return f"OCR_{basename}_{target_ext}"
39
+
40
+
41
+ def rm_local_text_files(name_contains="RESULT_"):
42
+ """
43
+ rm_local_text_files - remove local text files
44
+
45
+ Args:
46
+ name_contains (str, optional): [description]. Defaults to "OCR_".
47
+ """
48
+ files = [
49
+ f
50
+ for f in Path.cwd().iterdir()
51
+ if f.is_file() and f.suffix == ".txt" and name_contains in f.name
52
+ ]
53
+ logging.info(f"removing {len(files)} text files")
54
+ for f in files:
55
+ os.remove(f)
56
+ logging.info("done")
57
+
58
+
59
+ def corr(
60
+ s: str,
61
+ add_space_when_numerics=False,
62
+ exceptions=["e.g.", "i.e.", "etc.", "cf.", "vs.", "p."],
63
+ ) -> str:
64
+ """corrects spacing in a string
65
+
66
+ Args:
67
+ s (str): the string to correct
68
+ add_space_when_numerics (bool, optional): [add a space when a period is between two numbers, example 5.73]. Defaults to False.
69
+ exceptions (list, optional): [do not change these substrings]. Defaults to ['e.g.', 'i.e.', 'etc.', 'cf.', 'vs.', 'p.'].
70
+
71
+ Returns:
72
+ str: the corrected string
73
+ """
74
+ if add_space_when_numerics:
75
+ s = re.sub(r"(\d)\.(\d)", r"\1. \2", s)
76
+
77
+ s = re.sub(r"\s+", " ", s)
78
+ s = re.sub(r'\s([?.!"](?:\s|$))', r"\1", s)
79
+
80
+ # fix space before apostrophe
81
+ s = re.sub(r"\s\'", r"'", s)
82
+ # fix space after apostrophe
83
+ s = re.sub(r"'\s", r"'", s)
84
+ # fix space before comma
85
+ s = re.sub(r"\s,", r",", s)
86
+
87
+ for e in exceptions:
88
+ expected_sub = re.sub(r"\s", "", e)
89
+ s = s.replace(expected_sub, e)
90
+
91
+ return s
92
+
93
+
94
+ def fix_punct_spaces(string):
95
+ """
96
+ fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
97
+
98
+ Parameters
99
+ ----------
100
+ string : str, required, input string to be corrected
101
+
102
+ Returns
103
+ -------
104
+ str, corrected string
105
+ """
106
+
107
+ fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
108
+ string = fix_spaces.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), string)
109
+ string = string.replace(" ' ", "'")
110
+ string = string.replace(' " ', '"')
111
+ return string.strip()
112
+
113
+
114
+ def clean_OCR(ugly_text: str):
115
+ """
116
+ clean_OCR - clean the OCR text files.
117
+
118
+ Parameters
119
+ ----------
120
+ ugly_text : str, required, input string to be cleaned
121
+
122
+ Returns
123
+ -------
124
+ str, cleaned string
125
+ """
126
+ # Remove all the newlines.
127
+ cleaned_text = ugly_text.replace("\n", " ")
128
+ # Remove all the tabs.
129
+ cleaned_text = cleaned_text.replace("\t", " ")
130
+ # Remove all the double spaces.
131
+ cleaned_text = cleaned_text.replace(" ", " ")
132
+ # Remove all the spaces at the beginning of the text.
133
+ cleaned_text = cleaned_text.lstrip()
134
+ # remove all instances of "- " and " - "
135
+ cleaned_text = cleaned_text.replace("- ", "")
136
+ cleaned_text = cleaned_text.replace(" -", "")
137
+ return fix_punct_spaces(cleaned_text)
138
+
139
+
140
+ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
141
+
142
+ # this is the better version
143
+ old_filepath = join(from_dir, filename)
144
+
145
+ new_filedirectory = join(from_dir, new_folder)
146
+
147
+ if not os.path.isdir(new_filedirectory):
148
+ os.mkdir(new_filedirectory)
149
+ if verbose:
150
+ print("created new directory for files at: \n", new_filedirectory)
151
+ new_filepath = join(new_filedirectory, filename)
152
+
153
+ try:
154
+ shutil.move(old_filepath, new_filepath)
155
+ logging.info("successfully moved the file {} to */completed.".format(filename))
156
+ except:
157
+ logging.info(
158
+ "ERROR! unable to move file to \n{}. Please investigate".format(
159
+ new_filepath
160
+ )
161
+ )
162
+
163
+
164
+ """## pdf2text functions
165
+
166
+ """
167
+
168
+
169
+ custom_replace_list = {
170
+ "t0": "to",
171
+ "'$": "'s",
172
+ ",,": ", ",
173
+ "_ ": " ",
174
+ " '": "'",
175
+ }
176
+
177
+ replace_corr_exceptions = {
178
+ "i. e.": "i.e.",
179
+ "e. g.": "e.g.",
180
+ "e. g": "e.g.",
181
+ " ,": ",",
182
+ }
183
+
184
+
185
+ spell = SpellChecker()
186
+
187
+
188
+ def check_word_spelling(word: str) -> bool:
189
+ """
190
+ check_word_spelling - check the spelling of a word
191
+
192
+ Args:
193
+ word (str): word to check
194
+
195
+ Returns:
196
+ bool: True if word is spelled correctly, False if not
197
+ """
198
+
199
+ misspelled = spell.unknown([word])
200
+
201
+ return len(misspelled) == 0
202
+
203
+
204
+ def eval_and_replace(text: str, match_token: str = "- ") -> str:
205
+ """
206
+ eval_and_replace - conditionally replace all instances of a substring in a string based on whether the eliminated substring results in a valid word
207
+
208
+ Args:
209
+ text (str): text to evaluate
210
+ match_token (str, optional): token to replace. Defaults to "- ".
211
+
212
+ Returns:
213
+ str: text with replaced tokens
214
+ """
215
+
216
+ try:
217
+ if match_token not in text:
218
+ return text
219
+ else:
220
+ while True:
221
+ full_before_text = text.split(match_token, maxsplit=1)[0]
222
+ before_text = [
223
+ char for char in full_before_text.split()[-1] if char.isalpha()
224
+ ]
225
+ before_text = "".join(before_text)
226
+ full_after_text = text.split(match_token, maxsplit=1)[-1]
227
+ after_text = [char for char in full_after_text.split()[0] if char.isalpha()]
228
+ after_text = "".join(after_text)
229
+ full_text = before_text + after_text
230
+ if check_word_spelling(full_text):
231
+ text = full_before_text + full_after_text
232
+ else:
233
+ text = full_before_text + " " + full_after_text
234
+ if match_token not in text:
235
+ break
236
+ except Exception as e:
237
+ logging.error(f"Error spell-checking OCR output, returning default text:\t{e}")
238
+ return text
239
+
240
+
241
+ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
242
+ """
243
+ cleantxt_ocr - clean text from OCR
244
+
245
+ Args:
246
+ ugly_text (str): text to clean
247
+ lower (bool, optional): _description_. Defaults to False.
248
+ lang (str, optional): _description_. Defaults to "en".
249
+
250
+ Returns:
251
+ str: cleaned text
252
+ """
253
+ # a wrapper for clean text with options different than default
254
+
255
+ # https://pypi.org/project/clean-text/
256
+ cleaned_text = clean(
257
+ ugly_text,
258
+ fix_unicode=True, # fix various unicode errors
259
+ to_ascii=True, # transliterate to closest ASCII representation
260
+ lower=lower, # lowercase text
261
+ no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
262
+ no_urls=True, # replace all URLs with a special token
263
+ no_emails=True, # replace all email addresses with a special token
264
+ no_phone_numbers=False, # replace all phone numbers with a special token
265
+ no_numbers=False, # replace all numbers with a special token
266
+ no_digits=False, # replace all digits with a special token
267
+ no_currency_symbols=False, # replace all currency symbols with a special token
268
+ no_punct=False, # remove punctuations
269
+ replace_with_punct="", # instead of removing punctuations you may replace them
270
+ replace_with_url="<URL>",
271
+ replace_with_email="<EMAIL>",
272
+ replace_with_phone_number="<PHONE>",
273
+ replace_with_number="<NUM>",
274
+ replace_with_digit="0",
275
+ replace_with_currency_symbol="<CUR>",
276
+ lang=lang, # set to 'de' for German special handling
277
+ )
278
+
279
+ return cleaned_text
280
+
281
+
282
+ def format_ocr_out(OCR_data):
283
+
284
+ if isinstance(OCR_data, list):
285
+ text = " ".join(OCR_data)
286
+ else:
287
+ text = str(OCR_data)
288
+ _clean = cleantxt_ocr(text)
289
+ return corr(_clean)
290
+
291
+
292
+ def postprocess(text: str) -> str:
293
+ """to be used after recombining the lines"""
294
+
295
+ proc = corr(cleantxt_ocr(text))
296
+
297
+ for k, v in custom_replace_list.items():
298
+ proc = proc.replace(str(k), str(v))
299
+
300
+ proc = corr(proc)
301
+
302
+ for k, v in replace_corr_exceptions.items():
303
+ proc = proc.replace(str(k), str(v))
304
+
305
+ return eval_and_replace(proc)
306
+
307
+
308
+ def result2text(result, as_text=False) -> str or list:
309
+ """Convert OCR result to text"""
310
+
311
+ full_doc = []
312
+ for i, page in enumerate(result.pages, start=1):
313
+ text = ""
314
+ for block in page.blocks:
315
+ text += "\n\t"
316
+ for line in block.lines:
317
+ for word in line.words:
318
+ # print(dir(word))
319
+ text += word.value + " "
320
+ full_doc.append(text)
321
+
322
+ return "\n".join(full_doc) if as_text else full_doc
323
+
324
+
325
+ def convert_PDF_to_Text(
326
+ PDF_file,
327
+ ocr_model=None,
328
+ max_pages: int = 20,
329
+ ):
330
+
331
+ st = time.perf_counter()
332
+ PDF_file = Path(PDF_file)
333
+ ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
334
+ logging.info(f"starting OCR on {PDF_file.name}")
335
+ doc = DocumentFile.from_pdf(PDF_file)
336
+ truncated = False
337
+ if len(doc) > max_pages:
338
+ logging.warning(
339
+ f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
340
+ )
341
+ doc = doc[:max_pages]
342
+ truncated = True
343
+
344
+ # Analyze
345
+ logging.info(f"running OCR on {len(doc)} pages")
346
+ result = ocr_model(doc)
347
+ raw_text = result2text(result)
348
+ proc_text = [format_ocr_out(r) for r in raw_text]
349
+ fin_text = [postprocess(t) for t in proc_text]
350
+
351
+ ocr_results = "\n\n".join(fin_text)
352
+
353
+ fn_rt = time.perf_counter() - st
354
+
355
+ logging.info("OCR complete")
356
+
357
+ results_dict = {
358
+ "num_pages": len(doc),
359
+ "runtime": round(fn_rt, 2),
360
+ "date": str(date.today()),
361
+ "converted_text": ocr_results,
362
+ "truncated": truncated,
363
+ "length": len(ocr_results),
364
+ }
365
+
366
+ return results_dict
367
+
368
+
369
+ # @title translation functions
370
+
371
+ lt = LibreTranslateAPI("https://translate.astian.org/")
372
+
373
+
374
+ def translate_text(text, source_l, target_l="en"):
375
+
376
+ return str(lt.translate(text, source_l, target_l))
377
+
378
+
379
+ def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
380
+ """translate a document from lang_start to lang_end
381
+
382
+ {'code': 'en', 'name': 'English'},
383
+ {'code': 'fr', 'name': 'French'},
384
+ {'code': 'de', 'name': 'German'},
385
+ {'code': 'it', 'name': 'Italian'},"""
386
+
387
+ src_folder = dirname(filepath)
388
+ src_folder = Path(src_folder)
389
+ trgt_folder = src_folder / f"translated_{lang_end}"
390
+ trgt_folder.mkdir(exist_ok=True)
391
+ with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
392
+ foreign_t = f.readlines()
393
+ in_name = basename(filepath)
394
+ translated_doc = []
395
+ for line in tqdm(
396
+ foreign_t, total=len(foreign_t), desc="translating {}...".format(in_name[:10])
397
+ ):
398
+ translated_line = translate_text(line, lang_start, lang_end)
399
+ translated_doc.append(translated_line)
400
+ t_out_name = "[To {}]".format(lang_end) + simple_rename(in_name) + ".txt"
401
+ out_path = join(trgt_folder, t_out_name)
402
+ with open(out_path, "w", encoding="utf-8", errors="ignore") as f_o:
403
+ f_o.writelines(translated_doc)
404
+ if verbose:
405
+ print("finished translating the document! - ", datetime.now())
406
+ return out_path
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ clean-text[gpl]
2
+ python-doctr[torch]
3
+ gradio
4
+ libretranslatepy
5
+ natsort
6
+ nltk
7
+ pyspellchecker
8
+ torch
9
+ tqdm
10
+ rapidfuzz==2.13.7