pszemraj commited on
Commit
ccaf8ca
Β·
1 Parent(s): 8131022

πŸŽ‰ add mvp files

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (4) hide show
  1. .gitignore +28 -0
  2. app.py +151 -0
  3. example_file.pdf +0 -0
  4. pdf2text.py +680 -0
.gitignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # logs
2
+ *.log
3
+ *LOGFILE*
4
+
5
+ # output files need to be force-added
6
+ *.csv
7
+ *.png
8
+ *.jpg
9
+ *.jpeg
10
+ *.pkl
11
+ *.xlsx
12
+
13
+ # cache
14
+ *__pycache__/
15
+ *.pyc
16
+
17
+ # reports folder - need to be force-added
18
+ *reports/
19
+
20
+ # scratch files and folders
21
+
22
+ *scratch*
23
+ *scratch/
24
+
25
+ # notebooks
26
+
27
+ *notebooks/
28
+ *.ipynb
app.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ from pathlib import Path
4
+ import contextlib
5
+
6
+ logging.basicConfig(
7
+ level=logging.INFO,
8
+ format="%(asctime)s - %(levelname)s - %(message)s",
9
+ )
10
+
11
+
12
+ import gradio as gr
13
+ import nltk
14
+ import torch
15
+
16
+ from pdf2text import *
17
+
18
+ _here = Path(__file__).parent
19
+
20
+ nltk.download("stopwords") # TODO=find where this requirement originates from
21
+
22
+
23
+ def load_uploaded_file(file_obj, temp_dir: Path = None):
24
+ """
25
+ load_uploaded_file - process an uploaded file
26
+
27
+ Args:
28
+ file_obj (POTENTIALLY list): Gradio file object inside a list
29
+
30
+ Returns:
31
+ str, the uploaded file contents
32
+ """
33
+
34
+ # check if mysterious file object is a list
35
+ # check if mysterious file object is a list
36
+ if isinstance(file_obj, list):
37
+ file_obj = file_obj[0]
38
+ file_path = Path(file_obj.name)
39
+
40
+ if temp_dir is None:
41
+ _temp_dir = _here / "temp"
42
+ _temp_dir.mkdir(exist_ok=True)
43
+
44
+ try:
45
+ pdf_bytes_obj = open(file_path, "rb").read()
46
+ temp_path = temp_dir / file_path.name if temp_dir else file_path
47
+ # save to PDF file
48
+ with open(temp_path, "wb") as f:
49
+ f.write(pdf_bytes_obj)
50
+ logging.info(f"Saved uploaded file to {temp_path}")
51
+ return str(temp_path.resolve())
52
+
53
+ except Exception as e:
54
+ logging.error(f"Trying to load file with path {file_path}, error: {e}")
55
+ print(f"Trying to load file with path {file_path}, error: {e}")
56
+ return None
57
+
58
+
59
+ def convert_PDF(pdf_obj, language: str = "en"):
60
+ """
61
+ convert_PDF - convert a PDF file to text
62
+
63
+ Args:
64
+ pdf_bytes_obj (bytes): PDF file contents
65
+ language (str, optional): Language to use for OCR. Defaults to "en".
66
+
67
+ Returns:
68
+ str, the PDF file contents as text
69
+ """
70
+
71
+ global ocr_model
72
+ st = time.perf_counter()
73
+
74
+ conversion_stats = convert_PDF_to_Text(
75
+ pdf_obj,
76
+ ocr_model=ocr_model,
77
+ max_pages=20,
78
+ )
79
+ converted_txt = conversion_stats["converted_text"]
80
+ num_pages = conversion_stats["num_pages"]
81
+ # if alt_lang: # TODO: fix this
82
+
83
+ rt = round((time.perf_counter() - st) / 60, 2)
84
+ print(f"Runtime: {rt} minutes")
85
+ html = ""
86
+ html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
87
+
88
+ return converted_txt, html
89
+
90
+
91
+ if __name__ == "__main__":
92
+ logging.info("Starting app")
93
+
94
+ use_GPU = torch.cuda.is_available()
95
+ logging.info(f"Using GPU status: {use_GPU}")
96
+ logging.info("Loading OCR model")
97
+ with contextlib.redirect_stdout(None):
98
+ ocr_model = ocr_predictor('db_resnet50', 'crnn_mobilenet_v3_large', pretrained=True, assume_straight_pages=True)
99
+
100
+ # define pdf bytes as None
101
+ pdf_obj = _here / "example_file.pdf"
102
+ pdf_obj = str(pdf_obj.resolve())
103
+ _temp_dir = _here / "temp"
104
+ _temp_dir.mkdir(exist_ok=True)
105
+
106
+ logging.info("starting demo")
107
+ demo = gr.Blocks()
108
+
109
+ with demo:
110
+
111
+ gr.Markdown("# PDF to Text")
112
+ gr.Markdown("**Upload a PDF file to convert to text**")
113
+ gr.Markdown("If no file is uploaded, a sample PDF will be used")
114
+
115
+ with gr.Column():
116
+
117
+ gr.Markdown("## Load Inputs")
118
+ gr.Markdown("Upload your own file:")
119
+ pdf_obj = gr.Textbox(
120
+ lines=1,
121
+ label="VM file path",
122
+ placeholder="When the file is uploaded, the path will appear here",
123
+ value=pdf_obj,
124
+ )
125
+ with gr.Row():
126
+ uploaded_file = gr.File(
127
+ label="Upload a PDF file",
128
+ file_count="single",
129
+ type="file",
130
+ )
131
+ load_file_button = gr.Button("Load Uploaded File")
132
+
133
+ gr.Markdown("---")
134
+
135
+ with gr.Column():
136
+ gr.Markdown("## Convert PDF to Text")
137
+ convert_button = gr.Button("Convert PDF!")
138
+ out_placeholder = gr.HTML("<p><em>Output will appear below:</em></p>")
139
+ gr.Markdown("### Output")
140
+ OCR_text = gr.Textbox(
141
+ label="OCR Result", placeholder="The OCR text will appear here"
142
+ )
143
+
144
+ load_file_button.click(
145
+ fn=load_uploaded_file, inputs=uploaded_file, outputs=[pdf_obj]
146
+ )
147
+
148
+ convert_button.click(
149
+ fn=convert_PDF, inputs=[pdf_obj], outputs=[OCR_text, out_placeholder]
150
+ )
151
+ demo.launch(enable_queue=True)
example_file.pdf ADDED
Binary file (290 kB). View file
 
pdf2text.py ADDED
@@ -0,0 +1,680 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+
4
+ easyocr.py - A wrapper for easyocr to convert pdf to images to text
5
+ """
6
+
7
+ import logging
8
+ from pathlib import Path
9
+
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s %(levelname)s %(message)s",
13
+ datefmt="%m/%d/%Y %I:%M:%S",
14
+ )
15
+
16
+
17
+ import gc
18
+ import os
19
+ import pprint as pp
20
+ import re
21
+ import shutil
22
+ import time
23
+ from datetime import datetime
24
+ from os.path import basename, isfile, join
25
+ from pathlib import Path
26
+ import re
27
+ import pandas as pd
28
+ import wordninja
29
+ from cleantext import clean
30
+ from natsort import natsorted
31
+ from tqdm.auto import tqdm
32
+
33
+ from doctr.io import DocumentFile
34
+ from doctr.models import ocr_predictor
35
+ def fast_scandir(dirname):
36
+ # return all subfolders in a given filepath
37
+
38
+ subfolders = [f.path for f in os.scandir(dirname) if f.is_dir()]
39
+ for dirname in list(subfolders):
40
+ subfolders.extend(fast_scandir(dirname))
41
+ return subfolders # list
42
+
43
+
44
+ def create_folder(directory):
45
+ os.makedirs(directory, exist_ok=True)
46
+
47
+
48
+ def simple_rename(filepath, target_ext=".txt"):
49
+ _fp = Path(filepath)
50
+ basename = _fp.stem
51
+ return f"OCR_{basename}_{target_ext}"
52
+
53
+
54
+ def load_dir_files(directory, req_extension=".txt", return_type="list", verbose=False):
55
+ appr_files = []
56
+ # r=root, d=directories, f = files
57
+ for r, d, f in os.walk(directory):
58
+ for prefile in f:
59
+ if prefile.endswith(req_extension):
60
+ fullpath = os.path.join(r, prefile)
61
+ appr_files.append(fullpath)
62
+
63
+ appr_files = natsorted(appr_files)
64
+
65
+ if verbose:
66
+ print("A list of files in the {} directory are: \n".format(directory))
67
+ if len(appr_files) < 10:
68
+ pp.pprint(appr_files)
69
+ else:
70
+ pp.pprint(appr_files[:10])
71
+ print("\n and more. There are a total of {} files".format(len(appr_files)))
72
+
73
+ if return_type.lower() == "list":
74
+ return appr_files
75
+ else:
76
+ if verbose:
77
+ print("returning dictionary")
78
+
79
+ appr_file_dict = {}
80
+ for this_file in appr_files:
81
+ appr_file_dict[basename(this_file)] = this_file
82
+
83
+ return appr_file_dict
84
+
85
+
86
+ def corr(
87
+ s: str,
88
+ add_space_when_numerics=False,
89
+ exceptions=["e.g.", "i.e.", "etc.", "cf.", "vs.", "p."],
90
+ ) -> str:
91
+ """corrects spacing in a string
92
+
93
+ Args:
94
+ s (str): the string to correct
95
+ add_space_when_numerics (bool, optional): [add a space when a period is between two numbers, example 5.73]. Defaults to False.
96
+ exceptions (list, optional): [do not change these substrings]. Defaults to ['e.g.', 'i.e.', 'etc.', 'cf.', 'vs.', 'p.'].
97
+
98
+ Returns:
99
+ str: the corrected string
100
+ """
101
+ if add_space_when_numerics:
102
+ s = re.sub(r"(\d)\.(\d)", r"\1. \2", s)
103
+
104
+ s = re.sub(r"\s+", " ", s)
105
+ s = re.sub(r'\s([?.!"](?:\s|$))', r"\1", s)
106
+
107
+ # fix space before apostrophe
108
+ s = re.sub(r"\s\'", r"'", s)
109
+ # fix space after apostrophe
110
+ s = re.sub(r"'\s", r"'", s)
111
+ # fix space before comma
112
+ s = re.sub(r"\s,", r",", s)
113
+
114
+ for e in exceptions:
115
+ expected_sub = re.sub(r"\s", "", e)
116
+ s = s.replace(expected_sub, e)
117
+
118
+ return s
119
+
120
+
121
+ def is_this_needed_in_output(in_string):
122
+ if in_string.isalnum():
123
+ return True
124
+ elif in_string == ".":
125
+ return True
126
+ elif in_string == " ":
127
+ return True
128
+ elif in_string == "\n":
129
+ return True
130
+ elif in_string == "-":
131
+ return True
132
+ else:
133
+ return False
134
+
135
+
136
+ # @title clean filenames
137
+ def cleantxt_wrap(ugly_text, txt_lan="en"):
138
+ # a wrapper for clean text with options different than default
139
+
140
+ # https://pypi.org/project/clean-text/
141
+ cleaned_text = clean(
142
+ ugly_text,
143
+ fix_unicode=True, # fix various unicode errors
144
+ to_ascii=True, # transliterate to closest ASCII representation
145
+ lower=True, # lowercase text
146
+ no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
147
+ no_urls=True, # replace all URLs with a special token
148
+ no_emails=True, # replace all email addresses with a special token
149
+ no_phone_numbers=True, # replace all phone numbers with a special token
150
+ no_numbers=False, # replace all numbers with a special token
151
+ no_digits=False, # replace all digits with a special token
152
+ no_currency_symbols=True, # replace all currency symbols with a special token
153
+ no_punct=True, # remove punctuations
154
+ replace_with_punct="", # instead of removing punctuations you may replace them
155
+ replace_with_url="<URL>",
156
+ replace_with_email="<EMAIL>",
157
+ replace_with_phone_number="<PHONE>",
158
+ replace_with_number="<NUM>",
159
+ replace_with_digit="0",
160
+ replace_with_currency_symbol="<CUR>",
161
+ lang=txt_lan, # set to 'de' for German special handling
162
+ )
163
+
164
+ return cleaned_text
165
+
166
+
167
+ def beautify_filename(
168
+ filename, num_words=25, start_reverse=False, word_separator="_"
169
+ ) -> str:
170
+ """
171
+ beautify_filename takes a filename and returns a beautified version of it
172
+
173
+ Args:
174
+ filename (str): the filename to beautify
175
+ num_words (int, optional): _description_. Defaults to 25.
176
+ start_reverse (bool, optional): _description_. Defaults to False.
177
+ word_separator (str, optional): _description_. Defaults to "_".
178
+
179
+ Returns:
180
+ str: the beautified filename
181
+ """
182
+
183
+ filename = str(filename)
184
+ index_file_Ext = filename.rfind(".")
185
+ current_name = str(filename)[:index_file_Ext] # get rid of extension
186
+ if current_name[-1].isnumeric():
187
+ current_name = current_name + "s"
188
+ clean_name = cleantxt_wrap(current_name)
189
+ file_words = wordninja.split(clean_name)
190
+ # splits concatenated text into a list of words based on common word freq
191
+ if len(file_words) <= num_words:
192
+ num_words = len(file_words)
193
+
194
+ if start_reverse:
195
+ t_file_words = file_words[-num_words:]
196
+ else:
197
+ t_file_words = file_words[:num_words]
198
+
199
+ pretty_name = word_separator.join(t_file_words) # see function argument
200
+
201
+ # NOTE IT DOES NOT RETURN THE EXTENSION
202
+ return pretty_name[
203
+ : (len(pretty_name) - 1)
204
+ ] # there is a space always at the end, so -1
205
+
206
+
207
+ def fix_punct_spaces(string):
208
+ """
209
+ fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
210
+
211
+ Parameters
212
+ ----------
213
+ string : str, required, input string to be corrected
214
+
215
+ Returns
216
+ -------
217
+ str, corrected string
218
+ """
219
+
220
+ fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
221
+ string = fix_spaces.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), string)
222
+ string = string.replace(" ' ", "'")
223
+ string = string.replace(' " ', '"')
224
+ return string.strip()
225
+
226
+
227
+ def clean_OCR(ugly_text: str):
228
+ """
229
+ clean_OCR - clean the OCR text files.
230
+
231
+ Parameters
232
+ ----------
233
+ ugly_text : str, required, input string to be cleaned
234
+
235
+ Returns
236
+ -------
237
+ str, cleaned string
238
+ """
239
+ # Remove all the newlines.
240
+ cleaned_text = ugly_text.replace("\n", " ")
241
+ # Remove all the tabs.
242
+ cleaned_text = cleaned_text.replace("\t", " ")
243
+ # Remove all the double spaces.
244
+ cleaned_text = cleaned_text.replace(" ", " ")
245
+ # Remove all the spaces at the beginning of the text.
246
+ cleaned_text = cleaned_text.lstrip()
247
+ # remove all instances of "- " and " - "
248
+ cleaned_text = cleaned_text.replace("- ", "")
249
+ cleaned_text = cleaned_text.replace(" -", "")
250
+ return fix_punct_spaces(cleaned_text)
251
+
252
+
253
+ import os
254
+ import shutil
255
+ from os.path import join
256
+
257
+ # @markdown move2completed
258
+
259
+
260
+ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
261
+
262
+ # this is the better version
263
+ old_filepath = join(from_dir, filename)
264
+
265
+ new_filedirectory = join(from_dir, new_folder)
266
+
267
+ if not os.path.isdir(new_filedirectory):
268
+ os.mkdir(new_filedirectory)
269
+ if verbose:
270
+ print("created new directory for files at: \n", new_filedirectory)
271
+ new_filepath = join(new_filedirectory, filename)
272
+
273
+ try:
274
+ shutil.move(old_filepath, new_filepath)
275
+ logging.info("successfully moved the file {} to */completed.".format(filename))
276
+ except:
277
+ logging.info(
278
+ "ERROR! unable to move file to \n{}. Please investigate".format(
279
+ new_filepath
280
+ )
281
+ )
282
+
283
+
284
+ """### download files
285
+
286
+ **old versions**
287
+ """
288
+
289
+ import re
290
+
291
+
292
+ def URL_string_filter(text):
293
+ custom_printable = (
294
+ "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ._"
295
+ )
296
+
297
+ filtered = "".join((filter(lambda i: i in custom_printable, text)))
298
+
299
+ return filtered
300
+
301
+
302
+ import shutil # zipfile formats
303
+ from datetime import datetime
304
+ from os.path import getsize
305
+
306
+ import requests
307
+
308
+ # @markdown old download MAIN
309
+
310
+
311
+ def get_zip_URL(
312
+ URLtoget, extract_loc=None, file_header="dropboxexport_", verbose=False
313
+ ):
314
+
315
+ r = requests.get(URLtoget, allow_redirects=True)
316
+ names = "my_file.zip"
317
+ fixed_fnames = names.split(";") # split the multiple results
318
+ this_filename = file_header + URL_string_filter(fixed_fnames[0])
319
+
320
+ # define paths and save the zip file
321
+ if extract_loc is None:
322
+ extract_loc = "dropbox_dl"
323
+ dl_place = Path.cwd() / extract_loc
324
+ create_folder(dl_place)
325
+ save_loc = Path.cwd() / this_filename
326
+ open(save_loc, "wb").write(r.content)
327
+ if verbose:
328
+ print("downloaded file size was {} MB".format(getsize(save_loc) / 1000000))
329
+
330
+ # unpack the archive
331
+ shutil.unpack_archive(save_loc, extract_dir=dl_place)
332
+ if verbose:
333
+ print("extracted zip file - ", datetime.now())
334
+ x = load_dir_files(dl_place, req_extension="", verbose=verbose)
335
+ # remove original
336
+ try:
337
+ os.remove(save_loc)
338
+ del save_loc
339
+ except:
340
+ logging.info(
341
+ "unable to delete original zipfile - check if exists", datetime.now()
342
+ )
343
+ print("finished extracting zip - ", datetime.now())
344
+
345
+ return dl_place
346
+
347
+
348
+ """---
349
+
350
+ **new versions**
351
+ """
352
+
353
+ # @markdown downloading URL files with python
354
+
355
+
356
+ def clean_file_name(file_path):
357
+ """helper to clean filenames"""
358
+ file_path = Path(file_path)
359
+ # Remove all non-alphanumeric characters
360
+ cln_base = re.sub(r"[^\w\s]", "", file_path.stem)
361
+ # Replace all spaces with underscores
362
+ cln_base = re.sub(r"\s", "_", cln_base)
363
+ return cln_base + file_path.suffix
364
+
365
+
366
+ def download_URL(url: str, file=None, dlpath=None, verbose=False):
367
+ """
368
+ download_URL - download a file from a URL and show progress bar
369
+ Parameters
370
+ ----------
371
+ url : str, URL to download
372
+ file : str, optional, default None, name of file to save to. If None, will use the filename from the URL
373
+ dlpath : str, optional, default None, path to save the file to. If None, will save to the current working directory
374
+ verbose : bool, optional, default False, print progress bar
375
+ Returns
376
+ -------
377
+ str - path to the downloaded file
378
+ """
379
+
380
+ if file is None:
381
+ if "?dl=" in url:
382
+ # is a dropbox link
383
+ prefile = url.split("/")[-1]
384
+ filename = str(prefile).split("?dl=")[0]
385
+ else:
386
+ filename = url.split("/")[-1]
387
+ file = clean_file_name(filename)
388
+ if dlpath is None:
389
+ dlpath = Path.cwd() # save to current working directory
390
+ else:
391
+ dlpath = Path(dlpath) # make a path object
392
+ r = requests.get(url, stream=True, allow_redirects=True)
393
+ total_size = int(r.headers.get("content-length"))
394
+ initial_pos = 0
395
+ dl_loc = dlpath / file
396
+ with open(str(dl_loc.resolve()), "wb") as f:
397
+ with tqdm(
398
+ total=total_size,
399
+ unit="B",
400
+ unit_scale=True,
401
+ desc=file,
402
+ initial=initial_pos,
403
+ ascii=True,
404
+ ) as pbar:
405
+ for ch in r.iter_content(chunk_size=1024):
406
+ if ch:
407
+ f.write(ch)
408
+ pbar.update(len(ch))
409
+ if verbose:
410
+ print(f"\ndownloaded {file} to {dlpath}\n")
411
+ return str(dl_loc.resolve())
412
+
413
+
414
+ """## pdf2text functions
415
+
416
+ - now uses **easyocr**
417
+ - link to [docs](https://www.jaided.ai/easyocr/documentation/)
418
+ - the [tutorial](https://www.jaided.ai/easyocr/tutorial/)
419
+ - a list of available languages is [here](https://www.jaided.ai/easyocr/)
420
+
421
+ """
422
+
423
+
424
+
425
+ # need to run only once to load model into memory
426
+
427
+ custom_replace_list = {
428
+ "t0": "to",
429
+ "'$": "'s",
430
+ ",,": ", ",
431
+ "_ ": " ",
432
+ " '": "'",
433
+ }
434
+
435
+ replace_corr_exceptions = {
436
+ "i. e.": "i.e.",
437
+ "e. g.": "e.g.",
438
+ "e. g": "e.g.",
439
+ " ,": ",",
440
+ }
441
+
442
+ # TODO: add logic to 'corr' function to not add space after period when surrounded
443
+ # by numbers, example 5.6
444
+
445
+ from spellchecker import SpellChecker
446
+
447
+ spell = SpellChecker()
448
+
449
+
450
+ def check_word_spelling(word: str) -> bool:
451
+ """
452
+ check_word_spelling - check the spelling of a word
453
+
454
+ Args:
455
+ word (str): word to check
456
+
457
+ Returns:
458
+ bool: True if word is spelled correctly, False if not
459
+ """
460
+
461
+ misspelled = spell.unknown([word])
462
+
463
+ return len(misspelled) == 0
464
+
465
+
466
+ def eval_and_replace(text: str, match_token: str = "- ") -> str:
467
+ """
468
+ eval_and_replace - conditionally replace all instances of a substring in a string based on whether the eliminated substring results in a valid word
469
+
470
+ Args:
471
+ text (str): text to evaluate
472
+ match_token (str, optional): token to replace. Defaults to "- ".
473
+
474
+ Returns:
475
+ str: text with replaced tokens
476
+ """
477
+
478
+ if match_token not in text:
479
+ return text
480
+ else:
481
+ while True:
482
+ full_before_text = text.split(match_token, maxsplit=1)[0]
483
+ before_text = [
484
+ char for char in full_before_text.split()[-1] if char.isalpha()
485
+ ]
486
+ before_text = "".join(before_text)
487
+ full_after_text = text.split(match_token, maxsplit=1)[-1]
488
+ after_text = [char for char in full_after_text.split()[0] if char.isalpha()]
489
+ after_text = "".join(after_text)
490
+ full_text = before_text + after_text
491
+ if check_word_spelling(full_text):
492
+ text = full_before_text + full_after_text
493
+ else:
494
+ text = full_before_text + " " + full_after_text
495
+ if match_token not in text:
496
+ break
497
+ return text
498
+
499
+
500
+ def cleantxt_ocr(ugly_text):
501
+ # a wrapper for clean text with options different than default
502
+
503
+ # https://pypi.org/project/clean-text/
504
+ cleaned_text = clean(
505
+ ugly_text,
506
+ fix_unicode=True, # fix various unicode errors
507
+ to_ascii=True, # transliterate to closest ASCII representation
508
+ lower=False, # lowercase text
509
+ no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
510
+ no_urls=True, # replace all URLs with a special token
511
+ no_emails=True, # replace all email addresses with a special token
512
+ no_phone_numbers=False, # replace all phone numbers with a special token
513
+ no_numbers=False, # replace all numbers with a special token
514
+ no_digits=False, # replace all digits with a special token
515
+ no_currency_symbols=False, # replace all currency symbols with a special token
516
+ no_punct=False, # remove punctuations
517
+ replace_with_punct="", # instead of removing punctuations you may replace them
518
+ replace_with_url="<URL>",
519
+ replace_with_email="<EMAIL>",
520
+ replace_with_phone_number="<PHONE>",
521
+ replace_with_number="<NUM>",
522
+ replace_with_digit="0",
523
+ replace_with_currency_symbol="<CUR>",
524
+ lang="en", # set to 'de' for German special handling
525
+ )
526
+
527
+ return cleaned_text
528
+
529
+
530
+ def format_ocr_out(OCR_data):
531
+
532
+ if isinstance(OCR_data, list):
533
+ text = " ".join(OCR_data)
534
+ else:
535
+ text = str(OCR_data)
536
+ _clean = cleantxt_ocr(text)
537
+ return corr(_clean)
538
+
539
+
540
+ def postprocess(text: str) -> str:
541
+ """to be used after recombining the lines"""
542
+
543
+ proc = corr(cleantxt_ocr(text))
544
+
545
+ for k, v in custom_replace_list.items():
546
+ proc = proc.replace(str(k), str(v))
547
+
548
+ proc = corr(proc)
549
+ # TODO: upgrade corr() function to handle commas
550
+ # proc = proc.replace(" ,", ",")
551
+
552
+ for k, v in replace_corr_exceptions.items():
553
+ proc = proc.replace(str(k), str(v))
554
+
555
+ return eval_and_replace(proc)
556
+
557
+ def result2text(result) -> str:
558
+ """Convert OCR result to text"""
559
+
560
+ full_doc = []
561
+ for i, page in enumerate(result.pages, start=1):
562
+ text = ""
563
+ for block in page.blocks:
564
+ text += "\n\t"
565
+ for line in block.lines:
566
+ for word in line.words:
567
+ # print(dir(word))
568
+ text += word.value + " "
569
+ full_doc.append(text)
570
+
571
+
572
+
573
+ full_text = "\n".join(full_doc)
574
+ return full_text
575
+
576
+ import warnings
577
+ from datetime import date
578
+ from os.path import join
579
+
580
+
581
+ # @title define main fn - `convert_PDF_to_Text()`
582
+ # @markdown `convert_PDF_to_Text(PDF_file, multilang=False, use_page_labels=False, saveloc="")`
583
+ def convert_PDF_to_Text(
584
+ PDF_file,
585
+ ocr_model=None,
586
+ max_pages: int = 20,
587
+ ):
588
+
589
+ st = time.perf_counter()
590
+ PDF_file = Path(PDF_file)
591
+ ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
592
+ logging.info(f"starting OCR on {PDF_file.name}")
593
+ doc = DocumentFile.from_pdf(PDF_file)
594
+
595
+ if len(doc) > max_pages:
596
+ logging.warning(f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating")
597
+ doc = doc[:max_pages]
598
+
599
+ # Analyze
600
+ logging.info(f"running OCR on {len(doc)} pages")
601
+ result = ocr_model(doc)
602
+ raw_text = result2text(result)
603
+ proc_text = format_ocr_out(raw_text)
604
+ output_text = postprocess(proc_text)
605
+
606
+
607
+ fn_rt = time.perf_counter() - st
608
+
609
+
610
+
611
+ logging.info("OCR complete")
612
+
613
+
614
+ results_dict = {
615
+ "num_pages": len(doc),
616
+ "runtime": round(fn_rt, 2),
617
+ "date": str(date.today()),
618
+ "converted_text": output_text,
619
+ "length": len(output_text),
620
+ }
621
+
622
+ return results_dict
623
+
624
+
625
+ from os.path import basename, dirname, join
626
+
627
+ # @title translation functions
628
+ from libretranslatepy import LibreTranslateAPI
629
+
630
+ lt = LibreTranslateAPI("https://translate.astian.org/")
631
+
632
+
633
+ def translate_text(text, source_l, target_l="en"):
634
+
635
+ return str(lt.translate(text, source_l, target_l))
636
+
637
+
638
+ def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
639
+ """translate a document from lang_start to lang_end
640
+
641
+ {'code': 'en', 'name': 'English'},
642
+ {'code': 'fr', 'name': 'French'},
643
+ {'code': 'de', 'name': 'German'},
644
+ {'code': 'it', 'name': 'Italian'},"""
645
+
646
+ src_folder = dirname(filepath)
647
+ trgt_folder = join(src_folder, "translated to {}".format(lang_end))
648
+ create_folder(trgt_folder)
649
+ with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
650
+ foreign_t = f.readlines()
651
+ in_name = basename(filepath)
652
+ translated_doc = []
653
+ for line in tqdm(
654
+ foreign_t, total=len(foreign_t), desc="translating {}...".format(in_name[:10])
655
+ ):
656
+ translated_line = translate_text(line, lang_start, lang_end)
657
+ translated_doc.append(translated_line)
658
+ t_out_name = "[To {}]".format(lang_end) + simple_rename(in_name) + ".txt"
659
+ out_path = join(trgt_folder, t_out_name)
660
+ with open(out_path, "w", encoding="utf-8", errors="ignore") as f_o:
661
+ f_o.writelines(translated_doc)
662
+ if verbose:
663
+ print("finished translating the document! - ", datetime.now())
664
+ return out_path
665
+
666
+
667
+ """translation codes
668
+
669
+
670
+ ```
671
+
672
+
673
+ print(lt.languages())
674
+ call ^
675
+ ```
676
+
677
+ - link to their github [here](https://github.com/argosopentech/LibreTranslate-py)
678
+
679
+ # Load FIles
680
+ """