pszemraj commited on
Commit
e9be69e
1 Parent(s): 2205c39

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show
  1. pdf2text.py +24 -255
pdf2text.py CHANGED
@@ -29,10 +29,17 @@ import wordninja
29
  from cleantext import clean
30
  from natsort import natsorted
31
  from tqdm.auto import tqdm
32
-
 
 
 
33
  from doctr.io import DocumentFile
34
  from doctr.models import ocr_predictor
35
-
 
 
 
 
36
 
37
  def fast_scandir(dirname):
38
  # return all subfolders in a given filepath
@@ -120,90 +127,7 @@ def corr(
120
  return s
121
 
122
 
123
- def is_this_needed_in_output(in_string):
124
- if in_string.isalnum():
125
- return True
126
- elif in_string == ".":
127
- return True
128
- elif in_string == " ":
129
- return True
130
- elif in_string == "\n":
131
- return True
132
- elif in_string == "-":
133
- return True
134
- else:
135
- return False
136
-
137
-
138
- # @title clean filenames
139
- def cleantxt_wrap(ugly_text, txt_lan="en"):
140
- # a wrapper for clean text with options different than default
141
-
142
- # https://pypi.org/project/clean-text/
143
- cleaned_text = clean(
144
- ugly_text,
145
- fix_unicode=True, # fix various unicode errors
146
- to_ascii=True, # transliterate to closest ASCII representation
147
- lower=True, # lowercase text
148
- no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
149
- no_urls=True, # replace all URLs with a special token
150
- no_emails=True, # replace all email addresses with a special token
151
- no_phone_numbers=True, # replace all phone numbers with a special token
152
- no_numbers=False, # replace all numbers with a special token
153
- no_digits=False, # replace all digits with a special token
154
- no_currency_symbols=True, # replace all currency symbols with a special token
155
- no_punct=True, # remove punctuations
156
- replace_with_punct="", # instead of removing punctuations you may replace them
157
- replace_with_url="<URL>",
158
- replace_with_email="<EMAIL>",
159
- replace_with_phone_number="<PHONE>",
160
- replace_with_number="<NUM>",
161
- replace_with_digit="0",
162
- replace_with_currency_symbol="<CUR>",
163
- lang=txt_lan, # set to 'de' for German special handling
164
- )
165
-
166
- return cleaned_text
167
-
168
-
169
- def beautify_filename(
170
- filename, num_words=25, start_reverse=False, word_separator="_"
171
- ) -> str:
172
- """
173
- beautify_filename takes a filename and returns a beautified version of it
174
-
175
- Args:
176
- filename (str): the filename to beautify
177
- num_words (int, optional): _description_. Defaults to 25.
178
- start_reverse (bool, optional): _description_. Defaults to False.
179
- word_separator (str, optional): _description_. Defaults to "_".
180
-
181
- Returns:
182
- str: the beautified filename
183
- """
184
-
185
- filename = str(filename)
186
- index_file_Ext = filename.rfind(".")
187
- current_name = str(filename)[:index_file_Ext] # get rid of extension
188
- if current_name[-1].isnumeric():
189
- current_name = current_name + "s"
190
- clean_name = cleantxt_wrap(current_name)
191
- file_words = wordninja.split(clean_name)
192
- # splits concatenated text into a list of words based on common word freq
193
- if len(file_words) <= num_words:
194
- num_words = len(file_words)
195
-
196
- if start_reverse:
197
- t_file_words = file_words[-num_words:]
198
- else:
199
- t_file_words = file_words[:num_words]
200
-
201
- pretty_name = word_separator.join(t_file_words) # see function argument
202
 
203
- # NOTE IT DOES NOT RETURN THE EXTENSION
204
- return pretty_name[
205
- : (len(pretty_name) - 1)
206
- ] # there is a space always at the end, so -1
207
 
208
 
209
  def fix_punct_spaces(string):
@@ -252,11 +176,6 @@ def clean_OCR(ugly_text: str):
252
  return fix_punct_spaces(cleaned_text)
253
 
254
 
255
- import os
256
- import shutil
257
- from os.path import join
258
-
259
- # @markdown move2completed
260
 
261
 
262
  def move2completed(from_dir, filename, new_folder="completed", verbose=False):
@@ -283,147 +202,11 @@ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
283
  )
284
 
285
 
286
- """### download files
287
-
288
- **old versions**
289
- """
290
-
291
- import re
292
-
293
-
294
- def URL_string_filter(text):
295
- custom_printable = (
296
- "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ._"
297
- )
298
-
299
- filtered = "".join((filter(lambda i: i in custom_printable, text)))
300
-
301
- return filtered
302
-
303
-
304
- import shutil # zipfile formats
305
- from datetime import datetime
306
- from os.path import getsize
307
-
308
- import requests
309
-
310
- # @markdown old download MAIN
311
-
312
-
313
- def get_zip_URL(
314
- URLtoget, extract_loc=None, file_header="dropboxexport_", verbose=False
315
- ):
316
-
317
- r = requests.get(URLtoget, allow_redirects=True)
318
- names = "my_file.zip"
319
- fixed_fnames = names.split(";") # split the multiple results
320
- this_filename = file_header + URL_string_filter(fixed_fnames[0])
321
-
322
- # define paths and save the zip file
323
- if extract_loc is None:
324
- extract_loc = "dropbox_dl"
325
- dl_place = Path.cwd() / extract_loc
326
- create_folder(dl_place)
327
- save_loc = Path.cwd() / this_filename
328
- open(save_loc, "wb").write(r.content)
329
- if verbose:
330
- print("downloaded file size was {} MB".format(getsize(save_loc) / 1000000))
331
-
332
- # unpack the archive
333
- shutil.unpack_archive(save_loc, extract_dir=dl_place)
334
- if verbose:
335
- print("extracted zip file - ", datetime.now())
336
- x = load_dir_files(dl_place, req_extension="", verbose=verbose)
337
- # remove original
338
- try:
339
- os.remove(save_loc)
340
- del save_loc
341
- except:
342
- logging.info(
343
- "unable to delete original zipfile - check if exists", datetime.now()
344
- )
345
- print("finished extracting zip - ", datetime.now())
346
-
347
- return dl_place
348
-
349
-
350
- """---
351
-
352
- **new versions**
353
- """
354
-
355
- # @markdown downloading URL files with python
356
-
357
-
358
- def clean_file_name(file_path):
359
- """helper to clean filenames"""
360
- file_path = Path(file_path)
361
- # Remove all non-alphanumeric characters
362
- cln_base = re.sub(r"[^\w\s]", "", file_path.stem)
363
- # Replace all spaces with underscores
364
- cln_base = re.sub(r"\s", "_", cln_base)
365
- return cln_base + file_path.suffix
366
-
367
-
368
- def download_URL(url: str, file=None, dlpath=None, verbose=False):
369
- """
370
- download_URL - download a file from a URL and show progress bar
371
- Parameters
372
- ----------
373
- url : str, URL to download
374
- file : str, optional, default None, name of file to save to. If None, will use the filename from the URL
375
- dlpath : str, optional, default None, path to save the file to. If None, will save to the current working directory
376
- verbose : bool, optional, default False, print progress bar
377
- Returns
378
- -------
379
- str - path to the downloaded file
380
- """
381
-
382
- if file is None:
383
- if "?dl=" in url:
384
- # is a dropbox link
385
- prefile = url.split("/")[-1]
386
- filename = str(prefile).split("?dl=")[0]
387
- else:
388
- filename = url.split("/")[-1]
389
- file = clean_file_name(filename)
390
- if dlpath is None:
391
- dlpath = Path.cwd() # save to current working directory
392
- else:
393
- dlpath = Path(dlpath) # make a path object
394
- r = requests.get(url, stream=True, allow_redirects=True)
395
- total_size = int(r.headers.get("content-length"))
396
- initial_pos = 0
397
- dl_loc = dlpath / file
398
- with open(str(dl_loc.resolve()), "wb") as f:
399
- with tqdm(
400
- total=total_size,
401
- unit="B",
402
- unit_scale=True,
403
- desc=file,
404
- initial=initial_pos,
405
- ascii=True,
406
- ) as pbar:
407
- for ch in r.iter_content(chunk_size=1024):
408
- if ch:
409
- f.write(ch)
410
- pbar.update(len(ch))
411
- if verbose:
412
- print(f"\ndownloaded {file} to {dlpath}\n")
413
- return str(dl_loc.resolve())
414
-
415
-
416
  """## pdf2text functions
417
 
418
- - now uses **easyocr**
419
- - link to [docs](https://www.jaided.ai/easyocr/documentation/)
420
- - the [tutorial](https://www.jaided.ai/easyocr/tutorial/)
421
- - a list of available languages is [here](https://www.jaided.ai/easyocr/)
422
-
423
  """
424
 
425
 
426
- # need to run only once to load model into memory
427
 
428
  custom_replace_list = {
429
  "t0": "to",
@@ -440,10 +223,7 @@ replace_corr_exceptions = {
440
  " ,": ",",
441
  }
442
 
443
- # TODO: add logic to 'corr' function to not add space after period when surrounded
444
- # by numbers, example 5.6
445
 
446
- from spellchecker import SpellChecker
447
 
448
  spell = SpellChecker()
449
 
@@ -498,7 +278,18 @@ def eval_and_replace(text: str, match_token: str = "- ") -> str:
498
  return text
499
 
500
 
501
- def cleantxt_ocr(ugly_text):
 
 
 
 
 
 
 
 
 
 
 
502
  # a wrapper for clean text with options different than default
503
 
504
  # https://pypi.org/project/clean-text/
@@ -506,7 +297,7 @@ def cleantxt_ocr(ugly_text):
506
  ugly_text,
507
  fix_unicode=True, # fix various unicode errors
508
  to_ascii=True, # transliterate to closest ASCII representation
509
- lower=False, # lowercase text
510
  no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
511
  no_urls=True, # replace all URLs with a special token
512
  no_emails=True, # replace all email addresses with a special token
@@ -522,7 +313,7 @@ def cleantxt_ocr(ugly_text):
522
  replace_with_number="<NUM>",
523
  replace_with_digit="0",
524
  replace_with_currency_symbol="<CUR>",
525
- lang="en", # set to 'de' for German special handling
526
  )
527
 
528
  return cleaned_text
@@ -547,8 +338,6 @@ def postprocess(text: str) -> str:
547
  proc = proc.replace(str(k), str(v))
548
 
549
  proc = corr(proc)
550
- # TODO: upgrade corr() function to handle commas
551
- # proc = proc.replace(" ,", ",")
552
 
553
  for k, v in replace_corr_exceptions.items():
554
  proc = proc.replace(str(k), str(v))
@@ -573,13 +362,9 @@ def result2text(result, as_text=False) -> str or list:
573
  return "\n".join(full_doc) if as_text else full_doc
574
 
575
 
576
- import warnings
577
- from datetime import date
578
- from os.path import join
579
 
580
 
581
- # @title define main fn - `convert_PDF_to_Text()`
582
- # @markdown `convert_PDF_to_Text(PDF_file, multilang=False, use_page_labels=False, saveloc="")`
583
  def convert_PDF_to_Text(
584
  PDF_file,
585
  ocr_model=None,
@@ -624,10 +409,8 @@ def convert_PDF_to_Text(
624
  return results_dict
625
 
626
 
627
- from os.path import basename, dirname, join
628
 
629
  # @title translation functions
630
- from libretranslatepy import LibreTranslateAPI
631
 
632
  lt = LibreTranslateAPI("https://translate.astian.org/")
633
 
@@ -666,17 +449,3 @@ def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
666
  return out_path
667
 
668
 
669
- """translation codes
670
-
671
-
672
- ```
673
-
674
-
675
- print(lt.languages())
676
- call ^
677
- ```
678
-
679
- - link to their github [here](https://github.com/argosopentech/LibreTranslate-py)
680
-
681
- # Load FIles
682
- """
 
29
  from cleantext import clean
30
  from natsort import natsorted
31
  from tqdm.auto import tqdm
32
+ import os
33
+ import shutil
34
+ from os.path import join
35
+ from spellchecker import SpellChecker
36
  from doctr.io import DocumentFile
37
  from doctr.models import ocr_predictor
38
+ from libretranslatepy import LibreTranslateAPI
39
+ from os.path import basename, dirname, join
40
+ import warnings
41
+ from datetime import date
42
+ from os.path import join
43
 
44
  def fast_scandir(dirname):
45
  # return all subfolders in a given filepath
 
127
  return s
128
 
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
 
 
 
 
131
 
132
 
133
  def fix_punct_spaces(string):
 
176
  return fix_punct_spaces(cleaned_text)
177
 
178
 
 
 
 
 
 
179
 
180
 
181
  def move2completed(from_dir, filename, new_folder="completed", verbose=False):
 
202
  )
203
 
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  """## pdf2text functions
206
 
 
 
 
 
 
207
  """
208
 
209
 
 
210
 
211
  custom_replace_list = {
212
  "t0": "to",
 
223
  " ,": ",",
224
  }
225
 
 
 
226
 
 
227
 
228
  spell = SpellChecker()
229
 
 
278
  return text
279
 
280
 
281
+ def cleantxt_ocr(ugly_text, lower=False, lang:str="en") -> str:
282
+ """
283
+ cleantxt_ocr - clean text from OCR
284
+
285
+ Args:
286
+ ugly_text (str): text to clean
287
+ lower (bool, optional): _description_. Defaults to False.
288
+ lang (str, optional): _description_. Defaults to "en".
289
+
290
+ Returns:
291
+ str: cleaned text
292
+ """
293
  # a wrapper for clean text with options different than default
294
 
295
  # https://pypi.org/project/clean-text/
 
297
  ugly_text,
298
  fix_unicode=True, # fix various unicode errors
299
  to_ascii=True, # transliterate to closest ASCII representation
300
+ lower=lower, # lowercase text
301
  no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
302
  no_urls=True, # replace all URLs with a special token
303
  no_emails=True, # replace all email addresses with a special token
 
313
  replace_with_number="<NUM>",
314
  replace_with_digit="0",
315
  replace_with_currency_symbol="<CUR>",
316
+ lang=lang, # set to 'de' for German special handling
317
  )
318
 
319
  return cleaned_text
 
338
  proc = proc.replace(str(k), str(v))
339
 
340
  proc = corr(proc)
 
 
341
 
342
  for k, v in replace_corr_exceptions.items():
343
  proc = proc.replace(str(k), str(v))
 
362
  return "\n".join(full_doc) if as_text else full_doc
363
 
364
 
 
 
 
365
 
366
 
367
+
 
368
  def convert_PDF_to_Text(
369
  PDF_file,
370
  ocr_model=None,
 
409
  return results_dict
410
 
411
 
 
412
 
413
  # @title translation functions
 
414
 
415
  lt = LibreTranslateAPI("https://translate.astian.org/")
416
 
 
449
  return out_path
450
 
451