pszemraj commited on
Commit
85b5da3
β€’
1 Parent(s): e9be69e

🎨 format

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show
  1. pdf2text.py +8 -30
pdf2text.py CHANGED
@@ -14,32 +14,23 @@ logging.basicConfig(
14
  )
15
 
16
 
17
- import gc
18
  import os
19
  import pprint as pp
20
  import re
21
  import shutil
22
  import time
23
- from datetime import datetime
24
- from os.path import basename, isfile, join
25
  from pathlib import Path
26
- import re
27
- import pandas as pd
28
- import wordninja
29
  from cleantext import clean
30
- from natsort import natsorted
31
- from tqdm.auto import tqdm
32
- import os
33
- import shutil
34
- from os.path import join
35
- from spellchecker import SpellChecker
36
  from doctr.io import DocumentFile
37
  from doctr.models import ocr_predictor
38
  from libretranslatepy import LibreTranslateAPI
39
- from os.path import basename, dirname, join
40
- import warnings
41
- from datetime import date
42
- from os.path import join
43
 
44
  def fast_scandir(dirname):
45
  # return all subfolders in a given filepath
@@ -127,9 +118,6 @@ def corr(
127
  return s
128
 
129
 
130
-
131
-
132
-
133
  def fix_punct_spaces(string):
134
  """
135
  fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
@@ -176,8 +164,6 @@ def clean_OCR(ugly_text: str):
176
  return fix_punct_spaces(cleaned_text)
177
 
178
 
179
-
180
-
181
  def move2completed(from_dir, filename, new_folder="completed", verbose=False):
182
 
183
  # this is the better version
@@ -207,7 +193,6 @@ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
207
  """
208
 
209
 
210
-
211
  custom_replace_list = {
212
  "t0": "to",
213
  "'$": "'s",
@@ -224,7 +209,6 @@ replace_corr_exceptions = {
224
  }
225
 
226
 
227
-
228
  spell = SpellChecker()
229
 
230
 
@@ -278,7 +262,7 @@ def eval_and_replace(text: str, match_token: str = "- ") -> str:
278
  return text
279
 
280
 
281
- def cleantxt_ocr(ugly_text, lower=False, lang:str="en") -> str:
282
  """
283
  cleantxt_ocr - clean text from OCR
284
 
@@ -362,9 +346,6 @@ def result2text(result, as_text=False) -> str or list:
362
  return "\n".join(full_doc) if as_text else full_doc
363
 
364
 
365
-
366
-
367
-
368
  def convert_PDF_to_Text(
369
  PDF_file,
370
  ocr_model=None,
@@ -409,7 +390,6 @@ def convert_PDF_to_Text(
409
  return results_dict
410
 
411
 
412
-
413
  # @title translation functions
414
 
415
  lt = LibreTranslateAPI("https://translate.astian.org/")
@@ -447,5 +427,3 @@ def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
447
  if verbose:
448
  print("finished translating the document! - ", datetime.now())
449
  return out_path
450
-
451
-
 
14
  )
15
 
16
 
 
17
  import os
18
  import pprint as pp
19
  import re
20
  import shutil
21
  import time
22
+ from datetime import date, datetime
23
+ from os.path import basename, dirname, join
24
  from pathlib import Path
25
+
 
 
26
  from cleantext import clean
 
 
 
 
 
 
27
  from doctr.io import DocumentFile
28
  from doctr.models import ocr_predictor
29
  from libretranslatepy import LibreTranslateAPI
30
+ from natsort import natsorted
31
+ from spellchecker import SpellChecker
32
+ from tqdm.auto import tqdm
33
+
34
 
35
  def fast_scandir(dirname):
36
  # return all subfolders in a given filepath
 
118
  return s
119
 
120
 
 
 
 
121
  def fix_punct_spaces(string):
122
  """
123
  fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
 
164
  return fix_punct_spaces(cleaned_text)
165
 
166
 
 
 
167
  def move2completed(from_dir, filename, new_folder="completed", verbose=False):
168
 
169
  # this is the better version
 
193
  """
194
 
195
 
 
196
  custom_replace_list = {
197
  "t0": "to",
198
  "'$": "'s",
 
209
  }
210
 
211
 
 
212
  spell = SpellChecker()
213
 
214
 
 
262
  return text
263
 
264
 
265
+ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
266
  """
267
  cleantxt_ocr - clean text from OCR
268
 
 
346
  return "\n".join(full_doc) if as_text else full_doc
347
 
348
 
 
 
 
349
  def convert_PDF_to_Text(
350
  PDF_file,
351
  ocr_model=None,
 
390
  return results_dict
391
 
392
 
 
393
  # @title translation functions
394
 
395
  lt = LibreTranslateAPI("https://translate.astian.org/")
 
427
  if verbose:
428
  print("finished translating the document! - ", datetime.now())
429
  return out_path