๐จ format
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
- pdf2text.py +8 -30
pdf2text.py
CHANGED
@@ -14,32 +14,23 @@ logging.basicConfig(
|
|
14 |
)
|
15 |
|
16 |
|
17 |
-
import gc
|
18 |
import os
|
19 |
import pprint as pp
|
20 |
import re
|
21 |
import shutil
|
22 |
import time
|
23 |
-
from datetime import datetime
|
24 |
-
from os.path import basename,
|
25 |
from pathlib import Path
|
26 |
-
|
27 |
-
import pandas as pd
|
28 |
-
import wordninja
|
29 |
from cleantext import clean
|
30 |
-
from natsort import natsorted
|
31 |
-
from tqdm.auto import tqdm
|
32 |
-
import os
|
33 |
-
import shutil
|
34 |
-
from os.path import join
|
35 |
-
from spellchecker import SpellChecker
|
36 |
from doctr.io import DocumentFile
|
37 |
from doctr.models import ocr_predictor
|
38 |
from libretranslatepy import LibreTranslateAPI
|
39 |
-
from
|
40 |
-
import
|
41 |
-
from
|
42 |
-
|
43 |
|
44 |
def fast_scandir(dirname):
|
45 |
# return all subfolders in a given filepath
|
@@ -127,9 +118,6 @@ def corr(
|
|
127 |
return s
|
128 |
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
def fix_punct_spaces(string):
|
134 |
"""
|
135 |
fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
|
@@ -176,8 +164,6 @@ def clean_OCR(ugly_text: str):
|
|
176 |
return fix_punct_spaces(cleaned_text)
|
177 |
|
178 |
|
179 |
-
|
180 |
-
|
181 |
def move2completed(from_dir, filename, new_folder="completed", verbose=False):
|
182 |
|
183 |
# this is the better version
|
@@ -207,7 +193,6 @@ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
|
|
207 |
"""
|
208 |
|
209 |
|
210 |
-
|
211 |
custom_replace_list = {
|
212 |
"t0": "to",
|
213 |
"'$": "'s",
|
@@ -224,7 +209,6 @@ replace_corr_exceptions = {
|
|
224 |
}
|
225 |
|
226 |
|
227 |
-
|
228 |
spell = SpellChecker()
|
229 |
|
230 |
|
@@ -278,7 +262,7 @@ def eval_and_replace(text: str, match_token: str = "- ") -> str:
|
|
278 |
return text
|
279 |
|
280 |
|
281 |
-
def cleantxt_ocr(ugly_text, lower=False, lang:str="en") -> str:
|
282 |
"""
|
283 |
cleantxt_ocr - clean text from OCR
|
284 |
|
@@ -362,9 +346,6 @@ def result2text(result, as_text=False) -> str or list:
|
|
362 |
return "\n".join(full_doc) if as_text else full_doc
|
363 |
|
364 |
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
def convert_PDF_to_Text(
|
369 |
PDF_file,
|
370 |
ocr_model=None,
|
@@ -409,7 +390,6 @@ def convert_PDF_to_Text(
|
|
409 |
return results_dict
|
410 |
|
411 |
|
412 |
-
|
413 |
# @title translation functions
|
414 |
|
415 |
lt = LibreTranslateAPI("https://translate.astian.org/")
|
@@ -447,5 +427,3 @@ def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
|
|
447 |
if verbose:
|
448 |
print("finished translating the document! - ", datetime.now())
|
449 |
return out_path
|
450 |
-
|
451 |
-
|
|
|
14 |
)
|
15 |
|
16 |
|
|
|
17 |
import os
|
18 |
import pprint as pp
|
19 |
import re
|
20 |
import shutil
|
21 |
import time
|
22 |
+
from datetime import date, datetime
|
23 |
+
from os.path import basename, dirname, join
|
24 |
from pathlib import Path
|
25 |
+
|
|
|
|
|
26 |
from cleantext import clean
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
from doctr.io import DocumentFile
|
28 |
from doctr.models import ocr_predictor
|
29 |
from libretranslatepy import LibreTranslateAPI
|
30 |
+
from natsort import natsorted
|
31 |
+
from spellchecker import SpellChecker
|
32 |
+
from tqdm.auto import tqdm
|
33 |
+
|
34 |
|
35 |
def fast_scandir(dirname):
|
36 |
# return all subfolders in a given filepath
|
|
|
118 |
return s
|
119 |
|
120 |
|
|
|
|
|
|
|
121 |
def fix_punct_spaces(string):
|
122 |
"""
|
123 |
fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
|
|
|
164 |
return fix_punct_spaces(cleaned_text)
|
165 |
|
166 |
|
|
|
|
|
167 |
def move2completed(from_dir, filename, new_folder="completed", verbose=False):
|
168 |
|
169 |
# this is the better version
|
|
|
193 |
"""
|
194 |
|
195 |
|
|
|
196 |
custom_replace_list = {
|
197 |
"t0": "to",
|
198 |
"'$": "'s",
|
|
|
209 |
}
|
210 |
|
211 |
|
|
|
212 |
spell = SpellChecker()
|
213 |
|
214 |
|
|
|
262 |
return text
|
263 |
|
264 |
|
265 |
+
def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
|
266 |
"""
|
267 |
cleantxt_ocr - clean text from OCR
|
268 |
|
|
|
346 |
return "\n".join(full_doc) if as_text else full_doc
|
347 |
|
348 |
|
|
|
|
|
|
|
349 |
def convert_PDF_to_Text(
|
350 |
PDF_file,
|
351 |
ocr_model=None,
|
|
|
390 |
return results_dict
|
391 |
|
392 |
|
|
|
393 |
# @title translation functions
|
394 |
|
395 |
lt = LibreTranslateAPI("https://translate.astian.org/")
|
|
|
427 |
if verbose:
|
428 |
print("finished translating the document! - ", datetime.now())
|
429 |
return out_path
|
|
|
|