pdf-ocr2

Sleeping

App Files Files Community

pszemraj commited on Oct 4, 2022

Commit

85b5da3

1 Parent(s): e9be69e

🎨 format

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show

pdf2text.py +8 -30

pdf2text.py CHANGED Viewed

@@ -14,32 +14,23 @@ logging.basicConfig(
 )
-import gc
 import os
 import pprint as pp
 import re
 import shutil
 import time
-from datetime import datetime
-from os.path import basename, isfile, join
 from pathlib import Path
-import re
-import pandas as pd
-import wordninja
 from cleantext import clean
-from natsort import natsorted
-from tqdm.auto import tqdm
-import os
-import shutil
-from os.path import join
-from spellchecker import SpellChecker
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 from libretranslatepy import LibreTranslateAPI
-from os.path import basename, dirname, join
-import warnings
-from datetime import date
-from os.path import join
 def fast_scandir(dirname):
     # return all subfolders in a given filepath
@@ -127,9 +118,6 @@ def corr(
     return s
 def fix_punct_spaces(string):
     """
     fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
@@ -176,8 +164,6 @@ def clean_OCR(ugly_text: str):
     return fix_punct_spaces(cleaned_text)
 def move2completed(from_dir, filename, new_folder="completed", verbose=False):
     # this is the better version
@@ -207,7 +193,6 @@ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
 """
 custom_replace_list = {
     "t0": "to",
     "'$": "'s",
@@ -224,7 +209,6 @@ replace_corr_exceptions = {
 }
 spell = SpellChecker()
@@ -278,7 +262,7 @@ def eval_and_replace(text: str, match_token: str = "- ") -> str:
         return text
-def cleantxt_ocr(ugly_text, lower=False, lang:str="en") -> str:
     """
     cleantxt_ocr - clean text from OCR
@@ -362,9 +346,6 @@ def result2text(result, as_text=False) -> str or list:
     return "\n".join(full_doc) if as_text else full_doc
 def convert_PDF_to_Text(
     PDF_file,
     ocr_model=None,
@@ -409,7 +390,6 @@ def convert_PDF_to_Text(
     return results_dict
 # @title translation functions
 lt = LibreTranslateAPI("https://translate.astian.org/")
@@ -447,5 +427,3 @@ def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
     if verbose:
         print("finished translating the document! - ", datetime.now())
     return out_path

 )
 import os
 import pprint as pp
 import re
 import shutil
 import time
+from datetime import date, datetime
+from os.path import basename, dirname, join
 from pathlib import Path
 from cleantext import clean
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 from libretranslatepy import LibreTranslateAPI
+from natsort import natsorted
+from spellchecker import SpellChecker
+from tqdm.auto import tqdm
 def fast_scandir(dirname):
     # return all subfolders in a given filepath
     return s
 def fix_punct_spaces(string):
     """
     fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
     return fix_punct_spaces(cleaned_text)
 def move2completed(from_dir, filename, new_folder="completed", verbose=False):
     # this is the better version
 """
 custom_replace_list = {
     "t0": "to",
     "'$": "'s",
 }
 spell = SpellChecker()
         return text
+def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
     """
     cleantxt_ocr - clean text from OCR
     return "\n".join(full_doc) if as_text else full_doc
 def convert_PDF_to_Text(
     PDF_file,
     ocr_model=None,
     return results_dict
 # @title translation functions
 lt = LibreTranslateAPI("https://translate.astian.org/")
     if verbose:
         print("finished translating the document! - ", datetime.now())
     return out_path