pdf-ocr

Sleeping

App Files Files Community

pszemraj commited on Oct 4, 2022

Commit

e9be69e

1 Parent(s): 2205c39

⚰️

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show

pdf2text.py +24 -255

pdf2text.py CHANGED Viewed

@@ -29,10 +29,17 @@ import wordninja
 from cleantext import clean
 from natsort import natsorted
 from tqdm.auto import tqdm
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 def fast_scandir(dirname):
     # return all subfolders in a given filepath
@@ -120,90 +127,7 @@ def corr(
     return s
-def is_this_needed_in_output(in_string):
-    if in_string.isalnum():
-        return True
-    elif in_string == ".":
-        return True
-    elif in_string == " ":
-        return True
-    elif in_string == "\n":
-        return True
-    elif in_string == "-":
-        return True
-    else:
-        return False
-# @title clean filenames
-def cleantxt_wrap(ugly_text, txt_lan="en"):
-    # a wrapper for clean text with options different than default
-    # https://pypi.org/project/clean-text/
-    cleaned_text = clean(
-        ugly_text,
-        fix_unicode=True,  # fix various unicode errors
-        to_ascii=True,  # transliterate to closest ASCII representation
-        lower=True,  # lowercase text
-        no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
-        no_urls=True,  # replace all URLs with a special token
-        no_emails=True,  # replace all email addresses with a special token
-        no_phone_numbers=True,  # replace all phone numbers with a special token
-        no_numbers=False,  # replace all numbers with a special token
-        no_digits=False,  # replace all digits with a special token
-        no_currency_symbols=True,  # replace all currency symbols with a special token
-        no_punct=True,  # remove punctuations
-        replace_with_punct="",  # instead of removing punctuations you may replace them
-        replace_with_url="<URL>",
-        replace_with_email="<EMAIL>",
-        replace_with_phone_number="<PHONE>",
-        replace_with_number="<NUM>",
-        replace_with_digit="0",
-        replace_with_currency_symbol="<CUR>",
-        lang=txt_lan,  # set to 'de' for German special handling
-    )
-    return cleaned_text
-def beautify_filename(
-    filename, num_words=25, start_reverse=False, word_separator="_"
-) -> str:
-    """
-    beautify_filename takes a filename and returns a beautified version of it
-    Args:
-        filename (str): the filename to beautify
-        num_words (int, optional): _description_. Defaults to 25.
-        start_reverse (bool, optional): _description_. Defaults to False.
-        word_separator (str, optional): _description_. Defaults to "_".
-    Returns:
-        str: the beautified filename
-    """
-    filename = str(filename)
-    index_file_Ext = filename.rfind(".")
-    current_name = str(filename)[:index_file_Ext]  # get rid of extension
-    if current_name[-1].isnumeric():
-        current_name = current_name + "s"
-    clean_name = cleantxt_wrap(current_name)
-    file_words = wordninja.split(clean_name)
-    # splits concatenated text into a list of words based on common word freq
-    if len(file_words) <= num_words:
-        num_words = len(file_words)
-    if start_reverse:
-        t_file_words = file_words[-num_words:]
-    else:
-        t_file_words = file_words[:num_words]
-    pretty_name = word_separator.join(t_file_words)  # see function argument
-    # NOTE IT DOES NOT RETURN THE EXTENSION
-    return pretty_name[
-        : (len(pretty_name) - 1)
-    ]  # there is a space always at the end, so -1
 def fix_punct_spaces(string):
@@ -252,11 +176,6 @@ def clean_OCR(ugly_text: str):
     return fix_punct_spaces(cleaned_text)
-import os
-import shutil
-from os.path import join
-# @markdown move2completed
 def move2completed(from_dir, filename, new_folder="completed", verbose=False):
@@ -283,147 +202,11 @@ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
         )
-"""### download files
-**old versions**
-"""
-import re
-def URL_string_filter(text):
-    custom_printable = (
-        "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ._"
-    )
-    filtered = "".join((filter(lambda i: i in custom_printable, text)))
-    return filtered
-import shutil  # zipfile formats
-from datetime import datetime
-from os.path import getsize
-import requests
-# @markdown old download MAIN
-def get_zip_URL(
-    URLtoget, extract_loc=None, file_header="dropboxexport_", verbose=False
-):
-    r = requests.get(URLtoget, allow_redirects=True)
-    names = "my_file.zip"
-    fixed_fnames = names.split(";")  # split the multiple results
-    this_filename = file_header + URL_string_filter(fixed_fnames[0])
-    # define paths and save the zip file
-    if extract_loc is None:
-        extract_loc = "dropbox_dl"
-    dl_place = Path.cwd() / extract_loc
-    create_folder(dl_place)
-    save_loc = Path.cwd() / this_filename
-    open(save_loc, "wb").write(r.content)
-    if verbose:
-        print("downloaded file size was {} MB".format(getsize(save_loc) / 1000000))
-    # unpack the archive
-    shutil.unpack_archive(save_loc, extract_dir=dl_place)
-    if verbose:
-        print("extracted zip file - ", datetime.now())
-        x = load_dir_files(dl_place, req_extension="", verbose=verbose)
-    # remove original
-    try:
-        os.remove(save_loc)
-        del save_loc
-    except:
-        logging.info(
-            "unable to delete original zipfile - check if exists", datetime.now()
-        )
-    print("finished extracting zip - ", datetime.now())
-    return dl_place
-"""---
-**new versions**
-"""
-# @markdown downloading URL files with python
-def clean_file_name(file_path):
-    """helper to clean filenames"""
-    file_path = Path(file_path)
-    # Remove all non-alphanumeric characters
-    cln_base = re.sub(r"[^\w\s]", "", file_path.stem)
-    # Replace all spaces with underscores
-    cln_base = re.sub(r"\s", "_", cln_base)
-    return cln_base + file_path.suffix
-def download_URL(url: str, file=None, dlpath=None, verbose=False):
-    """
-    download_URL - download a file from a URL and show progress bar
-    Parameters
-    ----------
-    url : str,        URL to download
-    file : str, optional, default None, name of file to save to. If None, will use the filename from the URL
-    dlpath : str, optional, default None, path to save the file to. If None, will save to the current working directory
-    verbose : bool, optional, default False, print progress bar
-    Returns
-    -------
-    str - path to the downloaded file
-    """
-    if file is None:
-        if "?dl=" in url:
-            # is a dropbox link
-            prefile = url.split("/")[-1]
-            filename = str(prefile).split("?dl=")[0]
-        else:
-            filename = url.split("/")[-1]
-        file = clean_file_name(filename)
-    if dlpath is None:
-        dlpath = Path.cwd()  # save to current working directory
-    else:
-        dlpath = Path(dlpath)  # make a path object
-    r = requests.get(url, stream=True, allow_redirects=True)
-    total_size = int(r.headers.get("content-length"))
-    initial_pos = 0
-    dl_loc = dlpath / file
-    with open(str(dl_loc.resolve()), "wb") as f:
-        with tqdm(
-            total=total_size,
-            unit="B",
-            unit_scale=True,
-            desc=file,
-            initial=initial_pos,
-            ascii=True,
-        ) as pbar:
-            for ch in r.iter_content(chunk_size=1024):
-                if ch:
-                    f.write(ch)
-                    pbar.update(len(ch))
-    if verbose:
-        print(f"\ndownloaded {file} to {dlpath}\n")
-    return str(dl_loc.resolve())
 """## pdf2text functions
-- now uses **easyocr**
-- link to [docs](https://www.jaided.ai/easyocr/documentation/)
-- the [tutorial](https://www.jaided.ai/easyocr/tutorial/)
-- a list of available languages is [here](https://www.jaided.ai/easyocr/)
 """
-# need to run only once to load model into memory
 custom_replace_list = {
     "t0": "to",
@@ -440,10 +223,7 @@ replace_corr_exceptions = {
     " ,": ",",
 }
-# TODO: add logic to 'corr' function to not add space after period when surrounded
-# by numbers, example 5.6
-from spellchecker import SpellChecker
 spell = SpellChecker()
@@ -498,7 +278,18 @@ def eval_and_replace(text: str, match_token: str = "- ") -> str:
         return text
-def cleantxt_ocr(ugly_text):
     # a wrapper for clean text with options different than default
     # https://pypi.org/project/clean-text/
@@ -506,7 +297,7 @@ def cleantxt_ocr(ugly_text):
         ugly_text,
         fix_unicode=True,  # fix various unicode errors
         to_ascii=True,  # transliterate to closest ASCII representation
-        lower=False,  # lowercase text
         no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
         no_urls=True,  # replace all URLs with a special token
         no_emails=True,  # replace all email addresses with a special token
@@ -522,7 +313,7 @@ def cleantxt_ocr(ugly_text):
         replace_with_number="<NUM>",
         replace_with_digit="0",
         replace_with_currency_symbol="<CUR>",
-        lang="en",  # set to 'de' for German special handling
     )
     return cleaned_text
@@ -547,8 +338,6 @@ def postprocess(text: str) -> str:
         proc = proc.replace(str(k), str(v))
     proc = corr(proc)
-    # TODO: upgrade corr() function to handle commas
-    # proc = proc.replace(" ,", ",")
     for k, v in replace_corr_exceptions.items():
         proc = proc.replace(str(k), str(v))
@@ -573,13 +362,9 @@ def result2text(result, as_text=False) -> str or list:
     return "\n".join(full_doc) if as_text else full_doc
-import warnings
-from datetime import date
-from os.path import join
-# @title define main fn - `convert_PDF_to_Text()`
-# @markdown `convert_PDF_to_Text(PDF_file, multilang=False, use_page_labels=False, saveloc="")`
 def convert_PDF_to_Text(
     PDF_file,
     ocr_model=None,
@@ -624,10 +409,8 @@ def convert_PDF_to_Text(
     return results_dict
-from os.path import basename, dirname, join
 # @title translation functions
-from libretranslatepy import LibreTranslateAPI
 lt = LibreTranslateAPI("https://translate.astian.org/")
@@ -666,17 +449,3 @@ def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
     return out_path
-"""translation codes
-```
-print(lt.languages())
-call ^
-```
-- link to their github [here](https://github.com/argosopentech/LibreTranslate-py)
-# Load FIles
-"""

 from cleantext import clean
 from natsort import natsorted
 from tqdm.auto import tqdm
+import os
+import shutil
+from os.path import join
+from spellchecker import SpellChecker
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
+from libretranslatepy import LibreTranslateAPI
+from os.path import basename, dirname, join
+import warnings
+from datetime import date
+from os.path import join
 def fast_scandir(dirname):
     # return all subfolders in a given filepath
     return s
 def fix_punct_spaces(string):
     return fix_punct_spaces(cleaned_text)
 def move2completed(from_dir, filename, new_folder="completed", verbose=False):
         )
 """## pdf2text functions
 """
 custom_replace_list = {
     "t0": "to",
     " ,": ",",
 }
 spell = SpellChecker()
         return text
+def cleantxt_ocr(ugly_text, lower=False, lang:str="en") -> str:
+    """
+    cleantxt_ocr - clean text from OCR
+    Args:
+        ugly_text (str): text to clean
+        lower (bool, optional): _description_. Defaults to False.
+        lang (str, optional): _description_. Defaults to "en".
+    Returns:
+        str: cleaned text
+    """
     # a wrapper for clean text with options different than default
     # https://pypi.org/project/clean-text/
         ugly_text,
         fix_unicode=True,  # fix various unicode errors
         to_ascii=True,  # transliterate to closest ASCII representation
+        lower=lower,  # lowercase text
         no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
         no_urls=True,  # replace all URLs with a special token
         no_emails=True,  # replace all email addresses with a special token
         replace_with_number="<NUM>",
         replace_with_digit="0",
         replace_with_currency_symbol="<CUR>",
+        lang=lang,  # set to 'de' for German special handling
     )
     return cleaned_text
         proc = proc.replace(str(k), str(v))
     proc = corr(proc)
     for k, v in replace_corr_exceptions.items():
         proc = proc.replace(str(k), str(v))
     return "\n".join(full_doc) if as_text else full_doc
 def convert_PDF_to_Text(
     PDF_file,
     ocr_model=None,
     return results_dict
 # @title translation functions
 lt = LibreTranslateAPI("https://translate.astian.org/")
     return out_path