pszemraj commited on
Commit
684bb04
1 Parent(s): 58b2281

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show
  1. pdf2text.py +11 -43
pdf2text.py CHANGED
@@ -32,56 +32,24 @@ from spellchecker import SpellChecker
32
  from tqdm.auto import tqdm
33
 
34
 
35
- def fast_scandir(dirname):
36
- # return all subfolders in a given filepath
37
-
38
- subfolders = [f.path for f in os.scandir(dirname) if f.is_dir()]
39
- for dirname in list(subfolders):
40
- subfolders.extend(fast_scandir(dirname))
41
- return subfolders # list
42
-
43
-
44
- def create_folder(directory):
45
- os.makedirs(directory, exist_ok=True)
46
-
47
 
48
  def simple_rename(filepath, target_ext=".txt"):
49
  _fp = Path(filepath)
50
  basename = _fp.stem
51
  return f"OCR_{basename}_{target_ext}"
52
 
 
 
 
53
 
54
- def load_dir_files(directory, req_extension=".txt", return_type="list", verbose=False):
55
- appr_files = []
56
- # r=root, d=directories, f = files
57
- for r, d, f in os.walk(directory):
58
- for prefile in f:
59
- if prefile.endswith(req_extension):
60
- fullpath = os.path.join(r, prefile)
61
- appr_files.append(fullpath)
62
-
63
- appr_files = natsorted(appr_files)
64
-
65
- if verbose:
66
- print("A list of files in the {} directory are: \n".format(directory))
67
- if len(appr_files) < 10:
68
- pp.pprint(appr_files)
69
- else:
70
- pp.pprint(appr_files[:10])
71
- print("\n and more. There are a total of {} files".format(len(appr_files)))
72
-
73
- if return_type.lower() == "list":
74
- return appr_files
75
- else:
76
- if verbose:
77
- print("returning dictionary")
78
-
79
- appr_file_dict = {}
80
- for this_file in appr_files:
81
- appr_file_dict[basename(this_file)] = this_file
82
-
83
- return appr_file_dict
84
-
85
 
86
  def corr(
87
  s: str,
 
32
  from tqdm.auto import tqdm
33
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  def simple_rename(filepath, target_ext=".txt"):
37
  _fp = Path(filepath)
38
  basename = _fp.stem
39
  return f"OCR_{basename}_{target_ext}"
40
 
41
+ def rm_local_text_files(name_contains="RESULT_"):
42
+ """
43
+ rm_local_text_files - remove local text files
44
 
45
+ Args:
46
+ name_contains (str, optional): [description]. Defaults to "OCR_".
47
+ """
48
+ files = [f for f in Path.cwd().iterdir() if f.is_file() and f.suffix == '.txt' and name_contains in f.name]
49
+ logging.info(f"removing {len(files)} text files")
50
+ for f in files:
51
+ os.remove(f)
52
+ logging.info("done")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  def corr(
55
  s: str,