Spaces:
Sleeping
Sleeping
🎨 clean up code
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
- app.py +21 -17
- pdf2text.py +40 -94
app.py
CHANGED
@@ -78,11 +78,11 @@ def predict(
|
|
78 |
def proc_submission(
|
79 |
input_text: str,
|
80 |
model_name: str,
|
81 |
-
num_beams,
|
82 |
-
token_batch_length,
|
83 |
-
length_penalty,
|
84 |
-
repetition_penalty,
|
85 |
-
no_repeat_ngram_size,
|
86 |
max_input_length: int = 1024,
|
87 |
):
|
88 |
"""
|
@@ -117,7 +117,7 @@ def proc_submission(
|
|
117 |
history = {}
|
118 |
clean_text = clean(input_text, lower=False)
|
119 |
max_input_length = 2048 if "base" in model_name.lower() else max_input_length
|
120 |
-
processed = truncate_word_count(clean_text, max_input_length)
|
121 |
|
122 |
if processed["was_truncated"]:
|
123 |
tr_in = processed["truncated_text"]
|
@@ -184,7 +184,7 @@ def proc_submission(
|
|
184 |
|
185 |
def load_single_example_text(
|
186 |
example_path: str or Path,
|
187 |
-
max_pages=20,
|
188 |
) -> str:
|
189 |
"""
|
190 |
load_single_example_text - loads a single example text file
|
@@ -279,13 +279,19 @@ if __name__ == "__main__":
|
|
279 |
with gr.Row(variant="compact"):
|
280 |
with gr.Column(scale=0.5, variant="compact"):
|
281 |
model_name = gr.Dropdown(
|
282 |
-
choices=MODEL_OPTIONS,
|
|
|
|
|
283 |
)
|
284 |
num_beams = gr.Radio(
|
285 |
choices=[2, 3, 4],
|
286 |
label="Beam Search: # of Beams",
|
287 |
value=2,
|
288 |
)
|
|
|
|
|
|
|
|
|
289 |
with gr.Column(variant="compact"):
|
290 |
example_name = gr.Dropdown(
|
291 |
_examples,
|
@@ -303,11 +309,6 @@ if __name__ == "__main__":
|
|
303 |
label="Input Text (for summarization)",
|
304 |
placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
|
305 |
)
|
306 |
-
with gr.Column(min_width=100, scale=0.5):
|
307 |
-
load_examples_button = gr.Button(
|
308 |
-
"Load Example",
|
309 |
-
)
|
310 |
-
load_file_button = gr.Button("Upload File")
|
311 |
|
312 |
with gr.Column():
|
313 |
gr.Markdown("## Generate Summary")
|
@@ -332,7 +333,7 @@ if __name__ == "__main__":
|
|
332 |
)
|
333 |
|
334 |
text_file = gr.File(
|
335 |
-
label="Download
|
336 |
file_count="single",
|
337 |
type="file",
|
338 |
interactive=False,
|
@@ -342,7 +343,7 @@ if __name__ == "__main__":
|
|
342 |
with gr.Column():
|
343 |
gr.Markdown("### Advanced Settings")
|
344 |
with gr.Row(variant="compact"):
|
345 |
-
length_penalty = gr.
|
346 |
minimum=0.5,
|
347 |
maximum=1.0,
|
348 |
label="length penalty",
|
@@ -356,7 +357,7 @@ if __name__ == "__main__":
|
|
356 |
)
|
357 |
|
358 |
with gr.Row(variant="compact"):
|
359 |
-
repetition_penalty = gr.
|
360 |
minimum=1.0,
|
361 |
maximum=5.0,
|
362 |
label="repetition penalty",
|
@@ -371,7 +372,10 @@ if __name__ == "__main__":
|
|
371 |
with gr.Column():
|
372 |
gr.Markdown("### About")
|
373 |
gr.Markdown(
|
374 |
-
"
|
|
|
|
|
|
|
375 |
)
|
376 |
gr.Markdown("---")
|
377 |
|
|
|
78 |
def proc_submission(
|
79 |
input_text: str,
|
80 |
model_name: str,
|
81 |
+
num_beams: int,
|
82 |
+
token_batch_length: int,
|
83 |
+
length_penalty: float,
|
84 |
+
repetition_penalty: float,
|
85 |
+
no_repeat_ngram_size: int,
|
86 |
max_input_length: int = 1024,
|
87 |
):
|
88 |
"""
|
|
|
117 |
history = {}
|
118 |
clean_text = clean(input_text, lower=False)
|
119 |
max_input_length = 2048 if "base" in model_name.lower() else max_input_length
|
120 |
+
processed = truncate_word_count(clean_text, max_words=max_input_length)
|
121 |
|
122 |
if processed["was_truncated"]:
|
123 |
tr_in = processed["truncated_text"]
|
|
|
184 |
|
185 |
def load_single_example_text(
|
186 |
example_path: str or Path,
|
187 |
+
max_pages: int = 20,
|
188 |
) -> str:
|
189 |
"""
|
190 |
load_single_example_text - loads a single example text file
|
|
|
279 |
with gr.Row(variant="compact"):
|
280 |
with gr.Column(scale=0.5, variant="compact"):
|
281 |
model_name = gr.Dropdown(
|
282 |
+
choices=MODEL_OPTIONS,
|
283 |
+
value=MODEL_OPTIONS[0],
|
284 |
+
label="Model Name",
|
285 |
)
|
286 |
num_beams = gr.Radio(
|
287 |
choices=[2, 3, 4],
|
288 |
label="Beam Search: # of Beams",
|
289 |
value=2,
|
290 |
)
|
291 |
+
load_examples_button = gr.Button(
|
292 |
+
"Load Example in Dropdown",
|
293 |
+
)
|
294 |
+
load_file_button = gr.Button("Load an Uploaded File")
|
295 |
with gr.Column(variant="compact"):
|
296 |
example_name = gr.Dropdown(
|
297 |
_examples,
|
|
|
309 |
label="Input Text (for summarization)",
|
310 |
placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
|
311 |
)
|
|
|
|
|
|
|
|
|
|
|
312 |
|
313 |
with gr.Column():
|
314 |
gr.Markdown("## Generate Summary")
|
|
|
333 |
)
|
334 |
|
335 |
text_file = gr.File(
|
336 |
+
label="Download as Text File",
|
337 |
file_count="single",
|
338 |
type="file",
|
339 |
interactive=False,
|
|
|
343 |
with gr.Column():
|
344 |
gr.Markdown("### Advanced Settings")
|
345 |
with gr.Row(variant="compact"):
|
346 |
+
length_penalty = gr.Slider(
|
347 |
minimum=0.5,
|
348 |
maximum=1.0,
|
349 |
label="length penalty",
|
|
|
357 |
)
|
358 |
|
359 |
with gr.Row(variant="compact"):
|
360 |
+
repetition_penalty = gr.Slider(
|
361 |
minimum=1.0,
|
362 |
maximum=5.0,
|
363 |
label="repetition penalty",
|
|
|
372 |
with gr.Column():
|
373 |
gr.Markdown("### About")
|
374 |
gr.Markdown(
|
375 |
+
"- Models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209). The goal was to create a model that generalizes well and is useful for summarizing text in academic and everyday use."
|
376 |
+
)
|
377 |
+
gr.Markdown(
|
378 |
+
"- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
|
379 |
)
|
380 |
gr.Markdown("---")
|
381 |
|
pdf2text.py
CHANGED
@@ -1,10 +1,15 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
"""
|
3 |
-
|
4 |
-
easyocr.py - A wrapper for easyocr to convert pdf to images to text
|
5 |
"""
|
6 |
-
|
7 |
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from pathlib import Path
|
9 |
|
10 |
logging.basicConfig(
|
@@ -14,25 +19,18 @@ logging.basicConfig(
|
|
14 |
)
|
15 |
|
16 |
|
17 |
-
|
18 |
-
import pprint as pp
|
19 |
-
import re
|
20 |
-
import shutil
|
21 |
-
import time
|
22 |
-
from datetime import date, datetime
|
23 |
-
from os.path import basename, dirname, join
|
24 |
-
from pathlib import Path
|
25 |
|
26 |
from cleantext import clean
|
27 |
from doctr.io import DocumentFile
|
28 |
from doctr.models import ocr_predictor
|
29 |
from libretranslatepy import LibreTranslateAPI
|
30 |
-
from natsort import natsorted
|
31 |
from spellchecker import SpellChecker
|
32 |
from tqdm.auto import tqdm
|
33 |
|
34 |
|
35 |
def simple_rename(filepath, target_ext=".txt"):
|
|
|
36 |
_fp = Path(filepath)
|
37 |
basename = _fp.stem
|
38 |
return f"OCR_{basename}_{target_ext}"
|
@@ -41,9 +39,6 @@ def simple_rename(filepath, target_ext=".txt"):
|
|
41 |
def rm_local_text_files(name_contains="RESULT_"):
|
42 |
"""
|
43 |
rm_local_text_files - remove local text files
|
44 |
-
|
45 |
-
Args:
|
46 |
-
name_contains (str, optional): [description]. Defaults to "OCR_".
|
47 |
"""
|
48 |
files = [
|
49 |
f
|
@@ -91,17 +86,12 @@ def corr(
|
|
91 |
return s
|
92 |
|
93 |
|
94 |
-
def fix_punct_spaces(string):
|
95 |
"""
|
96 |
-
fix_punct_spaces -
|
97 |
-
|
98 |
-
Parameters
|
99 |
-
----------
|
100 |
-
string : str, required, input string to be corrected
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
str, corrected string
|
105 |
"""
|
106 |
|
107 |
fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
|
@@ -111,17 +101,12 @@ def fix_punct_spaces(string):
|
|
111 |
return string.strip()
|
112 |
|
113 |
|
114 |
-
def clean_OCR(ugly_text: str):
|
115 |
"""
|
116 |
-
clean_OCR - clean the OCR text
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
ugly_text : str, required, input string to be cleaned
|
121 |
-
|
122 |
-
Returns
|
123 |
-
-------
|
124 |
-
str, cleaned string
|
125 |
"""
|
126 |
# Remove all the newlines.
|
127 |
cleaned_text = ugly_text.replace("\n", " ")
|
@@ -137,9 +122,12 @@ def clean_OCR(ugly_text: str):
|
|
137 |
return fix_punct_spaces(cleaned_text)
|
138 |
|
139 |
|
140 |
-
def move2completed(
|
141 |
-
|
142 |
-
|
|
|
|
|
|
|
143 |
old_filepath = join(from_dir, filename)
|
144 |
|
145 |
new_filedirectory = join(from_dir, new_folder)
|
@@ -161,11 +149,6 @@ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
|
|
161 |
)
|
162 |
|
163 |
|
164 |
-
"""## pdf2text functions
|
165 |
-
|
166 |
-
"""
|
167 |
-
|
168 |
-
|
169 |
custom_replace_list = {
|
170 |
"t0": "to",
|
171 |
"'$": "'s",
|
@@ -239,17 +222,16 @@ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
|
|
239 |
"""
|
240 |
cleantxt_ocr - clean text from OCR
|
241 |
|
|
|
242 |
Args:
|
243 |
ugly_text (str): text to clean
|
244 |
-
lower (bool, optional):
|
245 |
-
lang (str, optional):
|
246 |
|
247 |
Returns:
|
248 |
str: cleaned text
|
249 |
"""
|
250 |
-
# a wrapper for clean text with options different than default
|
251 |
|
252 |
-
# https://pypi.org/project/clean-text/
|
253 |
cleaned_text = clean(
|
254 |
ugly_text,
|
255 |
fix_unicode=True, # fix various unicode errors
|
@@ -258,18 +240,15 @@ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
|
|
258 |
no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
|
259 |
no_urls=True, # replace all URLs with a special token
|
260 |
no_emails=True, # replace all email addresses with a special token
|
261 |
-
no_phone_numbers=
|
262 |
no_numbers=False, # replace all numbers with a special token
|
263 |
no_digits=False, # replace all digits with a special token
|
264 |
no_currency_symbols=False, # replace all currency symbols with a special token
|
265 |
no_punct=False, # remove punctuations
|
266 |
replace_with_punct="", # instead of removing punctuations you may replace them
|
267 |
-
replace_with_url="
|
268 |
-
replace_with_email="
|
269 |
-
replace_with_phone_number="
|
270 |
-
replace_with_number="<NUM>",
|
271 |
-
replace_with_digit="0",
|
272 |
-
replace_with_currency_symbol="<CUR>",
|
273 |
lang=lang, # set to 'de' for German special handling
|
274 |
)
|
275 |
|
@@ -277,7 +256,7 @@ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
|
|
277 |
|
278 |
|
279 |
def format_ocr_out(OCR_data):
|
280 |
-
|
281 |
if isinstance(OCR_data, list):
|
282 |
text = " ".join(OCR_data)
|
283 |
else:
|
@@ -323,8 +302,15 @@ def convert_PDF_to_Text(
|
|
323 |
PDF_file,
|
324 |
ocr_model=None,
|
325 |
max_pages: int = 20,
|
326 |
-
):
|
|
|
|
|
327 |
|
|
|
|
|
|
|
|
|
|
|
328 |
st = time.perf_counter()
|
329 |
PDF_file = Path(PDF_file)
|
330 |
ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
|
@@ -361,43 +347,3 @@ def convert_PDF_to_Text(
|
|
361 |
}
|
362 |
|
363 |
return results_dict
|
364 |
-
|
365 |
-
|
366 |
-
# @title translation functions
|
367 |
-
|
368 |
-
lt = LibreTranslateAPI("https://translate.astian.org/")
|
369 |
-
|
370 |
-
|
371 |
-
def translate_text(text, source_l, target_l="en"):
|
372 |
-
|
373 |
-
return str(lt.translate(text, source_l, target_l))
|
374 |
-
|
375 |
-
|
376 |
-
def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
|
377 |
-
"""translate a document from lang_start to lang_end
|
378 |
-
|
379 |
-
{'code': 'en', 'name': 'English'},
|
380 |
-
{'code': 'fr', 'name': 'French'},
|
381 |
-
{'code': 'de', 'name': 'German'},
|
382 |
-
{'code': 'it', 'name': 'Italian'},"""
|
383 |
-
|
384 |
-
src_folder = dirname(filepath)
|
385 |
-
src_folder = Path(src_folder)
|
386 |
-
trgt_folder = src_folder / f"translated_{lang_end}"
|
387 |
-
trgt_folder.mkdir(exist_ok=True)
|
388 |
-
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
|
389 |
-
foreign_t = f.readlines()
|
390 |
-
in_name = basename(filepath)
|
391 |
-
translated_doc = []
|
392 |
-
for line in tqdm(
|
393 |
-
foreign_t, total=len(foreign_t), desc="translating {}...".format(in_name[:10])
|
394 |
-
):
|
395 |
-
translated_line = translate_text(line, lang_start, lang_end)
|
396 |
-
translated_doc.append(translated_line)
|
397 |
-
t_out_name = "[To {}]".format(lang_end) + simple_rename(in_name) + ".txt"
|
398 |
-
out_path = join(trgt_folder, t_out_name)
|
399 |
-
with open(out_path, "w", encoding="utf-8", errors="ignore") as f_o:
|
400 |
-
f_o.writelines(translated_doc)
|
401 |
-
if verbose:
|
402 |
-
print("finished translating the document! - ", datetime.now())
|
403 |
-
return out_path
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
"""
|
3 |
+
pdf2text.py - convert pdf files to text files using OCR
|
|
|
4 |
"""
|
|
|
5 |
import logging
|
6 |
+
import os
|
7 |
+
import pprint as pp
|
8 |
+
import re
|
9 |
+
import shutil
|
10 |
+
import time
|
11 |
+
from datetime import date, datetime
|
12 |
+
from os.path import basename, dirname, join
|
13 |
from pathlib import Path
|
14 |
|
15 |
logging.basicConfig(
|
|
|
19 |
)
|
20 |
|
21 |
|
22 |
+
os.environ["USE_TORCH"] = "1"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
from cleantext import clean
|
25 |
from doctr.io import DocumentFile
|
26 |
from doctr.models import ocr_predictor
|
27 |
from libretranslatepy import LibreTranslateAPI
|
|
|
28 |
from spellchecker import SpellChecker
|
29 |
from tqdm.auto import tqdm
|
30 |
|
31 |
|
32 |
def simple_rename(filepath, target_ext=".txt"):
|
33 |
+
"""simple_rename - get a new str to rename a file"""
|
34 |
_fp = Path(filepath)
|
35 |
basename = _fp.stem
|
36 |
return f"OCR_{basename}_{target_ext}"
|
|
|
39 |
def rm_local_text_files(name_contains="RESULT_"):
|
40 |
"""
|
41 |
rm_local_text_files - remove local text files
|
|
|
|
|
|
|
42 |
"""
|
43 |
files = [
|
44 |
f
|
|
|
86 |
return s
|
87 |
|
88 |
|
89 |
+
def fix_punct_spaces(string: str) -> str:
|
90 |
"""
|
91 |
+
fix_punct_spaces - fix spaces around punctuation
|
|
|
|
|
|
|
|
|
92 |
|
93 |
+
:param str string: input string
|
94 |
+
:return str: string with spaces fixed
|
|
|
95 |
"""
|
96 |
|
97 |
fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
|
|
|
101 |
return string.strip()
|
102 |
|
103 |
|
104 |
+
def clean_OCR(ugly_text: str) -> str:
|
105 |
"""
|
106 |
+
clean_OCR - clean up the OCR text
|
107 |
|
108 |
+
:param str ugly_text: input text to be cleaned
|
109 |
+
:return str: cleaned text
|
|
|
|
|
|
|
|
|
|
|
110 |
"""
|
111 |
# Remove all the newlines.
|
112 |
cleaned_text = ugly_text.replace("\n", " ")
|
|
|
122 |
return fix_punct_spaces(cleaned_text)
|
123 |
|
124 |
|
125 |
+
def move2completed(
|
126 |
+
from_dir, filename, new_folder: str = "completed", verbose: bool = False
|
127 |
+
):
|
128 |
+
"""
|
129 |
+
move2completed - move a file to a new folder
|
130 |
+
"""
|
131 |
old_filepath = join(from_dir, filename)
|
132 |
|
133 |
new_filedirectory = join(from_dir, new_folder)
|
|
|
149 |
)
|
150 |
|
151 |
|
|
|
|
|
|
|
|
|
|
|
152 |
custom_replace_list = {
|
153 |
"t0": "to",
|
154 |
"'$": "'s",
|
|
|
222 |
"""
|
223 |
cleantxt_ocr - clean text from OCR
|
224 |
|
225 |
+
https://pypi.org/project/clean-text/
|
226 |
Args:
|
227 |
ugly_text (str): text to clean
|
228 |
+
lower (bool, optional): lowercase text. Defaults to False.
|
229 |
+
lang (str, optional): language of text. Defaults to "en".
|
230 |
|
231 |
Returns:
|
232 |
str: cleaned text
|
233 |
"""
|
|
|
234 |
|
|
|
235 |
cleaned_text = clean(
|
236 |
ugly_text,
|
237 |
fix_unicode=True, # fix various unicode errors
|
|
|
240 |
no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
|
241 |
no_urls=True, # replace all URLs with a special token
|
242 |
no_emails=True, # replace all email addresses with a special token
|
243 |
+
no_phone_numbers=True, # replace all phone numbers with a special token
|
244 |
no_numbers=False, # replace all numbers with a special token
|
245 |
no_digits=False, # replace all digits with a special token
|
246 |
no_currency_symbols=False, # replace all currency symbols with a special token
|
247 |
no_punct=False, # remove punctuations
|
248 |
replace_with_punct="", # instead of removing punctuations you may replace them
|
249 |
+
replace_with_url="this url",
|
250 |
+
replace_with_email="this email",
|
251 |
+
replace_with_phone_number="this phone number",
|
|
|
|
|
|
|
252 |
lang=lang, # set to 'de' for German special handling
|
253 |
)
|
254 |
|
|
|
256 |
|
257 |
|
258 |
def format_ocr_out(OCR_data):
|
259 |
+
"""format OCR output to text"""
|
260 |
if isinstance(OCR_data, list):
|
261 |
text = " ".join(OCR_data)
|
262 |
else:
|
|
|
302 |
PDF_file,
|
303 |
ocr_model=None,
|
304 |
max_pages: int = 20,
|
305 |
+
) -> str:
|
306 |
+
"""
|
307 |
+
convert_PDF_to_Text - convert a PDF file to text
|
308 |
|
309 |
+
:param str PDF_file: path to PDF file
|
310 |
+
:param ocr_model: model to use for OCR, defaults to None (uses the default model)
|
311 |
+
:param int max_pages: maximum number of pages to process, defaults to 20
|
312 |
+
:return str: text from PDF
|
313 |
+
"""
|
314 |
st = time.perf_counter()
|
315 |
PDF_file = Path(PDF_file)
|
316 |
ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
|
|
|
347 |
}
|
348 |
|
349 |
return results_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|