document-summarization

Sleeping

App Files Files Community

pszemraj commited on Apr 9, 2023

Commit

3927544

1 Parent(s): c006617

🎨 clean up code

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (2) hide show

app.py +21 -17
pdf2text.py +40 -94

app.py CHANGED Viewed

@@ -78,11 +78,11 @@ def predict(
 def proc_submission(
     input_text: str,
     model_name: str,
-    num_beams,
-    token_batch_length,
-    length_penalty,
-    repetition_penalty,
-    no_repeat_ngram_size,
     max_input_length: int = 1024,
 ):
     """
@@ -117,7 +117,7 @@ def proc_submission(
     history = {}
     clean_text = clean(input_text, lower=False)
     max_input_length = 2048 if "base" in model_name.lower() else max_input_length
-    processed = truncate_word_count(clean_text, max_input_length)
     if processed["was_truncated"]:
         tr_in = processed["truncated_text"]
@@ -184,7 +184,7 @@ def proc_submission(
 def load_single_example_text(
     example_path: str or Path,
-    max_pages=20,
 ) -> str:
     """
     load_single_example_text - loads a single example text file
@@ -279,13 +279,19 @@ if __name__ == "__main__":
             with gr.Row(variant="compact"):
                 with gr.Column(scale=0.5, variant="compact"):
                     model_name = gr.Dropdown(
-                        choices=MODEL_OPTIONS, value=MODEL_OPTIONS[0], label="Model"
                     )
                     num_beams = gr.Radio(
                         choices=[2, 3, 4],
                         label="Beam Search: # of Beams",
                         value=2,
                     )
                 with gr.Column(variant="compact"):
                     example_name = gr.Dropdown(
                         _examples,
@@ -303,11 +309,6 @@ if __name__ == "__main__":
                     label="Input Text (for summarization)",
                     placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
                 )
-                with gr.Column(min_width=100, scale=0.5):
-                    load_examples_button = gr.Button(
-                        "Load Example",
-                    )
-                    load_file_button = gr.Button("Upload File")
         with gr.Column():
             gr.Markdown("## Generate Summary")
@@ -332,7 +333,7 @@ if __name__ == "__main__":
             )
             text_file = gr.File(
-                label="Download Summary as Text File",
                 file_count="single",
                 type="file",
                 interactive=False,
@@ -342,7 +343,7 @@ if __name__ == "__main__":
         with gr.Column():
             gr.Markdown("### Advanced Settings")
             with gr.Row(variant="compact"):
-                length_penalty = gr.inputs.Slider(
                     minimum=0.5,
                     maximum=1.0,
                     label="length penalty",
@@ -356,7 +357,7 @@ if __name__ == "__main__":
                 )
             with gr.Row(variant="compact"):
-                repetition_penalty = gr.inputs.Slider(
                     minimum=1.0,
                     maximum=5.0,
                     label="repetition penalty",
@@ -371,7 +372,10 @@ if __name__ == "__main__":
         with gr.Column():
             gr.Markdown("### About")
             gr.Markdown(
-                "These models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
             )
             gr.Markdown("---")

 def proc_submission(
     input_text: str,
     model_name: str,
+    num_beams: int,
+    token_batch_length: int,
+    length_penalty: float,
+    repetition_penalty: float,
+    no_repeat_ngram_size: int,
     max_input_length: int = 1024,
 ):
     """
     history = {}
     clean_text = clean(input_text, lower=False)
     max_input_length = 2048 if "base" in model_name.lower() else max_input_length
+    processed = truncate_word_count(clean_text, max_words=max_input_length)
     if processed["was_truncated"]:
         tr_in = processed["truncated_text"]
 def load_single_example_text(
     example_path: str or Path,
+    max_pages: int = 20,
 ) -> str:
     """
     load_single_example_text - loads a single example text file
             with gr.Row(variant="compact"):
                 with gr.Column(scale=0.5, variant="compact"):
                     model_name = gr.Dropdown(
+                        choices=MODEL_OPTIONS,
+                        value=MODEL_OPTIONS[0],
+                        label="Model Name",
                     )
                     num_beams = gr.Radio(
                         choices=[2, 3, 4],
                         label="Beam Search: # of Beams",
                         value=2,
                     )
+                    load_examples_button = gr.Button(
+                        "Load Example in Dropdown",
+                    )
+                    load_file_button = gr.Button("Load an Uploaded File")
                 with gr.Column(variant="compact"):
                     example_name = gr.Dropdown(
                         _examples,
                     label="Input Text (for summarization)",
                     placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
                 )
         with gr.Column():
             gr.Markdown("## Generate Summary")
             )
             text_file = gr.File(
+                label="Download as Text File",
                 file_count="single",
                 type="file",
                 interactive=False,
         with gr.Column():
             gr.Markdown("### Advanced Settings")
             with gr.Row(variant="compact"):
+                length_penalty = gr.Slider(
                     minimum=0.5,
                     maximum=1.0,
                     label="length penalty",
                 )
             with gr.Row(variant="compact"):
+                repetition_penalty = gr.Slider(
                     minimum=1.0,
                     maximum=5.0,
                     label="repetition penalty",
         with gr.Column():
             gr.Markdown("### About")
             gr.Markdown(
+                "- Models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209). The goal was to create a model that generalizes well and is useful for summarizing text in academic and everyday use."
+            )
+            gr.Markdown(
+                "- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
             )
             gr.Markdown("---")

pdf2text.py CHANGED Viewed

@@ -1,10 +1,15 @@
 # -*- coding: utf-8 -*-
 """
-easyocr.py - A wrapper for easyocr to convert pdf to images to text
 """
 import logging
 from pathlib import Path
 logging.basicConfig(
@@ -14,25 +19,18 @@ logging.basicConfig(
 )
-import os
-import pprint as pp
-import re
-import shutil
-import time
-from datetime import date, datetime
-from os.path import basename, dirname, join
-from pathlib import Path
 from cleantext import clean
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 from libretranslatepy import LibreTranslateAPI
-from natsort import natsorted
 from spellchecker import SpellChecker
 from tqdm.auto import tqdm
 def simple_rename(filepath, target_ext=".txt"):
     _fp = Path(filepath)
     basename = _fp.stem
     return f"OCR_{basename}_{target_ext}"
@@ -41,9 +39,6 @@ def simple_rename(filepath, target_ext=".txt"):
 def rm_local_text_files(name_contains="RESULT_"):
     """
     rm_local_text_files - remove local text files
-    Args:
-        name_contains (str, optional): [description]. Defaults to "OCR_".
     """
     files = [
         f
@@ -91,17 +86,12 @@ def corr(
     return s
-def fix_punct_spaces(string):
     """
-    fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
-    Parameters
-    ----------
-    string : str, required, input string to be corrected
-    Returns
-    -------
-    str, corrected string
     """
     fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
@@ -111,17 +101,12 @@ def fix_punct_spaces(string):
     return string.strip()
-def clean_OCR(ugly_text: str):
     """
-    clean_OCR - clean the OCR text files.
-    Parameters
-    ----------
-    ugly_text : str, required, input string to be cleaned
-    Returns
-    -------
-    str, cleaned string
     """
     # Remove all the newlines.
     cleaned_text = ugly_text.replace("\n", " ")
@@ -137,9 +122,12 @@ def clean_OCR(ugly_text: str):
     return fix_punct_spaces(cleaned_text)
-def move2completed(from_dir, filename, new_folder="completed", verbose=False):
-    # this is the better version
     old_filepath = join(from_dir, filename)
     new_filedirectory = join(from_dir, new_folder)
@@ -161,11 +149,6 @@ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
         )
-"""## pdf2text functions
-"""
 custom_replace_list = {
     "t0": "to",
     "'$": "'s",
@@ -239,17 +222,16 @@ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
     """
     cleantxt_ocr - clean text from OCR
     Args:
         ugly_text (str): text to clean
-        lower (bool, optional): _description_. Defaults to False.
-        lang (str, optional): _description_. Defaults to "en".
     Returns:
         str: cleaned text
     """
-    # a wrapper for clean text with options different than default
-    # https://pypi.org/project/clean-text/
     cleaned_text = clean(
         ugly_text,
         fix_unicode=True,  # fix various unicode errors
@@ -258,18 +240,15 @@ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
         no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
         no_urls=True,  # replace all URLs with a special token
         no_emails=True,  # replace all email addresses with a special token
-        no_phone_numbers=False,  # replace all phone numbers with a special token
         no_numbers=False,  # replace all numbers with a special token
         no_digits=False,  # replace all digits with a special token
         no_currency_symbols=False,  # replace all currency symbols with a special token
         no_punct=False,  # remove punctuations
         replace_with_punct="",  # instead of removing punctuations you may replace them
-        replace_with_url="<URL>",
-        replace_with_email="<EMAIL>",
-        replace_with_phone_number="<PHONE>",
-        replace_with_number="<NUM>",
-        replace_with_digit="0",
-        replace_with_currency_symbol="<CUR>",
         lang=lang,  # set to 'de' for German special handling
     )
@@ -277,7 +256,7 @@ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
 def format_ocr_out(OCR_data):
     if isinstance(OCR_data, list):
         text = " ".join(OCR_data)
     else:
@@ -323,8 +302,15 @@ def convert_PDF_to_Text(
     PDF_file,
     ocr_model=None,
     max_pages: int = 20,
-):
     st = time.perf_counter()
     PDF_file = Path(PDF_file)
     ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
@@ -361,43 +347,3 @@ def convert_PDF_to_Text(
     }
     return results_dict
-# @title translation functions
-lt = LibreTranslateAPI("https://translate.astian.org/")
-def translate_text(text, source_l, target_l="en"):
-    return str(lt.translate(text, source_l, target_l))
-def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
-    """translate a document from lang_start to lang_end
-        {'code': 'en', 'name': 'English'},
-    {'code': 'fr', 'name': 'French'},
-    {'code': 'de', 'name': 'German'},
-    {'code': 'it', 'name': 'Italian'},"""
-    src_folder = dirname(filepath)
-    src_folder = Path(src_folder)
-    trgt_folder = src_folder / f"translated_{lang_end}"
-    trgt_folder.mkdir(exist_ok=True)
-    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
-        foreign_t = f.readlines()
-    in_name = basename(filepath)
-    translated_doc = []
-    for line in tqdm(
-        foreign_t, total=len(foreign_t), desc="translating {}...".format(in_name[:10])
-    ):
-        translated_line = translate_text(line, lang_start, lang_end)
-        translated_doc.append(translated_line)
-    t_out_name = "[To {}]".format(lang_end) + simple_rename(in_name) + ".txt"
-    out_path = join(trgt_folder, t_out_name)
-    with open(out_path, "w", encoding="utf-8", errors="ignore") as f_o:
-        f_o.writelines(translated_doc)
-    if verbose:
-        print("finished translating the document! - ", datetime.now())
-    return out_path

 # -*- coding: utf-8 -*-
 """
+pdf2text.py - convert pdf files to text files using OCR
 """
 import logging
+import os
+import pprint as pp
+import re
+import shutil
+import time
+from datetime import date, datetime
+from os.path import basename, dirname, join
 from pathlib import Path
 logging.basicConfig(
 )
+os.environ["USE_TORCH"] = "1"
 from cleantext import clean
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 from libretranslatepy import LibreTranslateAPI
 from spellchecker import SpellChecker
 from tqdm.auto import tqdm
 def simple_rename(filepath, target_ext=".txt"):
+    """simple_rename - get a new str to rename a file"""
     _fp = Path(filepath)
     basename = _fp.stem
     return f"OCR_{basename}_{target_ext}"
 def rm_local_text_files(name_contains="RESULT_"):
     """
     rm_local_text_files - remove local text files
     """
     files = [
         f
     return s
+def fix_punct_spaces(string: str) -> str:
     """
+    fix_punct_spaces - fix spaces around punctuation
+    :param str string: input string
+    :return str: string with spaces fixed
     """
     fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
     return string.strip()
+def clean_OCR(ugly_text: str) -> str:
     """
+    clean_OCR - clean up the OCR text
+    :param str ugly_text: input text to be cleaned
+    :return str: cleaned text
     """
     # Remove all the newlines.
     cleaned_text = ugly_text.replace("\n", " ")
     return fix_punct_spaces(cleaned_text)
+def move2completed(
+    from_dir, filename, new_folder: str = "completed", verbose: bool = False
+):
+    """
+    move2completed - move a file to a new folder
+    """
     old_filepath = join(from_dir, filename)
     new_filedirectory = join(from_dir, new_folder)
         )
 custom_replace_list = {
     "t0": "to",
     "'$": "'s",
     """
     cleantxt_ocr - clean text from OCR
+        https://pypi.org/project/clean-text/
     Args:
         ugly_text (str): text to clean
+        lower (bool, optional): lowercase text. Defaults to False.
+        lang (str, optional): language of text. Defaults to "en".
     Returns:
         str: cleaned text
     """
     cleaned_text = clean(
         ugly_text,
         fix_unicode=True,  # fix various unicode errors
         no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
         no_urls=True,  # replace all URLs with a special token
         no_emails=True,  # replace all email addresses with a special token
+        no_phone_numbers=True,  # replace all phone numbers with a special token
         no_numbers=False,  # replace all numbers with a special token
         no_digits=False,  # replace all digits with a special token
         no_currency_symbols=False,  # replace all currency symbols with a special token
         no_punct=False,  # remove punctuations
         replace_with_punct="",  # instead of removing punctuations you may replace them
+        replace_with_url="this url",
+        replace_with_email="this email",
+        replace_with_phone_number="this phone number",
         lang=lang,  # set to 'de' for German special handling
     )
 def format_ocr_out(OCR_data):
+    """format OCR output to text"""
     if isinstance(OCR_data, list):
         text = " ".join(OCR_data)
     else:
     PDF_file,
     ocr_model=None,
     max_pages: int = 20,
+) -> str:
+    """
+    convert_PDF_to_Text - convert a PDF file to text
+    :param str PDF_file: path to PDF file
+    :param ocr_model: model to use for OCR, defaults to None (uses the default model)
+    :param int max_pages: maximum number of pages to process, defaults to 20
+    :return str: text from PDF
+    """
     st = time.perf_counter()
     PDF_file = Path(PDF_file)
     ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
     }
     return results_dict