Spaces:

pszemraj
/

ballpark-trivia

Runtime error

App Files Files Community

peter szemraj commited on Dec 31, 2021

Commit

c566631

•

1 Parent(s): 47de588

:art: format code to black

Browse files

Files changed (3) hide show

app.py +11 -3
grammar_improve.py +88 -48
utils.py +0 -2

app.py CHANGED Viewed

@@ -27,7 +27,14 @@ import os
 import sys
 from os.path import dirname
 import nltk
-from grammar_improve import load_ns_checker, neuspell_correct, remove_repeated_words, remove_trailing_punctuation, build_symspell_obj, symspeller
 from scratch.grammar_tests import load_ns_checker, neuspell_correct
 from utils import (
@@ -112,7 +119,8 @@ def get_parser():
         description="submit a question, GPT model responds"
     )
     parser.add_argument(
-        "-m", "--model",
         required=False,
         type=str,
         default="ballpark-trivia-L",
@@ -170,4 +178,4 @@ if __name__ == "__main__":
     iface.launch(
         share=True,
         enable_queue=True,  # also allows for dealing with multiple users simultaneously (per newer gradio version)
-    )

 import sys
 from os.path import dirname
 import nltk
+from grammar_improve import (
+    load_ns_checker,
+    neuspell_correct,
+    remove_repeated_words,
+    remove_trailing_punctuation,
+    build_symspell_obj,
+    symspeller,
+)
 from scratch.grammar_tests import load_ns_checker, neuspell_correct
 from utils import (
         description="submit a question, GPT model responds"
     )
     parser.add_argument(
+        "-m",
+        "--model",
         required=False,
         type=str,
         default="ballpark-trivia-L",
     iface.launch(
         share=True,
         enable_queue=True,  # also allows for dealing with multiple users simultaneously (per newer gradio version)
+    )

grammar_improve.py CHANGED Viewed

@@ -14,7 +14,6 @@ import re
 from symspellpy.symspellpy import SymSpell
 def fix_punct_spaces(string):
     """
     fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
@@ -28,9 +27,8 @@ def fix_punct_spaces(string):
     str, corrected string
     """
-    fix_spaces = re.compile(r'\s*([?!.,]+(?:\s+[?!.,]+)*)\s*')
-    string = fix_spaces.sub(lambda x: "{} ".format(
-        x.group(1).replace(" ", "")), string)
     return string.strip()
@@ -90,9 +88,16 @@ start of SymSpell code
 """
-def symspeller(my_string: str, sym_checker=None, max_dist: int = 3, prefix_length: int = 7,
-               ignore_non_words=True,
-               dictionary_path: str = None, bigram_path: str = None, verbose=False):
     """
     symspeller - a wrapper for the SymSpell class from symspellpy
@@ -110,13 +115,21 @@ def symspeller(my_string: str, sym_checker=None, max_dist: int = 3, prefix_lengt
         if verbose:
             print("creating new SymSpell object")
         sym_checker = build_symspell_obj(
-            edit_dist=max_dist, prefix_length=prefix_length, dictionary_path=dictionary_path, bigram_path=bigram_path,)
     else:
         if verbose:
             print("using existing SymSpell object")
     # max edit distance per lookup (per single word, not per whole input string)
     suggestions = sym_checker.lookup_compound(
-        my_string, max_edit_distance=max_dist, ignore_non_words=ignore_non_words, ignore_term_with_digits=True, transfer_casing=True,
     )
     if verbose:
@@ -132,7 +145,12 @@ def symspeller(my_string: str, sym_checker=None, max_dist: int = 3, prefix_lengt
         return first_result._term
-def build_symspell_obj(edit_dist=3, prefix_length=7, dictionary_path=None, bigram_path=None,):
     """
     build_symspell_obj [build a SymSpell object]
@@ -142,18 +160,27 @@ def build_symspell_obj(edit_dist=3, prefix_length=7, dictionary_path=None, bigra
     Returns:
         SymSpell: a SymSpell object
     """
-    dictionary_path = r"symspell_rsc/frequency_dictionary_en_82_765.txt" if dictionary_path is None else dictionary_path
-    bigram_path = r"symspell_rsc/frequency_bigramdictionary_en_243_342.txt" if bigram_path is None else bigram_path
     sym_checker = SymSpell(
-        max_dictionary_edit_distance=edit_dist, prefix_length=prefix_length)
     # term_index is the column of the term and count_index is the
     # column of the term frequency
     sym_checker.load_dictionary(dictionary_path, term_index=0, count_index=1)
-    sym_checker.load_bigram_dictionary(
-        bigram_path, term_index=0, count_index=2)
     return sym_checker
 """
 NEEDED FOR T5
 import torch
@@ -167,6 +194,7 @@ gc_model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_devic
 """
 def t5b_correction(prompt: str, korrektor, verbose=False, beams=4):
     """
     t5b_correction - correct a string using a text2textgen pipeline model from transformers
@@ -186,18 +214,19 @@ def t5b_correction(prompt: str, korrektor, verbose=False, beams=4):
     p_min_len = int(math.ceil(0.9 * len(prompt)))
     p_max_len = int(math.ceil(1.1 * len(prompt)))
     if verbose:
-        print(f'setting min to {p_min_len} and max to {p_max_len}\n')
-    gcorr_result = korrektor(f"grammar: {prompt}",
-                             return_text=True,
-                             clean_up_tokenization_spaces=True,
-                             num_beams=beams,
-                             max_length=p_max_len,
-                             repetition_penalty=1.3,
-                             length_penalty=0.2,
-                             no_repeat_ngram_size=3,
-                             )
     if verbose:
-        print(f'grammar correction result: \n\t{gcorr_result}\n')
     return gcorr_result
@@ -244,7 +273,7 @@ def load_ns_checker(customckr=None):
 def neuspell_correct(input_text: str, checker=None, verbose=False):
     """
-    neuspell_correct - correct a string using neuspell.
                         note that modificaitons to the checker are needed if doing list-based corrections
     Parameters
@@ -264,7 +293,7 @@ def neuspell_correct(input_text: str, checker=None, verbose=False):
     cleaned_txt = fix_punct_spaces(corrected)
     if verbose:
-        print(f'neuspell correction result: \n\t{cleaned_txt}\n')
     return cleaned_txt
@@ -310,11 +339,17 @@ def DLA_correct(qphrase: str):
         return " ".join(full_cor)
-def correct_grammar(input_text: str, tokenizer, model,
-                    n_results: int = 1,
-                    beams: int = 8,
-                    temp=1, uniq_ngrams=2, rep_penalty=1.5,
-                    device='cpu'):
     """
     correct_grammar - correct a string using a text2textgen pipeline model from transformers.
                         This function is an alternative to the t5b_correction function.
@@ -337,21 +372,26 @@ def correct_grammar(input_text: str, tokenizer, model,
     """
     if len(input_text) < 5:
         return input_text
-    max_length = min(int(math.ceil(len(input_text)*1.2)), 128)
-    batch = tokenizer([input_text], truncation=True,
-                      padding='max_length',
-                      max_length=max_length, return_tensors="pt").to(device)
-    translated = model.generate(**batch,
-                                max_length=max_length,
-                                min_length=min(10, len(input_text)),
-                                no_repeat_ngram_size=uniq_ngrams,
-                                repetition_penalty=rep_penalty,
-                                num_beams=beams,
-                                num_return_sequences=n_results,
-                                temperature=temp)
-    tgt_text = tokenizer.batch_decode(translated,
-                                      skip_special_tokens=True)
     if isinstance(tgt_text, list):
         return tgt_text[0]

 from symspellpy.symspellpy import SymSpell
 def fix_punct_spaces(string):
     """
     fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
     str, corrected string
     """
+    fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
+    string = fix_spaces.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), string)
     return string.strip()
 """
+def symspeller(
+    my_string: str,
+    sym_checker=None,
+    max_dist: int = 3,
+    prefix_length: int = 7,
+    ignore_non_words=True,
+    dictionary_path: str = None,
+    bigram_path: str = None,
+    verbose=False,
+):
     """
     symspeller - a wrapper for the SymSpell class from symspellpy
         if verbose:
             print("creating new SymSpell object")
         sym_checker = build_symspell_obj(
+            edit_dist=max_dist,
+            prefix_length=prefix_length,
+            dictionary_path=dictionary_path,
+            bigram_path=bigram_path,
+        )
     else:
         if verbose:
             print("using existing SymSpell object")
     # max edit distance per lookup (per single word, not per whole input string)
     suggestions = sym_checker.lookup_compound(
+        my_string,
+        max_edit_distance=max_dist,
+        ignore_non_words=ignore_non_words,
+        ignore_term_with_digits=True,
+        transfer_casing=True,
     )
     if verbose:
         return first_result._term
+def build_symspell_obj(
+    edit_dist=3,
+    prefix_length=7,
+    dictionary_path=None,
+    bigram_path=None,
+):
     """
     build_symspell_obj [build a SymSpell object]
     Returns:
         SymSpell: a SymSpell object
     """
+    dictionary_path = (
+        r"symspell_rsc/frequency_dictionary_en_82_765.txt"
+        if dictionary_path is None
+        else dictionary_path
+    )
+    bigram_path = (
+        r"symspell_rsc/frequency_bigramdictionary_en_243_342.txt"
+        if bigram_path is None
+        else bigram_path
+    )
     sym_checker = SymSpell(
+        max_dictionary_edit_distance=edit_dist, prefix_length=prefix_length
+    )
     # term_index is the column of the term and count_index is the
     # column of the term frequency
     sym_checker.load_dictionary(dictionary_path, term_index=0, count_index=1)
+    sym_checker.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
     return sym_checker
 """
 NEEDED FOR T5
 import torch
 """
 def t5b_correction(prompt: str, korrektor, verbose=False, beams=4):
     """
     t5b_correction - correct a string using a text2textgen pipeline model from transformers
     p_min_len = int(math.ceil(0.9 * len(prompt)))
     p_max_len = int(math.ceil(1.1 * len(prompt)))
     if verbose:
+        print(f"setting min to {p_min_len} and max to {p_max_len}\n")
+    gcorr_result = korrektor(
+        f"grammar: {prompt}",
+        return_text=True,
+        clean_up_tokenization_spaces=True,
+        num_beams=beams,
+        max_length=p_max_len,
+        repetition_penalty=1.3,
+        length_penalty=0.2,
+        no_repeat_ngram_size=3,
+    )
     if verbose:
+        print(f"grammar correction result: \n\t{gcorr_result}\n")
     return gcorr_result
 def neuspell_correct(input_text: str, checker=None, verbose=False):
     """
+    neuspell_correct - correct a string using neuspell.
                         note that modificaitons to the checker are needed if doing list-based corrections
     Parameters
     cleaned_txt = fix_punct_spaces(corrected)
     if verbose:
+        print(f"neuspell correction result: \n\t{cleaned_txt}\n")
     return cleaned_txt
         return " ".join(full_cor)
+def correct_grammar(
+    input_text: str,
+    tokenizer,
+    model,
+    n_results: int = 1,
+    beams: int = 8,
+    temp=1,
+    uniq_ngrams=2,
+    rep_penalty=1.5,
+    device="cpu",
+):
     """
     correct_grammar - correct a string using a text2textgen pipeline model from transformers.
                         This function is an alternative to the t5b_correction function.
     """
     if len(input_text) < 5:
         return input_text
+    max_length = min(int(math.ceil(len(input_text) * 1.2)), 128)
+    batch = tokenizer(
+        [input_text],
+        truncation=True,
+        padding="max_length",
+        max_length=max_length,
+        return_tensors="pt",
+    ).to(device)
+    translated = model.generate(
+        **batch,
+        max_length=max_length,
+        min_length=min(10, len(input_text)),
+        no_repeat_ngram_size=uniq_ngrams,
+        repetition_penalty=rep_penalty,
+        num_beams=beams,
+        num_return_sequences=n_results,
+        temperature=temp,
+    )
+    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
     if isinstance(tgt_text, list):
         return tgt_text[0]

utils.py CHANGED Viewed

@@ -39,7 +39,6 @@ def print_spacer(n=1):
     print("\n   --------    " * n)
 def fast_scandir(dirname: str):
     """
     fast_scandir [an os.path-based means to return all subfolders in a given filepath]
@@ -350,7 +349,6 @@ def dl_extract_zip(
     return extract_loc
 def cleantxt_wrap(ugly_text):
     """
     cleantxt_wrap - applies the clean function to a string.

     print("\n   --------    " * n)
 def fast_scandir(dirname: str):
     """
     fast_scandir [an os.path-based means to return all subfolders in a given filepath]
     return extract_loc
 def cleantxt_wrap(ugly_text):
     """
     cleantxt_wrap - applies the clean function to a string.