Spaces:

pszemraj
/

ballpark-trivia

Runtime error

App Files Files Community

peter szemraj commited on Dec 31, 2021

Commit

2830ef7

•

1 Parent(s): 63bb54c

:construction: adding features for using symspell as a checker

Browse files

Files changed (2) hide show

app.py +15 -5
utils.py +47 -18

app.py CHANGED Viewed

@@ -33,6 +33,8 @@ from utils import (
     remove_trailing_punctuation,
     cleantxt_wrap,
     corr,
 )
 nltk.download("stopwords")  # TODO: find where this requirement originates from
@@ -44,7 +46,7 @@ cwd = Path.cwd()
 my_cwd = str(cwd.resolve())  # string so it can be passed to os.path() objects
-def gramformer_correct(corrector, qphrase: str):
     """
     gramformer_correct - correct a string using a text2textgen pipeline model from transformers
     Args:
@@ -58,8 +60,8 @@ def gramformer_correct(corrector, qphrase: str):
             clean(qphrase), return_text=True, clean_up_tokenization_spaces=True
         )
         return corrected[0]["generated_text"]
-    except:
-        print("NOTE - failed to correct with gramformer")
         return clean(qphrase)
@@ -86,8 +88,8 @@ def ask_gpt(message: str):
         temp=0.75,
         top_p=0.65,
     )
-    uniques = remove_repeated_words(resp["out_text"])
-    bot_resp = corr((uniques))
     rt = round(time.perf_counter() - st, 2)
     print(f"took {rt} sec to respond")
     return remove_trailing_punctuation(bot_resp)
@@ -134,6 +136,13 @@ def get_parser():
         help="folder - with respect to git directory of your repo that has the model files in it (pytorch.bin + "
         "config.json)",
     )
     parser.add_argument(
         "--gram-model",
         required=False,
@@ -151,6 +160,7 @@ if __name__ == "__main__":
     model_loc = cwd.parent / default_model
     model_loc = str(model_loc.resolve())
     gram_model = args.gram_model
     print(f"using model stored here: \n {model_loc} \n")
     iface = gr.Interface(
         chat,

     remove_trailing_punctuation,
     cleantxt_wrap,
     corr,
+    build_symspell_obj,
+    symspeller,
 )
 nltk.download("stopwords")  # TODO: find where this requirement originates from
 my_cwd = str(cwd.resolve())  # string so it can be passed to os.path() objects
+def grammarpipe(corrector, qphrase: str):
     """
     gramformer_correct - correct a string using a text2textgen pipeline model from transformers
     Args:
             clean(qphrase), return_text=True, clean_up_tokenization_spaces=True
         )
         return corrected[0]["generated_text"]
+    except Exception as e:
+        print(f"NOTE - failed to correct with grammarpipe:\n {e}")
         return clean(qphrase)
         temp=0.75,
         top_p=0.65,
     )
+    cln_resp = symspeller(resp["out_text"], sym_checker=schnellspell)
+    bot_resp = corr(remove_repeated_words(cln_resp))
     rt = round(time.perf_counter() - st, 2)
     print(f"took {rt} sec to respond")
     return remove_trailing_punctuation(bot_resp)
         help="folder - with respect to git directory of your repo that has the model files in it (pytorch.bin + "
         "config.json)",
     )
+    parser.add_argument(
+        "--adv-correct",
+        required=False,
+        default=False,
+        action="store_true",
+        help="turn off symspell (baseline) correction to use a more advanced spell checker",
+    )
     parser.add_argument(
         "--gram-model",
         required=False,
     model_loc = cwd.parent / default_model
     model_loc = str(model_loc.resolve())
     gram_model = args.gram_model
+    schnellspell = build_symspell_obj()
     print(f"using model stored here: \n {model_loc} \n")
     iface = gr.Interface(
         chat,

utils.py CHANGED Viewed

@@ -39,37 +39,66 @@ def print_spacer(n=1):
     print("\n   --------    " * n)
-def correct_phrase_load(my_string: str):
     """
-    correct_phrase_load [basic / unoptimized implementation of SymSpell to correct a string]
     Args:
-        my_string (str): [text to be corrected]
     Returns:
-        str: the corrected string
     """
-    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
-    dictionary_path = (
-        r"symspell_rsc/frequency_dictionary_en_82_765.txt"  # from repo root
-    )
-    bigram_path = (
-        r"symspell_rsc/frequency_bigramdictionary_en_243_342.txt"  # from repo root
-    )
     # term_index is the column of the term and count_index is the
     # column of the term frequency
-    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
-    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
     # max edit distance per lookup (per single word, not per whole input string)
-    suggestions = sym_spell.lookup_compound(
-        clean(my_string), max_edit_distance=2, ignore_non_words=True
     )
     if len(suggestions) < 1:
-        return my_string
     else:
-        first_result = suggestions[0]
         return first_result._term

     print("\n   --------    " * n)
+def build_symspell_obj(edit_dist=3, prefix_length=7, dictionary_path=None, bigram_path=None,):
     """
+    build_symspell_obj [build a SymSpell object]
     Args:
+        verbose (bool, optional): Defaults to False.
     Returns:
+        SymSpell: a SymSpell object
     """
+    dictionary_path = r"symspell_rsc/frequency_dictionary_en_82_765.txt" if dictionary_path is None else dictionary_path
+    bigram_path = r"symspell_rsc/frequency_bigramdictionary_en_243_342.txt" if bigram_path is None else bigram_path
+    sym_checker = SymSpell(
+        max_dictionary_edit_distance=edit_dist, prefix_length=prefix_length)
     # term_index is the column of the term and count_index is the
     # column of the term frequency
+    sym_checker.load_dictionary(dictionary_path, term_index=0, count_index=1)
+    sym_checker.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
+    return sym_checker
+def symspeller(my_string: str, sym_checker=None, max_dist: int = 3, prefix_length: int = 7,
+               ignore_non_words=True,
+               dictionary_path:str=None, bigram_path:str=None, verbose=False):
+    """
+    symspeller - a wrapper for the SymSpell class from symspellpy
+    Parameters
+    ----------
+    my_string : str, required, default=None, the string to be checked
+    sym_checker : SymSpell, optional, default=None, the SymSpell object to use
+    max_dist : int, optional, default=3, the maximum distance to look for replacements
+    """
+    assert len(my_string) > 0, "entered string for correction is empty"
+    if sym_checker is None:
+        # need to create a new class object. user can specify their own dictionary and bigram files
+        if verbose:
+            print("creating new SymSpell object")
+        sym_checker = build_symspell_obj(
+            edit_dist=max_dist, prefix_length=prefix_length, dictionary_path=dictionary_path, bigram_path=bigram_path,)
+    else:
+        if verbose: print("using existing SymSpell object")
     # max edit distance per lookup (per single word, not per whole input string)
+    suggestions = sym_checker.lookup_compound(
+        my_string, max_edit_distance=max_dist, ignore_non_words=ignore_non_words, ignore_term_with_digits=True, transfer_casing=True,
     )
+    if verbose:
+        print(f"{len(suggestions)} suggestions found")
+        print(f"the original string is:\n\t{my_string}")
+        sug_list = [sug.term for sug in suggestions]
+        print(f"suggestions:\n\t{sug_list}\n")
     if len(suggestions) < 1:
+        return clean(my_string) # no correction because no suggestions
     else:
+        first_result = suggestions[0] # first result is the most likely
         return first_result._term