Spaces:

pszemraj
/

ballpark-trivia

Runtime error

App Files Files Community

peter szemraj commited on Dec 31, 2021

Commit

7b10a08

•

1 Parent(s): cfd67f6

:truck: consolidate grammar-related functions to one file

Browse files

Files changed (2) hide show

grammar_improve.py +123 -4
utils.py +0 -112

grammar_improve.py CHANGED Viewed

@@ -10,7 +10,7 @@ import math
 from cleantext import clean
 import time
 import re
-""
 def fix_punct_spaces(string):
@@ -45,6 +45,126 @@ def split_sentences(text: str):
     return re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
 def t5b_correction(prompt: str, korrektor, verbose=False, beams=4):
     """
     t5b_correction - correct a string using a text2textgen pipeline model from transformers
@@ -100,7 +220,7 @@ def disp_neuspell_chkrs():
 def load_ns_checker(customckr=None):
     """
-    load_ns_checker - helper function, load a neuspell checker from huggingface transformers
     Args:
         customckr (neuspell.NeuSpell): [neuspell checker object], optional, if not provided, will load the default checker
@@ -211,8 +331,7 @@ def correct_grammar(input_text: str, tokenizer, model,
     Returns
     -------
-    [type]
-        [description]
     """
     if len(input_text) < 5:
         return input_text

 from cleantext import clean
 import time
 import re
 def fix_punct_spaces(string):
     return re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
+def remove_repeated_words(bot_response):
+    """
+    remove_repeated_words - remove repeated words from a string, returning only the first instance of each word
+    Parameters
+    ----------
+    bot_response : str
+        string to remove repeated words from
+    Returns
+    -------
+    str
+        string containing the first instance of each word
+    """
+    words = bot_response.split()
+    unique_words = []
+    for word in words:
+        if word not in unique_words:
+            unique_words.append(word)
+    return " ".join(unique_words)
+def remove_trailing_punctuation(text: str, fuLL_strip=False):
+    """
+    remove_trailing_punctuation - remove trailing punctuation from a string. Purpose is to seem more natural to end users
+    Args:
+        text (str): [string to be cleaned]
+    Returns:
+        [str]: [cleaned string]
+    """
+    if fuLL_strip:
+        return text.strip("?!.,;:")
+    else:
+        return text.strip(".,;:")
+"""
+start of SymSpell code
+"""
+def symspeller(my_string: str, sym_checker=None, max_dist: int = 3, prefix_length: int = 7,
+               ignore_non_words=True,
+               dictionary_path: str = None, bigram_path: str = None, verbose=False):
+    """
+    symspeller - a wrapper for the SymSpell class from symspellpy
+    Parameters
+    ----------
+    my_string : str, required, default=None, the string to be checked
+    sym_checker : SymSpell, optional, default=None, the SymSpell object to use
+    max_dist : int, optional, default=3, the maximum distance to look for replacements
+    """
+    assert len(my_string) > 0, "entered string for correction is empty"
+    if sym_checker is None:
+        # need to create a new class object. user can specify their own dictionary and bigram files
+        if verbose:
+            print("creating new SymSpell object")
+        sym_checker = build_symspell_obj(
+            edit_dist=max_dist, prefix_length=prefix_length, dictionary_path=dictionary_path, bigram_path=bigram_path,)
+    else:
+        if verbose:
+            print("using existing SymSpell object")
+    # max edit distance per lookup (per single word, not per whole input string)
+    suggestions = sym_checker.lookup_compound(
+        my_string, max_edit_distance=max_dist, ignore_non_words=ignore_non_words, ignore_term_with_digits=True, transfer_casing=True,
+    )
+    if verbose:
+        print(f"{len(suggestions)} suggestions found")
+        print(f"the original string is:\n\t{my_string}")
+        sug_list = [sug.term for sug in suggestions]
+        print(f"suggestions:\n\t{sug_list}\n")
+    if len(suggestions) < 1:
+        return clean(my_string)  # no correction because no suggestions
+    else:
+        first_result = suggestions[0]  # first result is the most likely
+        return first_result._term
+def build_symspell_obj(edit_dist=3, prefix_length=7, dictionary_path=None, bigram_path=None,):
+    """
+    build_symspell_obj [build a SymSpell object]
+    Args:
+        verbose (bool, optional): Defaults to False.
+    Returns:
+        SymSpell: a SymSpell object
+    """
+    dictionary_path = r"symspell_rsc/frequency_dictionary_en_82_765.txt" if dictionary_path is None else dictionary_path
+    bigram_path = r"symspell_rsc/frequency_bigramdictionary_en_243_342.txt" if bigram_path is None else bigram_path
+    sym_checker = SymSpell(
+        max_dictionary_edit_distance=edit_dist, prefix_length=prefix_length)
+    # term_index is the column of the term and count_index is the
+    # column of the term frequency
+    sym_checker.load_dictionary(dictionary_path, term_index=0, count_index=1)
+    sym_checker.load_bigram_dictionary(
+        bigram_path, term_index=0, count_index=2)
+    return sym_checker
+"""
+NEEDED FOR T5
+import torch
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+model_name = 'deep-learning-analytics/GrammarCorrector'
+# torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+torch_device = 'cpu'
+gc_tokenizer = T5Tokenizer.from_pretrained(model_name)
+gc_model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)
+"""
 def t5b_correction(prompt: str, korrektor, verbose=False, beams=4):
     """
     t5b_correction - correct a string using a text2textgen pipeline model from transformers
 def load_ns_checker(customckr=None):
     """
+    load_ns_checker - helper function, load / "set up" a neuspell checker from huggingface transformers
     Args:
         customckr (neuspell.NeuSpell): [neuspell checker object], optional, if not provided, will load the default checker
     Returns
     -------
+    str, corrected string (or list of strings if n_results > 1)
     """
     if len(input_text) < 5:
         return input_text

utils.py CHANGED Viewed

@@ -39,68 +39,6 @@ def print_spacer(n=1):
     print("\n   --------    " * n)
-def build_symspell_obj(edit_dist=3, prefix_length=7, dictionary_path=None, bigram_path=None,):
-    """
-    build_symspell_obj [build a SymSpell object]
-    Args:
-        verbose (bool, optional): Defaults to False.
-    Returns:
-        SymSpell: a SymSpell object
-    """
-    dictionary_path = r"symspell_rsc/frequency_dictionary_en_82_765.txt" if dictionary_path is None else dictionary_path
-    bigram_path = r"symspell_rsc/frequency_bigramdictionary_en_243_342.txt" if bigram_path is None else bigram_path
-    sym_checker = SymSpell(
-        max_dictionary_edit_distance=edit_dist, prefix_length=prefix_length)
-    # term_index is the column of the term and count_index is the
-    # column of the term frequency
-    sym_checker.load_dictionary(dictionary_path, term_index=0, count_index=1)
-    sym_checker.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
-    return sym_checker
-def symspeller(my_string: str, sym_checker=None, max_dist: int = 3, prefix_length: int = 7,
-               ignore_non_words=True,
-               dictionary_path:str=None, bigram_path:str=None, verbose=False):
-    """
-    symspeller - a wrapper for the SymSpell class from symspellpy
-    Parameters
-    ----------
-    my_string : str, required, default=None, the string to be checked
-    sym_checker : SymSpell, optional, default=None, the SymSpell object to use
-    max_dist : int, optional, default=3, the maximum distance to look for replacements
-    """
-    assert len(my_string) > 0, "entered string for correction is empty"
-    if sym_checker is None:
-        # need to create a new class object. user can specify their own dictionary and bigram files
-        if verbose:
-            print("creating new SymSpell object")
-        sym_checker = build_symspell_obj(
-            edit_dist=max_dist, prefix_length=prefix_length, dictionary_path=dictionary_path, bigram_path=bigram_path,)
-    else:
-        if verbose: print("using existing SymSpell object")
-    # max edit distance per lookup (per single word, not per whole input string)
-    suggestions = sym_checker.lookup_compound(
-        my_string, max_edit_distance=max_dist, ignore_non_words=ignore_non_words, ignore_term_with_digits=True, transfer_casing=True,
-    )
-    if verbose:
-        print(f"{len(suggestions)} suggestions found")
-        print(f"the original string is:\n\t{my_string}")
-        sug_list = [sug.term for sug in suggestions]
-        print(f"suggestions:\n\t{sug_list}\n")
-    if len(suggestions) < 1:
-        return clean(my_string) # no correction because no suggestions
-    else:
-        first_result = suggestions[0] # first result is the most likely
-        return first_result._term
 def fast_scandir(dirname: str):
     """
@@ -412,56 +350,6 @@ def dl_extract_zip(
     return extract_loc
-def remove_repeated_words(bot_response):
-    """
-    remove_repeated_words - remove repeated words from a string, returning only the first instance of each word
-    Parameters
-    ----------
-    bot_response : str
-        string to remove repeated words from
-    Returns
-    -------
-    str
-        string containing the first instance of each word
-    """
-    words = bot_response.split()
-    unique_words = []
-    for word in words:
-        if word not in unique_words:
-            unique_words.append(word)
-    return " ".join(unique_words)
-def remove_trailing_punctuation(text: str, fuLL_strip=False):
-    """
-    remove_trailing_punctuation - remove trailing punctuation from a string
-    Args:
-        text (str): [string to be cleaned]
-    Returns:
-        [str]: [cleaned string]
-    """
-    if fuLL_strip:
-        return text.strip("?!.,;:")
-    else:
-        return text.strip(".,;:")
-def split_sentences(text: str):
-    """
-    split_sentences - split a string into a list of sentences that keep their ending punctuation
-    Args:
-        text (str): [string to be split]
-    Returns:
-        [list]: [list of sentences]
-    """
-    return re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
 def cleantxt_wrap(ugly_text):
     """

     print("\n   --------    " * n)
 def fast_scandir(dirname: str):
     """
     return extract_loc
 def cleantxt_wrap(ugly_text):
     """