Spaces:

ethzanalytics
/

gpt2-xl-conversational

Runtime error

App Files Files Community

Peter commited on Feb 22, 2022

Commit

74b8229

•

1 Parent(s): 2e158ce

:tada: init from template

Browse files

Files changed (8) hide show

.gitignore +12 -0
app.py +251 -0
converse.py +244 -0
grammar_improve.py +463 -0
requirements.txt +16 -0
symspell_rsc/frequency_bigramdictionary_en_243_342.txt +0 -0
symspell_rsc/frequency_dictionary_en_82_765.txt +0 -0
utils.py +385 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+# basics
+*__pycache__*
+# local testing
+*aitextgen*
+*scratch*
+*tmp*
+# gradio database files
+*gradio_db_files*
+*gradio*
+*flagged*

app.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+app.py - the main file for the app. This creates the flask app and handles the routes.
+"""
+import torch
+from transformers import pipeline
+from cleantext import clean
+from pathlib import Path
+import warnings
+import time
+import argparse
+import logging
+import gradio as gr
+import os
+import sys
+from os.path import dirname
+import nltk
+from converse import discussion
+from grammar_improve import (
+    detect_propers,
+    load_ns_checker,
+    neuspell_correct,
+    remove_repeated_words,
+    remove_trailing_punctuation,
+    build_symspell_obj,
+    symspeller,
+    fix_punct_spacing,
+)
+from utils import (
+    cleantxt_wrap,
+    corr,
+)
+nltk.download("stopwords")  # TODO: find where this requirement originates from
+sys.path.append(dirname(dirname(os.path.abspath(__file__))))
+warnings.filterwarnings(action="ignore", message=".*gradient_checkpointing*")
+import transformers
+transformers.logging.set_verbosity_error()
+logging.basicConfig()
+cwd = Path.cwd()
+my_cwd = str(cwd.resolve())  # string so it can be passed to os.path() objects
+def chat(trivia_query):
+    """
+    chat - helper function that makes the whole gradio thing work.
+    Args:
+        trivia_query (str): the question to ask the bot
+    Returns:
+        [str]: the bot's response
+    """
+    history = []
+    response = ask_gpt(message=trivia_query, chat_pipe=my_chatbot)
+    history = [trivia_query, response]
+    html = ""
+    for item in history:
+        html += f"<b>{item}</b> <br>"
+    html += ""
+    return html
+def ask_gpt(
+    message: str,
+    chat_pipe,
+    speaker="person alpha",
+    responder="person beta",
+    max_len=196,
+    top_p=0.95,
+    top_k=50,
+    temperature=0.6,
+):
+    """
+    ask_gpt - a function that takes in a prompt and generates a response using the pipeline. This interacts the discussion function.
+    Parameters:
+        message (str): the question to ask the bot
+        chat_pipe (str): the chat_pipe to use for the bot (default: "pszemraj/Ballpark-Trivia-XL")
+        speaker (str): the name of the speaker (default: "person alpha")
+        responder (str): the name of the responder (default: "person beta")
+        max_len (int): the maximum length of the response (default: 128)
+        top_p (float): the top probability threshold (default: 0.95)
+        top_k (int): the top k threshold (default: 50)
+        temperature (float): the temperature of the response (default: 0.7)
+    """
+    st = time.perf_counter()
+    prompt = clean(message)  # clean user input
+    prompt = prompt.strip()  # get rid of any extra whitespace
+    in_len = len(prompt)
+    if in_len > 512:
+        prompt = prompt[-512:]  # truncate to 512 chars
+        print(f"Truncated prompt to last 512 chars: started with {in_len} chars")
+        max_len = min(max_len, 512)
+    resp = discussion(
+        prompt_text=prompt,
+        pipeline=chat_pipe,
+        speaker=speaker,
+        responder=responder,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        max_length=max_len,
+    )
+    gpt_et = time.perf_counter()
+    gpt_rt = round(gpt_et - st, 2)
+    rawtxt = resp["out_text"]
+    # check for proper nouns
+    if basic_sc and not detect_propers(rawtxt):
+        cln_resp = symspeller(rawtxt, sym_checker=schnellspell)
+    elif not detect_propers(rawtxt):
+        cln_resp = neuspell_correct(rawtxt, checker=ns_checker)
+    else:
+        # no correction needed
+        cln_resp = rawtxt.strip()
+    bot_resp_a = corr(remove_repeated_words(cln_resp))
+    bot_resp = fix_punct_spacing(bot_resp_a)
+    print(f"the prompt was:\n\t{message}\nand the response was:\n\t{bot_resp}\n")
+    corr_rt = round(time.perf_counter() - gpt_et, 4)
+    print(
+        f"took {gpt_rt + corr_rt} sec to respond, {gpt_rt} for GPT, {corr_rt} for correction\n"
+    )
+    return remove_trailing_punctuation(bot_resp)
+def get_parser():
+    """
+    get_parser - a helper function for the argparse module
+    """
+    parser = argparse.ArgumentParser(
+        description="submit a question, GPT model responds"
+    )
+    parser.add_argument(
+        "-m",
+        "--model",
+        required=False,
+        type=str,
+        default="pszemraj/GPT-Converse-1pt3B-Neo-WoW-DD-17",  # default model
+        help="the model to use for the chatbot on https://huggingface.co/models OR a path to a local model",
+    )
+    parser.add_argument(
+        "--basic-sc",
+        required=False,
+        default=True, # TODO: change this back to False once Neuspell issues are resolved.
+        action="store_true",
+        help="turn on symspell (baseline) correction instead of the more advanced neural net models",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        default=False,
+        help="turn on verbose logging",
+    )
+    return parser
+if __name__ == "__main__":
+    args = get_parser().parse_args()
+    default_model = str(args.model)
+    model_loc = Path(default_model)  # if the model is a path, use it
+    basic_sc = args.basic_sc  # whether to use the baseline spellchecker
+    device = 0 if torch.cuda.is_available() else -1
+    print(f"CUDA avail is {torch.cuda.is_available()}")
+    my_chatbot = (
+        pipeline("text-generation", model=model_loc.resolve(), device=device)
+        if model_loc.exists() and model_loc.is_dir()
+        else pipeline("text-generation", model=default_model, device=device)
+    )  # if the model is a name, use it. stays on CPU if no GPU available
+    print(f"using model {my_chatbot.model}")
+    if basic_sc:
+        print("Using the baseline spellchecker")
+        schnellspell = build_symspell_obj()
+    else:
+        print("using Neuspell spell checker")
+        ns_checker = load_ns_checker(fast=False)
+    print(f"using model stored here: \n {model_loc} \n")
+    iface = gr.Interface(
+        chat,
+        inputs=["text"],
+        outputs="html",
+        examples_per_page=10,
+        examples=[
+            "How can you help me?",
+            "what can you do?",
+            "Hi, my name is……",
+            "Happy birthday!",
+            "I have a question, can you help me?",
+            "Do you know a joke?",
+            "Will you marry me?",
+            "Are you single?",
+            "Do you like people?",
+            "Are you part of the Matrix?",
+            "Do you have a hobby?",
+            "You’re clever",
+            "Tell me about your personality",
+            "You’re annoying",
+            "you suck",
+            "I want to speak to a human now.",
+            "Don’t you speak English?!",
+            "Are you human?",
+            "Are you a robot?",
+            "What is your name?",
+            "How old are you?",
+            "What’s your age?",
+            "What day is it today?",
+            "Who made you?",
+            "Which languages can you speak?",
+            "What is your mother’s name?",
+            "Where do you live?",
+            "What’s the weather like today?",
+            "Are you expensive?",
+            "Do you get smarter?",
+            "rate your overall satisfaction with the chatbot",
+            "How many icebergs are in the ocean?",
+        ],
+        title=f"NLP template space: {default_model} Model",
+        description=f"this space is used as a template. please copy the files in the space to your own space repo, AND THEN edit them ",
+        article="here you can add more details about your model. \n\n"
+        "**Important Notes & About:**\n\n"
+        "1. the model can take up to 60 seconds to respond sometimes, patience is a virtue.\n"
+        "2. the model started from a pretrained checkpoint, and was trained on several different datasets. Anything it says should be fact-checked before being regarded as a true statement.\n"
+        "3. Some params are still being tweaked (in the future, will be inputs) any feedback is welcome :)\n",
+        css="""
+            .chatbox {display:flex;flex-direction:column}
+            .user_msg, .resp_msg {padding:4px;margin-bottom:4px;border-radius:4px;width:80%}
+            .user_msg {background-color:cornflowerblue;color:white;align-self:start}
+            .resp_msg {background-color:lightgray;align-self:self-end}
+        """,
+        allow_screenshot=True,
+        allow_flagging="never",
+        theme="dark",
+    )
+    # launch the gradio interface and start the server
+    iface.launch(
+        # prevent_thread_lock=True,
+        enable_queue=True,  # also allows for dealing with multiple users simultaneously (per newer gradio version)
+    )

converse.py ADDED Viewed

	@@ -0,0 +1,244 @@

+"""
+    converse.py - this script has functions for handling the conversation between the user and the bot.
+    https://huggingface.co/docs/transformers/v4.15.0/en/main_classes/model#transformers.generation_utils.GenerationMixin.generate.no_repeat_ngram_size
+"""
+import pprint as pp
+import time
+import torch
+import transformers
+from grammar_improve import remove_trailing_punctuation
+def discussion(
+    prompt_text: str,
+    speaker: str,
+    responder: str,
+    pipeline,
+    timeout=45,
+    max_length=128,
+    top_p=0.95,
+    top_k=50,
+    temperature=0.7,
+    full_text=False,
+    num_return_sequences=1,
+    device=-1,
+    verbose=False,
+):
+    """
+    discussion - a function that takes in a prompt and generates a response. This function is meant to be used in a conversation loop, and is the main function for the bot.
+    Parameters
+    ----------
+        prompt_text : str, the prompt to ask the bot, usually the user's question
+        speaker : str, the name of the person who is speaking the prompt
+        responder : str, the name of the person who is responding to the prompt
+        pipeline : transformers.Pipeline, the pipeline to use for generating the response
+        timeout : int, optional, the number of seconds to wait before timing out, by default 45
+        max_length : int, optional, the maximum number of tokens to generate, defaults to 128
+        top_p : float, optional, the top probability to use for sampling, defaults to 0.95
+        top_k : int, optional, the top k to use for sampling, defaults to 50
+        temperature : float, optional, the temperature to use for sampling, defaults to 0.7
+        full_text : bool, optional, whether to return the full text or just the generated text, defaults to False
+        num_return_sequences : int, optional, the number of sequences to return, defaults to 1
+        device : int, optional, the device to use for generation, defaults to -1 (CPU)
+        verbose : bool, optional, whether to print the generated text, defaults to False
+    Returns
+    -------
+        str, the generated text
+    """
+    p_list = []  # track conversation
+    p_list.append(speaker.lower() + ":" + "\n")
+    p_list.append(prompt_text.lower() + "\n")
+    p_list.append("\n")
+    p_list.append(responder.lower() + ":" + "\n")
+    this_prompt = "".join(p_list)
+    if verbose:
+        print("overall prompt:\n")
+        pp.pprint(this_prompt, indent=4)
+    # call the model
+    print("\n... generating...")
+    bot_dialogue = gen_response(
+        this_prompt,
+        pipeline,
+        speaker,
+        responder,
+        timeout=timeout,
+        max_length=max_length,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        full_text=full_text,
+        num_return_sequences=num_return_sequences,
+        device=device,
+        verbose=verbose,
+    )
+    if isinstance(bot_dialogue, list) and len(bot_dialogue) > 1:
+        bot_resp = ", ".join(bot_dialogue)
+    elif isinstance(bot_dialogue, list) and len(bot_dialogue) == 1:
+        bot_resp = bot_dialogue[0]
+    else:
+        bot_resp = bot_dialogue
+    bot_resp = " ".join(bot_resp) if isinstance(bot_resp, list) else bot_resp
+    bot_resp = bot_resp.strip()
+    # remove the last ',' '.' chars
+    bot_resp = remove_trailing_punctuation(bot_resp)
+    if verbose:
+        print("\n... bot response:\n")
+        pp.pprint(bot_resp)
+    p_list.append(bot_resp + "\n")
+    p_list.append("\n")
+    print("\nfinished!")
+    # return the bot response and the full conversation
+    return {"out_text": bot_resp, "full_conv": p_list}
+def gen_response(
+    query: str,
+    pipeline,
+    speaker: str,
+    responder: str,
+    timeout=45,
+    max_length=128,
+    top_p=0.95,
+    top_k=50,
+    temperature=0.7,
+    full_text=False,
+    num_return_sequences=1,
+    device=-1,
+    verbose=False,
+    **kwargs,
+):
+    """
+    gen_response - a function that takes in a prompt and generates a response using the pipeline. This operates underneath the discussion function.
+    Parameters
+    ----------
+        query : str, the prompt to ask the bot, usually the user's question
+        speaker : str, the name of the person who is speaking the prompt
+        responder : str, the name of the person who is responding to the prompt
+        pipeline : transformers.Pipeline, the pipeline to use for generating the response
+        timeout : int, optional, the number of seconds to wait before timing out, by default 45
+        max_length : int, optional, the maximum number of tokens to generate, defaults to 128
+        top_p : float, optional, the top probability to use for sampling, defaults to 0.95
+        top_k : int, optional, the top k to use for sampling, defaults to 50
+        temperature : float, optional, the temperature to use for sampling, defaults to 0.7
+        full_text : bool, optional, whether to return the full text or just the generated text, defaults to False
+        num_return_sequences : int, optional, the number of sequences to return, defaults to 1
+        device : int, optional, the device to use for generation, defaults to -1 (CPU)
+        verbose : bool, optional, whether to print the generated text, defaults to False
+    Returns
+    -------
+        str, the generated text
+    """
+    if max_length > 1024:
+        max_length = 1024
+        print("max_length is too large, setting to 1024")
+    st = time.perf_counter()
+    response = pipeline(
+        query,
+        max_length=max_length,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        num_return_sequences=num_return_sequences,
+        max_time=timeout,
+        return_full_text=full_text,
+        no_repeat_ngram_size=3,
+        length_penalty=0.3,
+        repetition_penalty=3.4,
+        clean_up_tokenization_spaces=True,
+        **kwargs,
+    )  # the likely better beam-less method
+    rt = round(time.perf_counter() - st, 2)
+    if verbose:
+        print(f"took {rt} sec to respond")
+    if verbose:
+        print("\n[DEBUG] generated:\n")
+        pp.pprint(response)  # for debugging
+    # process the full result to get the ~bot response~ piece
+    this_result = str(response[0]["generated_text"]).split(
+        "\n"
+    )  # TODO: adjust hardcoded value for index to dynamic (if n>1)
+    bot_dialogue = consolidate_texts(
+        name_resp=responder,
+        model_resp=this_result,
+        name_spk=speaker,
+        verbose=verbose,
+        print_debug=True,
+    )
+    if verbose:
+        print(f"DEBUG: {bot_dialogue} was original response pre-SC")
+    return bot_dialogue  #
+def consolidate_texts(
+    model_resp: list,
+    name_resp: str = None,
+    name_spk: str = None,
+    verbose=False,
+    print_debug=False,
+):
+    """
+    consolidate_texts - given a list with speaker name followed by speaker text, returns all consecutive values of the first speaker name
+    Parameters:
+        name_resp (str): the name of the person who is responding
+        model_resp (list): the list of strings to consolidate (usually from the model)
+        name_spk (str): the name of the person who is speaking
+        verbose (bool): whether to print the results
+        print_debug (bool): whether to print the debug info during looping
+    Returns:
+        list, a list of all the consecutive messages of the first speaker name
+    """
+    assert len(model_resp) > 0, "model_resp is empty"
+    if len(model_resp) == 1:
+        return model_resp[0]
+    name_resp = "person beta" if name_resp is None else name_resp
+    name_spk = "person alpha" if name_spk is None else name_spk
+    if verbose:
+        print("====" * 10)
+        print(f"\n[DEBUG] initial model_resp has {len(model_resp)} lines: \n\t{model_resp}")
+        print(f" the first element is \n\t{model_resp[0]} and it is {type(model_resp[0])}")
+    fn_resp = []
+    name_counter = 0
+    break_safe = False
+    for resline in model_resp:
+        if name_resp.lower() in resline:
+            name_counter += 1
+            break_safe = True  # know the line is from bot as this line starts with the name of the bot
+            continue  # don't add this line to the list
+        if name_spk.lower() in resline.lower():
+            if print_debug:
+                print(f"\nDEBUG: \n\t{resline}\ncaused the break")
+            break  # the name of the speaker is in the line, so we're done
+        if any([": " in resline,":\n" in resline]) and name_resp.lower() not in resline.lower():
+            if print_debug:
+                print(f"\nDEBUG: \n\t{resline}\ncaused the break")
+            break
+        else:
+            fn_resp.append(resline)
+            break_safe = False
+    if verbose:
+        print("--" * 10)
+        print("\nthe full response is:\n")
+        print("\n".join(fn_resp))
+        print("--" * 10)
+    return fn_resp

grammar_improve.py ADDED Viewed

	@@ -0,0 +1,463 @@

+"""
+grammar_improve.py - this .py script contains functions to improve the grammar of a user's input or the models output.
+"""
+from datetime import datetime
+import os
+import pprint as pp
+from neuspell import BertChecker, SclstmChecker
+import neuspell
+import math
+from cleantext import clean
+import time
+import re
+import sys
+from symspellpy.symspellpy import SymSpell
+from utils import suppress_stdout
+def detect_propers(text: str):
+    """
+    detect_propers - detect if a string contains proper nouns
+    Args:
+        text (str): [string to be checked]
+    Returns:
+        [bool]: [True if string contains proper nouns]
+    """
+    pat = re.compile(r"(?:\w+['’])?\w+(?:-(?:\w+['’])?\w+)*")
+    return bool(pat.search(text))
+def fix_punct_spaces(string):
+    """
+    fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
+    Parameters
+    ----------
+    string : str, required, input string to be corrected
+    Returns
+    -------
+    str, corrected string
+    """
+    fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
+    string = fix_spaces.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), string)
+    return string.strip()
+def split_sentences(text: str):
+    """
+    split_sentences - split a string into a list of sentences that keep their ending punctuation. powered by regex witchcraft
+    Args:
+        text (str): [string to be split]
+    Returns:
+        [list]: [list of strings]
+    """
+    return re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
+def remove_repeated_words(bot_response):
+    """
+    remove_repeated_words - remove repeated words from a string, returning only the first instance of each word
+    Parameters
+    ----------
+    bot_response : str
+        string to remove repeated words from
+    Returns
+    -------
+    str
+        string containing the first instance of each word
+    """
+    words = bot_response.split()
+    unique_words = []
+    for word in words:
+        if word not in unique_words:
+            unique_words.append(word)
+    return " ".join(unique_words)
+def remove_trailing_punctuation(text: str, fuLL_strip=False):
+    """
+    remove_trailing_punctuation - remove trailing punctuation from a string. Purpose is to seem more natural to end users
+    Args:
+        text (str): [string to be cleaned]
+    Returns:
+        [str]: [cleaned string]
+    """
+    if fuLL_strip:
+        return text.strip("?!.,;:")
+    else:
+        return text.strip(".,;:")
+def fix_punct_spacing(text: str):
+    fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
+    spc_text = fix_spaces.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), text)
+    cln_text = re.sub(r"(\W)(?=\1)", "", spc_text)
+    return cln_text
+"""
+start of SymSpell code
+"""
+def symspeller(
+    my_string: str,
+    sym_checker=None,
+    max_dist: int = 2,
+    prefix_length: int = 7,
+    ignore_non_words=True,
+    dictionary_path: str = None,
+    bigram_path: str = None,
+    verbose=False,
+):
+    """
+    symspeller - a wrapper for the SymSpell class from symspellpy
+    Parameters
+    ----------
+        my_string : str, required, default=None, the string to be checked
+        sym_checker : SymSpell, optional, default=None, the SymSpell object to use
+        max_dist : int, optional, default=3, the maximum distance to look for replacements
+        prefix_length : int, optional, default=7, the length of the prefixes to use
+        ignore_non_words : bool, optional, default=True, whether to ignore non-words
+        dictionary_path : str, optional, default=None, the path to the dictionary file
+        bigram_path : str, optional, default=None, the path to the bigram dictionary file
+        verbose : bool, optional, default=False, whether to print the results
+    Returns
+    -------
+        list,
+    """
+    assert len(my_string) > 0, "entered string for correction is empty"
+    if sym_checker is None:
+        # need to create a new class object. user can specify their own dictionary and bigram files
+        if verbose:
+            print("creating new SymSpell object")
+        sym_checker = build_symspell_obj(
+            edit_dist=max_dist,
+            prefix_length=prefix_length,
+            dictionary_path=dictionary_path,
+            bigram_path=bigram_path,
+        )
+    else:
+        if verbose:
+            print("using existing SymSpell object")
+    # max edit distance per lookup (per single word, not per whole input string)
+    suggestions = sym_checker.lookup_compound(
+        my_string,
+        max_edit_distance=max_dist,
+        ignore_non_words=ignore_non_words,
+        ignore_term_with_digits=True,
+        transfer_casing=True,
+    )
+    if verbose:
+        print(f"{len(suggestions)} suggestions found")
+        print(f"the original string is:\n\t{my_string}")
+        sug_list = [sug.term for sug in suggestions]
+        print(f"suggestions:\n\t{sug_list}\n")
+    if len(suggestions) < 1:
+        return clean(my_string)  # no correction because no suggestions
+    else:
+        first_result = suggestions[0]  # first result is the most likely
+        return first_result._term
+def build_symspell_obj(
+    edit_dist=2,
+    prefix_length=7,
+    dictionary_path=None,
+    bigram_path=None,
+):
+    """
+    build_symspell_obj [build a SymSpell object]
+    Args:
+        verbose (bool, optional): Defaults to False.
+    Returns:
+        SymSpell: a SymSpell object
+    """
+    dictionary_path = (
+        r"symspell_rsc/frequency_dictionary_en_82_765.txt"
+        if dictionary_path is None
+        else dictionary_path
+    )
+    bigram_path = (
+        r"symspell_rsc/frequency_bigramdictionary_en_243_342.txt"
+        if bigram_path is None
+        else bigram_path
+    )
+    sym_checker = SymSpell(
+        max_dictionary_edit_distance=edit_dist + 2, prefix_length=prefix_length
+    )
+    # term_index is the column of the term and count_index is the
+    # column of the term frequency
+    sym_checker.load_dictionary(dictionary_path, term_index=0, count_index=1)
+    sym_checker.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
+    return sym_checker
+"""
+# if using t5b_correction to check for spelling errors, use this code to initialize the objects
+import torch
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+model_name = 'deep-learning-analytics/GrammarCorrector'
+# torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+torch_device = 'cpu'
+gc_tokenizer = T5Tokenizer.from_pretrained(model_name)
+gc_model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)
+"""
+def t5b_correction(prompt: str, korrektor, verbose=False, beams=4):
+    """
+    t5b_correction - correct a string using a text2textgen pipeline model from transformers
+    Parameters
+    ----------
+    prompt : str, required, input prompt to be corrected
+    korrektor : transformers.pipeline, required, pipeline object
+    verbose : bool, optional, whether to print the corrected prompt. Defaults to False.
+    beams : int, optional, number of beams to use for the correction. Defaults to 4.
+    Returns
+    -------
+    str, corrected prompt
+    """
+    p_min_len = int(math.ceil(0.9 * len(prompt)))
+    p_max_len = int(math.ceil(1.1 * len(prompt)))
+    if verbose:
+        print(f"setting min to {p_min_len} and max to {p_max_len}\n")
+    gcorr_result = korrektor(
+        f"grammar: {prompt}",
+        return_text=True,
+        clean_up_tokenization_spaces=True,
+        num_beams=beams,
+        max_length=p_max_len,
+        repetition_penalty=1.3,
+        length_penalty=0.2,
+        no_repeat_ngram_size=2,
+    )
+    if verbose:
+        print(f"grammar correction result: \n\t{gcorr_result}\n")
+    return gcorr_result
+def all_neuspell_chkrs():
+    """
+    disp_neuspell_chkrs - display the neuspell checkers available
+    Parameters
+    ----------
+    None
+    Returns
+    -------
+    checker_opts - list of checkers available
+    """
+    checker_opts = dir(neuspell)
+    print(f"\navailable checkers:")
+    pp.pprint(checker_opts, indent=4, compact=True)
+    return checker_opts
+def load_ns_checker(customckr=None, fast=False):
+    """
+    load_ns_checker - helper function, load / "set up" a neuspell checker from huggingface transformers
+    Args:
+        customckr (neuspell.NeuSpell): [neuspell checker object], optional, if not provided, will load the default checker
+    Returns:
+        [neuspell.NeuSpell]: [neuspell checker object]
+    """
+    st = time.perf_counter()
+    # stop all printing to the console
+    with suppress_stdout():
+        if customckr is None and not fast:
+            checker = BertChecker(
+                pretrained=True
+            )  # load the default checker, has the best balance
+        elif customckr is None and fast:
+            checker = SclstmChecker(
+                pretrained=True
+            )  # this one is faster but not as accurate
+        else:
+            checker = customckr(pretrained=True)
+    rt_min = (time.perf_counter() - st) / 60
+    # return to standard logging level
+    print(f"\n\nloaded checker in {rt_min} minutes")
+    return checker
+def neuspell_correct(input_text: str, checker=None, verbose=False):
+    """
+    neuspell_correct - correct a string using neuspell.
+                        note that modificaitons to the checker are needed if doing list-based corrections
+    Parameters
+    ----------
+    input_text : str, required, input string to be corrected
+    checker : neuspell.NeuSpell, optional, neuspell checker object. Defaults to None.
+    verbose : bool, optional, whether to print the corrected string. Defaults to False.
+    Returns
+    -------
+    str, corrected string
+    """
+    if isinstance(input_text, str) and len(input_text) < 4:
+        print(f"input text of {input_text} is too short to be corrected")
+        return input_text
+    if checker is None:
+        print("NOTE - no checker provided, loading default checker")
+        checker = SclstmChecker(pretrained=True)
+    corrected = checker.correct(input_text)
+    cleaned_txt = fix_punct_spaces(corrected)
+    if verbose:
+        print(f"neuspell correction result: \n\t{cleaned_txt}\n")
+    return cleaned_txt
+def grammarpipe(corrector, qphrase: str):
+    """
+    gramformer_correct - THE ORIGINAL ONE USED IN PROJECT AND NEEDS TO BE CHANGED.
+                            Idea is to correct a string using a text2textgen pipeline model from transformers
+    Args:
+        corrector (transformers.pipeline): [transformers pipeline object, already created w/ relevant model]
+        qphrase (str): [text to be corrected]
+    Returns:
+        [str]: [corrected text]
+    """
+    if isinstance(qphrase, str) and len(qphrase) < 4:
+        print(f"input text of {qphrase} is too short to be corrected")
+        return qphrase
+    try:
+        corrected = corrector(
+            clean(qphrase), return_text=True, clean_up_tokenization_spaces=True
+        )
+        return corrected[0]["generated_text"]
+    except Exception as e:
+        print(f"NOTE - failed to correct with grammarpipe:\n {e}")
+        return clean(qphrase)
+def DLA_correct(qphrase: str):
+    """
+    DLA_correct - an "overhead" function to call correct_grammar() on a string, allowing for each newline to be corrected individually
+    Args:
+        qphrase (str): [string to be corrected]
+    Returns:
+        str, the list of the corrected strings joined under " "
+    """
+    if isinstance(qphrase, str) and len(qphrase) < 4:
+        print(f"input text of {qphrase} is too short to be corrected")
+        return qphrase
+    sentences = split_sentences(qphrase)
+    if len(sentences) == 1:
+        corrected = correct_grammar(sentences[0])
+        return corrected
+    else:
+        full_cor = []
+        for sen in sentences:
+            corr_sen = correct_grammar(clean(sen))
+            full_cor.append(corr_sen)
+        return " ".join(full_cor)
+def correct_grammar(
+    input_text: str,
+    tokenizer,
+    model,
+    n_results: int = 1,
+    beams: int = 8,
+    temp=1,
+    uniq_ngrams=2,
+    rep_penalty=1.5,
+    device="cpu",
+):
+    """
+    correct_grammar - correct a string using a text2textgen pipeline model from transformers.
+                        This function is an alternative to the t5b_correction function.
+    Parameters
+    ----------
+    input_text : str, required, input string to be corrected
+    tokenizer : transformers.T5Tokenizer, required, tokenizer object, already created w/ relevant model
+    model : transformers.T5ForConditionalGeneration, required, model object, already created w/ relevant model
+    n_results : int, optional, number of results to return. Defaults to 1.
+    beams : int, optional, number of beams to use for the correction. Defaults to 8.
+    temp : int, optional, temperature to use for the correction. Defaults to 1.
+    uniq_ngrams : int, optional, number of ngrams to use for the correction. Defaults to 2.
+    rep_penalty : float, optional, penalty to use for the correction. Defaults to 1.5.
+    device : str, optional, device to use for the correction. Defaults to 'cpu'.
+    Returns
+    -------
+    str, corrected string (or list of strings if n_results > 1)
+    """
+    st = time.perf_counter()
+    if len(input_text) < 5:
+        return input_text
+    max_length = min(int(math.ceil(len(input_text) * 1.2)), 128)
+    batch = tokenizer(
+        [input_text],
+        truncation=True,
+        padding="max_length",
+        max_length=max_length,
+        return_tensors="pt",
+    ).to(device)
+    translated = model.generate(
+        **batch,
+        max_length=max_length,
+        min_length=min(10, len(input_text)),
+        no_repeat_ngram_size=uniq_ngrams,
+        repetition_penalty=rep_penalty,
+        num_beams=beams,
+        num_return_sequences=n_results,
+        temperature=temp,
+    )
+    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
+    rt_min = (time.perf_counter() - st) / 60
+    print(f"\n\ncorrected in {rt_min} minutes")
+    if isinstance(tgt_text, list):
+        return tgt_text[0]
+    else:
+        return tgt_text

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+transformers>=4.12.5
+sentencepiece>=0.1.96
+tqdm>=4.43.0
+symspellpy>=6.7.0
+requests>=2.24.0
+gradio>=2.4.6
+natsort>=7.1.1
+pandas>=1.3.0
+aitextgen>=0.5.2
+clean-text>=0.5.0
+openwa>=1.3.16
+python-telegram-bot>=13.0
+webwhatsapi>=2.0.5
+Flask>=2.0.2
+nltk>=3.6.6
+neuspell>=1.0.0

symspell_rsc/frequency_bigramdictionary_en_243_342.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

symspell_rsc/frequency_dictionary_en_82_765.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

utils.py ADDED Viewed

	@@ -0,0 +1,385 @@

+"""
+    utils - general utility functions for loading, saving, and manipulating data
+"""
+import os
+from pathlib import Path
+import pprint as pp
+import re
+import shutil  # zipfile formats
+from datetime import datetime
+from os.path import basename
+from os.path import getsize, join
+import requests
+from cleantext import clean
+from natsort import natsorted
+from symspellpy import SymSpell
+import pandas as pd
+from tqdm.auto import tqdm
+from contextlib import contextmanager
+import sys
+import os
+@contextmanager
+def suppress_stdout():
+    """
+    suppress_stdout - suppress stdout for a given block of code. credit to https://newbedev.com/how-to-suppress-console-output-in-python
+    """
+    with open(os.devnull, "w") as devnull:
+        old_stdout = sys.stdout
+        sys.stdout = devnull
+        try:
+            yield
+        finally:
+            sys.stdout = old_stdout
+def remove_string_extras(mytext):
+    # removes everything from a string except A-Za-z0-9 .,;
+    return re.sub(r"[^A-Za-z0-9 .,;]+", "", mytext)
+def corr(s):
+    # adds space after period if there isn't one
+    # removes extra spaces
+    return re.sub(r"\.(?! )", ". ", re.sub(r" +", " ", s))
+def get_timestamp():
+    # get timestamp for file names
+    return datetime.now().strftime("%b-%d-%Y_t-%H")
+def print_spacer(n=1):
+    """print_spacer - print a spacer line"""
+    print("\n   --------    " * n)
+def fast_scandir(dirname: str):
+    """
+    fast_scandir [an os.path-based means to return all subfolders in a given filepath]
+    """
+    subfolders = [f.path for f in os.scandir(dirname) if f.is_dir()]
+    for dirname in list(subfolders):
+        subfolders.extend(fast_scandir(dirname))
+    return subfolders  # list
+def create_folder(directory: str):
+    # you will never guess what this does
+    os.makedirs(directory, exist_ok=True)
+def chunks(lst: list, n: int):
+    """
+    chunks   -  Yield successive n-sized chunks from lst
+    Args:   lst (list): list to be chunked
+    n (int): size of chunks
+    """
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]
+def chunky_pandas(my_df, num_chunks: int = 4):
+    """
+    chunky_pandas [split dataframe into `num_chunks` equal chunks, return each inside a list]
+    Args:
+        my_df (pd.DataFrame)
+        num_chunks (int, optional): Defaults to 4.
+    Returns:
+        list: a list of dataframes
+    """
+    n = int(len(my_df) // num_chunks)
+    list_df = [my_df[i : i + n] for i in range(0, my_df.shape[0], n)]
+    return list_df
+def load_dir_files(
+    directory: str, req_extension=".txt", return_type="list", verbose=False
+):
+    """
+    load_dir_files - an os.path based method of returning all files with extension `req_extension` in a given directory and subdirectories
+    Args:
+    Returns:
+        list or dict: an iterable of filepaths or a dict of filepaths and their respective filenames
+    """
+    appr_files = []
+    # r=root, d=directories, f = files
+    for r, d, f in os.walk(directory):
+        for prefile in f:
+            if prefile.endswith(req_extension):
+                fullpath = os.path.join(r, prefile)
+                appr_files.append(fullpath)
+    appr_files = natsorted(appr_files)
+    if verbose:
+        print("A list of files in the {} directory are: \n".format(directory))
+        if len(appr_files) < 10:
+            pp.pprint(appr_files)
+        else:
+            pp.pprint(appr_files[:10])
+            print("\n and more. There are a total of {} files".format(len(appr_files)))
+    if return_type.lower() == "list":
+        return appr_files
+    else:
+        if verbose:
+            print("returning dictionary")
+        appr_file_dict = {}
+        for this_file in appr_files:
+            appr_file_dict[basename(this_file)] = this_file
+        return appr_file_dict
+def URL_string_filter(text):
+    """
+    URL_string_filter - filter out nonstandard "text" characters
+    """
+    custom_printable = (
+        "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ._"
+    )
+    filtered = "".join((filter(lambda i: i in custom_printable, text)))
+    return filtered
+def getFilename_fromCd(cd):
+    """getFilename_fromCd - get the filename from a given cd str"""
+    if not cd:
+        return None
+    fname = re.findall("filename=(.+)", cd)
+    if len(fname) > 0:
+        output = fname[0]
+    elif cd.find("/"):
+        possible_fname = cd.rsplit("/", 1)[1]
+        output = URL_string_filter(possible_fname)
+    else:
+        output = None
+    return output
+def get_zip_URL(
+    URLtoget: str,
+    extract_loc: str = None,
+    file_header: str = "dropboxexport_",
+    verbose: bool = False,
+):
+    """get_zip_URL - download a zip file from a given URL and extract it to a given location"""
+    r = requests.get(URLtoget, allow_redirects=True)
+    names = getFilename_fromCd(r.headers.get("content-disposition"))
+    fixed_fnames = names.split(";")  # split the multiple results
+    this_filename = file_header + URL_string_filter(fixed_fnames[0])
+    # define paths and save the zip file
+    if extract_loc is None:
+        extract_loc = "dropbox_dl"
+    dl_place = join(os.getcwd(), extract_loc)
+    create_folder(dl_place)
+    save_loc = join(os.getcwd(), this_filename)
+    open(save_loc, "wb").write(r.content)
+    if verbose:
+        print("downloaded file size was {} MB".format(getsize(save_loc) / 1000000))
+    # unpack the archive
+    shutil.unpack_archive(save_loc, extract_dir=dl_place)
+    if verbose:
+        print("extracted zip file - ", datetime.now())
+        x = load_dir_files(dl_place, req_extension="", verbose=verbose)
+    # remove original
+    try:
+        os.remove(save_loc)
+        del save_loc
+    except Exception:
+        print("unable to delete original zipfile - check if exists", datetime.now())
+    print("finished extracting zip - ", datetime.now())
+    return dl_place
+def merge_dataframes(data_dir: str, ext=".xlsx", verbose=False):
+    """
+    merge_dataframes - given a filepath, loads and attempts to merge all files as dataframes
+    Args:
+        data_dir (str): [root directory to search in]
+        ext (str, optional): [anticipate file extension for the dataframes ]. Defaults to '.xlsx'.
+    Returns:
+        pd.DataFrame(): merged dataframe of all files
+    """
+    src = Path(data_dir)
+    src_str = str(src.resolve())
+    mrg_df = pd.DataFrame()
+    all_reports = load_dir_files(directory=src_str, req_extension=ext, verbose=verbose)
+    failed = []
+    for df_path in tqdm(all_reports, total=len(all_reports), desc="joining data..."):
+        try:
+            this_df = pd.read_excel(df_path).convert_dtypes()
+            mrg_df = pd.concat([mrg_df, this_df], axis=0)
+        except Exception:
+            short_p = os.path.basename(df_path)
+            print(
+                f"WARNING - file with extension {ext} and name {short_p} could not be read."
+            )
+            failed.append(short_p)
+    if len(failed) > 0:
+        print("failed to merge {} files, investigate as needed")
+    if verbose:
+        pp.pprint(mrg_df.info(True))
+    return mrg_df
+def download_URL(url: str, file=None, dlpath=None, verbose=False):
+    """
+    download_URL - download a file from a URL and show progress bar
+    Parameters
+    ----------
+    url : str
+        URL to download
+    file : [type], optional
+        [description], by default None
+    dlpath : [type], optional
+        [description], by default None
+    verbose : bool, optional
+        [description], by default False
+    Returns
+    -------
+    str - path to the downloaded file
+    """
+    if file is None:
+        if "?dl=" in url:
+            # is a dropbox link
+            prefile = url.split("/")[-1]
+            filename = str(prefile).split("?dl=")[0]
+        else:
+            filename = url.split("/")[-1]
+        file = clean(filename)
+    if dlpath is None:
+        dlpath = Path.cwd()  # save to current working directory
+    else:
+        dlpath = Path(dlpath)  # make a path object
+    r = requests.get(url, stream=True, allow_redirects=True)
+    total_size = int(r.headers.get("content-length"))
+    initial_pos = 0
+    dl_loc = dlpath / file
+    with open(str(dl_loc.resolve()), "wb") as f:
+        with tqdm(
+            total=total_size,
+            unit="B",
+            unit_scale=True,
+            desc=file,
+            initial=initial_pos,
+            ascii=True,
+        ) as pbar:
+            for ch in r.iter_content(chunk_size=1024):
+                if ch:
+                    f.write(ch)
+                    pbar.update(len(ch))
+    if verbose:
+        print(f"\ndownloaded {file} to {dlpath}\n")
+    return str(dl_loc.resolve())
+def dl_extract_zip(
+    URLtoget: str,
+    extract_loc: str = None,
+    file_header: str = "TEMP_archive_dl_",
+    verbose: bool = False,
+):
+    """
+    dl_extract_zip - generic function to download a zip file and extract it
+    Parameters
+    ----------
+    URLtoget : str
+        zip file URL to download
+    extract_loc : str, optional
+        directory to extract zip to , by default None
+    file_header : str, optional
+        [description], by default "TEMP_archive_dl_"
+    verbose : bool, optional
+        [description], by default False
+    Returns
+    -------
+    str - path to the downloaded and extracted folder
+    """
+    extract_loc = Path(extract_loc)
+    extract_loc.mkdir(parents=True, exist_ok=True)
+    save_loc = download_URL(
+        url=URLtoget, file=f"{file_header}.zip", dlpath=None, verbose=verbose
+    )
+    shutil.unpack_archive(save_loc, extract_dir=extract_loc)
+    if verbose:
+        print("extracted zip file - ", datetime.now())
+        x = load_dir_files(extract_loc, req_extension="", verbose=verbose)
+    # remove original
+    try:
+        os.remove(save_loc)
+        del save_loc
+    except Exception:
+        print("unable to delete original zipfile - check if exists", datetime.now())
+    if verbose:
+        print("finished extracting zip - ", datetime.now())
+    return extract_loc
+def cleantxt_wrap(ugly_text, all_lower=False):
+    """
+    cleantxt_wrap - applies the clean function to a string.
+    Args:
+        ugly_text (str): [string to be cleaned]
+    Returns:
+        [str]: [cleaned string]
+    """
+    if isinstance(ugly_text, str) and len(ugly_text) > 0:
+        return clean(ugly_text, lower=all_lower)
+    else:
+        return ugly_text