Spaces:

Hellisotherpeople
/

Gadsby

Sleeping

App Files Files Community

Hellisotherpeople commited on May 21, 2023

Commit

2dce12e

1 Parent(s): efa9e89

Upload Text-Generation.py

Browse files

Files changed (1) hide show

Text-Generation.py +113 -130

Text-Generation.py CHANGED Viewed

@@ -8,10 +8,37 @@ from transformers import (AutoModelForCausalLM, AutoModelForQuestionAnswering,
                           AutoModelForSeq2SeqLM,
                           AutoModelForSequenceClassification, AutoTokenizer,
                           GPT2Tokenizer, LogitsProcessor, LogitsProcessorList,
-                          pipeline, top_k_top_p_filtering)
 st.set_page_config(page_title="Gadsby")
 st.title("Gadsby - Constrained Text Generation with Transformers")
 st.image("https://upload.wikimedia.org/wikipedia/commons/1/1d/Gadsby_%28book_cover%29.jpg")
@@ -20,146 +47,102 @@ st.caption("The inspiration for this space: https://en.wikipedia.org/wiki/Gadsby
 form = st.sidebar.form("choose_settings")
-form.header("Main Settings")
-model_name = form.text_area("Enter the name of the pre-trained model from transformers that we are using for Text Generation", value = "gpt2")
 form.caption("This will download a new model, so it may take awhile or even break if the model is too large")
-mode = form.selectbox("What kind of constrained generation are we doing?", ["lipogram", "reverse_lipogram", "e-prime", "rhopalism", "length_constrained", "greater_than_length", "Pangram", "rhopalism-lipogram"])
-form.caption("Lipograms mean that a letter (or substring) is not allowed in the generated string, reverse lipograms force a letter to be in the generated string")
-if mode == "lipogram":
-    naughty_strings_list = st.text_area("Enter the list of strings that you don't want in each word seperated by a space", value = "E e")
-    naughty_strings = naughty_strings_list.split(" ")
-elif mode == "e-prime":
-    e_prime_string = """be being been am is isn't are aren't was wasn't were weren't i'm you're we're they're he's she's it's there's here's where's how's what's who's that's aint isnt arent wasnt werent im youre were theyre hes shes its theres heres wheres hows whats whos thats aint Be Being Been Am Is Isn't Are Aren't Was Wasn't Were Weren't I'm You're We're They're He's She's It's There's Here's Where's How's What's Who's That's Aint Isnt Arent Wasnt Werent Im Youre Were Theyre Hes Shes Its Theres Heres Wheres Hows Whats Whos Thats Aint BE BEING BEEN AM IS ISN'T ARE AREN'T WAS WASN'T WERE WEREN'T I'M YOU'RE WE'RE THEY'RE HE'S SHE'S IT'S THERE'S HERE'S WHERE'S HOW'S WHAT'S WHO'S THAT'S AINT ISNT ARENT WASNT WERENT IM YOURE WERE THEYRE HES SHES ITS THERES HERES WHERES HOWS WHATS WHOS THATS AINT"""
-    st.caption("The default word list is the list needed to enforce the language model to generate english without usage of the verb to be")
-    naughty_strings_list = st.text_area("Enter the list of strings that you don't want to be generated (exact match)", value = e_prime_string)
-    naughty_strings = naughty_strings_list.split(" ")
-elif mode == "reverse_lipogram":
-    nice_strings_list = st.text_area("Enter the list of strings that you DO want in each word seperated by a space", value = "t T")
-    nice_strings = nice_strings_list.split(" ")
-elif mode == "rhopalism":
-    length_constraint = form.number_input("Enter the length that the Rhopalism shoud start with", value = 1)
-    st.caption("Rhopalisms are usually reliable but sometimes you need to try generating two or three times for a perfect one")
-elif mode == "rhopalism-lipogram":
-    naughty_strings_list = st.text_area("Enter the list of strings that you don't want in each word seperated by a space", value = "E e")
-    naughty_strings = naughty_strings_list.split(" ")
-    length_constraint = form.number_input("Enter the length that the Rhopalism shoud start with", value = 1)
-    st.caption("Rhopalisms are usually reliable but sometimes you need to try generating two or three times for a perfect one")
 else:
-    length_constraint = form.number_input("Enter the length should each word be restricted to (or greater/less than)", value = 5) + 1
-length = form.number_input("Select how long you want the generated text to be", value = 100)
-number_of_tokens_to_sample = form.number_input("Select how many tokens we want to search through when we do the filtering", value = 25000)
-form.caption("Settings this to higher numbers will improve the experience but will cause generating to slow. Low numbers may cause lots of blank or failed generations")
-temperature = form.number_input("How spicy/interesting do we want our models output to be", value = 0.90, min_value = 0.0)
-form.caption("Setting this higher decreases the likelihood of high probability words and increases the likelihood of low probability (and presumably more interesting) words")
-form.caption("For more details on what these settings mean, see here: https://huggingface.co/blog/how-to-generate")
-sequence = st.text_area("Enter a custom prompt", value = "I do ")
 form.form_submit_button("Generate some Constrained Text!")
-@st.cache(allow_output_mutation=True)
 def load_the_tokenizer():
-  tokenizer = AutoTokenizer.from_pretrained(model_name)
-  return tokenizer
-@st.cache(allow_output_mutation=True)
-def load_the_model():
-  model = AutoModelForCausalLM.from_pretrained(model_name)
-  return model
-model = load_the_model()
-tokenizer = load_the_tokenizer()
-def isPalindrome(s):
-    return s == s[::-1]
-if mode == "rhopalism" or mode == "rhopalism-lipogram":
-    rhopalism_len = length_constraint
-nice_strings_pangram = list(string.ascii_lowercase)
-def get_next_word_without_e(input_sequence):
-    input_ids = tokenizer.encode(sequence, return_tensors="pt")
-    # get logits of last hidden state
-    next_token_candidates_logits = model(input_ids)[0][:, -1, :]
-    if temperature != 1.0:
-        next_token_candidates_logits = next_token_candidates_logits / temperature
-    # filter
-    filtered_next_token_candidates_logits = top_k_top_p_filtering(next_token_candidates_logits, top_k=int(number_of_tokens_to_sample), top_p=int(number_of_tokens_to_sample))
-    # sample and get a probability distribution
-    probs = F.softmax(filtered_next_token_candidates_logits, dim=-1)
-    next_token_candidates = torch.multinomial(probs, num_samples=int(number_of_tokens_to_sample)) ## 10000 random samples
-    word_list = []
-    for candidate_string in next_token_candidates:
-        for candidate in candidate_string:
-            resulting_string = tokenizer.decode(candidate) #skip_special_tokens=True, clean_up_tokenization_spaces=True)
-            ###Constrained text generation starts HERE
-            ##Lipogram - No naughty strings used
-            if mode == "lipogram" or mode == "e-prime":
-                if all(nauty_string not in resulting_string for nauty_string in naughty_strings): ## This returns at the first naughty strings
-                    return resulting_string
-            ##Reverse-Lipogram - Must use things in nice_strings
-            elif mode == "reverse_lipogram":
-                if any(nice_string in resulting_string for nice_string in nice_strings):
-                    return resulting_string
-            ##Length constraints
-            elif mode == "length_constrained":
-                ##Seems reliable if length is greater than 4
-                if len(resulting_string) == length_constraint:
-                    return resulting_string
-            elif mode == "greater_than_length":
-                ##Only sort of works
-                if len(resulting_string) >= length_constraint:
-                    return resulting_string
-            elif mode == "rhopalism":
-                ##Mostly works
-                if len(resulting_string) == rhopalism_len:
-                    return resulting_string
-            elif mode == "Pangram":
-                if any(c in nice_strings_pangram for c in resulting_string):
-                    return resulting_string
-            elif mode == "rhopalism-lipogram":
-                if len(resulting_string) == rhopalism_len:
-                    if all(nauty_string not in resulting_string for nauty_string in naughty_strings):
-                        return resulting_string
-    return " "
-j = 0
-i = length
-while i > 0:
-    new_word = get_next_word_without_e(input_sequence= sequence)
-    sequence = sequence + new_word
-    if mode == "rhopalism" or mode == "rhopalism-lipogram":
-        rhopalism_len += 1
-    i = i-1
-    if mode == "Pangram":
-        for character in sequence:
-            if character in nice_strings_pangram:
-                nice_strings_pangram.remove(character)
-    j += 1
-st.write("GENERATED SEQUENCE: ")
-st.write(sequence)
-#st.write(nice_strings_pangram)

                           AutoModelForSeq2SeqLM,
                           AutoModelForSequenceClassification, AutoTokenizer,
                           GPT2Tokenizer, LogitsProcessor, LogitsProcessorList,
+                          pipeline, top_k_top_p_filtering, PhrasalConstraint, DisjunctiveConstraint)
+import ast
+class ModifyLogitsProcessor(LogitsProcessor):
+    ### Anything with the letter "e" in it
+    def __init__(self, tokenizer, chars_to_modify, filter_mode=True):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.filter_mode = filter_mode
+        self.chars_to_modify = chars_to_modify
+        # Compute the tokens to modify at initialization
+        self.tokens_to_modify = {}
+        for char, factor in chars_to_modify.items():
+            mod_tokens = [token_id for token_id, token in enumerate(self.tokenizer.get_vocab()) if char in token]
+            self.tokens_to_modify[char] = mod_tokens
+    def __call__(self, input_ids, scores):
+        for char, tokens in self.tokens_to_modify.items():
+            if self.filter_mode:
+                scores[:, tokens] = -float('inf')
+            else:
+                # Fetch the corresponding factor from chars_to_modify dictionary
+                factor = self.chars_to_modify[char]
+                scores[:, tokens] += factor
+        return scores
 st.set_page_config(page_title="Gadsby")
 st.title("Gadsby - Constrained Text Generation with Transformers")
 st.image("https://upload.wikimedia.org/wikipedia/commons/1/1d/Gadsby_%28book_cover%29.jpg")
 form = st.sidebar.form("choose_settings")
+form.header("Model Settings")
+model_name = form.text_area("Enter the name of the pre-trained model from transformers that we are using for Text Generation", value = "eachadea/vicuna-7b-1.1")
 form.caption("This will download a new model, so it may take awhile or even break if the model is too large")
+percision = form.selectbox("What percision are we loading the model with?", ["8bit", "16bit", "32bit"], )
+form.caption("The lower the percision, the less ram the model takes and the faster it runs, but the quality is reduced")
+form.header("Token Level Constraint Settings")
+form.subheader("Lipogram Constraint")
+form.caption("Lipograms are compositions where a certain letter or certain letters of the alphabet are omitted or discouraged")
+filter_mode = form.checkbox("Filter Mode?", value=False)
+form.caption("Enabling filter mode sets all selected tokens probabilities to negative infinity")
+naughty_strings_list = form.text_input('Enter letters or words to filter or modify the probabilities of (comma separated):', value = "that,e")
+factor_input = form.text_input('Enter corresponding factors to add to the logits (comma separated, ignored if in filter mode):', value = "5,-99")
+form.header("Sequence Level Constraint Settings")
+form.header("Phrasal Constraint")
+force_word = form.text_input("Enter a word or sentence that is guaranteed to appear in the output", value = "lipogram")
+form.header("Disjunctive Constraint")
+force_flexible_input = form.text_input('Enter a list of words or sentences that the model must include at least one item from (in Python list format)', '["constraint", "banana"]')
+if force_flexible_input:
+    try:
+        force_flexible = ast.literal_eval(force_flexible_input)
+    except Exception as e:
+        st.write('Failed to parse the list. Please check your input.')
+        st.write('Error:', e)
+        force_flexible = []
 else:
+    pass
+if naughty_strings_list:
+    chars = naughty_strings_list.split(',')
+    factors = list(map(float, factor_input.split(',')))
+    chars_to_modify = dict(zip(chars, factors))
+else:
+    chars = ""
+    factors = []
+    chars_to_modify = {}
+generate_args = st.text_input('model.generate() arguments (in python dictionary format) ', '{"max_length": 50, "min_length": 50, "temperature": 2.0, "num_return_sequences": 1, "do_sample": False, "num_beams": 2, "repetition_penalty": 3.0}')
+st.caption("For more details on what these settings mean and a complete list of all settings, see here: https://huggingface.co/blog/how-to-generate and https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig and https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationMixin.generate")
+sequence = st.text_area("Enter a custom prompt", value = "Tell me about ")
 form.form_submit_button("Generate some Constrained Text!")
+def parse_generate_args(args_str):
+    args_list = args_str.split(',')
+    args_dict = {arg.split(':')[0]: int(arg.split(':')[1]) for arg in args_list if len(arg.split(':')) == 2}
+    return args_dict
+@st.cache_resource
 def load_the_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = False)
+    return tokenizer
+@st.cache_resource
+def load_the_model(percision):
+    if percision == "32bit":
+        model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', load_in_8bit=False)
+    elif percision =="16bit":
+        model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', load_in_8bit=False, torch_dtype=torch.float16)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', load_in_8bit=True)
+    return model
+if len(chars) != len(factors):
+    st.write("Please ensure that the number of characters matches the number of factors.")
+else:
+    model = load_the_model(percision)
+    tokenizer = load_the_tokenizer()
+    constraints = []
+    if force_word:
+        constraints.append(PhrasalConstraint(
+        tokenizer(force_word, add_special_tokens=False).input_ids
+    ))
+    if force_flexible_input:
+        constraints.append(DisjunctiveConstraint(
+        tokenizer(force_flexible, add_special_tokens=False).input_ids
+    ))
+    if filter_mode:
+        logits_processor = LogitsProcessorList([ModifyLogitsProcessor(tokenizer, chars_to_modify, filter_mode=True)])
+    else:
+        logits_processor = LogitsProcessorList([ModifyLogitsProcessor(tokenizer, chars_to_modify, filter_mode=False)])
+    input_ids = tokenizer.encode(sequence, return_tensors="pt").to('cuda')
+    generate_kwargs = ast.literal_eval(generate_args)
+    if constraints:
+        output_ids = model.generate(input_ids, constraints=constraints, logits_processor=logits_processor, **generate_kwargs)
+    else:
+        output_ids = model.generate(input_ids, logits_processor=logits_processor, **generate_kwargs)
+    st.write("GENERATED SEQUENCE(s): ")
+    for output in output_ids:
+        st.write(tokenizer.decode(output, skip_special_tokens = True, clean_up_tokenization_spaces = True))