File size: 9,356 Bytes
7bc1fb2
a90aae5
8784af6
 
9677df9
8784af6
 
 
e234712
fc092d3
f80653e
d733a1d
a90aae5
292fc01
49e6c2a
fc092d3
df899db
3d73dee
fe2f37b
80ab8fe
6ba8329
6510c54
a54a5d2
e64a5c2
09e268e
21ee0e4
83a1127
738039e
 
 
fbe6573
726d708
 
11a15f4
 
26f67a4
 
5957100
 
4f05762
 
7b650b8
 
30e3713
2c6750d
7b650b8
6cb6628
 
dc6b708
2473dde
dc6b708
8784af6
752107c
8784af6
 
 
e29aa1d
8784af6
80ab8fe
8784af6
 
 
2966e63
8784af6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08112a4
8451261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08112a4
 
e7e1f57
 
08112a4
 
 
 
e7e1f57
08112a4
 
 
 
 
 
 
 
 
 
 
 
e7e1f57
08112a4
 
 
752107c
 
 
 
 
 
 
 
 
 
08112a4
8784af6
 
 
e234712
08112a4
 
 
 
8451261
 
752107c
 
8784af6
 
 
e234712
8784af6
 
 
 
 
 
 
 
 
80ab8fe
8784af6
08112a4
 
 
 
 
752107c
 
 
 
 
 
 
 
 
 
 
8451261
752107c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
import torch

first = """informal english: corn fields are all across illinois, visible once you leave chicago.\nTranslated into the Style of Abraham Lincoln: corn fields ( permeate illinois / span the state of illinois / ( occupy / persist in ) all corners of illinois / line the horizon of illinois / envelop the landscape of illinois ), manifesting themselves visibly as one ventures beyond chicago.\n\ninformal english: """

@st.cache(allow_output_mutation=True)
def get_model():
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/GPTNeo350MInformalToFormalLincoln2")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln21")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln40")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln41")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln41")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln49")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/GPT2InformalToFormalLincoln42")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/Points3")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/GPTNeo1.3BPointsLincolnFormalInformal")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/MediumInformalToFormalLincoln")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/GPTNeo350MInformalToFormalLincoln7")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincolnConciseWordy")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/MediumInformalToFormalLincoln2")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/MediumInformalToFormalLincoln3")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/MediumInformalToFormalLincoln4")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln50")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/GPT2Neo1.3BPoints2")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/GPT2Neo1.3BPoints3")
    #model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
    #tokenizer = AutoTokenizer.from_pretrained("BigSalmon/InformalToFormalLincoln63Paraphrase")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln68Paraphrase")
    #model2 = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln63Paraphrase")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln63Paraphrase")
    #tokenizer = AutoTokenizer.from_pretrained("BigSalmon/InformalToFormalLincoln73Paraphrase")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln73Paraphrase")
    #tokenizer = AutoTokenizer.from_pretrained("BigSalmon/InformalToFormalLincoln76Paraphrase")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln76Paraphrase")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln78Paraphrase")
    #tokenizer = AutoTokenizer.from_pretrained("BigSalmon/InformalToFormalLincoln78Paraphrase")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln80Paraphrase")
    #tokenizer = AutoTokenizer.from_pretrained("BigSalmon/InformalToFormalLincoln80Paraphrase")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln82Paraphrase")
    #tokenizer = AutoTokenizer.from_pretrained("BigSalmon/InformalToFormalLincoln82Paraphrase")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln83Paraphrase")
    #tokenizer = AutoTokenizer.from_pretrained("BigSalmon/InformalToFormalLincoln83Paraphrase")
    model = AutoModelForCausalLM.from_pretrained("cerebras/Cerebras-GPT-1.3B")
    #model = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincoln87Paraphrase")
    tokenizer = AutoTokenizer.from_pretrained("BigSalmon/InformalToFormalLincoln87Paraphrase")
    tokenizer2 = AutoTokenizer.from_pretrained("BigSalmon/InformalToFormalLincolnMedium")
    model2 = AutoModelForCausalLM.from_pretrained("BigSalmon/InformalToFormalLincolnMedium")
    return model, model2, tokenizer, tokenizer2
    
model, model2, tokenizer, tokenizer2 = get_model()

st.text('''For Prompt Templates: https://huggingface.co/BigSalmon/InformalToFormalLincoln82Paraphrase''')

temp = st.sidebar.slider("Temperature", 0.7, 1.5)
number_of_outputs = st.sidebar.slider("Number of Outputs", 5, 50)
lengths = st.sidebar.slider("Length", 3, 500)
bad_words = st.text_input("Words You Do Not Want Generated", " core lemon height time ")
logs_outputs = st.sidebar.slider("Logit Outputs", 50, 300)

def run_generate(text, bad_words):
  yo = []
  input_ids = tokenizer.encode(text, return_tensors='pt')
  res = len(tokenizer.encode(text))
  bad_words = bad_words.split()
  bad_word_ids = []
  for bad_word in bad_words: 
    bad_word = " " + bad_word
    ids = tokenizer(bad_word).input_ids
    bad_word_ids.append(ids)
  sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length= res + lengths, 
    min_length = res + lengths, 
    top_k=50,
    temperature=temp,
    num_return_sequences=number_of_outputs,
    bad_words_ids=bad_word_ids
  )
  for i in range(number_of_outputs):
    e = tokenizer.decode(sample_outputs[i])
    e = e.replace(text, "")
    yo.append(e)
  return yo
  
def BestProbs5(prompt):
  prompt = prompt.strip()
  text = tokenizer.encode(prompt)
  myinput, past_key_values = torch.tensor([text]), None
  myinput = myinput
  logits, past_key_values = model(myinput, past_key_values = past_key_values, return_dict=False)
  logits = logits[0,-1]
  probabilities = torch.nn.functional.softmax(logits)
  best_logits, best_indices = logits.topk(number_of_outputs)
  best_words = [tokenizer.decode([idx.item()]) for idx in best_indices]
  for i in best_words[0:number_of_outputs]:
    #print(i)
    print("\n")
    g = (prompt + i)
    st.write(g)
    l = run_generate(g, "hey")
    st.write(l)
  
def run_generate2(text, bad_words):
  yo = []
  input_ids = tokenizer2.encode(text, return_tensors='pt')
  res = len(tokenizer2.encode(text))
  bad_words = bad_words.split()
  bad_word_ids = []
  for bad_word in bad_words: 
    bad_word = " " + bad_word
    ids = tokenizer2(bad_word).input_ids
    bad_word_ids.append(ids)
  sample_outputs = model2.generate(
    input_ids,
    do_sample=True, 
    max_length= res + lengths, 
    min_length = res + lengths, 
    top_k=50,
    temperature=temp,
    num_return_sequences=number_of_outputs,
    bad_words_ids=bad_word_ids
  )
  for i in range(number_of_outputs):
    e = tokenizer2.decode(sample_outputs[i])
    e = e.replace(text, "")
    yo.append(e)
  return yo
  
def prefix_format(sentence):
  words = sentence.split()
  if "[MASK]" in sentence:
    words2 = words.index("[MASK]")
    #print(words2)
    output = ("<Prefix> " + ' '.join(words[:words2]) + " <Prefix> " + "<Suffix> " + ' '.join(words[words2+1:]) + " <Suffix>" + " <Middle>")
    st.write(output)
  else:
    st.write("Add [MASK] to sentence")
 
with st.form(key='my_form'):
    text = st.text_area(label='Enter sentence', value=first)
    submit_button = st.form_submit_button(label='Submit')
    submit_button2 = st.form_submit_button(label='Submit Log Probs')
    
    submit_button3 = st.form_submit_button(label='Submit Other Model')
    submit_button4 = st.form_submit_button(label='Submit Log Probs Other Model')
    
    submit_button5 = st.form_submit_button(label='Most Prob')
    
    submit_button6 = st.form_submit_button(label='Turn Sentence with [MASK] into <Prefix> Format')
    
    if submit_button:
      translated_text = run_generate(text, bad_words)
      st.write(translated_text if translated_text else "No translation found")
    if submit_button2:
      with torch.no_grad():
        text2 = str(text)
        print(text2)
        text3 = tokenizer.encode(text2)
        myinput, past_key_values = torch.tensor([text3]), None
        myinput = myinput
        logits, past_key_values = model(myinput, past_key_values = past_key_values, return_dict=False)
        logits = logits[0,-1]
        probabilities = torch.nn.functional.softmax(logits)
        best_logits, best_indices = logits.topk(logs_outputs)
        best_words = [tokenizer.decode([idx.item()]) for idx in best_indices]      
        st.write(best_words)
    if submit_button3:
      translated_text = run_generate2(text, bad_words)
      st.write(translated_text if translated_text else "No translation found")
    if submit_button4:
      text2 = str(text)
      print(text2)
      text3 = tokenizer2.encode(text2)
      myinput, past_key_values = torch.tensor([text3]), None
      myinput = myinput
      logits, past_key_values = model2(myinput, past_key_values = past_key_values, return_dict=False)
      logits = logits[0,-1]
      probabilities = torch.nn.functional.softmax(logits)
      best_logits, best_indices = logits.topk(logs_outputs)
      best_words = [tokenizer2.decode([idx.item()]) for idx in best_indices]      
      st.write(best_words)
    if submit_button5:
      BestProbs5(text)
    if submit_button6:
      text2 = str(text)
      prefix_format(text2)