Mila commited on
Commit
3139db4
1 Parent(s): 33e257e

This time for sure x4

Browse files
Files changed (39) hide show
  1. app_context.py +253 -257
  2. flan-t5-train.py +234 -301
  3. results/checkpoint-16000/added_tokens.json +102 -0
  4. results/checkpoint-16000/config.json +62 -0
  5. results/checkpoint-16000/generation_config.json +6 -0
  6. results/checkpoint-16000/model.safetensors +3 -0
  7. results/checkpoint-16000/optimizer.pt +3 -0
  8. results/checkpoint-16000/rng_state.pth +3 -0
  9. results/checkpoint-16000/scheduler.pt +3 -0
  10. results/checkpoint-16000/special_tokens_map.json +125 -0
  11. results/checkpoint-16000/spiece.model +3 -0
  12. results/checkpoint-16000/tokenizer_config.json +939 -0
  13. results/checkpoint-16000/trainer_state.json +319 -0
  14. results/checkpoint-16000/training_args.bin +3 -0
  15. results/checkpoint-16500/added_tokens.json +102 -0
  16. results/checkpoint-16500/config.json +62 -0
  17. results/checkpoint-16500/generation_config.json +6 -0
  18. results/checkpoint-16500/model.safetensors +3 -0
  19. results/checkpoint-16500/optimizer.pt +3 -0
  20. results/checkpoint-16500/rng_state.pth +3 -0
  21. results/checkpoint-16500/scheduler.pt +3 -0
  22. results/checkpoint-16500/special_tokens_map.json +125 -0
  23. results/checkpoint-16500/spiece.model +3 -0
  24. results/checkpoint-16500/tokenizer_config.json +939 -0
  25. results/checkpoint-16500/trainer_state.json +325 -0
  26. results/checkpoint-16500/training_args.bin +3 -0
  27. results/checkpoint-17000/added_tokens.json +102 -0
  28. results/checkpoint-17000/config.json +62 -0
  29. results/checkpoint-17000/generation_config.json +6 -0
  30. results/checkpoint-17000/model.safetensors +3 -0
  31. results/checkpoint-17000/optimizer.pt +3 -0
  32. results/checkpoint-17000/rng_state.pth +3 -0
  33. results/checkpoint-17000/scheduler.pt +3 -0
  34. results/checkpoint-17000/special_tokens_map.json +125 -0
  35. results/checkpoint-17000/spiece.model +3 -0
  36. results/checkpoint-17000/tokenizer_config.json +939 -0
  37. results/checkpoint-17000/trainer_state.json +331 -0
  38. results/checkpoint-17000/training_args.bin +3 -0
  39. word_embedding.py +619 -0
app_context.py CHANGED
@@ -1,258 +1,254 @@
1
- import gradio as gr
2
- import math
3
- import spacy
4
- from datasets import load_dataset
5
- from sentence_transformers import SentenceTransformer
6
- from sentence_transformers import InputExample
7
- from sentence_transformers import losses
8
- from sentence_transformers import util
9
- from transformers import pipeline, T5Tokenizer
10
- from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
11
- from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration
12
- import torch
13
- import torch.nn.functional as F
14
- from torch.utils.data import DataLoader
15
- import numpy as np
16
- import evaluate
17
- import nltk
18
- from nltk.corpus import stopwords
19
- import subprocess
20
- import sys
21
- import random
22
- from textwrap import fill
23
-
24
- # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
25
- subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
26
- # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
27
- model_base = "results/checkpoint-17000"
28
- nltk.download('stopwords')
29
- nlp = spacy.load("en_core_web_sm")
30
- stops = stopwords.words("english")
31
- ROMAN_CONSTANTS = (
32
- ( "", "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX" ),
33
- ( "", "X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC" ),
34
- ( "", "C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM" ),
35
- ( "", "M", "MM", "MMM", "", "", "-", "", "", "" ),
36
- ( "", "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix" ),
37
- ( "", "x", "xx", "xxx", "xl", "l", "lx", "lxx", "lxxx", "xc" ),
38
- ( "", "c", "cc", "ccc", "cd", "d", "dc", "dcc", "dccc", "cm" ),
39
- ( "", "m", "mm", "mmm", "", "", "-", "", "", "" ),
40
- )
41
-
42
- # answer = "Pizza"
43
- guesses = []
44
- return_guesses = []
45
- answer = "Moon"
46
- word1 = "Black"
47
- word2 = "White"
48
- word3 = "Sun"
49
- base_prompts = ["Sun is to Moon as ", "Black is to White as ", "Atom is to Element as",
50
- "Athens is to Greece as ", "Cat is to Dog as ", "Robin is to Bird as",
51
- "Hunger is to Ambition as "]
52
-
53
-
54
- #Mean Pooling - Take attention mask into account for correct averaging
55
- def mean_pooling(model_output, attention_mask):
56
- token_embeddings = model_output['token_embeddings'] #First element of model_output contains all token embeddings
57
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
58
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
59
-
60
-
61
- def normalize(comment, lowercase, remove_stopwords):
62
- if lowercase:
63
- comment = comment.lower()
64
- comment = nlp(comment)
65
- lemmatized = list()
66
- for word in comment:
67
- lemma = word.lemma_.strip()
68
- if lemma:
69
- if not remove_stopwords or (remove_stopwords and lemma not in stops):
70
- lemmatized.append(lemma)
71
- return " ".join(lemmatized)
72
-
73
-
74
- # def tokenize_function(examples):
75
- # return tokenizer(examples["text"])
76
-
77
-
78
- def compute_metrics(eval_pred):
79
- logits, labels = eval_pred
80
- predictions = np.argmax(logits, axis=-1)
81
- metric = evaluate.load("accuracy")
82
- return metric.compute(predictions=predictions, references=labels)
83
-
84
-
85
- def get_model():
86
- global model_base
87
- # last_checkpoint = "./results/checkpoint-22500"
88
-
89
- finetuned_model = T5ForConditionalGeneration.from_pretrained(model_base)
90
- tokenizer = T5Tokenizer.from_pretrained(model_base)
91
- # model = SentenceTransformer(model_base)
92
- gpu_available = torch.cuda.is_available()
93
- device = torch.device("cuda" if gpu_available else "cpu")
94
- finetuned_model = finetuned_model.to(device)
95
- return finetuned_model, tokenizer
96
-
97
-
98
- def cosine_scores(model, sentence):
99
- global word1
100
- global word2
101
- global word3
102
- # sentence1 = f"{word1} is to {word2} as"
103
- embeddings1 = model.encode(sentence, convert_to_tensor=True)
104
-
105
- def embeddings(model, sentences, tokenizer):
106
- global word1
107
- global word2
108
- global word3
109
- global model_base
110
- gpu_available = torch.cuda.is_available()
111
- device = torch.device("cuda" if gpu_available else "cpu")
112
- # device = torch.device('cuda:0')
113
- # embeddings = model.encode(sentences)
114
- question = "Please answer to this question: " + sentences
115
-
116
- inputs = tokenizer(question, return_tensors="pt")
117
-
118
- print(inputs)
119
- # print(inputs.device)
120
- print(model.device)
121
- print(inputs['input_ids'].device)
122
- print(inputs['attention_mask'].device)
123
-
124
- inputs['attention_mask'] = inputs['attention_mask'].to(device)
125
- inputs['input_ids'] = inputs['input_ids'].to(device)
126
-
127
- outputs = model.generate(**inputs)
128
- answer = tokenizer.decode(outputs[0])
129
- answer = answer[6:-4]
130
- # print(fill(answer, width=80))
131
-
132
- print("ANSWER IS", answer)
133
-
134
- return answer
135
-
136
-
137
- def random_word(model, tokenizer):
138
- global model_base
139
- vocab = tokenizer.get_vocab()
140
- # with open(model_base + '/vocab.txt', 'r') as file:
141
- line = ""
142
- # content = file.readlines()
143
- length = tokenizer.vocab_size
144
- # print(vocab)
145
- while line == "":
146
- rand_line = random.randrange(0, length)
147
- # print("TRYING TO FIND", rand_line, "OUT OF", length, "WITH VOCAB OF TYPE", type(vocab))
148
- for word, id in vocab.items():
149
- if id == rand_line and word[0].isalpha() and word not in stops and word not in ROMAN_CONSTANTS:
150
- # if vocab[rand_line][0].isalpha() and vocab[rand_line][:-1] not in stops and vocab[rand_line][:-1] not in ROMAN_CONSTANTS:
151
- line = word
152
- elif id == rand_line:
153
- print(f"{word} is not alpha or is a stop word")
154
- # for num, aline in enumerate(file, 1997):
155
- # if random.randrange(num) and aline.isalpha():
156
- # continue
157
- # # elif not aline.isalpha():
158
-
159
- # line = aline
160
- print(line)
161
- return line
162
-
163
-
164
- def generate_prompt(model, tokenizer):
165
- global word1
166
- global word2
167
- global word3
168
- global answer
169
- global base_prompts
170
- word1 = random_word(model, tokenizer)
171
- # word2 = random_word()
172
-
173
- word2 = embeddings(model, f"{base_prompts[random.randint(0, len(base_prompts) - 1)]}{word1} is to ___.", tokenizer)
174
- word3 = random_word(model, tokenizer)
175
- sentence = f"{word1} is to {word2} as {word3} is to ___."
176
- print(sentence)
177
- answer = embeddings(model, sentence, tokenizer)
178
- print("ANSWER IS", answer)
179
- return f"# {word1} is to {word2} as {word3} is to ___."
180
- # cosine_scores(model, sentence)
181
-
182
-
183
- def greet(name):
184
- return "Hello " + name + "!!"
185
-
186
- def check_answer(guess:str):
187
- global guesses
188
- global answer
189
- global return_guesses
190
- global word1
191
- global word2
192
- global word3
193
-
194
- model, tokenizer = get_model()
195
- output = ""
196
- protected_guess = guess
197
- sentence = f"{word1} is to {word2} as [MASK] is to {guess}."
198
-
199
- other_word = embeddings(model, sentence, tokenizer)
200
- guesses.append(guess)
201
-
202
-
203
-
204
- for guess in return_guesses:
205
- output += ("- " + guess + "<br>")
206
-
207
- # output = output[:-1]
208
- prompt = f"{word1} is to {word2} as {word3} is to ___."
209
- # print("IS", protected_guess, "EQUAL TO", answer, ":", protected_guess.lower() == answer.lower())
210
-
211
- if protected_guess.lower() == answer.lower():
212
- return_guesses.append(f"{protected_guess}: {word1} is to {word2} as {word3} is to {protected_guess}.")
213
- output += f"<span style='color:green'>- {return_guesses[-1]}</span><br>"
214
- new_prompt = generate_prompt(model, tokenizer)
215
- return new_prompt, "Correct!", output
216
- else:
217
- return_guess = f"{protected_guess}: {word1} is to {word2} as {other_word} is to {protected_guess}."
218
- return_guesses.append(return_guess)
219
- output += ("- " + return_guess + " <br>")
220
- return prompt, "Try again!", output
221
-
222
- def main():
223
- global word1
224
- global word2
225
- global word3
226
- global answer
227
- # answer = "Moon"
228
- global guesses
229
-
230
-
231
- # num_rows, data_type, value, example, embeddings = training()
232
- # sent_embeddings = embeddings()
233
- model, tokenizer = get_model()
234
- generate_prompt(model, tokenizer)
235
-
236
- prompt = f"{word1} is to {word2} as {word3} is to ____"
237
- print(prompt)
238
- print("TESTING EMBEDDINGS")
239
- with gr.Blocks() as iface:
240
- mark_question = gr.Markdown(prompt)
241
- with gr.Tab("Guess"):
242
- text_input = gr.Textbox()
243
- text_output = gr.Textbox()
244
- text_button = gr.Button("Submit")
245
- with gr.Accordion("Open for previous guesses"):
246
- text_guesses = gr.Markdown()
247
- # with gr.Tab("Testing"):
248
- # gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
249
- text_button.click(check_answer, inputs=[text_input], outputs=[mark_question, text_output, text_guesses])
250
- # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
251
- iface.launch()
252
-
253
-
254
-
255
-
256
-
257
- if __name__ == "__main__":
258
  main()
 
1
+ import gradio as gr
2
+ import math
3
+ import spacy
4
+ from datasets import load_dataset
5
+ from transformers import pipeline, T5Tokenizer
6
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
7
+ from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from torch.utils.data import DataLoader
11
+ import numpy as np
12
+ import evaluate
13
+ import nltk
14
+ from nltk.corpus import stopwords
15
+ import subprocess
16
+ import sys
17
+ import random
18
+ from textwrap import fill
19
+
20
+ # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
21
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
22
+ # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
23
+ model_base = "results/checkpoint-17000"
24
+ nltk.download('stopwords')
25
+ nlp = spacy.load("en_core_web_sm")
26
+ stops = stopwords.words("english")
27
+ ROMAN_CONSTANTS = (
28
+ ( "", "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX" ),
29
+ ( "", "X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC" ),
30
+ ( "", "C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM" ),
31
+ ( "", "M", "MM", "MMM", "", "", "-", "", "", "" ),
32
+ ( "", "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix" ),
33
+ ( "", "x", "xx", "xxx", "xl", "l", "lx", "lxx", "lxxx", "xc" ),
34
+ ( "", "c", "cc", "ccc", "cd", "d", "dc", "dcc", "dccc", "cm" ),
35
+ ( "", "m", "mm", "mmm", "", "", "-", "", "", "" ),
36
+ )
37
+
38
+ # answer = "Pizza"
39
+ guesses = []
40
+ return_guesses = []
41
+ answer = "Moon"
42
+ word1 = "Black"
43
+ word2 = "White"
44
+ word3 = "Sun"
45
+ base_prompts = ["Sun is to Moon as ", "Black is to White as ", "Atom is to Element as",
46
+ "Athens is to Greece as ", "Cat is to Dog as ", "Robin is to Bird as",
47
+ "Hunger is to Ambition as "]
48
+
49
+
50
+ #Mean Pooling - Take attention mask into account for correct averaging
51
+ def mean_pooling(model_output, attention_mask):
52
+ token_embeddings = model_output['token_embeddings'] #First element of model_output contains all token embeddings
53
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
54
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
55
+
56
+
57
+ def normalize(comment, lowercase, remove_stopwords):
58
+ if lowercase:
59
+ comment = comment.lower()
60
+ comment = nlp(comment)
61
+ lemmatized = list()
62
+ for word in comment:
63
+ lemma = word.lemma_.strip()
64
+ if lemma:
65
+ if not remove_stopwords or (remove_stopwords and lemma not in stops):
66
+ lemmatized.append(lemma)
67
+ return " ".join(lemmatized)
68
+
69
+
70
+ # def tokenize_function(examples):
71
+ # return tokenizer(examples["text"])
72
+
73
+
74
+ def compute_metrics(eval_pred):
75
+ logits, labels = eval_pred
76
+ predictions = np.argmax(logits, axis=-1)
77
+ metric = evaluate.load("accuracy")
78
+ return metric.compute(predictions=predictions, references=labels)
79
+
80
+
81
+ def get_model():
82
+ global model_base
83
+ # last_checkpoint = "./results/checkpoint-22500"
84
+
85
+ finetuned_model = T5ForConditionalGeneration.from_pretrained(model_base)
86
+ tokenizer = T5Tokenizer.from_pretrained(model_base)
87
+ # model = SentenceTransformer(model_base)
88
+ gpu_available = torch.cuda.is_available()
89
+ device = torch.device("cuda" if gpu_available else "cpu")
90
+ finetuned_model = finetuned_model.to(device)
91
+ return finetuned_model, tokenizer
92
+
93
+
94
+ def cosine_scores(model, sentence):
95
+ global word1
96
+ global word2
97
+ global word3
98
+ # sentence1 = f"{word1} is to {word2} as"
99
+ embeddings1 = model.encode(sentence, convert_to_tensor=True)
100
+
101
+ def embeddings(model, sentences, tokenizer):
102
+ global word1
103
+ global word2
104
+ global word3
105
+ global model_base
106
+ gpu_available = torch.cuda.is_available()
107
+ device = torch.device("cuda" if gpu_available else "cpu")
108
+ # device = torch.device('cuda:0')
109
+ # embeddings = model.encode(sentences)
110
+ question = "Please answer to this question: " + sentences
111
+
112
+ inputs = tokenizer(question, return_tensors="pt")
113
+
114
+ print(inputs)
115
+ # print(inputs.device)
116
+ print(model.device)
117
+ print(inputs['input_ids'].device)
118
+ print(inputs['attention_mask'].device)
119
+
120
+ inputs['attention_mask'] = inputs['attention_mask'].to(device)
121
+ inputs['input_ids'] = inputs['input_ids'].to(device)
122
+
123
+ outputs = model.generate(**inputs)
124
+ answer = tokenizer.decode(outputs[0])
125
+ answer = answer[6:-4]
126
+ # print(fill(answer, width=80))
127
+
128
+ print("ANSWER IS", answer)
129
+
130
+ return answer
131
+
132
+
133
+ def random_word(model, tokenizer):
134
+ global model_base
135
+ vocab = tokenizer.get_vocab()
136
+ # with open(model_base + '/vocab.txt', 'r') as file:
137
+ line = ""
138
+ # content = file.readlines()
139
+ length = tokenizer.vocab_size
140
+ # print(vocab)
141
+ while line == "":
142
+ rand_line = random.randrange(0, length)
143
+ # print("TRYING TO FIND", rand_line, "OUT OF", length, "WITH VOCAB OF TYPE", type(vocab))
144
+ for word, id in vocab.items():
145
+ if id == rand_line and word[0].isalpha() and word not in stops and word not in ROMAN_CONSTANTS:
146
+ # if vocab[rand_line][0].isalpha() and vocab[rand_line][:-1] not in stops and vocab[rand_line][:-1] not in ROMAN_CONSTANTS:
147
+ line = word
148
+ elif id == rand_line:
149
+ print(f"{word} is not alpha or is a stop word")
150
+ # for num, aline in enumerate(file, 1997):
151
+ # if random.randrange(num) and aline.isalpha():
152
+ # continue
153
+ # # elif not aline.isalpha():
154
+
155
+ # line = aline
156
+ print(line)
157
+ return line
158
+
159
+
160
+ def generate_prompt(model, tokenizer):
161
+ global word1
162
+ global word2
163
+ global word3
164
+ global answer
165
+ global base_prompts
166
+ word1 = random_word(model, tokenizer)
167
+ # word2 = random_word()
168
+
169
+ word2 = embeddings(model, f"{base_prompts[random.randint(0, len(base_prompts) - 1)]}{word1} is to ___.", tokenizer)
170
+ word3 = random_word(model, tokenizer)
171
+ sentence = f"{word1} is to {word2} as {word3} is to ___."
172
+ print(sentence)
173
+ answer = embeddings(model, sentence, tokenizer)
174
+ print("ANSWER IS", answer)
175
+ return f"# {word1} is to {word2} as {word3} is to ___."
176
+ # cosine_scores(model, sentence)
177
+
178
+
179
+ def greet(name):
180
+ return "Hello " + name + "!!"
181
+
182
+ def check_answer(guess:str):
183
+ global guesses
184
+ global answer
185
+ global return_guesses
186
+ global word1
187
+ global word2
188
+ global word3
189
+
190
+ model, tokenizer = get_model()
191
+ output = ""
192
+ protected_guess = guess
193
+ sentence = f"{word1} is to {word2} as [MASK] is to {guess}."
194
+
195
+ other_word = embeddings(model, sentence, tokenizer)
196
+ guesses.append(guess)
197
+
198
+
199
+
200
+ for guess in return_guesses:
201
+ output += ("- " + guess + "<br>")
202
+
203
+ # output = output[:-1]
204
+ prompt = f"{word1} is to {word2} as {word3} is to ___."
205
+ # print("IS", protected_guess, "EQUAL TO", answer, ":", protected_guess.lower() == answer.lower())
206
+
207
+ if protected_guess.lower() == answer.lower():
208
+ return_guesses.append(f"{protected_guess}: {word1} is to {word2} as {word3} is to {protected_guess}.")
209
+ output += f"<span style='color:green'>- {return_guesses[-1]}</span><br>"
210
+ new_prompt = generate_prompt(model, tokenizer)
211
+ return new_prompt, "Correct!", output
212
+ else:
213
+ return_guess = f"{protected_guess}: {word1} is to {word2} as {other_word} is to {protected_guess}."
214
+ return_guesses.append(return_guess)
215
+ output += ("- " + return_guess + " <br>")
216
+ return prompt, "Try again!", output
217
+
218
+ def main():
219
+ global word1
220
+ global word2
221
+ global word3
222
+ global answer
223
+ # answer = "Moon"
224
+ global guesses
225
+
226
+
227
+ # num_rows, data_type, value, example, embeddings = training()
228
+ # sent_embeddings = embeddings()
229
+ model, tokenizer = get_model()
230
+ generate_prompt(model, tokenizer)
231
+
232
+ prompt = f"{word1} is to {word2} as {word3} is to ____"
233
+ print(prompt)
234
+ print("TESTING EMBEDDINGS")
235
+ with gr.Blocks() as iface:
236
+ mark_question = gr.Markdown(prompt)
237
+ with gr.Tab("Guess"):
238
+ text_input = gr.Textbox()
239
+ text_output = gr.Textbox()
240
+ text_button = gr.Button("Submit")
241
+ with gr.Accordion("Open for previous guesses"):
242
+ text_guesses = gr.Markdown()
243
+ # with gr.Tab("Testing"):
244
+ # gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
245
+ text_button.click(check_answer, inputs=[text_input], outputs=[mark_question, text_output, text_guesses])
246
+ # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
247
+ iface.launch()
248
+
249
+
250
+
251
+
252
+
253
+ if __name__ == "__main__":
 
 
 
 
254
  main()
flan-t5-train.py CHANGED
@@ -1,302 +1,235 @@
1
- import gradio as gr
2
- import math
3
- from datasets import load_dataset
4
- from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
5
- from transformers import TrainingArguments, Trainer
6
- from transformers import T5Tokenizer, T5ForConditionalGeneration
7
- import torch
8
- import torch.nn.functional as F
9
- from torch.utils.data import DataLoader
10
- import numpy as np
11
- import evaluate
12
- import nltk
13
- from nltk.corpus import stopwords
14
- import subprocess
15
- import sys
16
- from transformers import T5Tokenizer, DataCollatorForSeq2Seq
17
- from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
18
- from transformers import DataCollatorWithPadding, DistilBertTokenizerFast
19
- from transformers import TrainingArguments
20
- from transformers import (
21
- BertModel,
22
- BertTokenizerFast,
23
- Trainer,
24
- EvalPrediction
25
- )
26
-
27
- # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
28
- # subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
29
- # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
30
- # data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
31
- # nltk.download('stopwords')
32
- # nlp = spacy.load("en_core_web_sm")
33
- # stops = stopwords.words("english")
34
- nltk.download("punkt", quiet=True)
35
- metric = evaluate.load("rouge")
36
-
37
- # Global Parameters
38
- L_RATE = 3e-4
39
- BATCH_SIZE = 8
40
- PER_DEVICE_EVAL_BATCH = 4
41
- WEIGHT_DECAY = 0.01
42
- SAVE_TOTAL_LIM = 3
43
- NUM_EPOCHS = 10
44
-
45
- # Set up training arguments
46
- training_args = Seq2SeqTrainingArguments(
47
- output_dir="./results",
48
- evaluation_strategy="epoch",
49
- learning_rate=L_RATE,
50
- per_device_train_batch_size=BATCH_SIZE,
51
- per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
52
- weight_decay=WEIGHT_DECAY,
53
- save_total_limit=SAVE_TOTAL_LIM,
54
- num_train_epochs=NUM_EPOCHS,
55
- predict_with_generate=True,
56
- push_to_hub=False
57
- )
58
-
59
- model_id = "google/flan-t5-base"
60
- tokenizer = T5Tokenizer.from_pretrained(model_id)
61
- # tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
62
- # metric = evaluate.load("accuracy")
63
-
64
- def tokenize_function(examples):
65
- return tokenizer(examples["stem"], padding="max_length", truncation=True)
66
-
67
-
68
- #Mean Pooling - Take attention mask into account for correct averaging
69
- def mean_pooling(model_output, attention_mask):
70
- token_embeddings = model_output[0] #First element of model_output contains all token embeddings
71
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
72
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
73
-
74
-
75
- # def compute_metrics(eval_pred):
76
- # logits, labels = eval_pred
77
- # predictions = np.argmax(logits, axis=-1)
78
- # metric = evaluate.load("accuracy")
79
- # return metric.compute(predictions=predictions, references=labels)
80
-
81
- def compute_metrics(eval_preds):
82
- preds, labels = eval_preds
83
-
84
- # decode preds and labels
85
- labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
86
- decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
87
- decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
88
-
89
- # rougeLSum expects newline after each sentence
90
- decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
91
- decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
92
-
93
- result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
94
-
95
- return result
96
-
97
-
98
- def training():
99
- dataset_id = "tomasmcz/word2vec_analogy"
100
- # dataset_id = "relbert/scientific_and_creative_analogy"
101
- # dataset_sub = "Quadruples_Kmiecik_random_split"
102
- print("GETTING DATASET")
103
- dataset = load_dataset(dataset_id)
104
- # dataset = dataset["train"]
105
- # tokenized_datasets = dataset.map(tokenize_function, batched=True)
106
-
107
- print(dataset)
108
- print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
109
- print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0])} as value.")
110
- print(f"- Examples look like this: {dataset['train'][0]}")
111
-
112
- # for i in dataset["train"]:
113
- # print(i["AB"], "to", i["CD"], "is", i["label"])
114
-
115
- dataset = dataset["train"].train_test_split(test_size=0.3)
116
-
117
- # We prefix our tasks with "answer the question"
118
- prefix = "Please answer this question: "
119
-
120
- # Define the preprocessing function
121
-
122
- # def preprocess_function(examples):
123
- # """Add prefix to the sentences, tokenize the text, and set the labels"""
124
- # # The "inputs" are the tokenized answer:
125
- # inputs = []
126
- # # print(examples)
127
- # # inputs = [prefix + doc for doc in examples["question"]]
128
- # for doc in examples['source']:
129
- # # print("THE DOC IS:", doc)
130
- # # print("THE DOC IS:", examples[i]['AB'], examples[i]['CD'], examples[i]['label'])
131
- # prompt = f"{prefix}map "
132
- # for item in doc:
133
- # prompt += f"{item}, and "
134
- # prompt = prompt[:-6]
135
- # inputs.append(prompt)
136
- # # inputs = [prefix + doc for doc in examples["question"]]
137
- # for indx, doc in enumerate(examples["target_random"]):
138
- # prompt = f" to "
139
- # for item in doc:
140
- # prompt += f"{item}, and "
141
- # prompt = prompt[:-6] + "."
142
- # inputs[indx] += prompt
143
- # model_inputs = tokenizer(inputs, max_length=128, truncation=True)
144
-
145
- def preprocess_function(examples):
146
- """Add prefix to the sentences, tokenize the text, and set the labels"""
147
- # The "inputs" are the tokenized answer:
148
- inputs = []
149
- # print(examples)
150
- # inputs = [prefix + doc for doc in examples["question"]]
151
- for doc in examples['word_a']:
152
- # print("THE DOC IS:", doc)
153
- # print("THE DOC IS:", examples[i]['AB'], examples[i]['CD'], examples[i]['label'])
154
- prompt = f"{prefix}{doc} is to "
155
- inputs.append(prompt)
156
- # inputs = [prefix + doc for doc in examples["question"]]
157
- for indx, doc in enumerate(examples["word_b"]):
158
- prompt = f"{doc} as "
159
- inputs[indx] += prompt
160
-
161
- for indx, doc in enumerate(examples["word_c"]):
162
- prompt = f"{doc} is to ___."
163
- inputs[indx] += prompt
164
- model_inputs = tokenizer(inputs, max_length=128, truncation=True)
165
-
166
- # print(examples["label"], type(examples["label"]))
167
-
168
- # The "labels" are the tokenized outputs:
169
- labels = tokenizer(text_target=examples["word_d"],
170
- max_length=512,
171
- truncation=True)
172
-
173
- model_inputs["labels"] = labels["input_ids"]
174
- return model_inputs
175
-
176
-
177
-
178
- # Map the preprocessing function across our dataset
179
- tokenized_dataset = dataset.map(preprocess_function, batched=True)
180
- # train_examples = []
181
- # train_data = dataset["test"]
182
- # # For agility we only 1/2 of our available data
183
- # n_examples = dataset["test"].num_rows // 2
184
-
185
- # for i in range(n_examples):
186
- # example = train_data[i]
187
- # temp_word_1 = example["stem"][0]
188
- # temp_word_2 = example["stem"][1]
189
- # temp_word_3 = example["choice"][example["answer"]][0]
190
- # temp_word_4 = example["choice"][example["answer"]][1]
191
- # comp1 = f"{temp_word_1} to {temp_word_2}"
192
- # comp2 = f"{temp_word_3} to {temp_word_4}"
193
- # # example_opposite = dataset_clean[-(i)]
194
- # # print(example["text"])
195
- # train_examples.append(InputExample(texts=[comp1, comp2]))
196
-
197
-
198
- # train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
199
-
200
- print("END DATALOADER")
201
-
202
- # print(train_examples)
203
-
204
- embeddings = finetune(tokenized_dataset)
205
-
206
- return 0
207
-
208
-
209
- def finetune(dataset):
210
- # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
211
- # model_id = "sentence-transformers/all-MiniLM-L6-v2"
212
- model_id = "google/flan-t5-base"
213
- # model_id = "distilbert-base-uncased"
214
- # tokenizer = DistilBertTokenizerFast.from_pretrained(model_id)
215
- tokenizer = T5Tokenizer.from_pretrained(model_id)
216
- model = T5ForConditionalGeneration.from_pretrained(model_id)
217
- data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
218
- device = torch.device('cuda:0')
219
- model = model.to(device)
220
-
221
- # training_args = TrainingArguments(output_dir="test_trainer")
222
-
223
- # USE THIS LINK
224
- # https://huggingface.co/blog/how-to-train-sentence-transformers
225
-
226
- # train_loss = losses.MegaBatchMarginLoss(model=model)
227
- # ds_train, ds_valid = dataset.train_test_split(test_size=0.2, seed=42)
228
-
229
- print("BEGIN FIT")
230
-
231
- trainer = Seq2SeqTrainer(
232
- model=model,
233
- args=training_args,
234
- train_dataset=dataset["train"],
235
- eval_dataset=dataset["test"],
236
- # evaluation_strategy="no"
237
- tokenizer=tokenizer,
238
- data_collator=data_collator,
239
- compute_metrics=compute_metrics
240
- )
241
-
242
- # model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
243
-
244
- trainer.train()
245
-
246
- # model.save("flan-analogies")
247
-
248
- # model.save_to_hub("smhavens/bert-base-analogies")
249
- # accuracy = compute_metrics(eval, metric)
250
- return 0
251
-
252
- def greet(name):
253
- return "Hello " + name + "!!"
254
-
255
- def check_answer(guess:str):
256
- global guesses
257
- global answer
258
- guesses.append(guess)
259
- output = ""
260
- for guess in guesses:
261
- output += ("- " + guess + "\n")
262
- output = output[:-1]
263
-
264
- if guess.lower() == answer.lower():
265
- return "Correct!", output
266
- else:
267
- return "Try again!", output
268
-
269
- def main():
270
- print("BEGIN")
271
- word1 = "Black"
272
- word2 = "White"
273
- word3 = "Sun"
274
- global answer
275
- answer = "Moon"
276
- global guesses
277
-
278
- training()
279
-
280
- # prompt = f"{word1} is to {word2} as {word3} is to ____"
281
- # with gr.Blocks() as iface:
282
- # gr.Markdown(prompt)
283
- # with gr.Tab("Guess"):
284
- # text_input = gr.Textbox()
285
- # text_output = gr.Textbox()
286
- # text_button = gr.Button("Submit")
287
- # with gr.Accordion("Open for previous guesses"):
288
- # text_guesses = gr.Textbox()
289
- # with gr.Tab("Testing"):
290
- # gr.Markdown(f"""Number of rows in dataset is {num_rows}, with each having type {data_type} and value {value}.
291
- # An example is {example}.
292
- # The Embeddings are {embeddings}.""")
293
- # text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
294
- # # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
295
- # iface.launch()
296
-
297
-
298
-
299
-
300
-
301
- if __name__ == "__main__":
302
  main()
 
1
+ import gradio as gr
2
+ import math
3
+ from datasets import load_dataset
4
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
5
+ from transformers import TrainingArguments, Trainer
6
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
7
+ import torch
8
+ import torch.nn.functional as F
9
+ from torch.utils.data import DataLoader
10
+ import numpy as np
11
+ import evaluate
12
+ import nltk
13
+ from nltk.corpus import stopwords
14
+ import subprocess
15
+ import sys
16
+ from transformers import T5Tokenizer, DataCollatorForSeq2Seq
17
+ from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
18
+ from transformers import DataCollatorWithPadding, DistilBertTokenizerFast
19
+ from transformers import TrainingArguments
20
+ from transformers import (
21
+ BertModel,
22
+ BertTokenizerFast,
23
+ Trainer,
24
+ EvalPrediction
25
+ )
26
+
27
+ nltk.download("punkt", quiet=True)
28
+ metric = evaluate.load("rouge")
29
+
30
+ # Global Parameters
31
+ L_RATE = 3e-4
32
+ BATCH_SIZE = 8
33
+ PER_DEVICE_EVAL_BATCH = 4
34
+ WEIGHT_DECAY = 0.01
35
+ SAVE_TOTAL_LIM = 3
36
+ NUM_EPOCHS = 10
37
+
38
+ # Set up training arguments
39
+ training_args = Seq2SeqTrainingArguments(
40
+ output_dir="./results",
41
+ evaluation_strategy="epoch",
42
+ learning_rate=L_RATE,
43
+ per_device_train_batch_size=BATCH_SIZE,
44
+ per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
45
+ weight_decay=WEIGHT_DECAY,
46
+ save_total_limit=SAVE_TOTAL_LIM,
47
+ num_train_epochs=NUM_EPOCHS,
48
+ predict_with_generate=True,
49
+ push_to_hub=False
50
+ )
51
+
52
+ model_id = "google/flan-t5-base"
53
+ tokenizer = T5Tokenizer.from_pretrained(model_id)
54
+ # tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
55
+ # metric = evaluate.load("accuracy")
56
+
57
+ def tokenize_function(examples):
58
+ return tokenizer(examples["stem"], padding="max_length", truncation=True)
59
+
60
+
61
+ #Mean Pooling - Take attention mask into account for correct averaging
62
+ def mean_pooling(model_output, attention_mask):
63
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
64
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
65
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
66
+
67
+
68
+ # def compute_metrics(eval_pred):
69
+ # logits, labels = eval_pred
70
+ # predictions = np.argmax(logits, axis=-1)
71
+ # metric = evaluate.load("accuracy")
72
+ # return metric.compute(predictions=predictions, references=labels)
73
+
74
+ def compute_metrics(eval_preds):
75
+ preds, labels = eval_preds
76
+
77
+ # decode preds and labels
78
+ labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
79
+ decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
80
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
81
+
82
+ # rougeLSum expects newline after each sentence
83
+ decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
84
+ decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
85
+
86
+ result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
87
+
88
+ return result
89
+
90
+
91
+ def training():
92
+ dataset_id = "tomasmcz/word2vec_analogy"
93
+ # dataset_id = "relbert/scientific_and_creative_analogy"
94
+ # dataset_sub = "Quadruples_Kmiecik_random_split"
95
+ print("GETTING DATASET")
96
+ dataset = load_dataset(dataset_id)
97
+ # dataset = dataset["train"]
98
+ # tokenized_datasets = dataset.map(tokenize_function, batched=True)
99
+
100
+ print(dataset)
101
+ print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
102
+ print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0])} as value.")
103
+ print(f"- Examples look like this: {dataset['train'][0]}")
104
+
105
+ # for i in dataset["train"]:
106
+ # print(i["AB"], "to", i["CD"], "is", i["label"])
107
+
108
+ dataset = dataset["train"].train_test_split(test_size=0.3)
109
+
110
+ # We prefix our tasks with "answer the question"
111
+ prefix = "Please answer this question: "
112
+
113
+
114
+ def preprocess_function(examples):
115
+ """Add prefix to the sentences, tokenize the text, and set the labels"""
116
+ # The "inputs" are the tokenized answer:
117
+ inputs = []
118
+ # print(examples)
119
+ # inputs = [prefix + doc for doc in examples["question"]]
120
+ for doc in examples['word_a']:
121
+ # print("THE DOC IS:", doc)
122
+ # print("THE DOC IS:", examples[i]['AB'], examples[i]['CD'], examples[i]['label'])
123
+ prompt = f"{prefix}{doc} is to "
124
+ inputs.append(prompt)
125
+ # inputs = [prefix + doc for doc in examples["question"]]
126
+ for indx, doc in enumerate(examples["word_b"]):
127
+ prompt = f"{doc} as "
128
+ inputs[indx] += prompt
129
+
130
+ for indx, doc in enumerate(examples["word_c"]):
131
+ prompt = f"{doc} is to ___."
132
+ inputs[indx] += prompt
133
+ model_inputs = tokenizer(inputs, max_length=128, truncation=True)
134
+
135
+ # print(examples["label"], type(examples["label"]))
136
+
137
+ # The "labels" are the tokenized outputs:
138
+ labels = tokenizer(text_target=examples["word_d"],
139
+ max_length=512,
140
+ truncation=True)
141
+
142
+ model_inputs["labels"] = labels["input_ids"]
143
+ return model_inputs
144
+
145
+
146
+
147
+ # Map the preprocessing function across our dataset
148
+ tokenized_dataset = dataset.map(preprocess_function, batched=True)
149
+
150
+ print("END DATALOADER")
151
+
152
+ # print(train_examples)
153
+
154
+ embeddings = finetune(tokenized_dataset)
155
+
156
+ return 0
157
+
158
+
159
+ def finetune(dataset):
160
+ # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
161
+ # model_id = "sentence-transformers/all-MiniLM-L6-v2"
162
+ model_id = "google/flan-t5-base"
163
+ # model_id = "distilbert-base-uncased"
164
+ # tokenizer = DistilBertTokenizerFast.from_pretrained(model_id)
165
+ tokenizer = T5Tokenizer.from_pretrained(model_id)
166
+ model = T5ForConditionalGeneration.from_pretrained(model_id)
167
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
168
+ device = torch.device('cuda:0')
169
+ model = model.to(device)
170
+
171
+ # training_args = TrainingArguments(output_dir="test_trainer")
172
+
173
+ # USE THIS LINK
174
+ # https://huggingface.co/blog/how-to-train-sentence-transformers
175
+
176
+ # train_loss = losses.MegaBatchMarginLoss(model=model)
177
+ # ds_train, ds_valid = dataset.train_test_split(test_size=0.2, seed=42)
178
+
179
+ print("BEGIN FIT")
180
+
181
+ trainer = Seq2SeqTrainer(
182
+ model=model,
183
+ args=training_args,
184
+ train_dataset=dataset["train"],
185
+ eval_dataset=dataset["test"],
186
+ # evaluation_strategy="no"
187
+ tokenizer=tokenizer,
188
+ data_collator=data_collator,
189
+ compute_metrics=compute_metrics
190
+ )
191
+
192
+ # model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
193
+
194
+ trainer.train()
195
+
196
+ # model.save("flan-analogies")
197
+
198
+ # model.save_to_hub("smhavens/bert-base-analogies")
199
+ # accuracy = compute_metrics(eval, metric)
200
+ return 0
201
+
202
+ def greet(name):
203
+ return "Hello " + name + "!!"
204
+
205
+ def check_answer(guess:str):
206
+ global guesses
207
+ global answer
208
+ guesses.append(guess)
209
+ output = ""
210
+ for guess in guesses:
211
+ output += ("- " + guess + "\n")
212
+ output = output[:-1]
213
+
214
+ if guess.lower() == answer.lower():
215
+ return "Correct!", output
216
+ else:
217
+ return "Try again!", output
218
+
219
+ def main():
220
+ print("BEGIN")
221
+ word1 = "Black"
222
+ word2 = "White"
223
+ word3 = "Sun"
224
+ global answer
225
+ answer = "Moon"
226
+ global guesses
227
+
228
+ training()
229
+
230
+
231
+
232
+
233
+
234
+ if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  main()
results/checkpoint-16000/added_tokens.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<extra_id_0>": 32099,
3
+ "<extra_id_10>": 32089,
4
+ "<extra_id_11>": 32088,
5
+ "<extra_id_12>": 32087,
6
+ "<extra_id_13>": 32086,
7
+ "<extra_id_14>": 32085,
8
+ "<extra_id_15>": 32084,
9
+ "<extra_id_16>": 32083,
10
+ "<extra_id_17>": 32082,
11
+ "<extra_id_18>": 32081,
12
+ "<extra_id_19>": 32080,
13
+ "<extra_id_1>": 32098,
14
+ "<extra_id_20>": 32079,
15
+ "<extra_id_21>": 32078,
16
+ "<extra_id_22>": 32077,
17
+ "<extra_id_23>": 32076,
18
+ "<extra_id_24>": 32075,
19
+ "<extra_id_25>": 32074,
20
+ "<extra_id_26>": 32073,
21
+ "<extra_id_27>": 32072,
22
+ "<extra_id_28>": 32071,
23
+ "<extra_id_29>": 32070,
24
+ "<extra_id_2>": 32097,
25
+ "<extra_id_30>": 32069,
26
+ "<extra_id_31>": 32068,
27
+ "<extra_id_32>": 32067,
28
+ "<extra_id_33>": 32066,
29
+ "<extra_id_34>": 32065,
30
+ "<extra_id_35>": 32064,
31
+ "<extra_id_36>": 32063,
32
+ "<extra_id_37>": 32062,
33
+ "<extra_id_38>": 32061,
34
+ "<extra_id_39>": 32060,
35
+ "<extra_id_3>": 32096,
36
+ "<extra_id_40>": 32059,
37
+ "<extra_id_41>": 32058,
38
+ "<extra_id_42>": 32057,
39
+ "<extra_id_43>": 32056,
40
+ "<extra_id_44>": 32055,
41
+ "<extra_id_45>": 32054,
42
+ "<extra_id_46>": 32053,
43
+ "<extra_id_47>": 32052,
44
+ "<extra_id_48>": 32051,
45
+ "<extra_id_49>": 32050,
46
+ "<extra_id_4>": 32095,
47
+ "<extra_id_50>": 32049,
48
+ "<extra_id_51>": 32048,
49
+ "<extra_id_52>": 32047,
50
+ "<extra_id_53>": 32046,
51
+ "<extra_id_54>": 32045,
52
+ "<extra_id_55>": 32044,
53
+ "<extra_id_56>": 32043,
54
+ "<extra_id_57>": 32042,
55
+ "<extra_id_58>": 32041,
56
+ "<extra_id_59>": 32040,
57
+ "<extra_id_5>": 32094,
58
+ "<extra_id_60>": 32039,
59
+ "<extra_id_61>": 32038,
60
+ "<extra_id_62>": 32037,
61
+ "<extra_id_63>": 32036,
62
+ "<extra_id_64>": 32035,
63
+ "<extra_id_65>": 32034,
64
+ "<extra_id_66>": 32033,
65
+ "<extra_id_67>": 32032,
66
+ "<extra_id_68>": 32031,
67
+ "<extra_id_69>": 32030,
68
+ "<extra_id_6>": 32093,
69
+ "<extra_id_70>": 32029,
70
+ "<extra_id_71>": 32028,
71
+ "<extra_id_72>": 32027,
72
+ "<extra_id_73>": 32026,
73
+ "<extra_id_74>": 32025,
74
+ "<extra_id_75>": 32024,
75
+ "<extra_id_76>": 32023,
76
+ "<extra_id_77>": 32022,
77
+ "<extra_id_78>": 32021,
78
+ "<extra_id_79>": 32020,
79
+ "<extra_id_7>": 32092,
80
+ "<extra_id_80>": 32019,
81
+ "<extra_id_81>": 32018,
82
+ "<extra_id_82>": 32017,
83
+ "<extra_id_83>": 32016,
84
+ "<extra_id_84>": 32015,
85
+ "<extra_id_85>": 32014,
86
+ "<extra_id_86>": 32013,
87
+ "<extra_id_87>": 32012,
88
+ "<extra_id_88>": 32011,
89
+ "<extra_id_89>": 32010,
90
+ "<extra_id_8>": 32091,
91
+ "<extra_id_90>": 32009,
92
+ "<extra_id_91>": 32008,
93
+ "<extra_id_92>": 32007,
94
+ "<extra_id_93>": 32006,
95
+ "<extra_id_94>": 32005,
96
+ "<extra_id_95>": 32004,
97
+ "<extra_id_96>": 32003,
98
+ "<extra_id_97>": 32002,
99
+ "<extra_id_98>": 32001,
100
+ "<extra_id_99>": 32000,
101
+ "<extra_id_9>": 32090
102
+ }
results/checkpoint-16000/config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/flan-t5-base",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2048,
8
+ "d_kv": 64,
9
+ "d_model": 768,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "t5",
20
+ "n_positions": 512,
21
+ "num_decoder_layers": 12,
22
+ "num_heads": 12,
23
+ "num_layers": 12,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "relative_attention_max_distance": 128,
27
+ "relative_attention_num_buckets": 32,
28
+ "task_specific_params": {
29
+ "summarization": {
30
+ "early_stopping": true,
31
+ "length_penalty": 2.0,
32
+ "max_length": 200,
33
+ "min_length": 30,
34
+ "no_repeat_ngram_size": 3,
35
+ "num_beams": 4,
36
+ "prefix": "summarize: "
37
+ },
38
+ "translation_en_to_de": {
39
+ "early_stopping": true,
40
+ "max_length": 300,
41
+ "num_beams": 4,
42
+ "prefix": "translate English to German: "
43
+ },
44
+ "translation_en_to_fr": {
45
+ "early_stopping": true,
46
+ "max_length": 300,
47
+ "num_beams": 4,
48
+ "prefix": "translate English to French: "
49
+ },
50
+ "translation_en_to_ro": {
51
+ "early_stopping": true,
52
+ "max_length": 300,
53
+ "num_beams": 4,
54
+ "prefix": "translate English to Romanian: "
55
+ }
56
+ },
57
+ "tie_word_embeddings": false,
58
+ "torch_dtype": "float32",
59
+ "transformers_version": "4.35.2",
60
+ "use_cache": true,
61
+ "vocab_size": 32128
62
+ }
results/checkpoint-16000/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "decoder_start_token_id": 0,
3
+ "eos_token_id": 1,
4
+ "pad_token_id": 0,
5
+ "transformers_version": "4.35.2"
6
+ }
results/checkpoint-16000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd7f96db75733e18d6af8488ab51eea991be641c6c22b24fa5ab3b45101c3398
3
+ size 990345064
results/checkpoint-16000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31aa07bcfc63b03b9dbfb77536457e4d0591b64d537e2f4834f5b81c6bd2ab21
3
+ size 1980860410
results/checkpoint-16000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc296e1811c88d4548bfa74b8cf96485e58c41652ba8a0db69b6e3a9762f9be0
3
+ size 14244
results/checkpoint-16000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c77d751bb87ca04afd8f823ee9102cffea6221900b1a056c2f31d9044f1a0ce
3
+ size 1064
results/checkpoint-16000/special_tokens_map.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": {
105
+ "content": "</s>",
106
+ "lstrip": false,
107
+ "normalized": false,
108
+ "rstrip": false,
109
+ "single_word": false
110
+ },
111
+ "pad_token": {
112
+ "content": "<pad>",
113
+ "lstrip": false,
114
+ "normalized": false,
115
+ "rstrip": false,
116
+ "single_word": false
117
+ },
118
+ "unk_token": {
119
+ "content": "<unk>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false
124
+ }
125
+ }
results/checkpoint-16000/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
+ size 791656
results/checkpoint-16000/tokenizer_config.json ADDED
@@ -0,0 +1,939 @@