Mila commited on
Commit
ec3e101
1 Parent(s): 949bc1b

Working version. Needs updates for game

Browse files
Files changed (4) hide show
  1. .gitattributes +2 -0
  2. app.py +291 -291
  3. requirements.txt +5 -5
  4. train.py +276 -276
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.csv filter=lfs diff=lfs merge=lfs -text
37
+ *.safetenstors filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,292 +1,292 @@
1
- import gradio as gr
2
- import math
3
- import spacy
4
- from datasets import load_dataset
5
- from sentence_transformers import SentenceTransformer
6
- from sentence_transformers import InputExample
7
- from sentence_transformers import losses
8
- from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
9
- from transformers import TrainingArguments, Trainer
10
- import torch
11
- import torch.nn.functional as F
12
- from torch.utils.data import DataLoader
13
- import numpy as np
14
- import evaluate
15
- import nltk
16
- from nltk.corpus import stopwords
17
- import subprocess
18
- import sys
19
-
20
- # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
21
- # subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
22
- # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
23
- nltk.download('stopwords')
24
- nlp = spacy.load("en_core_web_sm")
25
- stops = stopwords.words("english")
26
-
27
- # answer = "Pizza"
28
- guesses = []
29
- answer = "Pizza"
30
-
31
-
32
- #Mean Pooling - Take attention mask into account for correct averaging
33
- def mean_pooling(model_output, attention_mask):
34
- token_embeddings = model_output['token_embeddings'] #First element of model_output contains all token embeddings
35
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
36
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
37
-
38
-
39
- def normalize(comment, lowercase, remove_stopwords):
40
- if lowercase:
41
- comment = comment.lower()
42
- comment = nlp(comment)
43
- lemmatized = list()
44
- for word in comment:
45
- lemma = word.lemma_.strip()
46
- if lemma:
47
- if not remove_stopwords or (remove_stopwords and lemma not in stops):
48
- lemmatized.append(lemma)
49
- return " ".join(lemmatized)
50
-
51
-
52
- # def tokenize_function(examples):
53
- # return tokenizer(examples["text"])
54
-
55
-
56
- def compute_metrics(eval_pred):
57
- logits, labels = eval_pred
58
- predictions = np.argmax(logits, axis=-1)
59
- metric = evaluate.load("accuracy")
60
- return metric.compute(predictions=predictions, references=labels)
61
-
62
-
63
- def training():
64
- dataset_id = "ag_news"
65
- dataset = load_dataset(dataset_id)
66
- # dataset = dataset["train"]
67
- # tokenized_datasets = dataset.map(tokenize_function, batched=True)
68
-
69
- print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
70
- print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['text'])} as value.")
71
- print(f"- Examples look like this: {dataset['train'][0]}")
72
-
73
- # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
74
- # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
75
-
76
- # dataset = dataset["train"].map(tokenize_function, batched=True)
77
- # dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
78
- # dataset.format['type']
79
-
80
- # print(dataset)
81
-
82
- train_examples = []
83
- train_data = dataset["train"]
84
- # For agility we only 1/2 of our available data
85
- n_examples = dataset["train"].num_rows // 2
86
- # n_remaining = dataset["train"].num_rows - n_examples
87
- # dataset_clean = {}
88
- # # dataset_0 = []
89
- # # dataset_1 = []
90
- # # dataset_2 = []
91
- # # dataset_3 = []
92
- # for i in range(n_examples):
93
- # dataset_clean[i] = {}
94
- # dataset_clean[i]["text"] = normalize(train_data[i]["text"], lowercase=True, remove_stopwords=True)
95
- # dataset_clean[i]["label"] = train_data[i]["label"]
96
- # if train_data[i]["label"] == 0:
97
- # dataset_0.append(dataset_clean[i])
98
- # elif train_data[i]["label"] == 1:
99
- # dataset_1.append(dataset_clean[i])
100
- # elif train_data[i]["label"] == 2:
101
- # dataset_2.append(dataset_clean[i])
102
- # elif train_data[i]["label"] == 3:
103
- # dataset_3.append(dataset_clean[i])
104
- # n_0 = len(dataset_0) // 2
105
- # n_1 = len(dataset_1) // 2
106
- # n_2 = len(dataset_2) // 2
107
- # n_3 = len(dataset_3) // 2
108
- # print("Label lengths:", len(dataset_0), len(dataset_1), len(dataset_2), len(dataset_3))
109
-
110
- for i in range(n_examples):
111
- example = train_data[i]
112
- # example_opposite = dataset_clean[-(i)]
113
- # print(example["text"])
114
- train_examples.append(InputExample(texts=[example['text']], label=example['label']))
115
-
116
- # for i in range(n_0):
117
- # example = dataset_0[i]
118
- # # example_opposite = dataset_0[-(i)]
119
- # # print(example["text"])
120
- # train_examples.append(InputExample(texts=[example['text']], label=0))
121
-
122
- # for i in range(n_1):
123
- # example = dataset_1[i]
124
- # # example_opposite = dataset_1[-(i)]
125
- # # print(example["text"])
126
- # train_examples.append(InputExample(texts=[example['text']], label=1))
127
-
128
- # for i in range(n_2):
129
- # example = dataset_2[i]
130
- # # example_opposite = dataset_2[-(i)]
131
- # # print(example["text"])
132
- # train_examples.append(InputExample(texts=[example['text']], label=2))
133
-
134
- # for i in range(n_3):
135
- # example = dataset_3[i]
136
- # # example_opposite = dataset_3[-(i)]
137
- # # print(example["text"])
138
- # train_examples.append(InputExample(texts=[example['text']], label=3))
139
-
140
- train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
141
-
142
- print("END DATALOADER")
143
-
144
- # print(train_examples)
145
-
146
- embeddings = finetune(train_dataloader)
147
-
148
- return (dataset['train'].num_rows, type(dataset['train'][0]), type(dataset['train'][0]['text']), dataset['train'][0], embeddings)
149
-
150
-
151
- def finetune(train_dataloader):
152
- # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
153
- model_id = "sentence-transformers/all-MiniLM-L6-v2"
154
- model = SentenceTransformer(model_id)
155
-
156
- # training_args = TrainingArguments(output_dir="test_trainer")
157
-
158
- # USE THIS LINK
159
- # https://huggingface.co/blog/how-to-train-sentence-transformers
160
-
161
- train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
162
-
163
- print("BEGIN FIT")
164
-
165
- model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
166
-
167
- model.save("ag_news_model")
168
-
169
- model.save_to_hub("smhavens/all-MiniLM-agNews")
170
- # accuracy = compute_metrics(eval, metric)
171
-
172
- # training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
173
-
174
- # trainer = Trainer(
175
- # model=model,
176
- # args=training_args,
177
- # train_dataset=train,
178
- # eval_dataset=eval,
179
- # compute_metrics=compute_metrics,
180
- # )
181
-
182
- # trainer.train()
183
-
184
- def embeddings():
185
- model = SentenceTransformer("ag_news_model")
186
- device = torch.device('cuda:0')
187
- model = model.to(device)
188
- sentences = ["This is an example sentence", "Each sentence is converted"]
189
-
190
- # model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
191
- embeddings = model.encode(sentences)
192
- # print(embeddings)
193
-
194
- # Sentences we want sentence embeddings for
195
- sentences = ['This is an example sentence', 'Each sentence is converted']
196
-
197
- # Load model from HuggingFace Hub
198
- tokenizer = AutoTokenizer.from_pretrained('ag_news_model')
199
- # model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
200
-
201
- # Tokenize sentences
202
- encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
203
-
204
- # print(model.device)
205
- # print(encoded_input["input_ids"].device)
206
- # print(encoded_input["attention_mask"].device)
207
- # print(encoded_input["token_type_ids"].device)
208
- encoded_input["input_ids"] = encoded_input["input_ids"].to(device)
209
- encoded_input["attention_mask"] = encoded_input["attention_mask"].to(device)
210
- encoded_input['token_type_ids'] = encoded_input['token_type_ids'].to(device)
211
- # print(encoded_input)
212
-
213
- # print(encoded_input["input_ids"].device)
214
- # print(encoded_input["attention_mask"].device)
215
- # print(encoded_input["token_type_ids"].device)
216
-
217
- encoded_input['input'] = {'input_ids':encoded_input['input_ids'], 'attention_mask':encoded_input['attention_mask']}
218
-
219
- # + encoded_input['token_type_ids'] + encoded_input['attention_mask']
220
- del encoded_input['input_ids']
221
- del encoded_input['token_type_ids']
222
- del encoded_input['attention_mask']
223
-
224
- # print(encoded_input)
225
-
226
- # encoded_input.to(device)
227
- # Compute token embeddings
228
- with torch.no_grad():
229
- model_output = model(**encoded_input)
230
-
231
- print(model_output)
232
- # Perform pooling
233
- sentence_embeddings = mean_pooling(model_output, encoded_input['input']["attention_mask"])
234
-
235
- # Normalize embeddings
236
- sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
237
-
238
- print("Sentence embeddings:")
239
- print(sentence_embeddings)
240
- return sentence_embeddings
241
-
242
-
243
-
244
- def greet(name):
245
- return "Hello " + name + "!!"
246
-
247
- def check_answer(guess:str):
248
- global guesses
249
- global answer
250
- guesses.append(guess)
251
- output = ""
252
- for guess in guesses:
253
- output += ("- " + guess + "\n")
254
- output = output[:-1]
255
-
256
- if guess.lower() == answer.lower():
257
- return "Correct!", output
258
- else:
259
- return "Try again!", output
260
-
261
- def main():
262
- word1 = "Black"
263
- word2 = "White"
264
- word3 = "Sun"
265
- global answer
266
- answer = "Moon"
267
- global guesses
268
-
269
- # num_rows, data_type, value, example, embeddings = training()
270
- sent_embeddings = embeddings()
271
-
272
- prompt = f"{word1} is to {word2} as {word3} is to ____"
273
- with gr.Blocks() as iface:
274
- gr.Markdown(prompt)
275
- with gr.Tab("Guess"):
276
- text_input = gr.Textbox()
277
- text_output = gr.Textbox()
278
- text_button = gr.Button("Submit")
279
- with gr.Accordion("Open for previous guesses"):
280
- text_guesses = gr.Textbox()
281
- with gr.Tab("Testing"):
282
- gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
283
- text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
284
- # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
285
- iface.launch()
286
-
287
-
288
-
289
-
290
-
291
- if __name__ == "__main__":
292
  main()
 
1
+ import gradio as gr
2
+ import math
3
+ import spacy
4
+ from datasets import load_dataset
5
+ from sentence_transformers import SentenceTransformer
6
+ from sentence_transformers import InputExample
7
+ from sentence_transformers import losses
8
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
9
+ from transformers import TrainingArguments, Trainer
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from torch.utils.data import DataLoader
13
+ import numpy as np
14
+ import evaluate
15
+ import nltk
16
+ from nltk.corpus import stopwords
17
+ import subprocess
18
+ import sys
19
+
20
+ # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
21
+ # subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
22
+ # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
23
+ nltk.download('stopwords')
24
+ nlp = spacy.load("en_core_web_sm")
25
+ stops = stopwords.words("english")
26
+
27
+ # answer = "Pizza"
28
+ guesses = []
29
+ answer = "Pizza"
30
+
31
+
32
+ #Mean Pooling - Take attention mask into account for correct averaging
33
+ def mean_pooling(model_output, attention_mask):
34
+ token_embeddings = model_output['token_embeddings'] #First element of model_output contains all token embeddings
35
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
36
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
37
+
38
+
39
+ def normalize(comment, lowercase, remove_stopwords):
40
+ if lowercase:
41
+ comment = comment.lower()
42
+ comment = nlp(comment)
43
+ lemmatized = list()
44
+ for word in comment:
45
+ lemma = word.lemma_.strip()
46
+ if lemma:
47
+ if not remove_stopwords or (remove_stopwords and lemma not in stops):
48
+ lemmatized.append(lemma)
49
+ return " ".join(lemmatized)
50
+
51
+
52
+ # def tokenize_function(examples):
53
+ # return tokenizer(examples["text"])
54
+
55
+
56
+ def compute_metrics(eval_pred):
57
+ logits, labels = eval_pred
58
+ predictions = np.argmax(logits, axis=-1)
59
+ metric = evaluate.load("accuracy")
60
+ return metric.compute(predictions=predictions, references=labels)
61
+
62
+
63
+ def training():
64
+ dataset_id = "ag_news"
65
+ dataset = load_dataset(dataset_id)
66
+ # dataset = dataset["train"]
67
+ # tokenized_datasets = dataset.map(tokenize_function, batched=True)
68
+
69
+ print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
70
+ print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['text'])} as value.")
71
+ print(f"- Examples look like this: {dataset['train'][0]}")
72
+
73
+ # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
74
+ # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
75
+
76
+ # dataset = dataset["train"].map(tokenize_function, batched=True)
77
+ # dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
78
+ # dataset.format['type']
79
+
80
+ # print(dataset)
81
+
82
+ train_examples = []
83
+ train_data = dataset["train"]
84
+ # For agility we only 1/2 of our available data
85
+ n_examples = dataset["train"].num_rows // 2
86
+ # n_remaining = dataset["train"].num_rows - n_examples
87
+ # dataset_clean = {}
88
+ # # dataset_0 = []
89
+ # # dataset_1 = []
90
+ # # dataset_2 = []
91
+ # # dataset_3 = []
92
+ # for i in range(n_examples):
93
+ # dataset_clean[i] = {}
94
+ # dataset_clean[i]["text"] = normalize(train_data[i]["text"], lowercase=True, remove_stopwords=True)
95
+ # dataset_clean[i]["label"] = train_data[i]["label"]
96
+ # if train_data[i]["label"] == 0:
97
+ # dataset_0.append(dataset_clean[i])
98
+ # elif train_data[i]["label"] == 1:
99
+ # dataset_1.append(dataset_clean[i])
100
+ # elif train_data[i]["label"] == 2:
101
+ # dataset_2.append(dataset_clean[i])
102
+ # elif train_data[i]["label"] == 3:
103
+ # dataset_3.append(dataset_clean[i])
104
+ # n_0 = len(dataset_0) // 2
105
+ # n_1 = len(dataset_1) // 2
106
+ # n_2 = len(dataset_2) // 2
107
+ # n_3 = len(dataset_3) // 2
108
+ # print("Label lengths:", len(dataset_0), len(dataset_1), len(dataset_2), len(dataset_3))
109
+
110
+ for i in range(n_examples):
111
+ example = train_data[i]
112
+ # example_opposite = dataset_clean[-(i)]
113
+ # print(example["text"])
114
+ train_examples.append(InputExample(texts=[example['text']], label=example['label']))
115
+
116
+ # for i in range(n_0):
117
+ # example = dataset_0[i]
118
+ # # example_opposite = dataset_0[-(i)]
119
+ # # print(example["text"])
120
+ # train_examples.append(InputExample(texts=[example['text']], label=0))
121
+
122
+ # for i in range(n_1):
123
+ # example = dataset_1[i]
124
+ # # example_opposite = dataset_1[-(i)]
125
+ # # print(example["text"])
126
+ # train_examples.append(InputExample(texts=[example['text']], label=1))
127
+
128
+ # for i in range(n_2):
129
+ # example = dataset_2[i]
130
+ # # example_opposite = dataset_2[-(i)]
131
+ # # print(example["text"])
132
+ # train_examples.append(InputExample(texts=[example['text']], label=2))
133
+
134
+ # for i in range(n_3):
135
+ # example = dataset_3[i]
136
+ # # example_opposite = dataset_3[-(i)]
137
+ # # print(example["text"])
138
+ # train_examples.append(InputExample(texts=[example['text']], label=3))
139
+
140
+ train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
141
+
142
+ print("END DATALOADER")
143
+
144
+ # print(train_examples)
145
+
146
+ embeddings = finetune(train_dataloader)
147
+
148
+ return (dataset['train'].num_rows, type(dataset['train'][0]), type(dataset['train'][0]['text']), dataset['train'][0], embeddings)
149
+
150
+
151
+ def finetune(train_dataloader):
152
+ # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
153
+ model_id = "sentence-transformers/all-MiniLM-L6-v2"
154
+ model = SentenceTransformer(model_id)
155
+
156
+ # training_args = TrainingArguments(output_dir="test_trainer")
157
+
158
+ # USE THIS LINK
159
+ # https://huggingface.co/blog/how-to-train-sentence-transformers
160
+
161
+ train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
162
+
163
+ print("BEGIN FIT")
164
+
165
+ model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
166
+
167
+ model.save("ag_news_model")
168
+
169
+ model.save_to_hub("smhavens/all-MiniLM-agNews")
170
+ # accuracy = compute_metrics(eval, metric)
171
+
172
+ # training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
173
+
174
+ # trainer = Trainer(
175
+ # model=model,
176
+ # args=training_args,
177
+ # train_dataset=train,
178
+ # eval_dataset=eval,
179
+ # compute_metrics=compute_metrics,
180
+ # )
181
+
182
+ # trainer.train()
183
+
184
+ def embeddings():
185
+ model = SentenceTransformer("ag_news_model")
186
+ device = torch.device('cuda:0')
187
+ model = model.to(device)
188
+ sentences = ["This is an example sentence", "Each sentence is converted"]
189
+
190
+ # model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
191
+ embeddings = model.encode(sentences)
192
+ # print(embeddings)
193
+
194
+ # Sentences we want sentence embeddings for
195
+ sentences = ['This is an example sentence', 'Each sentence is converted']
196
+
197
+ # Load model from HuggingFace Hub
198
+ tokenizer = AutoTokenizer.from_pretrained('ag_news_model')
199
+ # model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
200
+
201
+ # Tokenize sentences
202
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
203
+
204
+ # print(model.device)
205
+ # print(encoded_input["input_ids"].device)
206
+ # print(encoded_input["attention_mask"].device)
207
+ # print(encoded_input["token_type_ids"].device)
208
+ encoded_input["input_ids"] = encoded_input["input_ids"].to(device)
209
+ encoded_input["attention_mask"] = encoded_input["attention_mask"].to(device)
210
+ encoded_input['token_type_ids'] = encoded_input['token_type_ids'].to(device)
211
+ # print(encoded_input)
212
+
213
+ # print(encoded_input["input_ids"].device)
214
+ # print(encoded_input["attention_mask"].device)
215
+ # print(encoded_input["token_type_ids"].device)
216
+
217
+ encoded_input['input'] = {'input_ids':encoded_input['input_ids'], 'attention_mask':encoded_input['attention_mask']}
218
+
219
+ # + encoded_input['token_type_ids'] + encoded_input['attention_mask']
220
+ del encoded_input['input_ids']
221
+ del encoded_input['token_type_ids']
222
+ del encoded_input['attention_mask']
223
+
224
+ # print(encoded_input)
225
+
226
+ # encoded_input.to(device)
227
+ # Compute token embeddings
228
+ with torch.no_grad():
229
+ model_output = model(**encoded_input)
230
+
231
+ print(model_output)
232
+ # Perform pooling
233
+ sentence_embeddings = mean_pooling(model_output, encoded_input['input']["attention_mask"])
234
+
235
+ # Normalize embeddings
236
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
237
+
238
+ print("Sentence embeddings:")
239
+ print(sentence_embeddings)
240
+ return sentence_embeddings
241
+
242
+
243
+
244
+ def greet(name):
245
+ return "Hello " + name + "!!"
246
+
247
+ def check_answer(guess:str):
248
+ global guesses
249
+ global answer
250
+ guesses.append(guess)
251
+ output = ""
252
+ for guess in guesses:
253
+ output += ("- " + guess + "\n")
254
+ output = output[:-1]
255
+
256
+ if guess.lower() == answer.lower():
257
+ return "Correct!", output
258
+ else:
259
+ return "Try again!", output
260
+
261
+ def main():
262
+ word1 = "Black"
263
+ word2 = "White"
264
+ word3 = "Sun"
265
+ global answer
266
+ answer = "Moon"
267
+ global guesses
268
+
269
+ # num_rows, data_type, value, example, embeddings = training()
270
+ sent_embeddings = embeddings()
271
+
272
+ prompt = f"{word1} is to {word2} as {word3} is to ____"
273
+ with gr.Blocks() as iface:
274
+ gr.Markdown(prompt)
275
+ with gr.Tab("Guess"):
276
+ text_input = gr.Textbox()
277
+ text_output = gr.Textbox()
278
+ text_button = gr.Button("Submit")
279
+ with gr.Accordion("Open for previous guesses"):
280
+ text_guesses = gr.Textbox()
281
+ with gr.Tab("Testing"):
282
+ gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
283
+ text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
284
+ # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
285
+ iface.launch()
286
+
287
+
288
+
289
+
290
+
291
+ if __name__ == "__main__":
292
  main()
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
- spacy
2
- sentence_transformers
3
- transformers
4
- torch
5
- evaluate
6
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
 
1
+ spacy
2
+ sentence_transformers
3
+ transformers
4
+ torch
5
+ evaluate
6
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
train.py CHANGED
@@ -1,277 +1,277 @@
1
- import gradio as gr
2
- import math
3
- import spacy
4
- from datasets import load_dataset
5
- from sentence_transformers import SentenceTransformer
6
- from sentence_transformers import InputExample
7
- from sentence_transformers import losses
8
- from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
9
- from transformers import TrainingArguments, Trainer
10
- import torch
11
- import torch.nn.functional as F
12
- from torch.utils.data import DataLoader
13
- import numpy as np
14
- import evaluate
15
- import nltk
16
- from nltk.corpus import stopwords
17
- import subprocess
18
- import sys
19
- from transformers import DataCollatorWithPadding
20
-
21
-
22
- # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
23
- # subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
24
- # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
25
- # data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
26
- # nltk.download('stopwords')
27
- # nlp = spacy.load("en_core_web_sm")
28
- # stops = stopwords.words("english")
29
-
30
- # answer = "Pizza"
31
- guesses = []
32
- answer = "Pizza"
33
-
34
-
35
- #Mean Pooling - Take attention mask into account for correct averaging
36
- def mean_pooling(model_output, attention_mask):
37
- token_embeddings = model_output[0] #First element of model_output contains all token embeddings
38
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
39
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
40
-
41
-
42
- # def normalize(comment, lowercase, remove_stopwords):
43
- # if lowercase:
44
- # comment = comment.lower()
45
- # comment = nlp(comment)
46
- # lemmatized = list()
47
- # for word in comment:
48
- # lemma = word.lemma_.strip()
49
- # if lemma:
50
- # if not remove_stopwords or (remove_stopwords and lemma not in stops):
51
- # lemmatized.append(lemma)
52
- # return " ".join(lemmatized)
53
-
54
-
55
- # def tokenize_function(examples):
56
- # return tokenizer(examples["text"], truncation=True)
57
-
58
-
59
- def compute_metrics(eval_pred):
60
- logits, labels = eval_pred
61
- predictions = np.argmax(logits, axis=-1)
62
- metric = evaluate.load("accuracy")
63
- return metric.compute(predictions=predictions, references=labels)
64
-
65
-
66
- def training():
67
- dataset_id = "ag_news"
68
-
69
- print("GETTING DATASET")
70
- dataset = load_dataset(dataset_id)
71
- # dataset = dataset["train"]
72
- # tokenized_datasets = dataset.map(tokenize_function, batched=True)
73
-
74
- print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
75
- print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['text'])} as value.")
76
- print(f"- Examples look like this: {dataset['train'][0]}")
77
-
78
- # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
79
- # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
80
-
81
- # dataset = dataset["train"].map(tokenize_function, batched=True)
82
- # dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
83
- # dataset.format['type']
84
-
85
- # tokenized_news = dataset.map(tokenize_function, batched=True)
86
-
87
- # model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", num_labels=2)
88
-
89
- # print(dataset)
90
-
91
- train_examples = []
92
- train_data = dataset["train"]
93
- # For agility we only 1/2 of our available data
94
- n_examples = dataset["train"].num_rows // 2
95
- # n_remaining = dataset["train"].num_rows - n_examples
96
- # dataset_clean = {}
97
- # # dataset_0 = []
98
- # # dataset_1 = []
99
- # # dataset_2 = []
100
- # # dataset_3 = []
101
- # for i in range(n_examples):
102
- # dataset_clean[i] = {}
103
- # dataset_clean[i]["text"] = normalize(train_data[i]["text"], lowercase=True, remove_stopwords=True)
104
- # dataset_clean[i]["label"] = train_data[i]["label"]
105
- # if train_data[i]["label"] == 0:
106
- # dataset_0.append(dataset_clean[i])
107
- # elif train_data[i]["label"] == 1:
108
- # dataset_1.append(dataset_clean[i])
109
- # elif train_data[i]["label"] == 2:
110
- # dataset_2.append(dataset_clean[i])
111
- # elif train_data[i]["label"] == 3:
112
- # dataset_3.append(dataset_clean[i])
113
- # n_0 = len(dataset_0) // 2
114
- # n_1 = len(dataset_1) // 2
115
- # n_2 = len(dataset_2) // 2
116
- # n_3 = len(dataset_3) // 2
117
- # print("Label lengths:", len(dataset_0), len(dataset_1), len(dataset_2), len(dataset_3))
118
-
119
- for i in range(n_examples):
120
- example = train_data[i]
121
- # example_opposite = dataset_clean[-(i)]
122
- # print(example["text"])
123
- train_examples.append(InputExample(texts=[example['text']], label=example['label']))
124
-
125
- # for i in range(n_0):
126
- # example = dataset_0[i]
127
- # # example_opposite = dataset_0[-(i)]
128
- # # print(example["text"])
129
- # train_examples.append(InputExample(texts=[example['text']], label=0))
130
-
131
- # for i in range(n_1):
132
- # example = dataset_1[i]
133
- # # example_opposite = dataset_1[-(i)]
134
- # # print(example["text"])
135
- # train_examples.append(InputExample(texts=[example['text']], label=1))
136
-
137
- # for i in range(n_2):
138
- # example = dataset_2[i]
139
- # # example_opposite = dataset_2[-(i)]
140
- # # print(example["text"])
141
- # train_examples.append(InputExample(texts=[example['text']], label=2))
142
-
143
- # for i in range(n_3):
144
- # example = dataset_3[i]
145
- # # example_opposite = dataset_3[-(i)]
146
- # # print(example["text"])
147
- # train_examples.append(InputExample(texts=[example['text']], label=3))
148
-
149
- train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
150
-
151
- print("END DATALOADER")
152
-
153
- # print(train_examples)
154
-
155
- embeddings = finetune(train_dataloader)
156
-
157
- return (dataset['train'].num_rows, type(dataset['train'][0]), type(dataset['train'][0]['text']), dataset['train'][0], embeddings)
158
-
159
-
160
- def finetune(train_dataloader):
161
- # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
162
- model_id = "sentence-transformers/all-MiniLM-L6-v2"
163
- model = SentenceTransformer(model_id)
164
- device = torch.device('cuda:0')
165
- model = model.to(device)
166
-
167
- # training_args = TrainingArguments(output_dir="test_trainer")
168
-
169
- # USE THIS LINK
170
- # https://huggingface.co/blog/how-to-train-sentence-transformers
171
-
172
- train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
173
-
174
- print("BEGIN FIT")
175
-
176
- model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
177
-
178
- model.save("ag_news_model")
179
-
180
- model.save_to_hub("smhavens/all-MiniLM-agNews")
181
- # accuracy = compute_metrics(eval, metric)
182
-
183
- # training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
184
-
185
- # trainer = Trainer(
186
- # model=model,
187
- # args=training_args,
188
- # train_dataset=train,
189
- # eval_dataset=eval,
190
- # compute_metrics=compute_metrics,
191
- # )
192
-
193
- # trainer.train()
194
-
195
- # sentences = ["This is an example sentence", "Each sentence is converted"]
196
-
197
- # # model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
198
- # embeddings = model.encode(sentences)
199
- # print(embeddings)
200
-
201
- # # Sentences we want sentence embeddings for
202
- # sentences = ['This is an example sentence', 'Each sentence is converted']
203
-
204
- # # Load model from HuggingFace Hub
205
- # # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
206
- # # model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
207
-
208
- # # Tokenize sentences
209
- # encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
210
-
211
- # # Compute token embeddings
212
- # with torch.no_grad():
213
- # model_output = model(**encoded_input)
214
-
215
- # # Perform pooling
216
- # sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
217
-
218
- # # Normalize embeddings
219
- # sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
220
-
221
- # print("Sentence embeddings:")
222
- # print(sentence_embeddings)
223
- return 0
224
-
225
-
226
-
227
- def greet(name):
228
- return "Hello " + name + "!!"
229
-
230
- def check_answer(guess:str):
231
- global guesses
232
- global answer
233
- guesses.append(guess)
234
- output = ""
235
- for guess in guesses:
236
- output += ("- " + guess + "\n")
237
- output = output[:-1]
238
-
239
- if guess.lower() == answer.lower():
240
- return "Correct!", output
241
- else:
242
- return "Try again!", output
243
-
244
- def main():
245
- print("BEGIN")
246
- word1 = "Black"
247
- word2 = "White"
248
- word3 = "Sun"
249
- global answer
250
- answer = "Moon"
251
- global guesses
252
-
253
- num_rows, data_type, value, example, embeddings = training()
254
-
255
- # prompt = f"{word1} is to {word2} as {word3} is to ____"
256
- # with gr.Blocks() as iface:
257
- # gr.Markdown(prompt)
258
- # with gr.Tab("Guess"):
259
- # text_input = gr.Textbox()
260
- # text_output = gr.Textbox()
261
- # text_button = gr.Button("Submit")
262
- # with gr.Accordion("Open for previous guesses"):
263
- # text_guesses = gr.Textbox()
264
- # with gr.Tab("Testing"):
265
- # gr.Markdown(f"""Number of rows in dataset is {num_rows}, with each having type {data_type} and value {value}.
266
- # An example is {example}.
267
- # The Embeddings are {embeddings}.""")
268
- # text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
269
- # # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
270
- # iface.launch()
271
-
272
-
273
-
274
-
275
-
276
- if __name__ == "__main__":
277
  main()
 
1
+ import gradio as gr
2
+ import math
3
+ import spacy
4
+ from datasets import load_dataset
5
+ from sentence_transformers import SentenceTransformer
6
+ from sentence_transformers import InputExample
7
+ from sentence_transformers import losses
8
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
9
+ from transformers import TrainingArguments, Trainer
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from torch.utils.data import DataLoader
13
+ import numpy as np
14
+ import evaluate
15
+ import nltk
16
+ from nltk.corpus import stopwords
17
+ import subprocess
18
+ import sys
19
+ from transformers import DataCollatorWithPadding
20
+
21
+
22
+ # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
23
+ # subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
24
+ # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
25
+ # data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
26
+ # nltk.download('stopwords')
27
+ # nlp = spacy.load("en_core_web_sm")
28
+ # stops = stopwords.words("english")
29
+
30
+ # answer = "Pizza"
31
+ guesses = []
32
+ answer = "Pizza"
33
+
34
+
35
+ #Mean Pooling - Take attention mask into account for correct averaging
36
+ def mean_pooling(model_output, attention_mask):
37
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
38
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
39
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
40
+
41
+
42
+ # def normalize(comment, lowercase, remove_stopwords):
43
+ # if lowercase:
44
+ # comment = comment.lower()
45
+ # comment = nlp(comment)
46
+ # lemmatized = list()
47
+ # for word in comment:
48
+ # lemma = word.lemma_.strip()
49
+ # if lemma:
50
+ # if not remove_stopwords or (remove_stopwords and lemma not in stops):
51
+ # lemmatized.append(lemma)
52
+ # return " ".join(lemmatized)
53
+
54
+
55
+ # def tokenize_function(examples):
56
+ # return tokenizer(examples["text"], truncation=True)
57
+
58
+
59
+ def compute_metrics(eval_pred):
60
+ logits, labels = eval_pred
61
+ predictions = np.argmax(logits, axis=-1)
62
+ metric = evaluate.load("accuracy")
63
+ return metric.compute(predictions=predictions, references=labels)
64
+
65
+
66
+ def training():
67
+ dataset_id = "ag_news"
68
+
69
+ print("GETTING DATASET")
70
+ dataset = load_dataset(dataset_id)
71
+ # dataset = dataset["train"]
72
+ # tokenized_datasets = dataset.map(tokenize_function, batched=True)
73
+
74
+ print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
75
+ print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['text'])} as value.")
76
+ print(f"- Examples look like this: {dataset['train'][0]}")
77
+
78
+ # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
79
+ # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
80
+
81
+ # dataset = dataset["train"].map(tokenize_function, batched=True)
82
+ # dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
83
+ # dataset.format['type']
84
+
85
+ # tokenized_news = dataset.map(tokenize_function, batched=True)
86
+
87
+ # model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", num_labels=2)
88
+
89
+ # print(dataset)
90
+
91
+ train_examples = []
92
+ train_data = dataset["train"]
93
+ # For agility we only 1/2 of our available data
94
+ n_examples = dataset["train"].num_rows // 2
95
+ # n_remaining = dataset["train"].num_rows - n_examples
96
+ # dataset_clean = {}
97
+ # # dataset_0 = []
98
+ # # dataset_1 = []
99
+ # # dataset_2 = []
100
+ # # dataset_3 = []
101
+ # for i in range(n_examples):
102
+ # dataset_clean[i] = {}
103
+ # dataset_clean[i]["text"] = normalize(train_data[i]["text"], lowercase=True, remove_stopwords=True)
104
+ # dataset_clean[i]["label"] = train_data[i]["label"]
105
+ # if train_data[i]["label"] == 0:
106
+ # dataset_0.append(dataset_clean[i])
107
+ # elif train_data[i]["label"] == 1:
108
+ # dataset_1.append(dataset_clean[i])
109
+ # elif train_data[i]["label"] == 2:
110
+ # dataset_2.append(dataset_clean[i])
111
+ # elif train_data[i]["label"] == 3:
112
+ # dataset_3.append(dataset_clean[i])
113
+ # n_0 = len(dataset_0) // 2
114
+ # n_1 = len(dataset_1) // 2
115
+ # n_2 = len(dataset_2) // 2
116
+ # n_3 = len(dataset_3) // 2
117
+ # print("Label lengths:", len(dataset_0), len(dataset_1), len(dataset_2), len(dataset_3))
118
+
119
+ for i in range(n_examples):
120
+ example = train_data[i]
121
+ # example_opposite = dataset_clean[-(i)]
122
+ # print(example["text"])
123
+ train_examples.append(InputExample(texts=[example['text']], label=example['label']))
124
+
125
+ # for i in range(n_0):
126
+ # example = dataset_0[i]
127
+ # # example_opposite = dataset_0[-(i)]
128
+ # # print(example["text"])
129
+ # train_examples.append(InputExample(texts=[example['text']], label=0))
130
+
131
+ # for i in range(n_1):
132
+ # example = dataset_1[i]
133
+ # # example_opposite = dataset_1[-(i)]
134
+ # # print(example["text"])
135
+ # train_examples.append(InputExample(texts=[example['text']], label=1))
136
+
137
+ # for i in range(n_2):
138
+ # example = dataset_2[i]
139
+ # # example_opposite = dataset_2[-(i)]
140
+ # # print(example["text"])
141
+ # train_examples.append(InputExample(texts=[example['text']], label=2))
142
+
143
+ # for i in range(n_3):
144
+ # example = dataset_3[i]
145
+ # # example_opposite = dataset_3[-(i)]
146
+ # # print(example["text"])
147
+ # train_examples.append(InputExample(texts=[example['text']], label=3))
148
+
149
+ train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
150
+
151
+ print("END DATALOADER")
152
+
153
+ # print(train_examples)
154
+
155
+ embeddings = finetune(train_dataloader)
156
+
157
+ return (dataset['train'].num_rows, type(dataset['train'][0]), type(dataset['train'][0]['text']), dataset['train'][0], embeddings)
158
+
159
+
160
+ def finetune(train_dataloader):
161
+ # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
162
+ model_id = "sentence-transformers/all-MiniLM-L6-v2"
163
+ model = SentenceTransformer(model_id)
164
+ device = torch.device('cuda:0')
165
+ model = model.to(device)
166
+
167
+ # training_args = TrainingArguments(output_dir="test_trainer")
168
+
169
+ # USE THIS LINK
170
+ # https://huggingface.co/blog/how-to-train-sentence-transformers
171
+
172
+ train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
173
+
174
+ print("BEGIN FIT")
175
+
176
+ model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
177
+
178
+ model.save("ag_news_model")
179
+
180
+ model.save_to_hub("smhavens/all-MiniLM-agNews")
181
+ # accuracy = compute_metrics(eval, metric)
182
+
183
+ # training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
184
+
185
+ # trainer = Trainer(
186
+ # model=model,
187
+ # args=training_args,
188
+ # train_dataset=train,
189
+ # eval_dataset=eval,
190
+ # compute_metrics=compute_metrics,
191
+ # )
192
+
193
+ # trainer.train()
194
+
195
+ # sentences = ["This is an example sentence", "Each sentence is converted"]
196
+
197
+ # # model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
198
+ # embeddings = model.encode(sentences)
199
+ # print(embeddings)
200
+
201
+ # # Sentences we want sentence embeddings for
202
+ # sentences = ['This is an example sentence', 'Each sentence is converted']
203
+
204
+ # # Load model from HuggingFace Hub
205
+ # # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
206
+ # # model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
207
+
208
+ # # Tokenize sentences
209
+ # encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
210
+
211
+ # # Compute token embeddings
212
+ # with torch.no_grad():
213
+ # model_output = model(**encoded_input)
214
+
215
+ # # Perform pooling
216
+ # sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
217
+
218
+ # # Normalize embeddings
219
+ # sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
220
+
221
+ # print("Sentence embeddings:")
222
+ # print(sentence_embeddings)
223
+ return 0
224
+
225
+
226
+
227
+ def greet(name):
228
+ return "Hello " + name + "!!"
229
+
230
+ def check_answer(guess:str):
231
+ global guesses
232
+ global answer
233
+ guesses.append(guess)
234
+ output = ""
235
+ for guess in guesses:
236
+ output += ("- " + guess + "\n")
237
+ output = output[:-1]
238
+
239
+ if guess.lower() == answer.lower():
240
+ return "Correct!", output
241
+ else:
242
+ return "Try again!", output
243
+
244
+ def main():
245
+ print("BEGIN")
246
+ word1 = "Black"
247
+ word2 = "White"
248
+ word3 = "Sun"
249
+ global answer
250
+ answer = "Moon"
251
+ global guesses
252
+
253
+ num_rows, data_type, value, example, embeddings = training()
254
+
255
+ # prompt = f"{word1} is to {word2} as {word3} is to ____"
256
+ # with gr.Blocks() as iface:
257
+ # gr.Markdown(prompt)
258
+ # with gr.Tab("Guess"):
259
+ # text_input = gr.Textbox()
260
+ # text_output = gr.Textbox()
261
+ # text_button = gr.Button("Submit")
262
+ # with gr.Accordion("Open for previous guesses"):
263
+ # text_guesses = gr.Textbox()
264
+ # with gr.Tab("Testing"):
265
+ # gr.Markdown(f"""Number of rows in dataset is {num_rows}, with each having type {data_type} and value {value}.
266
+ # An example is {example}.
267
+ # The Embeddings are {embeddings}.""")
268
+ # text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
269
+ # # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
270
+ # iface.launch()
271
+
272
+
273
+
274
+
275
+
276
+ if __name__ == "__main__":
277
  main()