smhavens commited on
Commit
3922a86
1 Parent(s): a2c6b40

Massive changes, using better dataset and now returning random masks

Browse files
analogy_train.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import math
3
+ import spacy
4
+ from datasets import load_dataset
5
+ from sentence_transformers import SentenceTransformer
6
+ from sentence_transformers import InputExample
7
+ from sentence_transformers import losses
8
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
9
+ from transformers import TrainingArguments, Trainer
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from torch.utils.data import DataLoader
13
+ import numpy as np
14
+ import evaluate
15
+ import nltk
16
+ from nltk.corpus import stopwords
17
+ import subprocess
18
+ import sys
19
+ from transformers import DataCollatorWithPadding
20
+ from transformers import TrainingArguments
21
+ from transformers import (
22
+ BertModel,
23
+ BertTokenizerFast,
24
+ Trainer,
25
+ EvalPrediction
26
+ )
27
+
28
+ # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
29
+ # subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
30
+ # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
31
+ # data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
32
+ # nltk.download('stopwords')
33
+ # nlp = spacy.load("en_core_web_sm")
34
+ # stops = stopwords.words("english")
35
+
36
+ # answer = "Pizza"
37
+ guesses = []
38
+ answer = "Pizza"
39
+
40
+ tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
41
+ metric = evaluate.load("accuracy")
42
+
43
+ def tokenize_function(examples):
44
+ return tokenizer(examples["stem"], padding="max_length", truncation=True)
45
+
46
+
47
+ #Mean Pooling - Take attention mask into account for correct averaging
48
+ def mean_pooling(model_output, attention_mask):
49
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
50
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
51
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
52
+
53
+
54
+ def compute_metrics(eval_pred):
55
+ logits, labels = eval_pred
56
+ predictions = np.argmax(logits, axis=-1)
57
+ metric = evaluate.load("accuracy")
58
+ return metric.compute(predictions=predictions, references=labels)
59
+
60
+
61
+ # def training():
62
+ # dataset_id = "relbert/analogy_questions"
63
+ # dataset_sub = "bats"
64
+ # print("GETTING DATASET")
65
+ # raw_dataset = load_dataset(dataset_id, dataset_sub)
66
+ # # data_metric = evaluate.load(dataset_id, dataset_sub)
67
+ # checkpoint = "bert-base-uncased"
68
+ # model = BertModel.from_pretrained(checkpoint)
69
+ # # dataset = dataset["train"]
70
+ # # tokenized_datasets = dataset.map(tokenize_function, batched=True)
71
+ # # print(raw_dataset)
72
+ # test_data = raw_dataset["test"]
73
+ # # print(test_data["stem"])
74
+ # all_answers = []
75
+ # for answer in raw_dataset["answer"]:
76
+ # answer = raw_dataset["choice"][answer]
77
+ # raw_dataset = raw_dataset.add_column("label", all_answers)
78
+
79
+
80
+ # print(raw_dataset)
81
+ # print(raw_dataset["label"])
82
+ # dataset = raw_dataset.map(
83
+ # lambda x: tokenizer(x["stem"], truncation=True),
84
+ # batched=True,
85
+ # )
86
+ # print(dataset)
87
+ # dataset = dataset.remove_columns(["stem", "answer", "choice"])
88
+ # dataset = dataset.rename_column("label", "labels")
89
+ # dataset = dataset.with_format("torch")
90
+
91
+ # training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
92
+
93
+ # print(dataset)
94
+ # # print(f"- The {dataset_id} dataset has {dataset.num_rows} examples.")
95
+ # # print(f"- Each example is a {type(dataset[0])} with a {type(dataset[0]['stem'])} as value.")
96
+ # # print(f"- Examples look like this: {dataset[0]}")
97
+
98
+ # # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
99
+ # # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
100
+
101
+ # # dataset = dataset["train"].map(tokenize_function, batched=True)
102
+ # # dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
103
+ # # dataset.format['type']
104
+
105
+ # # tokenized_news = dataset.map(tokenize_function, batched=True)
106
+
107
+ # # model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", num_labels=2)
108
+
109
+ # # print(dataset)
110
+
111
+ # # Choose the appropriate device based on availability (CUDA or CPU)
112
+ # # gpu_available = torch.cuda.is_available()
113
+ # # device = torch.device("cuda" if gpu_available else "cpu")
114
+ # # model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
115
+
116
+ # # tokenized_datasets = dataset.map(tokenize_function, batched=True)
117
+ # # print(tokenized_datasets)
118
+ # # # small_train_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
119
+ # # # small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(1000))
120
+
121
+ # # model = model.to(device)
122
+
123
+ # # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
124
+ # # training_args = TrainingArguments(output_dir="test_trainer")
125
+
126
+ # trainer = Trainer(
127
+ # model=model,
128
+ # args=training_args,
129
+ # train_dataset=dataset["test"],
130
+ # eval_dataset=dataset["validation"],
131
+ # compute_metrics=compute_metrics,
132
+ # )
133
+
134
+ # output = trainer.train()
135
+
136
+ # # train_examples = []
137
+ # # train_data = dataset["train"]
138
+ # # # For agility we only 1/2 of our available data
139
+ # # n_examples = dataset["train"].num_rows // 2
140
+
141
+ # # for i in range(n_examples):
142
+ # # example = train_data[i]
143
+ # # # example_opposite = dataset_clean[-(i)]
144
+ # # # print(example["text"])
145
+ # # train_examples.append(InputExample(texts=[example['stem'], example]))
146
+
147
+
148
+ # # train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
149
+
150
+ # # print("END DATALOADER")
151
+
152
+ # # # print(train_examples)
153
+
154
+ # # embeddings = finetune(train_dataloader)
155
+ # print(output)
156
+
157
+ # model.save("bert-analogies")
158
+
159
+ # model.save_to_hub("smhavens/bert-base-analogies")
160
+ # return output
161
+
162
+
163
+ # def finetune(train_dataloader):
164
+ # # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
165
+ # model_id = "sentence-transformers/all-MiniLM-L6-v2"
166
+ # model = SentenceTransformer(model_id)
167
+ # device = torch.device('cuda:0')
168
+ # model = model.to(device)
169
+
170
+ # # training_args = TrainingArguments(output_dir="test_trainer")
171
+
172
+ # # USE THIS LINK
173
+ # # https://huggingface.co/blog/how-to-train-sentence-transformers
174
+
175
+ # train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
176
+
177
+ # print("BEGIN FIT")
178
+
179
+ # model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
180
+
181
+ # model.save("bert-analogies")
182
+
183
+ # model.save_to_hub("smhavens/bert-base-analogies")
184
+ # return 0
185
+
186
+ def training():
187
+ dataset_id = "relbert/analogy_questions"
188
+ dataset_sub = "bats"
189
+ print("GETTING DATASET")
190
+ dataset = load_dataset(dataset_id, dataset_sub)
191
+ # dataset = dataset["train"]
192
+ # tokenized_datasets = dataset.map(tokenize_function, batched=True)
193
+
194
+ print(f"- The {dataset_id} dataset has {dataset['test'].num_rows} examples.")
195
+ print(f"- Each example is a {type(dataset['test'][0])} with a {type(dataset['test'][0]['stem'])} as value.")
196
+ print(f"- Examples look like this: {dataset['test'][0]}")
197
+
198
+ train_examples = []
199
+ train_data = dataset["test"]
200
+ # For agility we only 1/2 of our available data
201
+ n_examples = dataset["test"].num_rows // 2
202
+
203
+ for i in range(n_examples):
204
+ example = train_data[i]
205
+ temp_word_1 = example["stem"][0]
206
+ temp_word_2 = example["stem"][1]
207
+ temp_word_3 = example["choice"][example["answer"]][0]
208
+ temp_word_4 = example["choice"][example["answer"]][1]
209
+ comp1 = f"{temp_word_1} to {temp_word_2}"
210
+ comp2 = f"{temp_word_3} to {temp_word_4}"
211
+ # example_opposite = dataset_clean[-(i)]
212
+ # print(example["text"])
213
+ train_examples.append(InputExample(texts=[comp1, comp2]))
214
+
215
+
216
+ train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
217
+
218
+ print("END DATALOADER")
219
+
220
+ # print(train_examples)
221
+
222
+ embeddings = finetune(train_dataloader)
223
+
224
+ return (dataset['test'].num_rows, type(dataset['test'][0]), type(dataset['test'][0]['stem']), dataset['test'][0], embeddings)
225
+
226
+
227
+ def finetune(train_dataloader):
228
+ # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
229
+ model_id = "sentence-transformers/all-MiniLM-L6-v2"
230
+ model = SentenceTransformer(model_id)
231
+ device = torch.device('cuda:0')
232
+ model = model.to(device)
233
+
234
+ # training_args = TrainingArguments(output_dir="test_trainer")
235
+
236
+ # USE THIS LINK
237
+ # https://huggingface.co/blog/how-to-train-sentence-transformers
238
+
239
+ train_loss = losses.MegaBatchMarginLoss(model=model)
240
+
241
+ print("BEGIN FIT")
242
+
243
+ model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
244
+
245
+ model.save("bert-analogies")
246
+
247
+ # model.save_to_hub("smhavens/bert-base-analogies")
248
+ # accuracy = compute_metrics(eval, metric)
249
+ return 0
250
+
251
+ def greet(name):
252
+ return "Hello " + name + "!!"
253
+
254
+ def check_answer(guess:str):
255
+ global guesses
256
+ global answer
257
+ guesses.append(guess)
258
+ output = ""
259
+ for guess in guesses:
260
+ output += ("- " + guess + "\n")
261
+ output = output[:-1]
262
+
263
+ if guess.lower() == answer.lower():
264
+ return "Correct!", output
265
+ else:
266
+ return "Try again!", output
267
+
268
+ def main():
269
+ print("BEGIN")
270
+ word1 = "Black"
271
+ word2 = "White"
272
+ word3 = "Sun"
273
+ global answer
274
+ answer = "Moon"
275
+ global guesses
276
+
277
+ num_rows, data_type, value, example, embeddings = training()
278
+
279
+ # prompt = f"{word1} is to {word2} as {word3} is to ____"
280
+ # with gr.Blocks() as iface:
281
+ # gr.Markdown(prompt)
282
+ # with gr.Tab("Guess"):
283
+ # text_input = gr.Textbox()
284
+ # text_output = gr.Textbox()
285
+ # text_button = gr.Button("Submit")
286
+ # with gr.Accordion("Open for previous guesses"):
287
+ # text_guesses = gr.Textbox()
288
+ # with gr.Tab("Testing"):
289
+ # gr.Markdown(f"""Number of rows in dataset is {num_rows}, with each having type {data_type} and value {value}.
290
+ # An example is {example}.
291
+ # The Embeddings are {embeddings}.""")
292
+ # text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
293
+ # # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
294
+ # iface.launch()
295
+
296
+
297
+
298
+
299
+
300
+ if __name__ == "__main__":
301
+ main()
app.py CHANGED
@@ -5,6 +5,8 @@ from datasets import load_dataset
5
  from sentence_transformers import SentenceTransformer
6
  from sentence_transformers import InputExample
7
  from sentence_transformers import losses
 
 
8
  from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
9
  from transformers import TrainingArguments, Trainer
10
  import torch
@@ -16,6 +18,7 @@ import nltk
16
  from nltk.corpus import stopwords
17
  import subprocess
18
  import sys
 
19
 
20
  # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
21
  subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
@@ -23,10 +26,20 @@ subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingf
23
  nltk.download('stopwords')
24
  nlp = spacy.load("en_core_web_sm")
25
  stops = stopwords.words("english")
 
 
 
 
 
 
26
 
27
  # answer = "Pizza"
28
  guesses = []
29
- answer = "Pizza"
 
 
 
 
30
 
31
 
32
  #Mean Pooling - Take attention mask into account for correct averaging
@@ -134,65 +147,108 @@ def finetune(train_dataloader):
134
 
135
  # trainer.train()
136
 
137
- def embeddings():
138
- model = SentenceTransformer("ag_news_model")
 
139
  device = torch.device('cuda:0')
140
  model = model.to(device)
141
- sentences = ["This is an example sentence", "Each sentence is converted"]
 
 
 
 
 
 
 
 
142
 
143
- # model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 
 
 
144
  embeddings = model.encode(sentences)
145
- # print(embeddings)
146
-
147
- # Sentences we want sentence embeddings for
148
- sentences = ['This is an example sentence', 'Each sentence is converted']
149
 
150
  # Load model from HuggingFace Hub
151
- tokenizer = AutoTokenizer.from_pretrained('ag_news_model')
152
- # model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
153
-
154
- # Tokenize sentences
155
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
156
-
157
- # print(model.device)
158
- # print(encoded_input["input_ids"].device)
159
- # print(encoded_input["attention_mask"].device)
160
- # print(encoded_input["token_type_ids"].device)
161
  encoded_input["input_ids"] = encoded_input["input_ids"].to(device)
162
  encoded_input["attention_mask"] = encoded_input["attention_mask"].to(device)
163
  encoded_input['token_type_ids'] = encoded_input['token_type_ids'].to(device)
164
- # print(encoded_input)
165
-
166
- # print(encoded_input["input_ids"].device)
167
- # print(encoded_input["attention_mask"].device)
168
- # print(encoded_input["token_type_ids"].device)
169
 
170
  encoded_input['input'] = {'input_ids':encoded_input['input_ids'], 'attention_mask':encoded_input['attention_mask']}
171
 
172
- # + encoded_input['token_type_ids'] + encoded_input['attention_mask']
173
  del encoded_input['input_ids']
174
  del encoded_input['token_type_ids']
175
  del encoded_input['attention_mask']
176
 
177
- # print(encoded_input)
178
-
179
- # encoded_input.to(device)
180
- # Compute token embeddings
181
  with torch.no_grad():
 
 
182
  model_output = model(**encoded_input)
 
 
 
 
 
183
 
184
- print(model_output)
185
  # Perform pooling
186
  sentence_embeddings = mean_pooling(model_output, encoded_input['input']["attention_mask"])
187
 
188
  # Normalize embeddings
189
  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
190
 
191
- print("Sentence embeddings:")
192
- print(sentence_embeddings)
193
- return sentence_embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
-
196
 
197
  def greet(name):
198
  return "Hello " + name + "!!"
@@ -200,29 +256,46 @@ def greet(name):
200
  def check_answer(guess:str):
201
  global guesses
202
  global answer
203
- guesses.append(guess)
 
204
  output = ""
205
- for guess in guesses:
206
- output += ("- " + guess + "\n")
 
 
 
 
 
 
 
 
207
  output = output[:-1]
 
208
 
209
- if guess.lower() == answer.lower():
 
210
  return "Correct!", output
211
  else:
 
212
  return "Try again!", output
213
 
214
  def main():
215
- word1 = "Black"
216
- word2 = "White"
217
- word3 = "Sun"
218
  global answer
219
- answer = "Moon"
220
  global guesses
221
 
 
222
  # num_rows, data_type, value, example, embeddings = training()
223
- sent_embeddings = embeddings()
 
 
224
 
225
  prompt = f"{word1} is to {word2} as {word3} is to ____"
 
 
226
  with gr.Blocks() as iface:
227
  gr.Markdown(prompt)
228
  with gr.Tab("Guess"):
@@ -231,8 +304,8 @@ def main():
231
  text_button = gr.Button("Submit")
232
  with gr.Accordion("Open for previous guesses"):
233
  text_guesses = gr.Textbox()
234
- with gr.Tab("Testing"):
235
- gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
236
  text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
237
  # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
238
  iface.launch()
 
5
  from sentence_transformers import SentenceTransformer
6
  from sentence_transformers import InputExample
7
  from sentence_transformers import losses
8
+ from sentence_transformers import util
9
+ from transformers import pipeline
10
  from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
11
  from transformers import TrainingArguments, Trainer
12
  import torch
 
18
  from nltk.corpus import stopwords
19
  import subprocess
20
  import sys
21
+ import random
22
 
23
  # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
24
  subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
 
26
  nltk.download('stopwords')
27
  nlp = spacy.load("en_core_web_sm")
28
  stops = stopwords.words("english")
29
+ ROMAN_CONSTANTS = (
30
+ ( "", "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX" ),
31
+ ( "", "X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC" ),
32
+ ( "", "C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM" ),
33
+ ( "", "M", "MM", "MMM", "", "", "-", "", "", "" ),
34
+ )
35
 
36
  # answer = "Pizza"
37
  guesses = []
38
+ return_guesses = []
39
+ answer = "Moon"
40
+ word1 = "Black"
41
+ word2 = "White"
42
+ word3 = "Sun"
43
 
44
 
45
  #Mean Pooling - Take attention mask into account for correct averaging
 
147
 
148
  # trainer.train()
149
 
150
+
151
+ def get_model():
152
+ model = SentenceTransformer("bert-analogies")
153
  device = torch.device('cuda:0')
154
  model = model.to(device)
155
+ return model
156
+
157
+
158
+ def cosine_scores(model, sentence):
159
+ global word1
160
+ global word2
161
+ global word3
162
+ # sentence1 = f"{word1} is to {word2} as"
163
+ embeddings1 = model.encode(sentence, convert_to_tensor=True)
164
 
165
+ def embeddings(model, sentences):
166
+ gpu_available = torch.cuda.is_available()
167
+ device = torch.device("cuda" if gpu_available else "cpu")
168
+ # device = torch.device('cuda:0')
169
  embeddings = model.encode(sentences)
170
+ global word1
171
+ global word2
172
+ global word3
 
173
 
174
  # Load model from HuggingFace Hub
175
+ tokenizer = AutoTokenizer.from_pretrained('bert-analogies')
 
 
 
176
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
177
+ token_ids = tokenizer.encode(sentences, return_tensors='pt')
178
+ blank_id = tokenizer.mask_token_id
179
+ blank_id_idx = torch.where(encoded_input["input_ids"] == blank_id)[1]
180
+
 
181
  encoded_input["input_ids"] = encoded_input["input_ids"].to(device)
182
  encoded_input["attention_mask"] = encoded_input["attention_mask"].to(device)
183
  encoded_input['token_type_ids'] = encoded_input['token_type_ids'].to(device)
 
 
 
 
 
184
 
185
  encoded_input['input'] = {'input_ids':encoded_input['input_ids'], 'attention_mask':encoded_input['attention_mask']}
186
 
 
187
  del encoded_input['input_ids']
188
  del encoded_input['token_type_ids']
189
  del encoded_input['attention_mask']
190
 
 
 
 
 
191
  with torch.no_grad():
192
+ # output = model(encoded_input)
193
+ print(encoded_input)
194
  model_output = model(**encoded_input)
195
+ # output = model(encoded_input_topk)
196
+
197
+ unmasker = pipeline('fill-mask', model='bert-analogies')
198
+ guesses = unmasker(sentences)
199
+ print(guesses)
200
 
 
201
  # Perform pooling
202
  sentence_embeddings = mean_pooling(model_output, encoded_input['input']["attention_mask"])
203
 
204
  # Normalize embeddings
205
  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
206
 
207
+ potential_words = []
208
+ for guess in guesses:
209
+ temp_word = guess['token_str']
210
+ if temp_word[0].isalpha() and temp_word not in stops and temp_word not in ROMAN_CONSTANTS:
211
+ potential_words.append(guess['token_str'])
212
+
213
+ return potential_words
214
+
215
+
216
+ def random_word():
217
+ with open('ag_news_model/vocab.txt', 'r') as file:
218
+ line = ""
219
+ content = file.readlines()
220
+ length = len(content)
221
+ while line == "":
222
+ rand_line = random.randrange(1997, length)
223
+
224
+ if content[rand_line][0].isalpha() and content[rand_line][:-1] not in stops and content[rand_line][:-1] not in ROMAN_CONSTANTS:
225
+ line = content[rand_line]
226
+ else:
227
+ print(f"{content[rand_line]} is not alpha or is a stop word")
228
+ # for num, aline in enumerate(file, 1997):
229
+ # if random.randrange(num) and aline.isalpha():
230
+ # continue
231
+ # # elif not aline.isalpha():
232
+
233
+ # line = aline
234
+ print(line)
235
+ return line[:-1]
236
+
237
+
238
+ def generate_prompt(model):
239
+ global word1
240
+ global word2
241
+ global word3
242
+ global answer
243
+ word1 = random_word()
244
+ word2 = random_word()
245
+ word3 = random_word()
246
+ sentence = f"{word1} is to {word2} as {word3} is to [MASK]"
247
+ print(sentence)
248
+ answer = embeddings(model, sentence)[0]
249
+ print("ANSWER IS", answer)
250
+ # cosine_scores(model, sentence)
251
 
 
252
 
253
  def greet(name):
254
  return "Hello " + name + "!!"
 
256
  def check_answer(guess:str):
257
  global guesses
258
  global answer
259
+ global return_guesses
260
+ model = get_model()
261
  output = ""
262
+ protected_guess = guess
263
+ sentence = f"{word1} is to {word2} as [MASK] is to {guess}"
264
+ other_word = embeddings(model, sentence)[0]
265
+ guesses.append(guess)
266
+ print("GUESS IS", guess)
267
+ return_guess = f"{guess}: {word1} is to {word2} as {other_word} is to {guess}"
268
+ print("GUESS IS", guess)
269
+ return_guesses.append(return_guess)
270
+ for guess in return_guesses:
271
+ output += (guess + "\n")
272
  output = output[:-1]
273
+ print("GUESS IS", protected_guess)
274
 
275
+ print("IS", protected_guess, "EQUAL TO", answer, ":", protected_guess.lower() == answer.lower())
276
+ if protected_guess.lower() == answer.lower():
277
  return "Correct!", output
278
  else:
279
+
280
  return "Try again!", output
281
 
282
  def main():
283
+ global word1
284
+ global word2
285
+ global word3
286
  global answer
287
+ # answer = "Moon"
288
  global guesses
289
 
290
+
291
  # num_rows, data_type, value, example, embeddings = training()
292
+ # sent_embeddings = embeddings()
293
+ model = get_model()
294
+ generate_prompt(model)
295
 
296
  prompt = f"{word1} is to {word2} as {word3} is to ____"
297
+ print(prompt)
298
+ print("TESTING EMBEDDINGS")
299
  with gr.Blocks() as iface:
300
  gr.Markdown(prompt)
301
  with gr.Tab("Guess"):
 
304
  text_button = gr.Button("Submit")
305
  with gr.Accordion("Open for previous guesses"):
306
  text_guesses = gr.Textbox()
307
+ # with gr.Tab("Testing"):
308
+ # gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
309
  text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
310
  # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
311
  iface.launch()
bert-analogies/1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 384,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
bert-analogies/README.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: sentence-similarity
3
+ tags:
4
+ - sentence-transformers
5
+ - feature-extraction
6
+ - sentence-similarity
7
+
8
+ ---
9
+
10
+ # {MODEL_NAME}
11
+
12
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
13
+
14
+ <!--- Describe your model here -->
15
+
16
+ ## Usage (Sentence-Transformers)
17
+
18
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
19
+
20
+ ```
21
+ pip install -U sentence-transformers
22
+ ```
23
+
24
+ Then you can use the model like this:
25
+
26
+ ```python
27
+ from sentence_transformers import SentenceTransformer
28
+ sentences = ["This is an example sentence", "Each sentence is converted"]
29
+
30
+ model = SentenceTransformer('{MODEL_NAME}')
31
+ embeddings = model.encode(sentences)
32
+ print(embeddings)
33
+ ```
34
+
35
+
36
+
37
+ ## Evaluation Results
38
+
39
+ <!--- Describe how your model was evaluated -->
40
+
41
+ For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
42
+
43
+
44
+ ## Training
45
+ The model was trained with the parameters:
46
+
47
+ **DataLoader**:
48
+
49
+ `torch.utils.data.dataloader.DataLoader` of length 36 with parameters:
50
+ ```
51
+ {'batch_size': 25, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
52
+ ```
53
+
54
+ **Loss**:
55
+
56
+ `sentence_transformers.losses.MegaBatchMarginLoss.MegaBatchMarginLoss`
57
+
58
+ Parameters of the fit()-Method:
59
+ ```
60
+ {
61
+ "epochs": 10,
62
+ "evaluation_steps": 0,
63
+ "evaluator": "NoneType",
64
+ "max_grad_norm": 1,
65
+ "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
66
+ "optimizer_params": {
67
+ "lr": 2e-05
68
+ },
69
+ "scheduler": "WarmupLinear",
70
+ "steps_per_epoch": null,
71
+ "warmup_steps": 10000,
72
+ "weight_decay": 0.01
73
+ }
74
+ ```
75
+
76
+
77
+ ## Full Model Architecture
78
+ ```
79
+ SentenceTransformer(
80
+ (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel
81
+ (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
82
+ (2): Normalize()
83
+ )
84
+ ```
85
+
86
+ ## Citing & Authors
87
+
88
+ <!--- Describe where people can find more information -->
bert-analogies/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/smhavens/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 384,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1536,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 6,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.35.2",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 30522
26
+ }
bert-analogies/config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.0.0",
4
+ "transformers": "4.6.1",
5
+ "pytorch": "1.8.1"
6
+ }
7
+ }
bert-analogies/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0f7c659ba309023355f651ecbe72279f6caef5fe5f274e59168f1bcedb36368
3
+ size 90864192
bert-analogies/modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
bert-analogies/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 256,
3
+ "do_lower_case": false
4
+ }
bert-analogies/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
bert-analogies/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
bert-analogies/tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "max_length": 128,
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "[PAD]",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "[SEP]",
57
+ "stride": 0,
58
+ "strip_accents": null,
59
+ "tokenize_chinese_chars": true,
60
+ "tokenizer_class": "BertTokenizer",
61
+ "truncation_side": "right",
62
+ "truncation_strategy": "longest_first",
63
+ "unk_token": "[UNK]"
64
+ }
bert-analogies/vocab.txt ADDED
The diff for this file is too large to render. See raw diff