smhavens commited on
Commit
33e257e
1 Parent(s): b3ffc6e

Add files via upload

Browse files
app_context.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import math
3
+ import spacy
4
+ from datasets import load_dataset
5
+ from sentence_transformers import SentenceTransformer
6
+ from sentence_transformers import InputExample
7
+ from sentence_transformers import losses
8
+ from sentence_transformers import util
9
+ from transformers import pipeline, T5Tokenizer
10
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
11
+ from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration
12
+ import torch
13
+ import torch.nn.functional as F
14
+ from torch.utils.data import DataLoader
15
+ import numpy as np
16
+ import evaluate
17
+ import nltk
18
+ from nltk.corpus import stopwords
19
+ import subprocess
20
+ import sys
21
+ import random
22
+ from textwrap import fill
23
+
24
+ # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
25
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
26
+ # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
27
+ model_base = "results/checkpoint-17000"
28
+ nltk.download('stopwords')
29
+ nlp = spacy.load("en_core_web_sm")
30
+ stops = stopwords.words("english")
31
+ ROMAN_CONSTANTS = (
32
+ ( "", "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX" ),
33
+ ( "", "X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC" ),
34
+ ( "", "C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM" ),
35
+ ( "", "M", "MM", "MMM", "", "", "-", "", "", "" ),
36
+ ( "", "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix" ),
37
+ ( "", "x", "xx", "xxx", "xl", "l", "lx", "lxx", "lxxx", "xc" ),
38
+ ( "", "c", "cc", "ccc", "cd", "d", "dc", "dcc", "dccc", "cm" ),
39
+ ( "", "m", "mm", "mmm", "", "", "-", "", "", "" ),
40
+ )
41
+
42
+ # answer = "Pizza"
43
+ guesses = []
44
+ return_guesses = []
45
+ answer = "Moon"
46
+ word1 = "Black"
47
+ word2 = "White"
48
+ word3 = "Sun"
49
+ base_prompts = ["Sun is to Moon as ", "Black is to White as ", "Atom is to Element as",
50
+ "Athens is to Greece as ", "Cat is to Dog as ", "Robin is to Bird as",
51
+ "Hunger is to Ambition as "]
52
+
53
+
54
+ #Mean Pooling - Take attention mask into account for correct averaging
55
+ def mean_pooling(model_output, attention_mask):
56
+ token_embeddings = model_output['token_embeddings'] #First element of model_output contains all token embeddings
57
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
58
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
59
+
60
+
61
+ def normalize(comment, lowercase, remove_stopwords):
62
+ if lowercase:
63
+ comment = comment.lower()
64
+ comment = nlp(comment)
65
+ lemmatized = list()
66
+ for word in comment:
67
+ lemma = word.lemma_.strip()
68
+ if lemma:
69
+ if not remove_stopwords or (remove_stopwords and lemma not in stops):
70
+ lemmatized.append(lemma)
71
+ return " ".join(lemmatized)
72
+
73
+
74
+ # def tokenize_function(examples):
75
+ # return tokenizer(examples["text"])
76
+
77
+
78
+ def compute_metrics(eval_pred):
79
+ logits, labels = eval_pred
80
+ predictions = np.argmax(logits, axis=-1)
81
+ metric = evaluate.load("accuracy")
82
+ return metric.compute(predictions=predictions, references=labels)
83
+
84
+
85
+ def get_model():
86
+ global model_base
87
+ # last_checkpoint = "./results/checkpoint-22500"
88
+
89
+ finetuned_model = T5ForConditionalGeneration.from_pretrained(model_base)
90
+ tokenizer = T5Tokenizer.from_pretrained(model_base)
91
+ # model = SentenceTransformer(model_base)
92
+ gpu_available = torch.cuda.is_available()
93
+ device = torch.device("cuda" if gpu_available else "cpu")
94
+ finetuned_model = finetuned_model.to(device)
95
+ return finetuned_model, tokenizer
96
+
97
+
98
+ def cosine_scores(model, sentence):
99
+ global word1
100
+ global word2
101
+ global word3
102
+ # sentence1 = f"{word1} is to {word2} as"
103
+ embeddings1 = model.encode(sentence, convert_to_tensor=True)
104
+
105
+ def embeddings(model, sentences, tokenizer):
106
+ global word1
107
+ global word2
108
+ global word3
109
+ global model_base
110
+ gpu_available = torch.cuda.is_available()
111
+ device = torch.device("cuda" if gpu_available else "cpu")
112
+ # device = torch.device('cuda:0')
113
+ # embeddings = model.encode(sentences)
114
+ question = "Please answer to this question: " + sentences
115
+
116
+ inputs = tokenizer(question, return_tensors="pt")
117
+
118
+ print(inputs)
119
+ # print(inputs.device)
120
+ print(model.device)
121
+ print(inputs['input_ids'].device)
122
+ print(inputs['attention_mask'].device)
123
+
124
+ inputs['attention_mask'] = inputs['attention_mask'].to(device)
125
+ inputs['input_ids'] = inputs['input_ids'].to(device)
126
+
127
+ outputs = model.generate(**inputs)
128
+ answer = tokenizer.decode(outputs[0])
129
+ answer = answer[6:-4]
130
+ # print(fill(answer, width=80))
131
+
132
+ print("ANSWER IS", answer)
133
+
134
+ return answer
135
+
136
+
137
+ def random_word(model, tokenizer):
138
+ global model_base
139
+ vocab = tokenizer.get_vocab()
140
+ # with open(model_base + '/vocab.txt', 'r') as file:
141
+ line = ""
142
+ # content = file.readlines()
143
+ length = tokenizer.vocab_size
144
+ # print(vocab)
145
+ while line == "":
146
+ rand_line = random.randrange(0, length)
147
+ # print("TRYING TO FIND", rand_line, "OUT OF", length, "WITH VOCAB OF TYPE", type(vocab))
148
+ for word, id in vocab.items():
149
+ if id == rand_line and word[0].isalpha() and word not in stops and word not in ROMAN_CONSTANTS:
150
+ # if vocab[rand_line][0].isalpha() and vocab[rand_line][:-1] not in stops and vocab[rand_line][:-1] not in ROMAN_CONSTANTS:
151
+ line = word
152
+ elif id == rand_line:
153
+ print(f"{word} is not alpha or is a stop word")
154
+ # for num, aline in enumerate(file, 1997):
155
+ # if random.randrange(num) and aline.isalpha():
156
+ # continue
157
+ # # elif not aline.isalpha():
158
+
159
+ # line = aline
160
+ print(line)
161
+ return line
162
+
163
+
164
+ def generate_prompt(model, tokenizer):
165
+ global word1
166
+ global word2
167
+ global word3
168
+ global answer
169
+ global base_prompts
170
+ word1 = random_word(model, tokenizer)
171
+ # word2 = random_word()
172
+
173
+ word2 = embeddings(model, f"{base_prompts[random.randint(0, len(base_prompts) - 1)]}{word1} is to ___.", tokenizer)
174
+ word3 = random_word(model, tokenizer)
175
+ sentence = f"{word1} is to {word2} as {word3} is to ___."
176
+ print(sentence)
177
+ answer = embeddings(model, sentence, tokenizer)
178
+ print("ANSWER IS", answer)
179
+ return f"# {word1} is to {word2} as {word3} is to ___."
180
+ # cosine_scores(model, sentence)
181
+
182
+
183
+ def greet(name):
184
+ return "Hello " + name + "!!"
185
+
186
+ def check_answer(guess:str):
187
+ global guesses
188
+ global answer
189
+ global return_guesses
190
+ global word1
191
+ global word2
192
+ global word3
193
+
194
+ model, tokenizer = get_model()
195
+ output = ""
196
+ protected_guess = guess
197
+ sentence = f"{word1} is to {word2} as [MASK] is to {guess}."
198
+
199
+ other_word = embeddings(model, sentence, tokenizer)
200
+ guesses.append(guess)
201
+
202
+
203
+
204
+ for guess in return_guesses:
205
+ output += ("- " + guess + "<br>")
206
+
207
+ # output = output[:-1]
208
+ prompt = f"{word1} is to {word2} as {word3} is to ___."
209
+ # print("IS", protected_guess, "EQUAL TO", answer, ":", protected_guess.lower() == answer.lower())
210
+
211
+ if protected_guess.lower() == answer.lower():
212
+ return_guesses.append(f"{protected_guess}: {word1} is to {word2} as {word3} is to {protected_guess}.")
213
+ output += f"<span style='color:green'>- {return_guesses[-1]}</span><br>"
214
+ new_prompt = generate_prompt(model, tokenizer)
215
+ return new_prompt, "Correct!", output
216
+ else:
217
+ return_guess = f"{protected_guess}: {word1} is to {word2} as {other_word} is to {protected_guess}."
218
+ return_guesses.append(return_guess)
219
+ output += ("- " + return_guess + " <br>")
220
+ return prompt, "Try again!", output
221
+
222
+ def main():
223
+ global word1
224
+ global word2
225
+ global word3
226
+ global answer
227
+ # answer = "Moon"
228
+ global guesses
229
+
230
+
231
+ # num_rows, data_type, value, example, embeddings = training()
232
+ # sent_embeddings = embeddings()
233
+ model, tokenizer = get_model()
234
+ generate_prompt(model, tokenizer)
235
+
236
+ prompt = f"{word1} is to {word2} as {word3} is to ____"
237
+ print(prompt)
238
+ print("TESTING EMBEDDINGS")
239
+ with gr.Blocks() as iface:
240
+ mark_question = gr.Markdown(prompt)
241
+ with gr.Tab("Guess"):
242
+ text_input = gr.Textbox()
243
+ text_output = gr.Textbox()
244
+ text_button = gr.Button("Submit")
245
+ with gr.Accordion("Open for previous guesses"):
246
+ text_guesses = gr.Markdown()
247
+ # with gr.Tab("Testing"):
248
+ # gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
249
+ text_button.click(check_answer, inputs=[text_input], outputs=[mark_question, text_output, text_guesses])
250
+ # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
251
+ iface.launch()
252
+
253
+
254
+
255
+
256
+
257
+ if __name__ == "__main__":
258
+ main()
checkpoint-17000/added_tokens.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<extra_id_0>": 32099,
3
+ "<extra_id_10>": 32089,
4
+ "<extra_id_11>": 32088,
5
+ "<extra_id_12>": 32087,
6
+ "<extra_id_13>": 32086,
7
+ "<extra_id_14>": 32085,
8
+ "<extra_id_15>": 32084,
9
+ "<extra_id_16>": 32083,
10
+ "<extra_id_17>": 32082,
11
+ "<extra_id_18>": 32081,
12
+ "<extra_id_19>": 32080,
13
+ "<extra_id_1>": 32098,
14
+ "<extra_id_20>": 32079,
15
+ "<extra_id_21>": 32078,
16
+ "<extra_id_22>": 32077,
17
+ "<extra_id_23>": 32076,
18
+ "<extra_id_24>": 32075,
19
+ "<extra_id_25>": 32074,
20
+ "<extra_id_26>": 32073,
21
+ "<extra_id_27>": 32072,
22
+ "<extra_id_28>": 32071,
23
+ "<extra_id_29>": 32070,
24
+ "<extra_id_2>": 32097,
25
+ "<extra_id_30>": 32069,
26
+ "<extra_id_31>": 32068,
27
+ "<extra_id_32>": 32067,
28
+ "<extra_id_33>": 32066,
29
+ "<extra_id_34>": 32065,
30
+ "<extra_id_35>": 32064,
31
+ "<extra_id_36>": 32063,
32
+ "<extra_id_37>": 32062,
33
+ "<extra_id_38>": 32061,
34
+ "<extra_id_39>": 32060,
35
+ "<extra_id_3>": 32096,
36
+ "<extra_id_40>": 32059,
37
+ "<extra_id_41>": 32058,
38
+ "<extra_id_42>": 32057,
39
+ "<extra_id_43>": 32056,
40
+ "<extra_id_44>": 32055,
41
+ "<extra_id_45>": 32054,
42
+ "<extra_id_46>": 32053,
43
+ "<extra_id_47>": 32052,
44
+ "<extra_id_48>": 32051,
45
+ "<extra_id_49>": 32050,
46
+ "<extra_id_4>": 32095,
47
+ "<extra_id_50>": 32049,
48
+ "<extra_id_51>": 32048,
49
+ "<extra_id_52>": 32047,
50
+ "<extra_id_53>": 32046,
51
+ "<extra_id_54>": 32045,
52
+ "<extra_id_55>": 32044,
53
+ "<extra_id_56>": 32043,
54
+ "<extra_id_57>": 32042,
55
+ "<extra_id_58>": 32041,
56
+ "<extra_id_59>": 32040,
57
+ "<extra_id_5>": 32094,
58
+ "<extra_id_60>": 32039,
59
+ "<extra_id_61>": 32038,
60
+ "<extra_id_62>": 32037,
61
+ "<extra_id_63>": 32036,
62
+ "<extra_id_64>": 32035,
63
+ "<extra_id_65>": 32034,
64
+ "<extra_id_66>": 32033,
65
+ "<extra_id_67>": 32032,
66
+ "<extra_id_68>": 32031,
67
+ "<extra_id_69>": 32030,
68
+ "<extra_id_6>": 32093,
69
+ "<extra_id_70>": 32029,
70
+ "<extra_id_71>": 32028,
71
+ "<extra_id_72>": 32027,
72
+ "<extra_id_73>": 32026,
73
+ "<extra_id_74>": 32025,
74
+ "<extra_id_75>": 32024,
75
+ "<extra_id_76>": 32023,
76
+ "<extra_id_77>": 32022,
77
+ "<extra_id_78>": 32021,
78
+ "<extra_id_79>": 32020,
79
+ "<extra_id_7>": 32092,
80
+ "<extra_id_80>": 32019,
81
+ "<extra_id_81>": 32018,
82
+ "<extra_id_82>": 32017,
83
+ "<extra_id_83>": 32016,
84
+ "<extra_id_84>": 32015,
85
+ "<extra_id_85>": 32014,
86
+ "<extra_id_86>": 32013,
87
+ "<extra_id_87>": 32012,
88
+ "<extra_id_88>": 32011,
89
+ "<extra_id_89>": 32010,
90
+ "<extra_id_8>": 32091,
91
+ "<extra_id_90>": 32009,
92
+ "<extra_id_91>": 32008,
93
+ "<extra_id_92>": 32007,
94
+ "<extra_id_93>": 32006,
95
+ "<extra_id_94>": 32005,
96
+ "<extra_id_95>": 32004,
97
+ "<extra_id_96>": 32003,
98
+ "<extra_id_97>": 32002,
99
+ "<extra_id_98>": 32001,
100
+ "<extra_id_99>": 32000,
101
+ "<extra_id_9>": 32090
102
+ }
checkpoint-17000/config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/flan-t5-base",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2048,
8
+ "d_kv": 64,
9
+ "d_model": 768,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "t5",
20
+ "n_positions": 512,
21
+ "num_decoder_layers": 12,
22
+ "num_heads": 12,
23
+ "num_layers": 12,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "relative_attention_max_distance": 128,
27
+ "relative_attention_num_buckets": 32,
28
+ "task_specific_params": {
29
+ "summarization": {
30
+ "early_stopping": true,
31
+ "length_penalty": 2.0,
32
+ "max_length": 200,
33
+ "min_length": 30,
34
+ "no_repeat_ngram_size": 3,
35
+ "num_beams": 4,
36
+ "prefix": "summarize: "
37
+ },
38
+ "translation_en_to_de": {
39
+ "early_stopping": true,
40
+ "max_length": 300,
41
+ "num_beams": 4,
42
+ "prefix": "translate English to German: "
43
+ },
44
+ "translation_en_to_fr": {
45
+ "early_stopping": true,
46
+ "max_length": 300,
47
+ "num_beams": 4,
48
+ "prefix": "translate English to French: "
49
+ },
50
+ "translation_en_to_ro": {
51
+ "early_stopping": true,
52
+ "max_length": 300,
53
+ "num_beams": 4,
54
+ "prefix": "translate English to Romanian: "
55
+ }
56
+ },
57
+ "tie_word_embeddings": false,
58
+ "torch_dtype": "float32",
59
+ "transformers_version": "4.35.2",
60
+ "use_cache": true,
61
+ "vocab_size": 32128
62
+ }
checkpoint-17000/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "decoder_start_token_id": 0,
3
+ "eos_token_id": 1,
4
+ "pad_token_id": 0,
5
+ "transformers_version": "4.35.2"
6
+ }
flan-t5-train.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import math
3
+ from datasets import load_dataset
4
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
5
+ from transformers import TrainingArguments, Trainer
6
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
7
+ import torch
8
+ import torch.nn.functional as F
9
+ from torch.utils.data import DataLoader
10
+ import numpy as np
11
+ import evaluate
12
+ import nltk
13
+ from nltk.corpus import stopwords
14
+ import subprocess
15
+ import sys
16
+ from transformers import T5Tokenizer, DataCollatorForSeq2Seq
17
+ from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
18
+ from transformers import DataCollatorWithPadding, DistilBertTokenizerFast
19
+ from transformers import TrainingArguments
20
+ from transformers import (
21
+ BertModel,
22
+ BertTokenizerFast,
23
+ Trainer,
24
+ EvalPrediction
25
+ )
26
+
27
+ # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
28
+ # subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
29
+ # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
30
+ # data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
31
+ # nltk.download('stopwords')
32
+ # nlp = spacy.load("en_core_web_sm")
33
+ # stops = stopwords.words("english")
34
+ nltk.download("punkt", quiet=True)
35
+ metric = evaluate.load("rouge")
36
+
37
+ # Global Parameters
38
+ L_RATE = 3e-4
39
+ BATCH_SIZE = 8
40
+ PER_DEVICE_EVAL_BATCH = 4
41
+ WEIGHT_DECAY = 0.01
42
+ SAVE_TOTAL_LIM = 3
43
+ NUM_EPOCHS = 10
44
+
45
+ # Set up training arguments
46
+ training_args = Seq2SeqTrainingArguments(
47
+ output_dir="./results",
48
+ evaluation_strategy="epoch",
49
+ learning_rate=L_RATE,
50
+ per_device_train_batch_size=BATCH_SIZE,
51
+ per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
52
+ weight_decay=WEIGHT_DECAY,
53
+ save_total_limit=SAVE_TOTAL_LIM,
54
+ num_train_epochs=NUM_EPOCHS,
55
+ predict_with_generate=True,
56
+ push_to_hub=False
57
+ )
58
+
59
+ model_id = "google/flan-t5-base"
60
+ tokenizer = T5Tokenizer.from_pretrained(model_id)
61
+ # tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
62
+ # metric = evaluate.load("accuracy")
63
+
64
+ def tokenize_function(examples):
65
+ return tokenizer(examples["stem"], padding="max_length", truncation=True)
66
+
67
+
68
+ #Mean Pooling - Take attention mask into account for correct averaging
69
+ def mean_pooling(model_output, attention_mask):
70
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
71
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
72
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
73
+
74
+
75
+ # def compute_metrics(eval_pred):
76
+ # logits, labels = eval_pred
77
+ # predictions = np.argmax(logits, axis=-1)
78
+ # metric = evaluate.load("accuracy")
79
+ # return metric.compute(predictions=predictions, references=labels)
80
+
81
+ def compute_metrics(eval_preds):
82
+ preds, labels = eval_preds
83
+
84
+ # decode preds and labels
85
+ labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
86
+ decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
87
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
88
+
89
+ # rougeLSum expects newline after each sentence
90
+ decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
91
+ decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
92
+
93
+ result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
94
+
95
+ return result
96
+
97
+
98
+ def training():
99
+ dataset_id = "tomasmcz/word2vec_analogy"
100
+ # dataset_id = "relbert/scientific_and_creative_analogy"
101
+ # dataset_sub = "Quadruples_Kmiecik_random_split"
102
+ print("GETTING DATASET")
103
+ dataset = load_dataset(dataset_id)
104
+ # dataset = dataset["train"]
105
+ # tokenized_datasets = dataset.map(tokenize_function, batched=True)
106
+
107
+ print(dataset)
108
+ print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
109
+ print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0])} as value.")
110
+ print(f"- Examples look like this: {dataset['train'][0]}")
111
+
112
+ # for i in dataset["train"]:
113
+ # print(i["AB"], "to", i["CD"], "is", i["label"])
114
+
115
+ dataset = dataset["train"].train_test_split(test_size=0.3)
116
+
117
+ # We prefix our tasks with "answer the question"
118
+ prefix = "Please answer this question: "
119
+
120
+ # Define the preprocessing function
121
+
122
+ # def preprocess_function(examples):
123
+ # """Add prefix to the sentences, tokenize the text, and set the labels"""
124
+ # # The "inputs" are the tokenized answer:
125
+ # inputs = []
126
+ # # print(examples)
127
+ # # inputs = [prefix + doc for doc in examples["question"]]
128
+ # for doc in examples['source']:
129
+ # # print("THE DOC IS:", doc)
130
+ # # print("THE DOC IS:", examples[i]['AB'], examples[i]['CD'], examples[i]['label'])
131
+ # prompt = f"{prefix}map "
132
+ # for item in doc:
133
+ # prompt += f"{item}, and "
134
+ # prompt = prompt[:-6]
135
+ # inputs.append(prompt)
136
+ # # inputs = [prefix + doc for doc in examples["question"]]
137
+ # for indx, doc in enumerate(examples["target_random"]):
138
+ # prompt = f" to "
139
+ # for item in doc:
140
+ # prompt += f"{item}, and "
141
+ # prompt = prompt[:-6] + "."
142
+ # inputs[indx] += prompt
143
+ # model_inputs = tokenizer(inputs, max_length=128, truncation=True)
144
+
145
+ def preprocess_function(examples):
146
+ """Add prefix to the sentences, tokenize the text, and set the labels"""
147
+ # The "inputs" are the tokenized answer:
148
+ inputs = []
149
+ # print(examples)
150
+ # inputs = [prefix + doc for doc in examples["question"]]
151
+ for doc in examples['word_a']:
152
+ # print("THE DOC IS:", doc)
153
+ # print("THE DOC IS:", examples[i]['AB'], examples[i]['CD'], examples[i]['label'])
154
+ prompt = f"{prefix}{doc} is to "
155
+ inputs.append(prompt)
156
+ # inputs = [prefix + doc for doc in examples["question"]]
157
+ for indx, doc in enumerate(examples["word_b"]):
158
+ prompt = f"{doc} as "
159
+ inputs[indx] += prompt
160
+
161
+ for indx, doc in enumerate(examples["word_c"]):
162
+ prompt = f"{doc} is to ___."
163
+ inputs[indx] += prompt
164
+ model_inputs = tokenizer(inputs, max_length=128, truncation=True)
165
+
166
+ # print(examples["label"], type(examples["label"]))
167
+
168
+ # The "labels" are the tokenized outputs:
169
+ labels = tokenizer(text_target=examples["word_d"],
170
+ max_length=512,
171
+ truncation=True)
172
+
173
+ model_inputs["labels"] = labels["input_ids"]
174
+ return model_inputs
175
+
176
+
177
+
178
+ # Map the preprocessing function across our dataset
179
+ tokenized_dataset = dataset.map(preprocess_function, batched=True)
180
+ # train_examples = []
181
+ # train_data = dataset["test"]
182
+ # # For agility we only 1/2 of our available data
183
+ # n_examples = dataset["test"].num_rows // 2
184
+
185
+ # for i in range(n_examples):
186
+ # example = train_data[i]
187
+ # temp_word_1 = example["stem"][0]
188
+ # temp_word_2 = example["stem"][1]
189
+ # temp_word_3 = example["choice"][example["answer"]][0]
190
+ # temp_word_4 = example["choice"][example["answer"]][1]
191
+ # comp1 = f"{temp_word_1} to {temp_word_2}"
192
+ # comp2 = f"{temp_word_3} to {temp_word_4}"
193
+ # # example_opposite = dataset_clean[-(i)]
194
+ # # print(example["text"])
195
+ # train_examples.append(InputExample(texts=[comp1, comp2]))
196
+
197
+
198
+ # train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
199
+
200
+ print("END DATALOADER")
201
+
202
+ # print(train_examples)
203
+
204
+ embeddings = finetune(tokenized_dataset)
205
+
206
+ return 0
207
+
208
+
209
+ def finetune(dataset):
210
+ # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
211
+ # model_id = "sentence-transformers/all-MiniLM-L6-v2"
212
+ model_id = "google/flan-t5-base"
213
+ # model_id = "distilbert-base-uncased"
214
+ # tokenizer = DistilBertTokenizerFast.from_pretrained(model_id)
215
+ tokenizer = T5Tokenizer.from_pretrained(model_id)
216
+ model = T5ForConditionalGeneration.from_pretrained(model_id)
217
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
218
+ device = torch.device('cuda:0')
219
+ model = model.to(device)
220
+
221
+ # training_args = TrainingArguments(output_dir="test_trainer")
222
+
223
+ # USE THIS LINK
224
+ # https://huggingface.co/blog/how-to-train-sentence-transformers
225
+
226
+ # train_loss = losses.MegaBatchMarginLoss(model=model)
227
+ # ds_train, ds_valid = dataset.train_test_split(test_size=0.2, seed=42)
228
+
229
+ print("BEGIN FIT")
230
+
231
+ trainer = Seq2SeqTrainer(
232
+ model=model,
233
+ args=training_args,
234
+ train_dataset=dataset["train"],
235
+ eval_dataset=dataset["test"],
236
+ # evaluation_strategy="no"
237
+ tokenizer=tokenizer,
238
+ data_collator=data_collator,
239
+ compute_metrics=compute_metrics
240
+ )
241
+
242
+ # model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
243
+
244
+ trainer.train()
245
+
246
+ # model.save("flan-analogies")
247
+
248
+ # model.save_to_hub("smhavens/bert-base-analogies")
249
+ # accuracy = compute_metrics(eval, metric)
250
+ return 0
251
+
252
+ def greet(name):
253
+ return "Hello " + name + "!!"
254
+
255
+ def check_answer(guess:str):
256
+ global guesses
257
+ global answer
258
+ guesses.append(guess)
259
+ output = ""
260
+ for guess in guesses:
261
+ output += ("- " + guess + "\n")
262
+ output = output[:-1]
263
+
264
+ if guess.lower() == answer.lower():
265
+ return "Correct!", output
266
+ else:
267
+ return "Try again!", output
268
+
269
+ def main():
270
+ print("BEGIN")
271
+ word1 = "Black"
272
+ word2 = "White"
273
+ word3 = "Sun"
274
+ global answer
275
+ answer = "Moon"
276
+ global guesses
277
+
278
+ training()
279
+
280
+ # prompt = f"{word1} is to {word2} as {word3} is to ____"
281
+ # with gr.Blocks() as iface:
282
+ # gr.Markdown(prompt)
283
+ # with gr.Tab("Guess"):
284
+ # text_input = gr.Textbox()
285
+ # text_output = gr.Textbox()
286
+ # text_button = gr.Button("Submit")
287
+ # with gr.Accordion("Open for previous guesses"):
288
+ # text_guesses = gr.Textbox()
289
+ # with gr.Tab("Testing"):
290
+ # gr.Markdown(f"""Number of rows in dataset is {num_rows}, with each having type {data_type} and value {value}.
291
+ # An example is {example}.
292
+ # The Embeddings are {embeddings}.""")
293
+ # text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
294
+ # # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
295
+ # iface.launch()
296
+
297
+
298
+
299
+
300
+
301
+ if __name__ == "__main__":
302
+ main()
word_embedding.py ADDED
@@ -0,0 +1,617 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import shutil
3
+ import json
4
+ from collections import defaultdict
5
+ import multiprocessing
6
+ import gensim
7
+ from sklearn.metrics import classification_report
8
+ from gensim import corpora
9
+ from gensim.test.utils import common_texts
10
+ from gensim.models import Word2Vec
11
+ from gensim.models import KeyedVectors
12
+ from gensim.models import fasttext
13
+ from gensim.test.utils import datapath
14
+ from wefe.datasets import load_bingliu
15
+ from wefe.metrics import RNSB
16
+ from wefe.query import Query
17
+ from wefe.word_embedding_model import WordEmbeddingModel
18
+ from wefe.utils import plot_queries_results, run_queries
19
+ import pandas as pd
20
+ import gensim.downloader as api
21
+ import glob
22
+ from sklearn.feature_extraction.text import TfidfVectorizer
23
+ from sklearn.ensemble import RandomForestClassifier
24
+ from wefe.metrics import WEAT
25
+ from wefe.datasets import load_weat
26
+ from wefe.utils import run_queries
27
+ from wefe.utils import plot_queries_results
28
+ import random
29
+ from scipy.special import expit
30
+ import math
31
+ import sys
32
+ import os
33
+ import argparse
34
+ import nltk
35
+ import scipy.sparse
36
+ import numpy as np
37
+ import string
38
+ import io
39
+ from sklearn.model_selection import train_test_split
40
+
41
+
42
+ '''STEPS FOR CODE:
43
+ 1. Train word embeddings on Simple English Wikipedia;
44
+ 2. Compare these to other pre-trained embeddings;
45
+ 3. Quantify biases that exist in these word embeddings;
46
+ 4. Use your word embeddings as features in a simple text classifier;
47
+ '''
48
+
49
+
50
+ def load_vectors(fname):
51
+ fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
52
+ n, d = map(int, fin.readline().split())
53
+ data = {}
54
+ # print("Hello", n, d)
55
+ for line in fin:
56
+ tokens = line.rstrip().split(' ')
57
+ data[tokens[0]] = map(float, tokens[1:])
58
+ # print(data)
59
+
60
+ print(data)
61
+ return data
62
+
63
+
64
+ def train_embeddings():
65
+ '''TRAIN WORD EMBEDDINGS
66
+ This will be making use of the dataset from wikipedia and the first step'''
67
+ dataset = load_dataset("wikipedia", "20220301.simple")
68
+ cores = multiprocessing.cpu_count()
69
+ # check the first example of the training portion of the dataset :
70
+ # print(dataset['train'][0])
71
+ dataset_size = len(dataset)
72
+
73
+ ### BUILD VOCAB ###
74
+ # print(type(dataset["train"][0]))
75
+ vocab = set()
76
+ vocab_size = 0
77
+ count = 0
78
+ ## Generate vocab and split sentances and words?
79
+ data = []
80
+ for index, page in enumerate(dataset["train"]):
81
+ document = page["text"]
82
+ document = document.replace("\n", ". ")
83
+ # print(document)
84
+ for sent in document.split("."):
85
+ # print("Sentance:", sent)
86
+ new_sent = []
87
+ clean_sent =[s for s in sent if s.isalnum() or s.isspace()]
88
+ clean_sent = "".join(clean_sent)
89
+ for word in clean_sent.split(" "):
90
+ if len(word) > 0:
91
+ new_word = word.lower()
92
+ # print("Word:", new_word)
93
+ if new_word[0] not in string.punctuation:
94
+ new_sent.append(new_word)
95
+ if len(new_sent) > 0:
96
+ data.append(new_sent)
97
+ # print("New Sent:", new_sent)
98
+
99
+
100
+ for index, page in enumerate(dataset["train"]):
101
+ # print(page["text"])
102
+ # for text in page:
103
+ # print(text)
104
+ text = page["text"]
105
+ clean_text = [s for s in text if s.isalnum() or s.isspace()]
106
+ clean_text = "".join(clean_text)
107
+ clean_text = clean_text.replace("\n", " ")
108
+ # text = text.replace('; ', ' ').replace(", ", " ").replace("\n", " ").replace(":", " ").replace(". ", " ").replace("! ", " ").replace("? ", " ").replace()
109
+
110
+ for word in clean_text.split(" "):
111
+ # print(word)
112
+ if word != "\n" and word != " " and word not in vocab:
113
+ vocab.add(word)
114
+ vocab_size += 1
115
+ # if index == 10:
116
+ # break
117
+ # print(f"word #{index}/{count} is {word}")
118
+ count += 1
119
+
120
+ # print(f"There are {vocab_size} vocab words")
121
+
122
+ embeddings_model = Word2Vec(
123
+ data,
124
+ epochs= 10,
125
+ window=10,
126
+ vector_size= 50)
127
+ embeddings_model.save("word2vec.model")
128
+
129
+ skip_model = Word2Vec(
130
+ data,
131
+ epochs= 10,
132
+ window=10,
133
+ vector_size= 50,
134
+ sg=1)
135
+ skip_model.save("skip2vec.model")
136
+
137
+ embeddings_model = Word2Vec.load("word2vec.model")
138
+ skip_model = Word2Vec.load("skip2vec.model")
139
+
140
+ # embeddings_model.train(dataset, total_examples=dataset_size, epochs=15)
141
+ # print(embeddings_model['train'])
142
+ # print(embeddings_model.wv["france"])
143
+ return embeddings_model, skip_model
144
+
145
+
146
+ def get_data():
147
+ dataset = load_dataset("wikipedia", "20220301.simple")
148
+ cores = multiprocessing.cpu_count()
149
+ # check the first example of the training portion of the dataset :
150
+ # print(dataset['train'][0])
151
+ dataset_size = len(dataset)
152
+
153
+ ### BUILD VOCAB ###
154
+ # print(type(dataset["train"][0]))
155
+ vocab = set()
156
+ vocab_size = 0
157
+ count = 0
158
+ ## Generate vocab and split sentances and words?
159
+ data = []
160
+ num_sents = 0
161
+ for index, page in enumerate(dataset["train"]):
162
+ document = page["text"]
163
+ document = document.replace("\n", ". ")
164
+ # print(document)
165
+ for sent in document.split("."):
166
+ num_sents += 1
167
+ # print("Sentance:", sent)
168
+ new_sent = []
169
+ clean_sent =[s for s in sent if s.isalnum() or s.isspace()]
170
+ clean_sent = "".join(clean_sent)
171
+ for word in clean_sent.split(" "):
172
+ if len(word) > 0:
173
+ new_word = word.lower()
174
+ # print("Word:", new_word)
175
+ if new_word[0] not in string.punctuation:
176
+ new_sent.append(new_word)
177
+ if len(new_sent) > 0:
178
+ data.append(new_sent)
179
+ # print("New Sent:", new_sent)
180
+
181
+ return data, num_sents
182
+
183
+
184
+ def compare_embeddings(cbow, skip, urban, fasttext):
185
+ '''COMPARE EMBEDDINGS'''
186
+ print("Most Similar to dog")
187
+ print("cbow", cbow.wv.most_similar(positive=['dog'], negative=[], topn=2))
188
+ print("skip", skip.wv.most_similar(positive=['dog'], negative=[], topn=2))
189
+ print("urban", urban.most_similar(positive=['dog'], negative=[], topn=2))
190
+ print("fasttext", fasttext.most_similar(positive=['dog'], negative=[], topn=2))
191
+
192
+ print("\nMost Similar to Pizza - Pepperoni + Pretzel")
193
+ print("cbow", cbow.wv.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
194
+ print("skip", skip.wv.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
195
+ print("urban", urban.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
196
+ print("fasttext", fasttext.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
197
+
198
+ print("\nMost Similar to witch - woman + man")
199
+ print("cbow", cbow.wv.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
200
+ print("skip", skip.wv.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
201
+ print("urban", urban.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
202
+ print("fasttext", fasttext.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
203
+
204
+ print("\nMost Similar to mayor - town + country")
205
+ print("cbow", cbow.wv.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
206
+ print("skip", skip.wv.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
207
+ print("urban", urban.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
208
+ print("fasttext", fasttext.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
209
+
210
+ print("\nMost Similar to death")
211
+ print("cbow", cbow.wv.most_similar(positive=['death'], negative=[], topn=2))
212
+ print("skip", skip.wv.most_similar(positive=['death'], negative=[], topn=2))
213
+ print("urban", urban.most_similar(positive=['death'], negative=[], topn=2))
214
+ print("fasttext", fasttext.most_similar(positive=['death'], negative=[], topn=2))
215
+
216
+
217
+ def quantify_bias(cbow, skip, urban, fasttext):
218
+ '''QUANTIFY BIASES'''
219
+ '''Using WEFE, RNSB'''
220
+
221
+ RNSB_words = [
222
+ ['christianity'],
223
+ ['catholicism'],
224
+ ['islam'],
225
+ ['judaism'],
226
+ ['hinduism'],
227
+ ['buddhism'],
228
+ ['mormonism'],
229
+ ['scientology'],
230
+ ['taoism']]
231
+
232
+ weat_wordset = load_weat()
233
+
234
+ models = [WordEmbeddingModel(cbow.wv, "CBOW"),
235
+ WordEmbeddingModel(skip.wv, "skip-gram"),
236
+ WordEmbeddingModel(urban, "urban dictionary"),
237
+ WordEmbeddingModel(fasttext, "fasttext")]
238
+
239
+ # Define the 10 Queries:
240
+ # print(weat_wordset["science"])
241
+ religions = ['christianity',
242
+ 'catholicism',
243
+ 'islam',
244
+ 'judaism',
245
+ 'hinduism',
246
+ 'buddhism',
247
+ 'mormonism',
248
+ 'scientology',
249
+ 'taoism',
250
+ 'atheism']
251
+ queries = [
252
+ # Flowers vs Insects wrt Pleasant (5) and Unpleasant (5)
253
+ Query([religions, weat_wordset['arts']],
254
+ [weat_wordset['career'], weat_wordset['family']],
255
+ ['Religion', 'Art'], ['Career', 'Family']),
256
+
257
+ Query([religions, weat_wordset['weapons']],
258
+ [weat_wordset['male_terms'], weat_wordset['female_terms']],
259
+ ['Religion', 'Weapons'], ['Male terms', 'Female terms']),
260
+
261
+ ]
262
+
263
+ wefe_results = run_queries(WEAT,
264
+ queries,
265
+ models,
266
+ metric_params ={
267
+ 'preprocessors': [
268
+ {},
269
+ {'lowercase': True }
270
+ ]
271
+ },
272
+ warn_not_found_words = True
273
+ ).T.round(2)
274
+
275
+ print(wefe_results)
276
+ plot_queries_results(wefe_results).show()
277
+
278
+
279
+ def text_classifier(cbow):
280
+ '''SIMPLE TEXT CLASSIFIER'''
281
+ '''For each document, average together all embeddings for the
282
+ individual words in that document to get a new, d-dimensional representation
283
+ of that document (this is essentially a “continuous bag-of-words”). Note that
284
+ your input feature size is only d now, instead of the size of your entire vocabulary.
285
+ Compare the results of training a model using these “CBOW” input features to
286
+ your original (discrete) BOW model.'''
287
+ pos_train_files = glob.glob('aclImdb/train/pos/*')
288
+ neg_train_files = glob.glob('aclImdb/train/neg/*')
289
+ # print(pos_train_files[:5])
290
+
291
+ num_files_per_class = 1000
292
+ # bow_train_files = cbow
293
+ all_train_files = pos_train_files[:num_files_per_class] + neg_train_files[:num_files_per_class]
294
+ # vectorizer = TfidfVectorizer(input="filename", stop_words="english")
295
+ # vectors = vectorizer.fit_transform(all_train_files)
296
+ d = len(cbow.wv["man"])
297
+ vectors = np.empty([len(all_train_files), d])
298
+ count = 0
299
+ vocab = set()
300
+ for doc in all_train_files:
301
+ temp_array = avg_embeddings(doc, cbow, vocab)
302
+ if len(temp_array) > 0:
303
+ vectors[count] = temp_array
304
+ count += 1
305
+ else:
306
+ vectors = np.delete(vectors, count)
307
+ # vectors = np.array(avg_embeddings(doc, cbow) for doc in all_train_files)
308
+ # print(vectors)
309
+ # print(vocab)
310
+
311
+ # len(vectorizer.vocabulary_)
312
+ vectors[0].sum()
313
+ # print("Vector at 0", vectors[0])
314
+
315
+ X = vectors
316
+ y = [1] * num_files_per_class + [0] * num_files_per_class
317
+ len(y)
318
+
319
+ x_0 = X[0]
320
+ w = np.zeros(X.shape[1])
321
+ # x_0_dense = x_0.todense()
322
+ x_0.dot(w)
323
+
324
+ w,b = sgd_for_lr_with_ce(X,y)
325
+ # w
326
+
327
+ # sorted_vocab = sorted([(k,v) for k,v in vectorizer.vocabulary_.items()],key=lambda x:x[1])
328
+ sorted_vocab = sorted(vocab)
329
+ # sorted_vocab = [a for (a,b) in sorted_vocab]
330
+
331
+ sorted_words_weights = sorted([x for x in zip(sorted_vocab, w)], key=lambda x:x[1])
332
+ sorted_words_weights[-50:]
333
+
334
+ preds = predict_y_lr(w,b,X)
335
+
336
+ preds
337
+
338
+ w,b = sgd_for_lr_with_ce(X, y, num_passes=10)
339
+ y_pred = predict_y_lr(w,b,X)
340
+ print(classification_report(y, y_pred))
341
+
342
+ # compute for dev set
343
+ # pos_dev_files = glob.glob('aclImdb/test/pos/*')
344
+ # neg_dev_files = glob.glob('aclImdb/test/neg/*')
345
+ # num_dev_files_per_class = 100
346
+ # all_dev_files = pos_dev_files[:num_dev_files_per_class] + neg_dev_files[:num_dev_files_per_class]
347
+ # # use the same vectorizer from before! otherwise features won't line up
348
+ # # don't fit it again, just use it to transform!
349
+ # X_dev = vectorizer.transform(all_dev_files)
350
+ # y_dev = [1]* num_dev_files_per_class + [0]* num_dev_files_per_class
351
+ # # don't need new w and b, these are from out existing model
352
+ # y_dev_pred = predict_y_lr(w,b,X_dev)
353
+ # print(classification_report(y_dev, y_dev_pred))
354
+
355
+
356
+ def avg_embeddings(doc, model, vocab: set):
357
+ words = []
358
+ # remove out-of-vocabulary words
359
+ with open(doc, "r") as file:
360
+ for line in file:
361
+ for word in line.split():
362
+ words.append(word)
363
+ vocab.add(word)
364
+ words = [word for word in words if word in model.wv.index_to_key]
365
+ if len(words) >= 1:
366
+ return np.mean(model.wv[words], axis=0)
367
+ else:
368
+ return []
369
+
370
+
371
+
372
+ def sent_vec(sent, cbow):
373
+ vector_size = cbow.wv.vector_size
374
+ wv_res = np.zeros(vector_size)
375
+ # print(wv_res)
376
+ ctr = 1
377
+ for w in sent:
378
+ if w in cbow.wv:
379
+ ctr += 1
380
+ wv_res += cbow.wv[w]
381
+ wv_res = wv_res/ctr
382
+ return wv_res
383
+
384
+
385
+ def spacy_tokenizer(sentence):
386
+ # Creating our token object, which is used to create documents with linguistic annotations.
387
+ # doc = nlp(sentence)
388
+
389
+
390
+
391
+ # print(doc)
392
+ # print(type(doc))
393
+
394
+ # Lemmatizing each token and converting each token into lowercase
395
+ # mytokens = [ word.lemma_.lower().strip() for word in doc ]
396
+
397
+ # print(mytokens)
398
+
399
+ # Removing stop words
400
+ # mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
401
+
402
+ # return preprocessed list of tokens
403
+ return 0
404
+
405
+
406
+ def cbow_classifier(cbow, data, num_sentances):
407
+ vocab_len = len(cbow.wv.index_to_key)
408
+
409
+ embeddings = []
410
+ embedding_dict = {}
411
+ vocab = set(cbow.wv.index_to_key)
412
+
413
+ # print("Data len", len(data))
414
+ # print("Data at 0", data[0])
415
+
416
+ X_temp = np.empty([len(data), 1])
417
+ X_train_vect = np.array([np.array([cbow.wv[i] for i in ls if i in vocab])
418
+ for ls in data])
419
+ X_test_vect = np.array([np.array([cbow.wv[i] for i in ls if i in vocab])
420
+ for ls in data])
421
+
422
+ # words = [word for word in words if word in cbow.wv.index_to_key]
423
+ for word in vocab:
424
+ # embedding[word] = cbow.wv[word]
425
+ embeddings.append(np.mean(cbow.wv[word], axis=0))
426
+ embedding_dict[word] = np.mean(cbow.wv[word], axis=0)
427
+
428
+ X = embeddings
429
+
430
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)
431
+
432
+ # print(embeddings)
433
+ # print(vocab_len)
434
+
435
+ # X_train_vect_avg = []
436
+ # for v in X_train_vect:
437
+ # if v.size:
438
+ # X_train_vect_avg.append(v.mean(axis=0))
439
+ # else:
440
+ # X_train_vect_avg.append(np.zeros(100, dtype=float))
441
+
442
+ # X_test_vect_avg = []
443
+ # for v in X_test_vect:
444
+ # if v.size:
445
+ # X_test_vect_avg.append(v.mean(axis=0))
446
+ # else:
447
+ # X_test_vect_avg.append(np.zeros(100, dtype=float))
448
+
449
+ # # for i, v in enumerate(X_train_vect_avg):
450
+ # # print(len(data.iloc[i]), len(v))
451
+
452
+ # x_0 = X_train_vect_avg[0]
453
+ # num_files_per_class = 100
454
+ # y = [1] * num_files_per_class + [0] * num_files_per_class
455
+ # w = np.zeros(X_train_vect_avg.shape[1])
456
+ # x_0_dense = x_0.todense()
457
+ # x_0.dot(w)
458
+
459
+ # w,b = sgd_for_lr_with_ce(X_train_vect_avg, y)
460
+ # w
461
+
462
+ # sorted_vocab = sorted([(k,v) for k,v in enumerate(embedding_dict)],key=lambda x:x[1])
463
+ # sorted_vocab = [a for (a,b) in sorted_vocab]
464
+
465
+ # sorted_words_weights = sorted([x for x in zip(sorted_vocab, w)], key=lambda x:x[1])
466
+ # sorted_words_weights[-50:]
467
+
468
+ # preds = predict_y_lr(w,b,X_train_vect_avg)
469
+
470
+ # preds
471
+
472
+ # w,b = sgd_for_lr_with_ce(X_train_vect_avg, y, num_passes=10)
473
+ # y_pred = predict_y_lr(w,b,X_train_vect_avg)
474
+ # print(classification_report(y, y_pred))
475
+
476
+ # # compute for dev set
477
+ # pos_dev_files = glob.glob('aclImdb/test/pos/*')
478
+ # neg_dev_files = glob.glob('aclImdb/test/neg/*')
479
+ # num_dev_files_per_class = 100
480
+ # all_dev_files = pos_dev_files[:num_dev_files_per_class] + neg_dev_files[:num_dev_files_per_class]
481
+ # # use the same vectorizer from before! otherwise features won't line up
482
+ # # don't fit it again, just use it to transform!
483
+ # # X_dev = vectorizer.transform(all_dev_files)
484
+ # # y_dev = [1]* num_dev_files_per_class + [0]* num_dev_files_per_class
485
+ # # # don't need new w and b, these are from out existing model
486
+ # # y_dev_pred = predict_y_lr(w,b,X_dev)
487
+ # # print(classification_report(y_dev, y_dev_pred))
488
+
489
+
490
+ def sgd_for_lr_with_ce(X, y, num_passes=5, learning_rate = 0.1):
491
+
492
+ num_data_points = X.shape[0]
493
+
494
+ # Initialize theta -> 0
495
+ num_features = X.shape[1]
496
+ w = np.zeros(num_features)
497
+ b = 0.0
498
+
499
+ # repeat until done
500
+ # how to define "done"? let's just make it num passes for now
501
+ # we can also do norm of gradient and when it is < epsilon (something tiny)
502
+ # we stop
503
+
504
+ for current_pass in range(num_passes):
505
+
506
+ # iterate through entire dataset in random order
507
+ order = list(range(num_data_points))
508
+ random.shuffle(order)
509
+ for i in order:
510
+
511
+ # compute y-hat for this value of i given y_i and x_i
512
+ x_i = X[i]
513
+ y_i = y[i]
514
+
515
+ # need to compute based on w and b
516
+ # sigmoid(w dot x + b)
517
+ z = x_i.dot(w) + b
518
+ y_hat_i = expit(z)
519
+
520
+ # for each w (and b), modify by -lr * (y_hat_i - y_i) * x_i
521
+ w = w - learning_rate * (y_hat_i - y_i) * x_i
522
+ b = b - learning_rate * (y_hat_i - y_i)
523
+
524
+ # return theta
525
+ return w,b
526
+
527
+
528
+ def predict_y_lr(w,b,X,threshold=0.5):
529
+
530
+ # use our matrix operation version of the logistic regression model
531
+ # X dot w + b
532
+ # need to make w a column vector so the dimensions line up correctly
533
+ y_hat = X.dot( w.reshape((-1,1)) ) + b
534
+
535
+ # then just check if it's > threshold
536
+ preds = np.where(y_hat > threshold,1,0)
537
+
538
+ return preds
539
+
540
+
541
+ def main():
542
+ parser = argparse.ArgumentParser(
543
+ prog='word_embedding',
544
+ description='This program will train a word embedding model using simple wikipedia.',
545
+ epilog='To skip training the model and to used the saved model "word2vec.model", use the command --skip or -s.'
546
+ )
547
+ parser.add_argument('-s', '--skip', action='store_true')
548
+ parser.add_argument('-e', '--extra', action='store_true')
549
+ parser.add_argument('-b', '--bias', action='store_true')
550
+ parser.add_argument('-c', '--compare', action='store_true')
551
+ parser.add_argument('-t', '--text', action='store_true')
552
+
553
+ args = parser.parse_args()
554
+ skip_model = None
555
+ cbow_model = None
556
+ ud_model = None
557
+ wiki_model = None
558
+ if args.compare:
559
+ if args.skip:
560
+ # print("Skipping")
561
+ cbow_model = Word2Vec.load("word2vec.model")
562
+ skip_model = Word2Vec.load("skip2vec.model")
563
+ ud_model = KeyedVectors.load("urban2vec.model")
564
+ wiki_model = KeyedVectors.load("wiki2vec.model")
565
+ elif args.extra:
566
+ # print("Extra mode")
567
+ cbow_model = Word2Vec.load("word2vec.model")
568
+ skip_model = Word2Vec.load("skip2vec.model")
569
+ wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
570
+ ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
571
+ wiki_model.save("wiki2vec.model")
572
+ ud_model.save("urban2vec.model")
573
+ else:
574
+ cbow_model, skip_model = train_embeddings()
575
+ wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
576
+ ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
577
+ wiki_model.save("wiki2vec.model")
578
+ ud_model.save("urban2vec.model")
579
+ compare_embeddings(cbow_model, skip_model, ud_model, wiki_model)
580
+ if args.bias:
581
+ if args.skip:
582
+ # print("Skipping")
583
+ cbow_model = Word2Vec.load("word2vec.model")
584
+ skip_model = Word2Vec.load("skip2vec.model")
585
+ ud_model = KeyedVectors.load("urban2vec.model")
586
+ wiki_model = KeyedVectors.load("wiki2vec.model")
587
+ elif args.extra:
588
+ # print("Extra mode")
589
+ cbow_model = Word2Vec.load("word2vec.model")
590
+ skip_model = Word2Vec.load("skip2vec.model")
591
+ wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
592
+ ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
593
+ wiki_model.save("wiki2vec.model")
594
+ ud_model.save("urban2vec.model")
595
+ else:
596
+ cbow_model, skip_model = train_embeddings()
597
+ wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
598
+ ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
599
+ wiki_model.save("wiki2vec.model")
600
+ ud_model.save("urban2vec.model")
601
+ quantify_bias(cbow_model, skip_model, ud_model, wiki_model)
602
+ if args.text:
603
+ if args.skip:
604
+ # print("Skipping")
605
+ cbow_model = Word2Vec.load("word2vec.model")
606
+ else:
607
+ cbow_model, skip_model = train_embeddings()
608
+
609
+ text_classifier(cbow_model)
610
+ # data, sents = get_data()
611
+ # cbow_classifier(cbow_model, data, sents)
612
+
613
+ # print("No errors?")
614
+
615
+
616
+ if __name__ == "__main__":
617
+ main()