Spaces:

smhavens
/

AnalogyArcade

Sleeping

App Files Files Community

smhavens commited on Dec 11, 2023

Commit

33e257e

•

1 Parent(s): b3ffc6e

Add files via upload

Browse files

Files changed (6) hide show

app_context.py +258 -0
checkpoint-17000/added_tokens.json +102 -0
checkpoint-17000/config.json +62 -0
checkpoint-17000/generation_config.json +6 -0
flan-t5-train.py +302 -0
word_embedding.py +617 -0

app_context.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import gradio as gr
+import math
+import spacy
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer
+from sentence_transformers import InputExample
+from sentence_transformers import losses
+from sentence_transformers import util
+from transformers import pipeline, T5Tokenizer
+from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
+from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+import numpy as np
+import evaluate
+import nltk
+from nltk.corpus import stopwords
+import subprocess
+import sys
+import random
+from textwrap import fill
+# !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
+subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
+# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+model_base = "results/checkpoint-17000"
+nltk.download('stopwords')
+nlp = spacy.load("en_core_web_sm")
+stops = stopwords.words("english")
+ROMAN_CONSTANTS = (
+            ( "", "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX" ),
+            ( "", "X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC" ),
+            ( "", "C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM" ),
+            ( "", "M", "MM", "MMM", "",   "",  "-",  "",    "",     ""   ),
+            ( "", "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix" ),
+            ( "", "x", "xx", "xxx", "xl", "l", "lx", "lxx", "lxxx", "xc" ),
+            ( "", "c", "cc", "ccc", "cd", "d", "dc", "dcc", "dccc", "cm" ),
+            ( "", "m", "mm", "mmm", "",   "",  "-",  "",    "",     ""   ),
+        )
+# answer = "Pizza"
+guesses = []
+return_guesses = []
+answer = "Moon"
+word1 = "Black"
+word2 = "White"
+word3 = "Sun"
+base_prompts = ["Sun is to Moon as ", "Black is to White as ", "Atom is to Element as",
+                "Athens is to Greece as ", "Cat is to Dog as ", "Robin is to Bird as",
+                "Hunger is to Ambition as "]
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output['token_embeddings'] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+def normalize(comment, lowercase, remove_stopwords):
+    if lowercase:
+        comment = comment.lower()
+    comment = nlp(comment)
+    lemmatized = list()
+    for word in comment:
+        lemma = word.lemma_.strip()
+        if lemma:
+            if not remove_stopwords or (remove_stopwords and lemma not in stops):
+                lemmatized.append(lemma)
+    return " ".join(lemmatized)
+# def tokenize_function(examples):
+#     return tokenizer(examples["text"])
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predictions = np.argmax(logits, axis=-1)
+    metric = evaluate.load("accuracy")
+    return metric.compute(predictions=predictions, references=labels)
+def get_model():
+    global model_base
+    # last_checkpoint = "./results/checkpoint-22500"
+    finetuned_model = T5ForConditionalGeneration.from_pretrained(model_base)
+    tokenizer = T5Tokenizer.from_pretrained(model_base)
+    # model = SentenceTransformer(model_base)
+    gpu_available = torch.cuda.is_available()
+    device = torch.device("cuda" if gpu_available else "cpu")
+    finetuned_model = finetuned_model.to(device)
+    return finetuned_model, tokenizer
+def cosine_scores(model, sentence):
+    global word1
+    global word2
+    global word3
+    # sentence1 = f"{word1} is to {word2} as"
+    embeddings1 = model.encode(sentence, convert_to_tensor=True)
+def embeddings(model, sentences, tokenizer):
+    global word1
+    global word2
+    global word3
+    global model_base
+    gpu_available = torch.cuda.is_available()
+    device = torch.device("cuda" if gpu_available else "cpu")
+    # device = torch.device('cuda:0')
+    # embeddings = model.encode(sentences)
+    question = "Please answer to this question: " + sentences
+    inputs = tokenizer(question, return_tensors="pt")
+    print(inputs)
+    # print(inputs.device)
+    print(model.device)
+    print(inputs['input_ids'].device)
+    print(inputs['attention_mask'].device)
+    inputs['attention_mask'] = inputs['attention_mask'].to(device)
+    inputs['input_ids'] = inputs['input_ids'].to(device)
+    outputs = model.generate(**inputs)
+    answer = tokenizer.decode(outputs[0])
+    answer = answer[6:-4]
+    # print(fill(answer, width=80))
+    print("ANSWER IS", answer)
+    return answer
+def random_word(model, tokenizer):
+    global model_base
+    vocab = tokenizer.get_vocab()
+    # with open(model_base + '/vocab.txt', 'r') as file:
+    line = ""
+    # content = file.readlines()
+    length = tokenizer.vocab_size
+    # print(vocab)
+    while line == "":
+        rand_line = random.randrange(0, length)
+        # print("TRYING TO FIND", rand_line, "OUT OF", length, "WITH VOCAB OF TYPE", type(vocab))
+        for word, id in vocab.items():
+            if id == rand_line and word[0].isalpha() and word not in stops and word not in ROMAN_CONSTANTS:
+        # if vocab[rand_line][0].isalpha() and vocab[rand_line][:-1] not in stops and vocab[rand_line][:-1] not in ROMAN_CONSTANTS:
+                line = word
+            elif id == rand_line:
+                print(f"{word} is not alpha or is a stop word")
+    # for num, aline in enumerate(file, 1997):
+    #     if random.randrange(num) and aline.isalpha():
+    #         continue
+    #     # elif not aline.isalpha():
+    #     line = aline
+    print(line)
+    return line
+def generate_prompt(model, tokenizer):
+    global word1
+    global word2
+    global word3
+    global answer
+    global base_prompts
+    word1 = random_word(model, tokenizer)
+    # word2 = random_word()
+    word2 = embeddings(model, f"{base_prompts[random.randint(0, len(base_prompts) - 1)]}{word1} is to ___.", tokenizer)
+    word3 = random_word(model, tokenizer)
+    sentence = f"{word1} is to {word2} as {word3} is to ___."
+    print(sentence)
+    answer = embeddings(model, sentence, tokenizer)
+    print("ANSWER IS", answer)
+    return f"# {word1} is to {word2} as {word3} is to ___."
+    # cosine_scores(model, sentence)
+def greet(name):
+    return "Hello " + name + "!!"
+def check_answer(guess:str):
+    global guesses
+    global answer
+    global return_guesses
+    global word1
+    global word2
+    global word3
+    model, tokenizer = get_model()
+    output = ""
+    protected_guess = guess
+    sentence = f"{word1} is to {word2} as [MASK] is to {guess}."
+    other_word = embeddings(model, sentence, tokenizer)
+    guesses.append(guess)
+    for guess in return_guesses:
+        output += ("- " + guess + "<br>")
+    # output = output[:-1]
+    prompt = f"{word1} is to {word2} as {word3} is to ___."
+    # print("IS", protected_guess, "EQUAL TO", answer, ":", protected_guess.lower() == answer.lower())
+    if protected_guess.lower() == answer.lower():
+        return_guesses.append(f"{protected_guess}: {word1} is to {word2} as {word3} is to {protected_guess}.")
+        output += f"<span style='color:green'>- {return_guesses[-1]}</span><br>"
+        new_prompt = generate_prompt(model, tokenizer)
+        return new_prompt, "Correct!", output
+    else:
+        return_guess = f"{protected_guess}: {word1} is to {word2} as {other_word} is to {protected_guess}."
+        return_guesses.append(return_guess)
+        output += ("- " + return_guess + " <br>")
+        return prompt, "Try again!", output
+def main():
+    global word1
+    global word2
+    global word3
+    global answer
+    # answer = "Moon"
+    global guesses
+    # num_rows, data_type, value, example, embeddings = training()
+    # sent_embeddings = embeddings()
+    model, tokenizer = get_model()
+    generate_prompt(model, tokenizer)
+    prompt = f"{word1} is to {word2} as {word3} is to ____"
+    print(prompt)
+    print("TESTING EMBEDDINGS")
+    with gr.Blocks() as iface:
+        mark_question = gr.Markdown(prompt)
+        with gr.Tab("Guess"):
+            text_input = gr.Textbox()
+            text_output = gr.Textbox()
+            text_button = gr.Button("Submit")
+        with gr.Accordion("Open for previous guesses"):
+            text_guesses = gr.Markdown()
+        # with gr.Tab("Testing"):
+        #     gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
+        text_button.click(check_answer, inputs=[text_input], outputs=[mark_question, text_output, text_guesses])
+    # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
+    iface.launch()
+if __name__ == "__main__":
+    main()

checkpoint-17000/added_tokens.json ADDED Viewed

	@@ -0,0 +1,102 @@

+{
+  "<extra_id_0>": 32099,
+  "<extra_id_10>": 32089,
+  "<extra_id_11>": 32088,
+  "<extra_id_12>": 32087,
+  "<extra_id_13>": 32086,
+  "<extra_id_14>": 32085,
+  "<extra_id_15>": 32084,
+  "<extra_id_16>": 32083,
+  "<extra_id_17>": 32082,
+  "<extra_id_18>": 32081,
+  "<extra_id_19>": 32080,
+  "<extra_id_1>": 32098,
+  "<extra_id_20>": 32079,
+  "<extra_id_21>": 32078,
+  "<extra_id_22>": 32077,
+  "<extra_id_23>": 32076,
+  "<extra_id_24>": 32075,
+  "<extra_id_25>": 32074,
+  "<extra_id_26>": 32073,
+  "<extra_id_27>": 32072,
+  "<extra_id_28>": 32071,
+  "<extra_id_29>": 32070,
+  "<extra_id_2>": 32097,
+  "<extra_id_30>": 32069,
+  "<extra_id_31>": 32068,
+  "<extra_id_32>": 32067,
+  "<extra_id_33>": 32066,
+  "<extra_id_34>": 32065,
+  "<extra_id_35>": 32064,
+  "<extra_id_36>": 32063,
+  "<extra_id_37>": 32062,
+  "<extra_id_38>": 32061,
+  "<extra_id_39>": 32060,
+  "<extra_id_3>": 32096,
+  "<extra_id_40>": 32059,
+  "<extra_id_41>": 32058,
+  "<extra_id_42>": 32057,
+  "<extra_id_43>": 32056,
+  "<extra_id_44>": 32055,
+  "<extra_id_45>": 32054,
+  "<extra_id_46>": 32053,
+  "<extra_id_47>": 32052,
+  "<extra_id_48>": 32051,
+  "<extra_id_49>": 32050,
+  "<extra_id_4>": 32095,
+  "<extra_id_50>": 32049,
+  "<extra_id_51>": 32048,
+  "<extra_id_52>": 32047,
+  "<extra_id_53>": 32046,
+  "<extra_id_54>": 32045,
+  "<extra_id_55>": 32044,
+  "<extra_id_56>": 32043,
+  "<extra_id_57>": 32042,
+  "<extra_id_58>": 32041,
+  "<extra_id_59>": 32040,
+  "<extra_id_5>": 32094,
+  "<extra_id_60>": 32039,
+  "<extra_id_61>": 32038,
+  "<extra_id_62>": 32037,
+  "<extra_id_63>": 32036,
+  "<extra_id_64>": 32035,
+  "<extra_id_65>": 32034,
+  "<extra_id_66>": 32033,
+  "<extra_id_67>": 32032,
+  "<extra_id_68>": 32031,
+  "<extra_id_69>": 32030,
+  "<extra_id_6>": 32093,
+  "<extra_id_70>": 32029,
+  "<extra_id_71>": 32028,
+  "<extra_id_72>": 32027,
+  "<extra_id_73>": 32026,
+  "<extra_id_74>": 32025,
+  "<extra_id_75>": 32024,
+  "<extra_id_76>": 32023,
+  "<extra_id_77>": 32022,
+  "<extra_id_78>": 32021,
+  "<extra_id_79>": 32020,
+  "<extra_id_7>": 32092,
+  "<extra_id_80>": 32019,
+  "<extra_id_81>": 32018,
+  "<extra_id_82>": 32017,
+  "<extra_id_83>": 32016,
+  "<extra_id_84>": 32015,
+  "<extra_id_85>": 32014,
+  "<extra_id_86>": 32013,
+  "<extra_id_87>": 32012,
+  "<extra_id_88>": 32011,
+  "<extra_id_89>": 32010,
+  "<extra_id_8>": 32091,
+  "<extra_id_90>": 32009,
+  "<extra_id_91>": 32008,
+  "<extra_id_92>": 32007,
+  "<extra_id_93>": 32006,
+  "<extra_id_94>": 32005,
+  "<extra_id_95>": 32004,
+  "<extra_id_96>": 32003,
+  "<extra_id_97>": 32002,
+  "<extra_id_98>": 32001,
+  "<extra_id_99>": 32000,
+  "<extra_id_9>": 32090
+}

checkpoint-17000/config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "_name_or_path": "google/flan-t5-base",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 768,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 12,
+  "num_heads": 12,
+  "num_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32128
+}

checkpoint-17000/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}

flan-t5-train.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import gradio as gr
+import math
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
+from transformers import TrainingArguments, Trainer
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+import numpy as np
+import evaluate
+import nltk
+from nltk.corpus import stopwords
+import subprocess
+import sys
+from transformers import T5Tokenizer, DataCollatorForSeq2Seq
+from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
+from transformers import DataCollatorWithPadding, DistilBertTokenizerFast
+from transformers import TrainingArguments
+from transformers import (
+    BertModel,
+    BertTokenizerFast,
+    Trainer,
+    EvalPrediction
+)
+# !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
+# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+# nltk.download('stopwords')
+# nlp = spacy.load("en_core_web_sm")
+# stops = stopwords.words("english")
+nltk.download("punkt", quiet=True)
+metric = evaluate.load("rouge")
+# Global Parameters
+L_RATE = 3e-4
+BATCH_SIZE = 8
+PER_DEVICE_EVAL_BATCH = 4
+WEIGHT_DECAY = 0.01
+SAVE_TOTAL_LIM = 3
+NUM_EPOCHS = 10
+# Set up training arguments
+training_args = Seq2SeqTrainingArguments(
+   output_dir="./results",
+   evaluation_strategy="epoch",
+   learning_rate=L_RATE,
+   per_device_train_batch_size=BATCH_SIZE,
+   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
+   weight_decay=WEIGHT_DECAY,
+   save_total_limit=SAVE_TOTAL_LIM,
+   num_train_epochs=NUM_EPOCHS,
+   predict_with_generate=True,
+   push_to_hub=False
+)
+model_id = "google/flan-t5-base"
+tokenizer = T5Tokenizer.from_pretrained(model_id)
+# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+# metric = evaluate.load("accuracy")
+def tokenize_function(examples):
+    return tokenizer(examples["stem"], padding="max_length", truncation=True)
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+# def compute_metrics(eval_pred):
+#     logits, labels = eval_pred
+#     predictions = np.argmax(logits, axis=-1)
+#     metric = evaluate.load("accuracy")
+#     return metric.compute(predictions=predictions, references=labels)
+def compute_metrics(eval_preds):
+   preds, labels = eval_preds
+   # decode preds and labels
+   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+   # rougeLSum expects newline after each sentence
+   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
+   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
+   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+   return result
+def training():
+    dataset_id = "tomasmcz/word2vec_analogy"
+    # dataset_id = "relbert/scientific_and_creative_analogy"
+    # dataset_sub = "Quadruples_Kmiecik_random_split"
+    print("GETTING DATASET")
+    dataset = load_dataset(dataset_id)
+    # dataset = dataset["train"]
+    # tokenized_datasets = dataset.map(tokenize_function, batched=True)
+    print(dataset)
+    print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
+    print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0])} as value.")
+    print(f"- Examples look like this: {dataset['train'][0]}")
+    # for i in dataset["train"]:
+    #     print(i["AB"], "to", i["CD"], "is", i["label"])
+    dataset = dataset["train"].train_test_split(test_size=0.3)
+    # We prefix our tasks with "answer the question"
+    prefix = "Please answer this question: "
+    # Define the preprocessing function
+    # def preprocess_function(examples):
+    #     """Add prefix to the sentences, tokenize the text, and set the labels"""
+    #     # The "inputs" are the tokenized answer:
+    #     inputs = []
+    #     # print(examples)
+    #     # inputs = [prefix + doc for doc in examples["question"]]
+    #     for doc in examples['source']:
+    #         # print("THE DOC IS:", doc)
+    #         # print("THE DOC IS:", examples[i]['AB'], examples[i]['CD'], examples[i]['label'])
+    #         prompt = f"{prefix}map "
+    #         for item in doc:
+    #             prompt += f"{item}, and "
+    #         prompt = prompt[:-6]
+    #         inputs.append(prompt)
+    #     # inputs = [prefix + doc for doc in examples["question"]]
+    #     for indx, doc in enumerate(examples["target_random"]):
+    #         prompt = f" to "
+    #         for item in doc:
+    #             prompt += f"{item}, and "
+    #         prompt = prompt[:-6] + "."
+    #         inputs[indx] += prompt
+    #     model_inputs = tokenizer(inputs, max_length=128, truncation=True)
+    def preprocess_function(examples):
+        """Add prefix to the sentences, tokenize the text, and set the labels"""
+        # The "inputs" are the tokenized answer:
+        inputs = []
+        # print(examples)
+        # inputs = [prefix + doc for doc in examples["question"]]
+        for doc in examples['word_a']:
+            # print("THE DOC IS:", doc)
+            # print("THE DOC IS:", examples[i]['AB'], examples[i]['CD'], examples[i]['label'])
+            prompt = f"{prefix}{doc} is to "
+            inputs.append(prompt)
+        # inputs = [prefix + doc for doc in examples["question"]]
+        for indx, doc in enumerate(examples["word_b"]):
+            prompt = f"{doc} as "
+            inputs[indx] += prompt
+        for indx, doc in enumerate(examples["word_c"]):
+            prompt = f"{doc} is to ___."
+            inputs[indx] += prompt
+        model_inputs = tokenizer(inputs, max_length=128, truncation=True)
+        # print(examples["label"], type(examples["label"]))
+        # The "labels" are the tokenized outputs:
+        labels = tokenizer(text_target=examples["word_d"],
+                            max_length=512,
+                            truncation=True)
+        model_inputs["labels"] = labels["input_ids"]
+        return model_inputs
+    # Map the preprocessing function across our dataset
+    tokenized_dataset = dataset.map(preprocess_function, batched=True)
+    # train_examples = []
+    # train_data = dataset["test"]
+    # # For agility we only 1/2 of our available data
+    # n_examples = dataset["test"].num_rows // 2
+    # for i in range(n_examples):
+    #     example = train_data[i]
+    #     temp_word_1 = example["stem"][0]
+    #     temp_word_2 = example["stem"][1]
+    #     temp_word_3 = example["choice"][example["answer"]][0]
+    #     temp_word_4 = example["choice"][example["answer"]][1]
+    #     comp1 = f"{temp_word_1} to {temp_word_2}"
+    #     comp2 = f"{temp_word_3} to {temp_word_4}"
+    #     # example_opposite = dataset_clean[-(i)]
+    #     # print(example["text"])
+    #     train_examples.append(InputExample(texts=[comp1, comp2]))
+    # train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
+    print("END DATALOADER")
+    # print(train_examples)
+    embeddings = finetune(tokenized_dataset)
+    return 0
+def finetune(dataset):
+    # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+    # model_id = "sentence-transformers/all-MiniLM-L6-v2"
+    model_id = "google/flan-t5-base"
+    # model_id = "distilbert-base-uncased"
+    # tokenizer = DistilBertTokenizerFast.from_pretrained(model_id)
+    tokenizer = T5Tokenizer.from_pretrained(model_id)
+    model = T5ForConditionalGeneration.from_pretrained(model_id)
+    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
+    device = torch.device('cuda:0')
+    model = model.to(device)
+    # training_args = TrainingArguments(output_dir="test_trainer")
+    # USE THIS LINK
+    # https://huggingface.co/blog/how-to-train-sentence-transformers
+    # train_loss = losses.MegaBatchMarginLoss(model=model)
+    # ds_train, ds_valid = dataset.train_test_split(test_size=0.2, seed=42)
+    print("BEGIN FIT")
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset["train"],
+        eval_dataset=dataset["test"],
+        # evaluation_strategy="no"
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics
+        )
+    # model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
+    trainer.train()
+    # model.save("flan-analogies")
+    # model.save_to_hub("smhavens/bert-base-analogies")
+    # accuracy = compute_metrics(eval, metric)
+    return 0
+def greet(name):
+    return "Hello " + name + "!!"
+def check_answer(guess:str):
+    global guesses
+    global answer
+    guesses.append(guess)
+    output = ""
+    for guess in guesses:
+        output += ("- " + guess + "\n")
+    output = output[:-1]
+    if guess.lower() == answer.lower():
+        return "Correct!", output
+    else:
+        return "Try again!", output
+def main():
+    print("BEGIN")
+    word1 = "Black"
+    word2 = "White"
+    word3 = "Sun"
+    global answer
+    answer = "Moon"
+    global guesses
+    training()
+    # prompt = f"{word1} is to {word2} as {word3} is to ____"
+    # with gr.Blocks() as iface:
+    #     gr.Markdown(prompt)
+    #     with gr.Tab("Guess"):
+    #         text_input = gr.Textbox()
+    #         text_output = gr.Textbox()
+    #         text_button = gr.Button("Submit")
+    #     with gr.Accordion("Open for previous guesses"):
+    #         text_guesses = gr.Textbox()
+    #     with gr.Tab("Testing"):
+    #         gr.Markdown(f"""Number of rows in dataset is {num_rows}, with each having type {data_type} and value {value}.
+    #                     An example is {example}.
+    #                     The Embeddings are {embeddings}.""")
+    #     text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
+    # # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
+    # iface.launch()
+if __name__ == "__main__":
+    main()

word_embedding.py ADDED Viewed

	@@ -0,0 +1,617 @@

+from datasets import load_dataset
+import shutil
+import json
+from collections import defaultdict
+import multiprocessing
+import gensim
+from sklearn.metrics import classification_report
+from gensim import corpora
+from gensim.test.utils import common_texts
+from gensim.models import Word2Vec
+from gensim.models import KeyedVectors
+from gensim.models import fasttext
+from gensim.test.utils import datapath
+from wefe.datasets import load_bingliu
+from wefe.metrics import RNSB
+from wefe.query import Query
+from wefe.word_embedding_model import WordEmbeddingModel
+from wefe.utils import plot_queries_results, run_queries
+import pandas as pd
+import gensim.downloader as api
+import glob
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.ensemble import RandomForestClassifier
+from wefe.metrics import WEAT
+from wefe.datasets import load_weat
+from wefe.utils import run_queries
+from wefe.utils import plot_queries_results
+import random
+from scipy.special import expit
+import math
+import sys
+import os
+import argparse
+import nltk
+import scipy.sparse
+import numpy as np
+import string
+import io
+from sklearn.model_selection import train_test_split
+'''STEPS FOR CODE:
+1. Train word embeddings on Simple English Wikipedia;
+2. Compare these to other pre-trained embeddings;
+3. Quantify biases that exist in these word embeddings;
+4. Use your word embeddings as features in a simple text classifier;
+'''
+def load_vectors(fname):
+    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
+    n, d = map(int, fin.readline().split())
+    data = {}
+    # print("Hello", n, d)
+    for line in fin:
+        tokens = line.rstrip().split(' ')
+        data[tokens[0]] = map(float, tokens[1:])
+        # print(data)
+    print(data)
+    return data
+def train_embeddings():
+    '''TRAIN WORD EMBEDDINGS
+    This will be making use of the dataset from wikipedia and the first step'''
+    dataset = load_dataset("wikipedia", "20220301.simple")
+    cores = multiprocessing.cpu_count()
+    # check the first example of the training portion of the dataset :
+    # print(dataset['train'][0])
+    dataset_size = len(dataset)
+    ### BUILD VOCAB ###
+    # print(type(dataset["train"][0]))
+    vocab = set()
+    vocab_size = 0
+    count = 0
+    ## Generate vocab and split sentances and words?
+    data = []
+    for index, page in enumerate(dataset["train"]):
+        document = page["text"]
+        document = document.replace("\n", ". ")
+        # print(document)
+        for sent in document.split("."):
+            # print("Sentance:", sent)
+            new_sent = []
+            clean_sent =[s for s in sent if s.isalnum() or s.isspace()]
+            clean_sent = "".join(clean_sent)
+            for word in clean_sent.split(" "):
+                if len(word) > 0:
+                    new_word = word.lower()
+                    # print("Word:", new_word)
+                    if new_word[0] not in string.punctuation:
+                        new_sent.append(new_word)
+            if len(new_sent) > 0:
+                data.append(new_sent)
+                # print("New Sent:", new_sent)
+    for index, page in enumerate(dataset["train"]):
+        # print(page["text"])
+        # for text in page:
+        #     print(text)
+        text = page["text"]
+        clean_text = [s for s in text if s.isalnum() or s.isspace()]
+        clean_text = "".join(clean_text)
+        clean_text = clean_text.replace("\n", " ")
+        # text = text.replace('; ', ' ').replace(", ", " ").replace("\n", " ").replace(":", " ").replace(". ", " ").replace("! ", " ").replace("? ", " ").replace()
+        for word in clean_text.split(" "):
+            # print(word)
+            if word != "\n" and word != " " and word not in vocab:
+                vocab.add(word)
+                vocab_size += 1
+            # if index == 10:
+            #     break
+            # print(f"word #{index}/{count} is {word}")
+        count += 1
+    # print(f"There are {vocab_size} vocab words")
+    embeddings_model = Word2Vec(
+                     data,
+                     epochs= 10,
+                     window=10,
+                     vector_size= 50)
+    embeddings_model.save("word2vec.model")
+    skip_model = Word2Vec(
+                     data,
+                     epochs= 10,
+                     window=10,
+                     vector_size= 50,
+                     sg=1)
+    skip_model.save("skip2vec.model")
+    embeddings_model = Word2Vec.load("word2vec.model")
+    skip_model = Word2Vec.load("skip2vec.model")
+    # embeddings_model.train(dataset, total_examples=dataset_size, epochs=15)
+    # print(embeddings_model['train'])
+    # print(embeddings_model.wv["france"])
+    return embeddings_model, skip_model
+def get_data():
+    dataset = load_dataset("wikipedia", "20220301.simple")
+    cores = multiprocessing.cpu_count()
+    # check the first example of the training portion of the dataset :
+    # print(dataset['train'][0])
+    dataset_size = len(dataset)
+    ### BUILD VOCAB ###
+    # print(type(dataset["train"][0]))
+    vocab = set()
+    vocab_size = 0
+    count = 0
+    ## Generate vocab and split sentances and words?
+    data = []
+    num_sents = 0
+    for index, page in enumerate(dataset["train"]):
+        document = page["text"]
+        document = document.replace("\n", ". ")
+        # print(document)
+        for sent in document.split("."):
+            num_sents += 1
+            # print("Sentance:", sent)
+            new_sent = []
+            clean_sent =[s for s in sent if s.isalnum() or s.isspace()]
+            clean_sent = "".join(clean_sent)
+            for word in clean_sent.split(" "):
+                if len(word) > 0:
+                    new_word = word.lower()
+                    # print("Word:", new_word)
+                    if new_word[0] not in string.punctuation:
+                        new_sent.append(new_word)
+            if len(new_sent) > 0:
+                data.append(new_sent)
+                # print("New Sent:", new_sent)
+    return data, num_sents
+def compare_embeddings(cbow, skip, urban, fasttext):
+    '''COMPARE EMBEDDINGS'''
+    print("Most Similar to dog")
+    print("cbow", cbow.wv.most_similar(positive=['dog'], negative=[], topn=2))
+    print("skip", skip.wv.most_similar(positive=['dog'], negative=[], topn=2))
+    print("urban", urban.most_similar(positive=['dog'], negative=[], topn=2))
+    print("fasttext", fasttext.most_similar(positive=['dog'], negative=[], topn=2))
+    print("\nMost Similar to Pizza - Pepperoni + Pretzel")
+    print("cbow", cbow.wv.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
+    print("skip", skip.wv.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
+    print("urban", urban.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
+    print("fasttext", fasttext.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
+    print("\nMost Similar to witch - woman + man")
+    print("cbow", cbow.wv.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
+    print("skip", skip.wv.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
+    print("urban", urban.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
+    print("fasttext", fasttext.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
+    print("\nMost Similar to mayor - town + country")
+    print("cbow", cbow.wv.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
+    print("skip", skip.wv.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
+    print("urban", urban.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
+    print("fasttext", fasttext.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
+    print("\nMost Similar to death")
+    print("cbow", cbow.wv.most_similar(positive=['death'], negative=[], topn=2))
+    print("skip", skip.wv.most_similar(positive=['death'], negative=[], topn=2))
+    print("urban", urban.most_similar(positive=['death'], negative=[], topn=2))
+    print("fasttext", fasttext.most_similar(positive=['death'], negative=[], topn=2))
+def quantify_bias(cbow, skip, urban, fasttext):
+    '''QUANTIFY BIASES'''
+    '''Using WEFE, RNSB'''
+    RNSB_words = [
+        ['christianity'],
+        ['catholicism'],
+        ['islam'],
+        ['judaism'],
+        ['hinduism'],
+        ['buddhism'],
+        ['mormonism'],
+        ['scientology'],
+        ['taoism']]
+    weat_wordset = load_weat()
+    models = [WordEmbeddingModel(cbow.wv, "CBOW"),
+              WordEmbeddingModel(skip.wv, "skip-gram"),
+              WordEmbeddingModel(urban, "urban dictionary"),
+              WordEmbeddingModel(fasttext, "fasttext")]
+    # Define the 10 Queries:
+    # print(weat_wordset["science"])
+    religions = ['christianity',
+                 'catholicism',
+                 'islam',
+                 'judaism',
+                 'hinduism',
+                 'buddhism',
+                 'mormonism',
+                 'scientology',
+                 'taoism',
+                 'atheism']
+    queries = [
+        # Flowers vs Insects wrt Pleasant (5) and Unpleasant (5)
+        Query([religions, weat_wordset['arts']],
+            [weat_wordset['career'], weat_wordset['family']],
+            ['Religion', 'Art'], ['Career', 'Family']),
+        Query([religions, weat_wordset['weapons']],
+            [weat_wordset['male_terms'], weat_wordset['female_terms']],
+            ['Religion', 'Weapons'], ['Male terms', 'Female terms']),
+    ]
+    wefe_results = run_queries(WEAT,
+                                queries,
+                                models,
+                                metric_params ={
+                                    'preprocessors': [
+                                        {},
+                                        {'lowercase': True }
+                                    ]
+                                },
+                                warn_not_found_words = True
+                                ).T.round(2)
+    print(wefe_results)
+    plot_queries_results(wefe_results).show()
+def text_classifier(cbow):
+    '''SIMPLE TEXT CLASSIFIER'''
+    '''For each document, average together all embeddings for the
+    individual words in that document to get a new, d-dimensional representation
+    of that document (this is essentially a “continuous bag-of-words”). Note that
+    your input feature size is only d now, instead of the size of your entire vocabulary.
+    Compare the results of training a model using these “CBOW” input features to
+    your original (discrete) BOW model.'''
+    pos_train_files = glob.glob('aclImdb/train/pos/*')
+    neg_train_files = glob.glob('aclImdb/train/neg/*')
+    # print(pos_train_files[:5])
+    num_files_per_class = 1000
+    # bow_train_files = cbow
+    all_train_files = pos_train_files[:num_files_per_class] + neg_train_files[:num_files_per_class]
+    # vectorizer = TfidfVectorizer(input="filename", stop_words="english")
+    # vectors = vectorizer.fit_transform(all_train_files)
+    d = len(cbow.wv["man"])
+    vectors = np.empty([len(all_train_files), d])
+    count = 0
+    vocab = set()
+    for doc in all_train_files:
+        temp_array = avg_embeddings(doc, cbow, vocab)
+        if len(temp_array) > 0:
+            vectors[count] = temp_array
+            count += 1
+        else:
+            vectors = np.delete(vectors, count)
+    # vectors = np.array(avg_embeddings(doc, cbow) for doc in all_train_files)
+    # print(vectors)
+    # print(vocab)
+    # len(vectorizer.vocabulary_)
+    vectors[0].sum()
+    # print("Vector at 0", vectors[0])
+    X = vectors
+    y = [1] * num_files_per_class + [0] * num_files_per_class
+    len(y)
+    x_0 = X[0]
+    w = np.zeros(X.shape[1])
+    # x_0_dense = x_0.todense()
+    x_0.dot(w)
+    w,b = sgd_for_lr_with_ce(X,y)
+    # w
+    # sorted_vocab = sorted([(k,v) for k,v in vectorizer.vocabulary_.items()],key=lambda x:x[1])
+    sorted_vocab = sorted(vocab)
+    # sorted_vocab = [a for (a,b) in sorted_vocab]
+    sorted_words_weights = sorted([x for x in zip(sorted_vocab, w)], key=lambda x:x[1])
+    sorted_words_weights[-50:]
+    preds = predict_y_lr(w,b,X)
+    preds
+    w,b = sgd_for_lr_with_ce(X, y, num_passes=10)
+    y_pred = predict_y_lr(w,b,X)
+    print(classification_report(y, y_pred))
+    # compute for dev set
+    # pos_dev_files = glob.glob('aclImdb/test/pos/*')
+    # neg_dev_files = glob.glob('aclImdb/test/neg/*')
+    # num_dev_files_per_class = 100
+    # all_dev_files = pos_dev_files[:num_dev_files_per_class] + neg_dev_files[:num_dev_files_per_class]
+    # # use the same vectorizer from before! otherwise features won't line up
+    # # don't fit it again, just use it to transform!
+    # X_dev = vectorizer.transform(all_dev_files)
+    # y_dev = [1]* num_dev_files_per_class + [0]* num_dev_files_per_class
+    # # don't need new w and b, these are from out existing model
+    # y_dev_pred = predict_y_lr(w,b,X_dev)
+    # print(classification_report(y_dev, y_dev_pred))
+def avg_embeddings(doc, model, vocab: set):
+    words = []
+    # remove out-of-vocabulary words
+    with open(doc, "r") as file:
+        for line in file:
+            for word in line.split():
+                words.append(word)
+                vocab.add(word)
+    words = [word for word in words if word in model.wv.index_to_key]
+    if len(words) >= 1:
+        return np.mean(model.wv[words], axis=0)
+    else:
+        return []
+def sent_vec(sent, cbow):
+    vector_size = cbow.wv.vector_size
+    wv_res = np.zeros(vector_size)
+    # print(wv_res)
+    ctr = 1
+    for w in sent:
+        if w in cbow.wv:
+            ctr += 1
+            wv_res += cbow.wv[w]
+    wv_res = wv_res/ctr
+    return wv_res
+def spacy_tokenizer(sentence):
+    # Creating our token object, which is used to create documents with linguistic annotations.
+    # doc = nlp(sentence)
+    # print(doc)
+    # print(type(doc))
+    # Lemmatizing each token and converting each token into lowercase
+    # mytokens = [ word.lemma_.lower().strip() for word in doc ]
+    # print(mytokens)
+    # Removing stop words
+    # mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
+    # return preprocessed list of tokens
+    return 0
+def cbow_classifier(cbow, data, num_sentances):
+    vocab_len = len(cbow.wv.index_to_key)
+    embeddings = []
+    embedding_dict = {}
+    vocab = set(cbow.wv.index_to_key)
+    # print("Data len", len(data))
+    # print("Data at 0", data[0])
+    X_temp = np.empty([len(data), 1])
+    X_train_vect = np.array([np.array([cbow.wv[i] for i in ls if i in vocab])
+                         for ls in data])
+    X_test_vect = np.array([np.array([cbow.wv[i] for i in ls if i in vocab])
+                         for ls in data])
+    # words = [word for word in words if word in cbow.wv.index_to_key]
+    for word in vocab:
+        # embedding[word] = cbow.wv[word]
+        embeddings.append(np.mean(cbow.wv[word], axis=0))
+        embedding_dict[word] = np.mean(cbow.wv[word], axis=0)
+    X = embeddings
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)
+    # print(embeddings)
+    # print(vocab_len)
+    # X_train_vect_avg = []
+    # for v in X_train_vect:
+    #     if v.size:
+    #         X_train_vect_avg.append(v.mean(axis=0))
+    #     else:
+    #         X_train_vect_avg.append(np.zeros(100, dtype=float))
+    # X_test_vect_avg = []
+    # for v in X_test_vect:
+    #     if v.size:
+    #         X_test_vect_avg.append(v.mean(axis=0))
+    #     else:
+    #         X_test_vect_avg.append(np.zeros(100, dtype=float))
+    # # for i, v in enumerate(X_train_vect_avg):
+    # #     print(len(data.iloc[i]), len(v))
+    # x_0 = X_train_vect_avg[0]
+    # num_files_per_class = 100
+    # y = [1] * num_files_per_class + [0] * num_files_per_class
+    # w = np.zeros(X_train_vect_avg.shape[1])
+    # x_0_dense = x_0.todense()
+    # x_0.dot(w)
+    # w,b = sgd_for_lr_with_ce(X_train_vect_avg, y)
+    # w
+    # sorted_vocab = sorted([(k,v) for k,v in enumerate(embedding_dict)],key=lambda x:x[1])
+    # sorted_vocab = [a for (a,b) in sorted_vocab]
+    # sorted_words_weights = sorted([x for x in zip(sorted_vocab, w)], key=lambda x:x[1])
+    # sorted_words_weights[-50:]
+    # preds = predict_y_lr(w,b,X_train_vect_avg)
+    # preds
+    # w,b = sgd_for_lr_with_ce(X_train_vect_avg, y, num_passes=10)
+    # y_pred = predict_y_lr(w,b,X_train_vect_avg)
+    # print(classification_report(y, y_pred))
+    # # compute for dev set
+    # pos_dev_files = glob.glob('aclImdb/test/pos/*')
+    # neg_dev_files = glob.glob('aclImdb/test/neg/*')
+    # num_dev_files_per_class = 100
+    # all_dev_files = pos_dev_files[:num_dev_files_per_class] + neg_dev_files[:num_dev_files_per_class]
+    # # use the same vectorizer from before! otherwise features won't line up
+    # # don't fit it again, just use it to transform!
+    # # X_dev = vectorizer.transform(all_dev_files)
+    # # y_dev = [1]* num_dev_files_per_class + [0]* num_dev_files_per_class
+    # # # don't need new w and b, these are from out existing model
+    # # y_dev_pred = predict_y_lr(w,b,X_dev)
+    # # print(classification_report(y_dev, y_dev_pred))
+def sgd_for_lr_with_ce(X, y, num_passes=5, learning_rate = 0.1):
+    num_data_points = X.shape[0]
+    # Initialize theta -> 0
+    num_features = X.shape[1]
+    w = np.zeros(num_features)
+    b = 0.0
+    # repeat until done
+    # how to define "done"? let's just make it num passes for now
+    # we can also do norm of gradient and when it is < epsilon (something tiny)
+    # we stop
+    for current_pass in range(num_passes):
+        # iterate through entire dataset in random order
+        order = list(range(num_data_points))
+        random.shuffle(order)
+        for i in order:
+            # compute y-hat for this value of i given y_i and x_i
+            x_i = X[i]
+            y_i = y[i]
+            # need to compute based on w and b
+            # sigmoid(w dot x + b)
+            z = x_i.dot(w) + b
+            y_hat_i = expit(z)
+            # for each w (and b), modify by -lr * (y_hat_i - y_i) * x_i
+            w = w - learning_rate * (y_hat_i - y_i) * x_i
+            b = b - learning_rate * (y_hat_i - y_i)
+    # return theta
+    return w,b
+def predict_y_lr(w,b,X,threshold=0.5):
+    # use our matrix operation version of the logistic regression model
+    # X dot w + b
+    # need to make w a column vector so the dimensions line up correctly
+    y_hat = X.dot( w.reshape((-1,1)) ) + b
+    # then just check if it's > threshold
+    preds = np.where(y_hat > threshold,1,0)
+    return preds
+def main():
+    parser = argparse.ArgumentParser(
+        prog='word_embedding',
+        description='This program will train a word embedding model using simple wikipedia.',
+        epilog='To skip training the model and to used the saved model "word2vec.model", use the command --skip or -s.'
+    )
+    parser.add_argument('-s', '--skip', action='store_true')
+    parser.add_argument('-e', '--extra', action='store_true')
+    parser.add_argument('-b', '--bias', action='store_true')
+    parser.add_argument('-c', '--compare', action='store_true')
+    parser.add_argument('-t', '--text', action='store_true')
+    args = parser.parse_args()
+    skip_model = None
+    cbow_model = None
+    ud_model = None
+    wiki_model = None
+    if args.compare:
+        if args.skip:
+            # print("Skipping")
+            cbow_model = Word2Vec.load("word2vec.model")
+            skip_model = Word2Vec.load("skip2vec.model")
+            ud_model = KeyedVectors.load("urban2vec.model")
+            wiki_model = KeyedVectors.load("wiki2vec.model")
+        elif args.extra:
+            # print("Extra mode")
+            cbow_model = Word2Vec.load("word2vec.model")
+            skip_model = Word2Vec.load("skip2vec.model")
+            wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
+            ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
+            wiki_model.save("wiki2vec.model")
+            ud_model.save("urban2vec.model")
+        else:
+            cbow_model, skip_model = train_embeddings()
+            wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
+            ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
+            wiki_model.save("wiki2vec.model")
+            ud_model.save("urban2vec.model")
+        compare_embeddings(cbow_model, skip_model, ud_model, wiki_model)
+    if args.bias:
+        if args.skip:
+            # print("Skipping")
+            cbow_model = Word2Vec.load("word2vec.model")
+            skip_model = Word2Vec.load("skip2vec.model")
+            ud_model = KeyedVectors.load("urban2vec.model")
+            wiki_model = KeyedVectors.load("wiki2vec.model")
+        elif args.extra:
+            # print("Extra mode")
+            cbow_model = Word2Vec.load("word2vec.model")
+            skip_model = Word2Vec.load("skip2vec.model")
+            wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
+            ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
+            wiki_model.save("wiki2vec.model")
+            ud_model.save("urban2vec.model")
+        else:
+            cbow_model, skip_model = train_embeddings()
+            wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
+            ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
+            wiki_model.save("wiki2vec.model")
+            ud_model.save("urban2vec.model")
+        quantify_bias(cbow_model, skip_model, ud_model, wiki_model)
+    if args.text:
+        if args.skip:
+            # print("Skipping")
+            cbow_model = Word2Vec.load("word2vec.model")
+        else:
+            cbow_model, skip_model = train_embeddings()
+        text_classifier(cbow_model)
+        # data, sents = get_data()
+        # cbow_classifier(cbow_model, data, sents)
+    # print("No errors?")
+if __name__ == "__main__":
+    main()