AnalogyArcade / analogy_train.py
smhavens
Massive changes, using better dataset and now returning random masks
3922a86
raw
history blame
10.8 kB
import gradio as gr
import math
import spacy
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers import InputExample
from sentence_transformers import losses
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
import evaluate
import nltk
from nltk.corpus import stopwords
import subprocess
import sys
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import (
BertModel,
BertTokenizerFast,
Trainer,
EvalPrediction
)
# !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# nltk.download('stopwords')
# nlp = spacy.load("en_core_web_sm")
# stops = stopwords.words("english")
# answer = "Pizza"
guesses = []
answer = "Pizza"
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
metric = evaluate.load("accuracy")
def tokenize_function(examples):
return tokenizer(examples["stem"], padding="max_length", truncation=True)
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
metric = evaluate.load("accuracy")
return metric.compute(predictions=predictions, references=labels)
# def training():
# dataset_id = "relbert/analogy_questions"
# dataset_sub = "bats"
# print("GETTING DATASET")
# raw_dataset = load_dataset(dataset_id, dataset_sub)
# # data_metric = evaluate.load(dataset_id, dataset_sub)
# checkpoint = "bert-base-uncased"
# model = BertModel.from_pretrained(checkpoint)
# # dataset = dataset["train"]
# # tokenized_datasets = dataset.map(tokenize_function, batched=True)
# # print(raw_dataset)
# test_data = raw_dataset["test"]
# # print(test_data["stem"])
# all_answers = []
# for answer in raw_dataset["answer"]:
# answer = raw_dataset["choice"][answer]
# raw_dataset = raw_dataset.add_column("label", all_answers)
# print(raw_dataset)
# print(raw_dataset["label"])
# dataset = raw_dataset.map(
# lambda x: tokenizer(x["stem"], truncation=True),
# batched=True,
# )
# print(dataset)
# dataset = dataset.remove_columns(["stem", "answer", "choice"])
# dataset = dataset.rename_column("label", "labels")
# dataset = dataset.with_format("torch")
# training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
# print(dataset)
# # print(f"- The {dataset_id} dataset has {dataset.num_rows} examples.")
# # print(f"- Each example is a {type(dataset[0])} with a {type(dataset[0]['stem'])} as value.")
# # print(f"- Examples look like this: {dataset[0]}")
# # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
# # dataset = dataset["train"].map(tokenize_function, batched=True)
# # dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
# # dataset.format['type']
# # tokenized_news = dataset.map(tokenize_function, batched=True)
# # model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", num_labels=2)
# # print(dataset)
# # Choose the appropriate device based on availability (CUDA or CPU)
# # gpu_available = torch.cuda.is_available()
# # device = torch.device("cuda" if gpu_available else "cpu")
# # model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
# # tokenized_datasets = dataset.map(tokenize_function, batched=True)
# # print(tokenized_datasets)
# # # small_train_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
# # # small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(1000))
# # model = model.to(device)
# # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
# # training_args = TrainingArguments(output_dir="test_trainer")
# trainer = Trainer(
# model=model,
# args=training_args,
# train_dataset=dataset["test"],
# eval_dataset=dataset["validation"],
# compute_metrics=compute_metrics,
# )
# output = trainer.train()
# # train_examples = []
# # train_data = dataset["train"]
# # # For agility we only 1/2 of our available data
# # n_examples = dataset["train"].num_rows // 2
# # for i in range(n_examples):
# # example = train_data[i]
# # # example_opposite = dataset_clean[-(i)]
# # # print(example["text"])
# # train_examples.append(InputExample(texts=[example['stem'], example]))
# # train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
# # print("END DATALOADER")
# # # print(train_examples)
# # embeddings = finetune(train_dataloader)
# print(output)
# model.save("bert-analogies")
# model.save_to_hub("smhavens/bert-base-analogies")
# return output
# def finetune(train_dataloader):
# # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
# model_id = "sentence-transformers/all-MiniLM-L6-v2"
# model = SentenceTransformer(model_id)
# device = torch.device('cuda:0')
# model = model.to(device)
# # training_args = TrainingArguments(output_dir="test_trainer")
# # USE THIS LINK
# # https://huggingface.co/blog/how-to-train-sentence-transformers
# train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
# print("BEGIN FIT")
# model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
# model.save("bert-analogies")
# model.save_to_hub("smhavens/bert-base-analogies")
# return 0
def training():
dataset_id = "relbert/analogy_questions"
dataset_sub = "bats"
print("GETTING DATASET")
dataset = load_dataset(dataset_id, dataset_sub)
# dataset = dataset["train"]
# tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(f"- The {dataset_id} dataset has {dataset['test'].num_rows} examples.")
print(f"- Each example is a {type(dataset['test'][0])} with a {type(dataset['test'][0]['stem'])} as value.")
print(f"- Examples look like this: {dataset['test'][0]}")
train_examples = []
train_data = dataset["test"]
# For agility we only 1/2 of our available data
n_examples = dataset["test"].num_rows // 2
for i in range(n_examples):
example = train_data[i]
temp_word_1 = example["stem"][0]
temp_word_2 = example["stem"][1]
temp_word_3 = example["choice"][example["answer"]][0]
temp_word_4 = example["choice"][example["answer"]][1]
comp1 = f"{temp_word_1} to {temp_word_2}"
comp2 = f"{temp_word_3} to {temp_word_4}"
# example_opposite = dataset_clean[-(i)]
# print(example["text"])
train_examples.append(InputExample(texts=[comp1, comp2]))
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
print("END DATALOADER")
# print(train_examples)
embeddings = finetune(train_dataloader)
return (dataset['test'].num_rows, type(dataset['test'][0]), type(dataset['test'][0]['stem']), dataset['test'][0], embeddings)
def finetune(train_dataloader):
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
model_id = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_id)
device = torch.device('cuda:0')
model = model.to(device)
# training_args = TrainingArguments(output_dir="test_trainer")
# USE THIS LINK
# https://huggingface.co/blog/how-to-train-sentence-transformers
train_loss = losses.MegaBatchMarginLoss(model=model)
print("BEGIN FIT")
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
model.save("bert-analogies")
# model.save_to_hub("smhavens/bert-base-analogies")
# accuracy = compute_metrics(eval, metric)
return 0
def greet(name):
return "Hello " + name + "!!"
def check_answer(guess:str):
global guesses
global answer
guesses.append(guess)
output = ""
for guess in guesses:
output += ("- " + guess + "\n")
output = output[:-1]
if guess.lower() == answer.lower():
return "Correct!", output
else:
return "Try again!", output
def main():
print("BEGIN")
word1 = "Black"
word2 = "White"
word3 = "Sun"
global answer
answer = "Moon"
global guesses
num_rows, data_type, value, example, embeddings = training()
# prompt = f"{word1} is to {word2} as {word3} is to ____"
# with gr.Blocks() as iface:
# gr.Markdown(prompt)
# with gr.Tab("Guess"):
# text_input = gr.Textbox()
# text_output = gr.Textbox()
# text_button = gr.Button("Submit")
# with gr.Accordion("Open for previous guesses"):
# text_guesses = gr.Textbox()
# with gr.Tab("Testing"):
# gr.Markdown(f"""Number of rows in dataset is {num_rows}, with each having type {data_type} and value {value}.
# An example is {example}.
# The Embeddings are {embeddings}.""")
# text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
# # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
# iface.launch()
if __name__ == "__main__":
main()