Backedman's picture
init
a3fa0c8 verified
raw
history blame
8.12 kB
from typing import List, Tuple
import nltk
import sklearn
import question_categorizer as qc
from question_categorizer import TextClassificationModel
from tfidf_model import NLPModel
import tfidf_model
import transformers
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
from collections import defaultdict
class QuizBowlModel:
def __init__(self, clear = False):
"""
Load your model(s) and whatever else you need in this function.
Do NOT load your model or resources in the guess_and_buzz() function,
as it will increase latency severely.
"""
self.categories = ['Geography', 'Religion', 'Philosophy', 'Trash','Mythology', 'Literature','Science', 'Social Science', 'History', 'Current Events', 'Fine Arts', 'ALL']
self.tfidf_models = [None for _ in range(len(self.categories))]
self.qc_model = qc.TextClassificationModel.load_model("models/categorizer")
self.load_tfidf_models(clear=clear)
def guess_and_buzz(self, question_text: List[str]) -> List[Tuple[str, bool]]:
"""
This function accepts a list of question strings, and returns a list of tuples containing
strings representing the guess and corresponding booleans representing
whether or not to buzz.
So, guess_and_buzz(["This is a question"]) should return [("answer", False)]
If you are using a deep learning model, try to use batched prediction instead of
iterating using a for loop.
"""
guesses = []
curr_question = ""
for question in question_text:
curr_question += question + "."
confidence,answer = self.predict(curr_question)
confidence = True if confidence > 0.5 else False
guesses.append((confidence,answer))
return guesses
def load_tfidf_models(self, clear=False):
print("loading tfidf models")
# Create respective model if not exist
if not clear:
for category in range(len(self.categories)):
if self.tfidf_models[category] is None:
self.tfidf_models[category] = NLPModel().load(f"models/{self.categories[category]}_tfidf.pkl")
self.tfidf_models[-1] = NLPModel().load(f"models/{'ALL'}_tfidf.pkl")
else:
for category in range(len(self.categories)):
if self.tfidf_models[category] is None:
self.tfidf_models[category] = NLPModel()
print(self.tfidf_models)
def train(self, data):
# Create n empty lists, each index associated with the index of the category
training_data = [[] for _ in range(len(self.categories))]
with tqdm(total=len(data)) as pbar:
for data_point in data:
text = data_point["text"]
answer = data_point["answer"]
categories = data_point["category"]
for category in categories:
category_ind = self.categories.index(category)
training_data[category_ind].append({"text": text, "answer": answer})
pbar.update(1)
for ind,data in enumerate(training_data):
self.tfidf_models[ind].process_data(data)
# Train model
self.tfidf_models[ind].train_model()
# Save model
self.tfidf_models[ind].save(f"models/{self.categories[ind]}_tfidf.pkl")
self.tfidf_models[ind] = None
training_data[ind] = []
#Update progress bar
#pbar.update(1)
print("TRAINING DATA")
'''with tqdm(total=len(self.categories)) as pbar:
for category in range(len(self.categories)):
# Train model
self.tfidf_models[category].train_model()
# Save model
self.tfidf_models[category].save(f"models/{self.categories[category]}_tfidf.pkl")
# Unload model
#print(f'category {self.categories[category]} gets unloaded')
self.tfidf_models[category] = None
training_data[category] = None
pbar.update(1)'''
print("Training complete.")
def predict(self, input_data, confidence_threshold=1.5):
# Get category confidence scores from qc_model
category_confidences = self.qc_model.predict(input_data)
#print("Category confidences:", category_confidences)
# Find the indices of categories with confidence scores above the threshold
confident_indices = (category_confidences > confidence_threshold).nonzero()[:,1]
#print(confident_indices)
max_confidence = 0
max_answer = None
max_category = 0
for category in confident_indices:
#print(category)
confidence,answer = self.tfidf_models[category].predict(input_data)
if(confidence > max_confidence):
max_confidence = confidence
max_answer = answer
max_category = category
#max_confidence, max_answer = selected_model.predict(input_data)
#print("Prediction for category", self.categories[category], ":", max_answer, "with confidence", max_confidence)
return (np.tanh(max_confidence), max_answer)
def evaluate(self, input_data):
correct = 0
count = 0
with tqdm(total=len(input_data)) as pbar:
for data_point in input_data:
print(count % 10)
count += 1
text = data_point["text"]
answer = data_point["answer"]
answer_predict = self.predict(text)[1]
if(answer == answer_predict):
correct += 1
print(correct)
if(count % 10 == 0):
average = float(correct)/count
print(f'rolling average: {average}')
pbar.update(1)
accuracy = correct/len(input_data)
return accuracy
if __name__ == "__main__":
# Train a simple model on QB data, save it to a file
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str)
parser.add_argument('--model', type=str)
parser.add_argument('--predict', type=str)
parser.add_argument('--clear', action='store_const', const=True, default=False)
parser.add_argument('--evaluate', type=str)
flags = parser.parse_args()
model = None
print(flags.clear)
if flags.clear:
model = QuizBowlModel(clear=True)
else:
model = QuizBowlModel()
if flags.data:
data_json = []
for data in flags.data:
with open(flags.data, 'r') as data_file:
data_json.extend(json.load(data_file))
model.train(data_json)
#print(model.predict("My name is bobby, bobby newport. your name is jeff?"))
#model.save("model.pkl")
if flags.model:
model.load(flags.model)
if flags.predict:
print(model.predict(flags.predict))
if flags.evaluate:
with open(flags.evaluate, 'r') as data_file:
data_json = json.load(data_file)
print(f'accuracy: {model.evaluate(data_json)}')