|
from typing import List, Tuple |
|
import nltk |
|
import sklearn |
|
import question_categorizer as qc |
|
from question_categorizer import TextClassificationModel |
|
from tfidf_model import NLPModel |
|
import tfidf_model |
|
import transformers |
|
import numpy as np |
|
import pandas as pd |
|
import json |
|
from tqdm import tqdm |
|
from collections import defaultdict |
|
|
|
|
|
|
|
|
|
|
|
class QuizBowlModel: |
|
|
|
def __init__(self, clear = False): |
|
""" |
|
Load your model(s) and whatever else you need in this function. |
|
|
|
Do NOT load your model or resources in the guess_and_buzz() function, |
|
as it will increase latency severely. |
|
""" |
|
|
|
self.categories = ['Geography', 'Religion', 'Philosophy', 'Trash','Mythology', 'Literature','Science', 'Social Science', 'History', 'Current Events', 'Fine Arts', 'ALL'] |
|
self.tfidf_models = [None for _ in range(len(self.categories))] |
|
self.qc_model = qc.TextClassificationModel.load_model("models/categorizer") |
|
|
|
self.load_tfidf_models(clear=clear) |
|
|
|
|
|
|
|
|
|
|
|
def guess_and_buzz(self, question_text: List[str]) -> List[Tuple[str, bool]]: |
|
""" |
|
This function accepts a list of question strings, and returns a list of tuples containing |
|
strings representing the guess and corresponding booleans representing |
|
whether or not to buzz. |
|
|
|
So, guess_and_buzz(["This is a question"]) should return [("answer", False)] |
|
|
|
If you are using a deep learning model, try to use batched prediction instead of |
|
iterating using a for loop. |
|
""" |
|
|
|
guesses = [] |
|
curr_question = "" |
|
|
|
for question in question_text: |
|
curr_question += question + "." |
|
|
|
confidence,answer = self.predict(curr_question) |
|
|
|
confidence = True if confidence > 0.5 else False |
|
|
|
guesses.append((confidence,answer)) |
|
|
|
return guesses |
|
|
|
def load_tfidf_models(self, clear=False): |
|
|
|
print("loading tfidf models") |
|
|
|
|
|
if not clear: |
|
for category in range(len(self.categories)): |
|
if self.tfidf_models[category] is None: |
|
self.tfidf_models[category] = NLPModel().load(f"models/{self.categories[category]}_tfidf.pkl") |
|
|
|
self.tfidf_models[-1] = NLPModel().load(f"models/{'ALL'}_tfidf.pkl") |
|
else: |
|
for category in range(len(self.categories)): |
|
if self.tfidf_models[category] is None: |
|
self.tfidf_models[category] = NLPModel() |
|
|
|
print(self.tfidf_models) |
|
|
|
|
|
|
|
def train(self, data): |
|
|
|
|
|
training_data = [[] for _ in range(len(self.categories))] |
|
|
|
with tqdm(total=len(data)) as pbar: |
|
for data_point in data: |
|
text = data_point["text"] |
|
answer = data_point["answer"] |
|
categories = data_point["category"] |
|
|
|
for category in categories: |
|
|
|
category_ind = self.categories.index(category) |
|
|
|
training_data[category_ind].append({"text": text, "answer": answer}) |
|
|
|
pbar.update(1) |
|
|
|
|
|
for ind,data in enumerate(training_data): |
|
|
|
self.tfidf_models[ind].process_data(data) |
|
|
|
|
|
self.tfidf_models[ind].train_model() |
|
|
|
|
|
self.tfidf_models[ind].save(f"models/{self.categories[ind]}_tfidf.pkl") |
|
self.tfidf_models[ind] = None |
|
|
|
|
|
training_data[ind] = [] |
|
|
|
|
|
|
|
|
|
print("TRAINING DATA") |
|
'''with tqdm(total=len(self.categories)) as pbar: |
|
for category in range(len(self.categories)): |
|
|
|
# Train model |
|
self.tfidf_models[category].train_model() |
|
|
|
# Save model |
|
self.tfidf_models[category].save(f"models/{self.categories[category]}_tfidf.pkl") |
|
|
|
# Unload model |
|
#print(f'category {self.categories[category]} gets unloaded') |
|
self.tfidf_models[category] = None |
|
training_data[category] = None |
|
|
|
pbar.update(1)''' |
|
|
|
print("Training complete.") |
|
|
|
|
|
|
|
def predict(self, input_data, confidence_threshold=1.5): |
|
|
|
category_confidences = self.qc_model.predict(input_data) |
|
|
|
|
|
|
|
confident_indices = (category_confidences > confidence_threshold).nonzero()[:,1] |
|
|
|
|
|
|
|
max_confidence = 0 |
|
max_answer = None |
|
max_category = 0 |
|
for category in confident_indices: |
|
|
|
confidence,answer = self.tfidf_models[category].predict(input_data) |
|
|
|
if(confidence > max_confidence): |
|
max_confidence = confidence |
|
max_answer = answer |
|
max_category = category |
|
|
|
|
|
|
|
|
|
return (np.tanh(max_confidence), max_answer) |
|
|
|
def evaluate(self, input_data): |
|
correct = 0 |
|
count = 0 |
|
|
|
with tqdm(total=len(input_data)) as pbar: |
|
for data_point in input_data: |
|
print(count % 10) |
|
count += 1 |
|
text = data_point["text"] |
|
answer = data_point["answer"] |
|
|
|
answer_predict = self.predict(text)[1] |
|
|
|
if(answer == answer_predict): |
|
correct += 1 |
|
print(correct) |
|
|
|
if(count % 10 == 0): |
|
average = float(correct)/count |
|
print(f'rolling average: {average}') |
|
|
|
pbar.update(1) |
|
|
|
|
|
accuracy = correct/len(input_data) |
|
|
|
return accuracy |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
import argparse |
|
parser = argparse.ArgumentParser() |
|
|
|
parser.add_argument('--data', type=str) |
|
parser.add_argument('--model', type=str) |
|
parser.add_argument('--predict', type=str) |
|
parser.add_argument('--clear', action='store_const', const=True, default=False) |
|
parser.add_argument('--evaluate', type=str) |
|
|
|
flags = parser.parse_args() |
|
model = None |
|
|
|
print(flags.clear) |
|
|
|
if flags.clear: |
|
|
|
model = QuizBowlModel(clear=True) |
|
|
|
else: |
|
|
|
model = QuizBowlModel() |
|
|
|
|
|
|
|
if flags.data: |
|
|
|
data_json = [] |
|
|
|
for data in flags.data: |
|
with open(flags.data, 'r') as data_file: |
|
data_json.extend(json.load(data_file)) |
|
|
|
model.train(data_json) |
|
|
|
|
|
|
|
if flags.model: |
|
model.load(flags.model) |
|
|
|
if flags.predict: |
|
print(model.predict(flags.predict)) |
|
|
|
if flags.evaluate: |
|
with open(flags.evaluate, 'r') as data_file: |
|
data_json = json.load(data_file) |
|
print(f'accuracy: {model.evaluate(data_json)}') |
|
|