from typing import List, Tuple import nltk import sklearn from .question_categorizer import TextClassificationModel from .tfidf_model import NLPModel import transformers from huggingface_hub import hf_hub_download import numpy as np import pandas as pd import json from tqdm import tqdm from collections import defaultdict class QuizBowlModel: def __init__(self, clear = False): """ Load your model(s) and whatever else you need in this function. Do NOT load your model or resources in the guess_and_buzz() function, as it will increase latency severely. """ self.categories = ['Geography', 'Religion', 'Philosophy', 'Trash','Mythology', 'Literature','Science', 'Social Science', 'History', 'Current Events', 'Fine Arts'] self.tfidf_models = [None for _ in range(len(self.categories))] self.qc_model = TextClassificationModel.load_model("models/categorizer") self.load_tfidf_models(clear=clear) def guess_and_buzz(self, question_text: List[str]) -> List[Tuple[str, bool]]: """ This function accepts a list of question strings, and returns a list of tuples containing strings representing the guess and corresponding booleans representing whether or not to buzz. So, guess_and_buzz(["This is a question"]) should return [("answer", False)] If you are using a deep learning model, try to use batched prediction instead of iterating using a for loop. """ guesses = [] curr_question = "" for question in question_text: curr_question += question + "." confidence,answer = self.predict(curr_question) confidence = True if confidence > 0.5 else False guesses.append((confidence,answer)) return guesses def load_tfidf_models(self, clear=False): print("loading tfidf models") # Create respective model if not exist if not clear: for category in range(len(self.categories)): REPO_ID = 'Backedman/TriviaAnsweringMachineREAL' FILENAME = f"models/{self.categories[category]}_tfidf.pkl" hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir='.') if self.tfidf_models[category] is None: self.tfidf_models[category] = NLPModel().load(f"models/{self.categories[category]}_tfidf.pkl") #self.tfidf_models[-1] = NLPModel().load(f"models/{'ALL'}_tfidf.pkl") else: for category in range(len(self.categories)): if self.tfidf_models[category] is None: self.tfidf_models[category] = NLPModel() print(self.tfidf_models) def train(self, data): # Create n empty lists, each index associated with the index of the category training_data = [[] for _ in range(len(self.categories))] with tqdm(total=len(data)) as pbar: for data_point in data: text = data_point["text"] answer = data_point["answer"] categories = data_point["category"] for category in categories: category_ind = self.categories.index(category) training_data[category_ind].append({"text": text, "answer": answer}) pbar.update(1) for ind,data in enumerate(training_data): self.tfidf_models[ind].process_data(data) # Train model self.tfidf_models[ind].train_model() # Save model self.tfidf_models[ind].save(f"models/{self.categories[ind]}_tfidf.pkl") self.tfidf_models[ind] = None training_data[ind] = [] #Update progress bar #pbar.update(1) print("TRAINING DATA") '''with tqdm(total=len(self.categories)) as pbar: for category in range(len(self.categories)): # Train model self.tfidf_models[category].train_model() # Save model self.tfidf_models[category].save(f"models/{self.categories[category]}_tfidf.pkl") # Unload model #print(f'category {self.categories[category]} gets unloaded') self.tfidf_models[category] = None training_data[category] = None pbar.update(1)''' print("Training complete.") def predict(self, input_data, confidence_threshold=1.5): # Get category confidence scores from qc_model category_confidences = self.qc_model.predict(input_data) #print("Category confidences:", category_confidences) # Find the indices of categories with confidence scores above the threshold confident_indices = (category_confidences > confidence_threshold).nonzero()[:,1] #print(confident_indices) max_confidence = 0 max_answer = None max_category = 0 for category in confident_indices: #print(category) confidence,answer = self.tfidf_models[category].predict(input_data) if(confidence > max_confidence): max_confidence = confidence max_answer = answer max_category = category #max_confidence, max_answer = selected_model.predict(input_data) #print("Prediction for category", self.categories[category], ":", max_answer, "with confidence", max_confidence) return (self.confidence_eq(np.tanh(max_confidence)), max_answer) def evaluate(self, input_data): correct = 0 count = 0 with tqdm(total=len(input_data)) as pbar: for data_point in input_data: print(count % 10) count += 1 text = data_point["text"] answer = data_point["answer"] answer_predict = self.predict(text)[1] if(answer == answer_predict): correct += 1 print(correct) if(count % 10 == 0): average = float(correct)/count print(f'rolling average: {average}') pbar.update(1) accuracy = correct/len(input_data) return accuracy def confidence_eq(self,x): if(x < 0.5): return 0 elif(x < 0.6): return 2*x - 1 elif(x < 0.7): return 4*x - 2.2 else: return min(1, 1.5*(x + 0.1)**2 - 0.36) if __name__ == "__main__": # Train a simple model on QB data, save it to a file import argparse parser = argparse.ArgumentParser() parser.add_argument('--data', type=str) parser.add_argument('--model', type=str) parser.add_argument('--predict', type=str) parser.add_argument('--clear', action='store_const', const=True, default=False) parser.add_argument('--evaluate', type=str) flags = parser.parse_args() model = None print(flags.clear) if flags.clear: model = QuizBowlModel(clear=True) else: model = QuizBowlModel() if flags.data: data_json = [] for data in flags.data: with open(flags.data, 'r') as data_file: data_json.extend(json.load(data_file)) model.train(data_json) #print(model.predict("My name is bobby, bobby newport. your name is jeff?")) #model.save("model.pkl") if flags.model: model.load(flags.model) if flags.predict: print(model.predict(flags.predict)) if flags.evaluate: with open(flags.evaluate, 'r') as data_file: data_json = json.load(data_file) print(f'accuracy: {model.evaluate(data_json)}')