File size: 8,123 Bytes
a3fa0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
from typing import List, Tuple
import nltk
import sklearn
import question_categorizer as qc
from question_categorizer import TextClassificationModel
from tfidf_model import NLPModel
import tfidf_model
import transformers
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
from collections import defaultdict





class QuizBowlModel:

    def __init__(self, clear = False):
        """
        Load your model(s) and whatever else you need in this function.

        Do NOT load your model or resources in the guess_and_buzz() function, 
        as it will increase latency severely. 
        """
        
        self.categories = ['Geography', 'Religion', 'Philosophy', 'Trash','Mythology', 'Literature','Science', 'Social Science', 'History', 'Current Events', 'Fine Arts', 'ALL']
        self.tfidf_models = [None for _ in range(len(self.categories))]
        self.qc_model = qc.TextClassificationModel.load_model("models/categorizer")
                
        self.load_tfidf_models(clear=clear)



        

    def guess_and_buzz(self, question_text: List[str]) -> List[Tuple[str, bool]]:
        """
        This function accepts a list of question strings, and returns a list of tuples containing
        strings representing the guess and corresponding booleans representing 
        whether or not to buzz. 

        So, guess_and_buzz(["This is a question"]) should return [("answer", False)]

        If you are using a deep learning model, try to use batched prediction instead of 
        iterating using a for loop.
        """
        
        guesses = []
        curr_question = ""
        
        for question in question_text:
            curr_question += question + "."
            
            confidence,answer = self.predict(curr_question)
            
            confidence = True if confidence > 0.5 else False

            guesses.append((confidence,answer))
            
        return guesses
    
    def load_tfidf_models(self, clear=False):
        
        print("loading tfidf models")
        
        # Create respective model if not exist
        if not clear:
            for category in range(len(self.categories)):
                if self.tfidf_models[category] is None:
                    self.tfidf_models[category] = NLPModel().load(f"models/{self.categories[category]}_tfidf.pkl")
                
            self.tfidf_models[-1] = NLPModel().load(f"models/{'ALL'}_tfidf.pkl")
        else:
            for category in range(len(self.categories)):
                if self.tfidf_models[category] is None:
                    self.tfidf_models[category] = NLPModel()
                    
            print(self.tfidf_models)
                    
        
    
    def train(self, data):
                
        # Create n empty lists, each index associated with the index of the category
        training_data = [[] for _ in range(len(self.categories))]
        
        with tqdm(total=len(data)) as pbar:
            for data_point in data:
                text = data_point["text"]
                answer = data_point["answer"]
                categories = data_point["category"]
                
                for category in categories:
                    
                    category_ind = self.categories.index(category)
                            
                    training_data[category_ind].append({"text": text, "answer": answer})
                    
                pbar.update(1)
                    

        for ind,data in enumerate(training_data):
            
            self.tfidf_models[ind].process_data(data)
            
            # Train model
            self.tfidf_models[ind].train_model()
                
            # Save model
            self.tfidf_models[ind].save(f"models/{self.categories[ind]}_tfidf.pkl")
            self.tfidf_models[ind] = None
                
                        
            training_data[ind] = []
            
            #Update progress bar
            #pbar.update(1)
        
        print("TRAINING DATA")
        '''with tqdm(total=len(self.categories)) as pbar:
            for category in range(len(self.categories)):
                
                # Train model
                self.tfidf_models[category].train_model()
                
                # Save model
                self.tfidf_models[category].save(f"models/{self.categories[category]}_tfidf.pkl")
                
                # Unload model
                #print(f'category {self.categories[category]} gets unloaded')
                self.tfidf_models[category] = None
                training_data[category] = None
                
                pbar.update(1)'''
                
        print("Training complete.")

    
        
    def predict(self, input_data, confidence_threshold=1.5):
        # Get category confidence scores from qc_model
        category_confidences = self.qc_model.predict(input_data)
        #print("Category confidences:", category_confidences)
        
        # Find the indices of categories with confidence scores above the threshold
        confident_indices = (category_confidences > confidence_threshold).nonzero()[:,1]
        
        #print(confident_indices)
        
        max_confidence = 0
        max_answer = None
        max_category = 0
        for category in confident_indices:
            #print(category)
            confidence,answer = self.tfidf_models[category].predict(input_data)
            
            if(confidence > max_confidence):
                max_confidence = confidence
                max_answer = answer
                max_category = category
            
        #max_confidence, max_answer = selected_model.predict(input_data)
        #print("Prediction for category", self.categories[category], ":", max_answer, "with confidence", max_confidence)
        
        return (np.tanh(max_confidence), max_answer)
        
    def evaluate(self, input_data):
        correct = 0
        count = 0
        
        with tqdm(total=len(input_data)) as pbar: 
          for data_point in input_data:
              print(count % 10)
              count += 1
              text = data_point["text"]
              answer = data_point["answer"]
          
              answer_predict = self.predict(text)[1]
              
              if(answer == answer_predict):
                  correct += 1
                  print(correct)
                
              if(count % 10 == 0):
                  average = float(correct)/count
                  print(f'rolling average: {average}')
                
              pbar.update(1)
        
          
          accuracy = correct/len(input_data)
          
          return accuracy
        
            
        

        
        
        
        
        
        
if __name__ == "__main__":
    # Train a simple model on QB data, save it to a file
    import argparse
    parser = argparse.ArgumentParser()

    parser.add_argument('--data', type=str)
    parser.add_argument('--model', type=str)
    parser.add_argument('--predict', type=str)
    parser.add_argument('--clear', action='store_const', const=True, default=False)
    parser.add_argument('--evaluate', type=str)

    flags = parser.parse_args()
    model = None
    
    print(flags.clear)

    if flags.clear:

        model = QuizBowlModel(clear=True)
        
    else:

        model = QuizBowlModel()

        

    if flags.data:
        
        data_json = []
        
        for data in flags.data:
            with open(flags.data, 'r') as data_file:
                data_json.extend(json.load(data_file))

                model.train(data_json)
            #print(model.predict("My name is bobby, bobby newport. your name is jeff?"))
            #model.save("model.pkl")

    if flags.model:
        model.load(flags.model)
    
    if flags.predict:
        print(model.predict(flags.predict))
        
    if flags.evaluate:
        with open(flags.evaluate, 'r') as data_file:
          data_json = json.load(data_file)
          print(f'accuracy: {model.evaluate(data_json)}')