File size: 5,923 Bytes
a3fa0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import json
from bs4 import BeautifulSoup
import re
from tqdm import tqdm  # Import tqdm for progress tracking
import sys
import question_categorizer as qc
import numpy as np
from question_categorizer import TextClassificationModel

qc_model = qc.TextClassificationModel.load_model("models/categorizer")

categories = ['Geography', 'Religion', 'Philosophy', 'Trash','Mythology', 'Literature','Science', 'Social Science', 'History', 'Current Events', 'Fine Arts']

def remove_newline(string):
    return re.sub('\n+', ' ', string)

def clean_text(text, answer):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    #text = re.sub(r'?','.',text)
    text = text.replace('?','.')
    
    # Clean the text further
    text = re.sub(r'[^a-zA-Z.\s-]', '', text)
    
    
    
    # Remove answer from text
    try:
        # Preprocess the answer to replace underscores with spaces
        processed_answer = answer.replace('_', ' ')
        
        # Remove parentheses from the processed answer
        processed_answer = re.sub(r'\([^)]*\)', '', processed_answer)
        
        # Replace all instances of the processed answer with an empty string, ignoring case
        text = re.sub(re.escape(processed_answer), '', text, flags=re.IGNORECASE)
    except Exception as e:
        print("An error occurred during text cleaning:", e)
        print("Text:", text)
        print("Answer:", answer)
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

def process_data():
    #with open("data/JEOPARDY_QUESTIONS1.json", "r") as f:
    #    jeopardy_data = json.load(f)
    jeopardy_data = []

    wiki_files = [
    ]
    
    question_files = [
        "qadata.json"]
    
    wiki_data = []
    question_data = []
    
    for file_path in wiki_files:
        with open('data/' + file_path, "r") as f:
            wiki_data.extend(json.load(f))
            
    for file_path in question_files:
        with open('data/' + file_path, "r") as f:
            question_data.extend(json.load(f))
            
    #print(question_data)

    with open("data/training_data.json", "w") as f:
        training_data = []

        # Process Jeopardy data
        print("Processing Jeopardy data...")
        for entry in tqdm(jeopardy_data):
            question = entry["question"]
            answer = str(entry["answer"])

            # Preprocess the text
            soup = BeautifulSoup(question, 'html.parser')
            clean_question = ''.join(soup.findAll(text=True, recursive=False))
            
            question_category = []
            
            # Get category from qc_model
            prediction = qc_model.predict(question)
            predictions = np.argwhere(prediction >= 1.5)[1]
            
            for prediction_ind in predictions:
                # Store data in array with respective index
                question_category.append(categories[prediction_ind])
                
            question_category.append('ALL')
            
            

            training_entry = {
                "text": clean_question,
                "answer": answer,#,
                # Mohit, put categorizing code here
                "category": question_category
            }

            training_data.append(training_entry)

        # Process Wikipedia data
        print("Processing Wikipedia data...")
        for entry in tqdm(wiki_data):
            page = str(entry["page"])
            text = entry["text"]
            
            if(text == ""):
                continue
            
            text = remove_newline(text)
            text = clean_text(text, page)
            
            question_category = []
            
            # Get category from qc_model
            prediction = qc_model.predict(text)
            predictions = np.argwhere(prediction >= 1.5)[1]
            
            for prediction_ind in predictions:
                # Store data in array with respective index
                question_category.append(categories[prediction_ind])
                
            question_category.append('ALL')
            


            training_entry = {
                "text": text,
                "answer": page,
                # Mohit, put categorizing code here
                "category": question_category
            }

            training_data.append(training_entry)

        print("Processing Misc data...")
        for entry in tqdm(question_data):
            
            answer = str(entry["answer"])
            text = entry["text"]
            
            if(text == "" or answer == ""):
                continue
            
            text = remove_newline(text)
            text = clean_text(text, answer)
            
            question_category = []
            
            # Get category from qc_model
            try:
              prediction = qc_model.predict(text)
              predictions = np.argwhere(prediction >= 1.5)[1]
            except:
              print("answer: " + str(answer))
              print("text:" + str(text))
              continue
            
            for prediction_ind in predictions:
                # Store data in array with respective index
                question_category.append(categories[prediction_ind])
                
            question_category.append('ALL')
            


            training_entry = {
                "text": text,
                "answer": answer,
                # Mohit, put categorizing code here
                "category": question_category
            }
            
            training_data.append(training_entry)
            
            

        json.dump(training_data, f, indent=4)
        
process_data()