import json from bs4 import BeautifulSoup import re from tqdm import tqdm # Import tqdm for progress tracking import sys import question_categorizer as qc import numpy as np from question_categorizer import TextClassificationModel qc_model = qc.TextClassificationModel.load_model("models/categorizer") categories = ['Geography', 'Religion', 'Philosophy', 'Trash','Mythology', 'Literature','Science', 'Social Science', 'History', 'Current Events', 'Fine Arts'] def remove_newline(string): return re.sub('\n+', ' ', string) def clean_text(text, answer): # Remove HTML tags text = re.sub(r'<.*?>', '', text) #text = re.sub(r'?','.',text) text = text.replace('?','.') # Clean the text further text = re.sub(r'[^a-zA-Z.\s-]', '', text) # Remove answer from text try: # Preprocess the answer to replace underscores with spaces processed_answer = answer.replace('_', ' ') # Remove parentheses from the processed answer processed_answer = re.sub(r'\([^)]*\)', '', processed_answer) # Replace all instances of the processed answer with an empty string, ignoring case text = re.sub(re.escape(processed_answer), '', text, flags=re.IGNORECASE) except Exception as e: print("An error occurred during text cleaning:", e) print("Text:", text) print("Answer:", answer) # Remove extra whitespaces text = re.sub(r'\s+', ' ', text) return text.strip() def process_data(): #with open("data/JEOPARDY_QUESTIONS1.json", "r") as f: # jeopardy_data = json.load(f) jeopardy_data = [] wiki_files = [ ] question_files = [ "qadata.json"] wiki_data = [] question_data = [] for file_path in wiki_files: with open('data/' + file_path, "r") as f: wiki_data.extend(json.load(f)) for file_path in question_files: with open('data/' + file_path, "r") as f: question_data.extend(json.load(f)) #print(question_data) with open("data/training_data.json", "w") as f: training_data = [] # Process Jeopardy data print("Processing Jeopardy data...") for entry in tqdm(jeopardy_data): question = entry["question"] answer = str(entry["answer"]) # Preprocess the text soup = BeautifulSoup(question, 'html.parser') clean_question = ''.join(soup.findAll(text=True, recursive=False)) question_category = [] # Get category from qc_model prediction = qc_model.predict(question) predictions = np.argwhere(prediction >= 1.5)[1] for prediction_ind in predictions: # Store data in array with respective index question_category.append(categories[prediction_ind]) question_category.append('ALL') training_entry = { "text": clean_question, "answer": answer,#, # Mohit, put categorizing code here "category": question_category } training_data.append(training_entry) # Process Wikipedia data print("Processing Wikipedia data...") for entry in tqdm(wiki_data): page = str(entry["page"]) text = entry["text"] if(text == ""): continue text = remove_newline(text) text = clean_text(text, page) question_category = [] # Get category from qc_model prediction = qc_model.predict(text) predictions = np.argwhere(prediction >= 1.5)[1] for prediction_ind in predictions: # Store data in array with respective index question_category.append(categories[prediction_ind]) question_category.append('ALL') training_entry = { "text": text, "answer": page, # Mohit, put categorizing code here "category": question_category } training_data.append(training_entry) print("Processing Misc data...") for entry in tqdm(question_data): answer = str(entry["answer"]) text = entry["text"] if(text == "" or answer == ""): continue text = remove_newline(text) text = clean_text(text, answer) question_category = [] # Get category from qc_model try: prediction = qc_model.predict(text) predictions = np.argwhere(prediction >= 1.5)[1] except: print("answer: " + str(answer)) print("text:" + str(text)) continue for prediction_ind in predictions: # Store data in array with respective index question_category.append(categories[prediction_ind]) question_category.append('ALL') training_entry = { "text": text, "answer": answer, # Mohit, put categorizing code here "category": question_category } training_data.append(training_entry) json.dump(training_data, f, indent=4) process_data()