TriviaAnsweringMachineREAL / data_collection.py

Mohit Chandra Sai Bogineni

ladsjklf

8d1b471 10 months ago

5.92 kB

	import json
	from bs4 import BeautifulSoup
	import re
	from tqdm import tqdm # Import tqdm for progress tracking
	import sys
	import question_categorizer as qc
	import numpy as np
	from question_categorizer import TextClassificationModel

	qc_model = qc.TextClassificationModel.load_model("models/categorizer")

	categories = ['Geography', 'Religion', 'Philosophy', 'Trash','Mythology', 'Literature','Science', 'Social Science', 'History', 'Current Events', 'Fine Arts']

	def remove_newline(string):
	return re.sub('\n+', ' ', string)

	def clean_text(text, answer):
	# Remove HTML tags
	text = re.sub(r'<.*?>', '', text)

	#text = re.sub(r'?','.',text)
	text = text.replace('?','.')

	# Clean the text further
	text = re.sub(r'[^a-zA-Z.\s-]', '', text)



	# Remove answer from text
	try:
	# Preprocess the answer to replace underscores with spaces
	processed_answer = answer.replace('_', ' ')

	# Remove parentheses from the processed answer
	processed_answer = re.sub(r'\([^)]*\)', '', processed_answer)

	# Replace all instances of the processed answer with an empty string, ignoring case
	text = re.sub(re.escape(processed_answer), '', text, flags=re.IGNORECASE)
	except Exception as e:
	print("An error occurred during text cleaning:", e)
	print("Text:", text)
	print("Answer:", answer)

	# Remove extra whitespaces
	text = re.sub(r'\s+', ' ', text)

	return text.strip()

	def process_data():
	#with open("data/JEOPARDY_QUESTIONS1.json", "r") as f:
	# jeopardy_data = json.load(f)
	jeopardy_data = []

	wiki_files = [
	]

	question_files = [
	"qadata.json"]

	wiki_data = []
	question_data = []

	for file_path in wiki_files:
	with open('data/' + file_path, "r") as f:
	wiki_data.extend(json.load(f))

	for file_path in question_files:
	with open('data/' + file_path, "r") as f:
	question_data.extend(json.load(f))

	#print(question_data)

	with open("data/training_data.json", "w") as f:
	training_data = []

	# Process Jeopardy data
	print("Processing Jeopardy data...")
	for entry in tqdm(jeopardy_data):
	question = entry["question"]
	answer = str(entry["answer"])

	# Preprocess the text
	soup = BeautifulSoup(question, 'html.parser')
	clean_question = ''.join(soup.findAll(text=True, recursive=False))

	question_category = []

	# Get category from qc_model
	prediction = qc_model.predict(question)
	predictions = np.argwhere(prediction >= 1.5)[1]

	for prediction_ind in predictions:
	# Store data in array with respective index
	question_category.append(categories[prediction_ind])

	question_category.append('ALL')



	training_entry = {
	"text": clean_question,
	"answer": answer,#,
	# Mohit, put categorizing code here
	"category": question_category
	}

	training_data.append(training_entry)

	# Process Wikipedia data
	print("Processing Wikipedia data...")
	for entry in tqdm(wiki_data):
	page = str(entry["page"])
	text = entry["text"]

	if(text == ""):
	continue

	text = remove_newline(text)
	text = clean_text(text, page)

	question_category = []

	# Get category from qc_model
	prediction = qc_model.predict(text)
	predictions = np.argwhere(prediction >= 1.5)[1]

	for prediction_ind in predictions:
	# Store data in array with respective index
	question_category.append(categories[prediction_ind])

	question_category.append('ALL')



	training_entry = {
	"text": text,
	"answer": page,
	# Mohit, put categorizing code here
	"category": question_category
	}

	training_data.append(training_entry)

	print("Processing Misc data...")
	for entry in tqdm(question_data):

	answer = str(entry["answer"])
	text = entry["text"]

	if(text == "" or answer == ""):
	continue

	text = remove_newline(text)
	text = clean_text(text, answer)

	question_category = []

	# Get category from qc_model
	try:
	prediction = qc_model.predict(text)
	predictions = np.argwhere(prediction >= 1.5)[1]
	except:
	print("answer: " + str(answer))
	print("text:" + str(text))
	continue

	for prediction_ind in predictions:
	# Store data in array with respective index
	question_category.append(categories[prediction_ind])

	question_category.append('ALL')



	training_entry = {
	"text": text,
	"answer": answer,
	# Mohit, put categorizing code here
	"category": question_category
	}

	training_data.append(training_entry)



	json.dump(training_data, f, indent=4)

	process_data()