Spaces:

yash9439
/

MultiAgent-QnA-ChatBot

Sleeping

App Files Files Community

MultiAgent-QnA-ChatBot / Baseline /boolean_retrieval.py

yash9439

Adding Baseline and DocumentCollection

f7f9f67 verified 10 months ago

raw

history blame contribute delete

4.95 kB

	from collections import defaultdict
	import re
	import heapq
	import joblib
	import os

	def preprocess_text(text):
	"""
	Preprocess the text for tokenization.
	Removes special characters, lowercases, and splits into words.
	"""
	return re.findall(r'\w+', text.lower())

	def create_inverted_index(wikipedia_dict):
	"""
	Create an inverted index from the document dictionary.
	Args:
	wikipedia_dict (dict): A dictionary with document IDs as keys and text as values.

	Returns:
	dict: An inverted index where each term maps to a list of document IDs containing it.
	"""
	inverted_index = defaultdict(set)
	for doc_id, text in wikipedia_dict.items():
	tokens = set(preprocess_text(text)) # Unique tokens for each document
	for token in tokens:
	inverted_index[token].add(doc_id)
	return inverted_index

	def save_inverted_index(inverted_index, filepath="Baseline/inverted_index.pkl"):
	"""
	Save the inverted index to a file using joblib.
	"""
	joblib.dump(inverted_index, filepath)

	def load_inverted_index(filepath="Baseline/inverted_index.pkl"):
	"""
	Load the inverted index from a file using joblib.
	"""
	if os.path.exists(filepath):
	return joblib.load(filepath)
	return None

	def boolean_retrieval(queries_dict, inverted_index, wikipedia_dict, top_n=100):
	"""
	Perform boolean retrieval for each query.
	Args:
	queries_dict (dict): A dictionary with query IDs as keys and query text as values.
	inverted_index (dict): The inverted index created from the document collection.
	wikipedia_dict (dict): The original document dictionary (for scoring if needed).
	top_n (int): The number of top documents to retrieve for each query.

	Returns:
	dict: A dictionary with query IDs as keys and a list of top document IDs as values.
	"""
	query_results = {}

	for query_id, query_text in queries_dict.items():
	query_tokens = preprocess_text(query_text)

	# Collect all document IDs that contain any of the query terms
	relevant_docs = set()
	for token in query_tokens:
	if token in inverted_index:
	relevant_docs.update(inverted_index[token])

	# If more than `top_n` documents, sort by some criteria (e.g., frequency of terms in the doc)
	doc_scores = []
	for doc_id in relevant_docs:
	doc_text = preprocess_text(wikipedia_dict[doc_id])
	score = sum(doc_text.count(token) for token in query_tokens) # Term frequency score
	doc_scores.append((score, doc_id))

	# Get the top `top_n` documents based on the score
	top_docs = heapq.nlargest(top_n, doc_scores)
	query_results[query_id] = [doc_id for _, doc_id in top_docs]

	return query_results

	# Main flow
	def main_boolean_retrieval(wikipedia_dict, queries_dict):
	# Step 1: Create inverted index
	inverted_index = create_inverted_index(wikipedia_dict)

	# Step 2: Perform boolean retrieval
	top_docs = boolean_retrieval(queries_dict, inverted_index, wikipedia_dict)

	return top_docs

	def retrieve_single_query(query, wikipedia_dict, top_n=100, inverted_index_path="Baseline/inverted_index.pkl"):
	"""
	Retrieve documents for a single query using the inverted index.
	If the inverted index is not found, it will be created and saved.

	Args:
	query (str): The query text.
	wikipedia_dict (dict): The original document dictionary.
	top_n (int): The number of top documents to retrieve.
	inverted_index_path (str): Path to the saved inverted index file.

	Returns:
	list: A list of top document IDs matching the query.
	"""
	# Load or create the inverted index
	inverted_index = load_inverted_index(inverted_index_path)
	if inverted_index is None:
	print("Inverted index not found. Creating one...")
	inverted_index = create_inverted_index(wikipedia_dict)
	save_inverted_index(inverted_index, inverted_index_path)

	# Preprocess the query
	query_tokens = preprocess_text(query)

	# Collect relevant documents
	relevant_docs = set()
	for token in query_tokens:
	if token in inverted_index:
	relevant_docs.update(inverted_index[token])

	# Rank documents by frequency of terms
	doc_scores = []
	for doc_id in relevant_docs:
	doc_text = preprocess_text(wikipedia_dict[doc_id])
	score = sum(doc_text.count(token) for token in query_tokens)
	doc_scores.append((score, doc_id))

	# Get the top `top_n` documents based on the score
	top_docs = heapq.nlargest(top_n, doc_scores)
	return [doc_id for _, doc_id in top_docs]

	# Example usage:
	# Assuming `wikipedia_dict` and `queries_dict` are already prepared
	# top_results = main_boolean_retrieval(wikipedia_dict, queries_dict)
	# print(top_results)