Spaces:

IR-IIITH
/

MultiAgent-OpenDomain-QnA-System

Sleeping

App Files Files Community

raghuv-aditya commited on Nov 20, 2024

Commit

9f21f05

verified ·

1 Parent(s): d38e818

Upload 24 files

Browse files

Files changed (25) hide show

.gitattributes +1 -0
Agents/rankerAgent.py +158 -0
Agents/togetherAIAgent.py +42 -0
Agents/wikiAgent.py +158 -0
AnswerGeneration/__pycache__/getAnswer.cpython-312.pyc +0 -0
AnswerGeneration/getAnswer.py +45 -0
Baseline/__pycache__/boolean.cpython-312.pyc +0 -0
Baseline/__pycache__/boolean_retrieval.cpython-312.pyc +0 -0
Baseline/__pycache__/data_processor.cpython-312.pyc +0 -0
Baseline/boolean.py +54 -0
Baseline/boolean_retrieval.py +134 -0
Baseline/data_processor.py +98 -0
Baseline/inverted_index.pkl +3 -0
Datasets/mini_wiki_collection.json +3 -0
Query_Modification/QueryModification.py +119 -0
Query_Modification/__pycache__/QueryModification.cpython-311.pyc +0 -0
Query_Modification/__pycache__/QueryModification.cpython-312.pyc +0 -0
Query_Modification/prompt.txt +19 -0
Ranking/RRF/RRF_implementation.py +239 -0
Ranking/RRF/__pycache__/RRF_implementation.cpython-311.pyc +0 -0
Ranking/RRF/__pycache__/RRF_implementation.cpython-312.pyc +0 -0
app.py +112 -0
vision/Text_to_image/main.py +20 -0
vision/Text_to_image/pdfToImage.py +30 -0
vision/Text_to_image/textToPdf.py +30 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Datasets/mini_wiki_collection.json filter=lfs diff=lfs merge=lfs -text

Agents/rankerAgent.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import json
+import os
+from together import Together
+def rerank_best_answer(json_files, config_file='config.json', model="meta-llama/Llama-3-8b-chat-hf"):
+    # Load API key from configuration file
+    together_ai_key = os.getenv("TOGETHER_AI")
+    if not together_ai_key:
+        raise ValueError("TOGETHER_AI environment variable not found. Please set it before running the script.")
+    # Initialize Together client
+    client = Together(api_key=together_ai_key)
+    # Combine all JSON files into a single structure
+    combined_prompts = {}
+    for json_file in json_files:
+        with open(json_file, 'r') as file:
+            data = json.load(file)
+        # Format the input for the prompt
+        for item in data:
+            query_id = item['query_id']
+            if query_id not in combined_prompts:
+                combined_prompts[query_id] = {
+                    "question": item['input'],
+                    "answers": {}
+                }
+            combined_prompts[query_id]["answers"][json_file] = item['response']
+    responses = []
+    for query_id, prompt in combined_prompts.items():
+        # Generate the prompt text
+        prompt_text = f"""Input JSON:
+{json.dumps(prompt, indent=4)}
+For the above question, identify which model gave the best response based on accuracy. Ensure the chosen response is an answer and not a follow-up question. Provide the output in the format:
+{{
+    "best_model": "<model_name>",
+    "best_answer": "<answer>"
+}}
+Just output this JSON and nothing else.
+"""
+        # Generate response from Together API
+        response = client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": prompt_text}],
+        )
+        response_content = response.choices[0].message.content
+        # print(response_content)
+        prompt_text_extract_bestModel = f"""Input JSON:
+{json.dumps(response_content, indent=4)}
+Just Output the best_model from above JSON and nothing else.
+"""
+        prompt_text_extract_bestAnswer = f"""Input JSON:
+{json.dumps(response_content, indent=4)}
+Just Output the best_answer from above JSON and nothing else.
+"""
+        response_bestModel = client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": prompt_text_extract_bestModel}],
+        )
+        response_bestAnswer = client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": prompt_text_extract_bestAnswer}],
+        )
+        # print({"query_id": query_id, "question": prompt["question"], "Ranker_Output": response.choices[0].message.content})
+        responses.append({"query_id": query_id, "question": prompt["question"], "best_model": response_bestModel.choices[0].message.content, "best_answer": response_bestAnswer.choices[0].message.content})
+        print(response_bestModel.choices[0].message.content)
+    return responses
+def rankerAgent(prompt, config_file='config.json', model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"):
+    # Load API key from configuration file
+    with open(config_file, 'r') as file:
+        config = json.load(file)
+    together_ai_key = config.get("TOGETHER_AI")
+    if not together_ai_key:
+        raise ValueError("TOGETHER_AI key not found in the config file.")
+    # Initialize Together client
+    client = Together(api_key=together_ai_key)
+    prompt_text = f"""Input JSON:
+{json.dumps(prompt, indent=4)}
+For the above question, identify which model gave the best response based on accuracy. Ensure the chosen response is an answer and not a follow-up question. Provide the output in the format:
+{{
+    "best_model": "<model_name>",
+    "best_answer": "<answer>"
+}}
+Just output this JSON and nothing else.
+"""
+    # Generate response from Together API
+    response = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": prompt_text}],
+    )
+    response_content = response.choices[0].message.content
+    # print(response_content)
+    prompt_text_extract_bestModel = f"""Input JSON:
+{json.dumps(response_content, indent=4)}
+Just Output the best_model from above JSON and nothing else.
+"""
+    prompt_text_extract_bestAnswer = f"""Input JSON:
+{json.dumps(response_content, indent=4)}
+Just Output the best_answer from above JSON and nothing else.
+"""
+    response_bestModel = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": prompt_text_extract_bestModel}],
+    )
+    response_bestAnswer = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": prompt_text_extract_bestAnswer}],
+    )
+    return response_bestModel.choices[0].message.content, response_bestAnswer.choices[0].message.content
+# # Usage example
+# json_files = ["../QnA_Eval/Responses/BOW_1_2_top_100_response.json",
+#               "../QnA_Eval/Responses/BOW_1_2_top_100_modified_response.json",
+#               "../QnA_Eval/Responses/tf-idf_1_2_top_100_response.json",
+#               "../QnA_Eval/Responses/tf-idf_1_2_top_100_modified_response.json",
+#               "../QnA_Eval/Responses/bm25_1_2_top_100_response.json",
+#               "../QnA_Eval/Responses/bm25_1_2_top_100_modified_response.json",
+#               "../QnA_Eval/Responses/open_source_1_2_top_100_response.json",
+#               "../QnA_Eval/Responses/open_source_1_2_top_100_modified_response.json",
+#               "../QnA_Eval/Responses/vision_1_2_top_100_response.json",
+#               "../QnA_Eval/Responses/vision_1_2_top_100_modified_response.json",
+#               "../QnA_Eval/Responses/ZeroShot_response.json",
+#               "../QnA_Eval/Responses/WikiAgent_response.json",
+#               "../QnA_Eval/Responses/WikiAgent_response_modified.json",
+#               "../QnA_Eval/Responses/LlamaAgent_response.json",
+#               "../QnA_Eval/Responses/LlamaAgent_response_modified.json",
+#               "../QnA_Eval/Responses/tf_idf_bm25_open_1_2_top_100_combined_response.json", "../QnA_Eval/Responses/tf_idf_bm25_open_1_2_top_100_combined_modified_response.json", "../QnA_Eval/Responses/tf_idf_bm25_open_1_2_top_100_combined_both_response.json"]
+# config_file = "../config.json"
+# result = rerank_best_answer(json_files, config_file)
+# with open("reranked_best_answers_1_2.json", 'w') as file:
+#     json.dump(result, file, indent=4, ensure_ascii=False)

Agents/togetherAIAgent.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import json, os
+from together import Together
+def generate_article_from_query(query, config_file='config.json', model="meta-llama/Llama-3-8b-chat-hf"):
+    """
+    Generates an article based on the given query using the Together API.
+    Parameters:
+    - query (str): The input query for generating the article.
+    - config_file (str): Path to the JSON file containing the API key. Default is 'config.json'.
+    - model (str): The Together AI model to use. Default is "meta-llama/Llama-3-8b-chat-hf".
+    Returns:
+    - str: The generated article content.
+    """
+    together_ai_key = os.getenv("TOGETHER_AI")
+    if not together_ai_key:
+        raise ValueError("TOGETHER_AI environment variable not found. Please set it before running the script.")
+    # Initialize Together client
+    client = Together(api_key=together_ai_key)
+    # Create the prompt
+    prompt = f"""Using the query provided, generate a well-researched and informative short article. The article should be detailed, accurate, and structured to cover various aspects of the topic in an engaging way. Focus on presenting key facts, historical context, notable insights, and any relevant background information that adds value to the reader’s understanding. Ensure the tone is neutral and informative. Keep the article short. Here’s the query:
+    Query: {query}"""
+    # Generate response
+    response = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+    )
+    return response.choices[0].message.content
+# # Example usage
+# if __name__ == "__main__":
+#     query = "I feel anxious about my health and stressed at work."
+#     article = generate_article_from_query(query)
+#     print(article)

Agents/wikiAgent.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import wikipediaapi
+from typing import List, Dict
+import logging
+from dataclasses import dataclass
+from datetime import datetime
+@dataclass
+class WikiSearchResult:
+    """Data class to store Wikipedia article information"""
+    title: str
+    summary: str
+    full_text: str
+    url: str
+    last_modified: datetime
+    categories: List[str]
+def initialize_wikipedia_client(language: str = 'en', user_agent: str = 'WikipediaSearcher/1.0') -> wikipediaapi.Wikipedia:
+    """
+    Initialize Wikipedia API client
+    Args:
+        language: Language code (e.g., 'en' for English)
+        user_agent: User agent string for API requests
+    Returns:
+        Wikipedia API client instance
+    """
+    return wikipediaapi.Wikipedia(
+        language=language,
+        extract_format=wikipediaapi.ExtractFormat.WIKI,
+        user_agent=user_agent
+    )
+def process_page(page: wikipediaapi.WikipediaPage) -> WikiSearchResult:
+    """Process a Wikipedia page and extract relevant information"""
+    categories = [cat.title for cat in page.categories.values()]
+    return WikiSearchResult(
+        title=page.title,
+        summary=page.summary,
+        full_text=page.text,
+        url=page.fullurl,
+        last_modified=datetime.strptime(page.touched, '%Y-%m-%dT%H:%M:%SZ'),
+        categories=categories
+    )
+def search_wikipedia(client: wikipediaapi.Wikipedia, query: str, results_limit: int = 3) -> List[WikiSearchResult]:
+    """
+    Search Wikipedia and get detailed information for matching articles
+    Args:
+        client: Wikipedia API client instance
+        query: Search query string
+        results_limit: Maximum number of results to return
+    Returns:
+        List of WikiSearchResult objects containing article information
+    """
+    try:
+        page = client.page(query)
+        if not page.exists():
+            logging.warning(f"No exact match found for: {query}")
+            return []
+        results = [process_page(page)]
+        # Get related pages through links (if we want more results)
+        if results_limit > 1:
+            for link_title in list(page.links.keys())[:results_limit - 1]:
+                link_page = client.page(link_title)
+                if link_page.exists():
+                    results.append(process_page(link_page))
+        return results
+    except Exception as e:
+        logging.error(f"Error searching Wikipedia: {e}")
+        return []
+def format_result(result: WikiSearchResult, include_full_text: bool = False) -> str:
+    """
+    Format a search result for display
+    Args:
+        result: WikiSearchResult object to format
+        include_full_text: Whether to include the full article text
+    Returns:
+        Formatted string containing article information
+    """
+    formatted = f"""
+Title: {result.title}
+URL: {result.url}
+Last Modified: {result.last_modified}
+Categories: {', '.join(result.categories[:5])}{'...' if len(result.categories) > 5 else ''}
+Summary:
+{result.summary}
+"""
+    if include_full_text:
+        formatted += f"\nFull Text:\n{result.full_text}"
+    return formatted
+def get_wiki_data(query: str, results_limit: int = 3) -> List[str]:
+    """
+    Get Wikipedia data for a given query. If the search returns no results,
+    try using n-grams of decreasing size until a result is found or all attempts fail.
+    Args:
+        query: Search query string
+        results_limit: Maximum number of results to return
+    Returns:
+        List of summaries from Wikipedia search results, or None if no results are found.
+    """
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+    client = initialize_wikipedia_client()
+    def get_search_result(query):
+        """Helper function to get search result summary."""
+        result = search_wikipedia(client, query, results_limit)
+        if result:
+            return result[0].summary  # Return the first result's summary if available
+        return None
+    # Check the search results with the full query
+    summary = get_search_result(query)
+    if summary:
+        return [summary]
+    # If no result, try reducing the query by n-grams
+    n = len(query.split())  # Starting with the number of words in the query
+    for i in range(n, 1, -1):  # Try from n-grams down to 2-grams
+        # Generate n-grams for the current iteration
+        n_grams_query = ' '.join(query.split()[:i])
+        logging.info(f"Trying n-gram query: {n_grams_query}")
+        summary = get_search_result(n_grams_query)
+        if summary:
+            return [summary]
+    # If no results found after all n-gram reductions, return None
+    logging.info("No results found for any query variations.")
+    return None
+# # Example usage
+# if __name__ == "__main__":
+#     query = "Clash of Clans"
+#     results = get_wiki_data(query, results_limit=3)
+#     if not results:
+#         print(f"No results found for query: {query}")
+#     else:
+#         for idx, result in enumerate(results, 1):
+#             print(f"\nResult {idx}:")
+#             print("-" * 60)
+#             print(format_result(result))

AnswerGeneration/__pycache__/getAnswer.cpython-312.pyc ADDED Viewed

Binary file (2.13 kB). View file

AnswerGeneration/getAnswer.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+import json
+from together import Together
+def generate_answer_withContext(question, context):
+    together_ai_key = os.getenv("TOGETHER_AI")
+    if not together_ai_key:
+        raise ValueError("TOGETHER_AI environment variable not found. Please set it before running the script.")
+    client = Together(api_key=together_ai_key)
+    prompt = f"""Consider the context and generate a brief 1-2 line answer to the question. Output only the answer.
+Context: {context}
+Question: {question}
+"""
+    response = client.chat.completions.create(
+        # model="meta-llama/Llama-3-8b-chat-hf",
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+        messages=[{"role": "user", "content": prompt}],
+    )
+    return response.choices[0].message.content
+def generate_answer_zeroShot(question):
+    together_ai_key = os.getenv("TOGETHER_AI")
+    if not together_ai_key:
+        raise ValueError("TOGETHER_AI environment variable not found. Please set it before running the script.")
+    client = Together(api_key=together_ai_key)
+    prompt = f"""Answer the following question:
+Question: {question}
+"""
+    response = client.chat.completions.create(
+        model="meta-llama/Llama-3-8b-chat-hf",
+        messages=[{"role": "user", "content": prompt}],
+    )
+    return response.choices[0].message.content

Baseline/__pycache__/boolean.cpython-312.pyc ADDED Viewed

Binary file (960 Bytes). View file

Baseline/__pycache__/boolean_retrieval.cpython-312.pyc ADDED Viewed

Binary file (5.61 kB). View file

Baseline/__pycache__/data_processor.cpython-312.pyc ADDED Viewed

Binary file (2.89 kB). View file

Baseline/boolean.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from Baseline.data_processor import process_json_data, process_queries, merge_documents
+from Baseline.boolean_retrieval import main_boolean_retrieval, retrieve_single_query
+import json
+def boolean_pipeline(query, wikipedia_data_path="Datasets/mini_wiki_collection.json", top_n=100):
+    # Load the JSON files
+    with open(wikipedia_data_path, "r") as file1:
+        wikipedia_data = json.load(file1)
+    # Process the JSON files
+    wikipedia_dict = process_json_data(wikipedia_data)
+    # Print the processed data
+    top_results = retrieve_single_query(query, wikipedia_dict, top_n)
+    return top_results
+# def main():
+#     # Load the JSON files
+#     # boolean_retrieval("In the United States, why are positions like Attorney General, Secretary of State, etc. appointed by the president at the federal level but elected by the people at the state level? Had it ever been proposed to do this differently?")
+#     # return
+#     with open("../Datasets/mini_wiki_collection.json", "r") as file1:  # Replace with the actual path to your file
+#         wikipedia_data = json.load(file1)
+#     with open("../Datasets/mini_wiki_collection_10000_documents.json", "r") as file1:  # Replace with the actual path to your file
+#         additional_json_file = json.load(file1)
+#     with open("../Datasets/FinalDataset_WithModifiedQuery.json", "r") as file2:  # Replace with the actual path to your file
+#         queries_data = json.load(file2)
+#     # Process the JSON files
+#     wikipedia_dict = process_json_data(wikipedia_data)
+#     updated_main_dict = merge_documents(wikipedia_dict, additional_json_file, limit=2000)
+#     queries_dict = process_queries(queries_data)
+#     # Print the processed data
+#     print("Processed Wikipedia Data:")
+#     print(wikipedia_dict["420538"])
+#     print("\nProcessed Queries Data:")
+#     print(queries_dict["5xvggq"])
+#     top_results = main_boolean_retrieval(updated_main_dict, queries_dict)
+#     # Print the results for a specific query
+#     print("\nTop results for query '5xvggq':")
+#     print(top_results.get("5xvggq", []))
+#     # Optionally, save the top results to a JSON file
+#     with open("boolean_retrieval_1_2_query.json", "w") as output_file:
+#         json.dump(top_results, output_file, indent=4)
+# # if __name__ == "__main__":
+# #     main()

Baseline/boolean_retrieval.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from collections import defaultdict
+import re
+import heapq
+import joblib
+import os
+def preprocess_text(text):
+    """
+    Preprocess the text for tokenization.
+    Removes special characters, lowercases, and splits into words.
+    """
+    return re.findall(r'\w+', text.lower())
+def create_inverted_index(wikipedia_dict):
+    """
+    Create an inverted index from the document dictionary.
+    Args:
+        wikipedia_dict (dict): A dictionary with document IDs as keys and text as values.
+    Returns:
+        dict: An inverted index where each term maps to a list of document IDs containing it.
+    """
+    inverted_index = defaultdict(set)
+    for doc_id, text in wikipedia_dict.items():
+        tokens = set(preprocess_text(text))  # Unique tokens for each document
+        for token in tokens:
+            inverted_index[token].add(doc_id)
+    return inverted_index
+def save_inverted_index(inverted_index, filepath="Baseline/inverted_index.pkl"):
+    """
+    Save the inverted index to a file using joblib.
+    """
+    joblib.dump(inverted_index, filepath)
+def load_inverted_index(filepath="Baseline/inverted_index.pkl"):
+    """
+    Load the inverted index from a file using joblib.
+    """
+    if os.path.exists(filepath):
+        return joblib.load(filepath)
+    return None
+def boolean_retrieval(queries_dict, inverted_index, wikipedia_dict, top_n=100):
+    """
+    Perform boolean retrieval for each query.
+    Args:
+        queries_dict (dict): A dictionary with query IDs as keys and query text as values.
+        inverted_index (dict): The inverted index created from the document collection.
+        wikipedia_dict (dict): The original document dictionary (for scoring if needed).
+        top_n (int): The number of top documents to retrieve for each query.
+    Returns:
+        dict: A dictionary with query IDs as keys and a list of top document IDs as values.
+    """
+    query_results = {}
+    for query_id, query_text in queries_dict.items():
+        query_tokens = preprocess_text(query_text)
+        # Collect all document IDs that contain any of the query terms
+        relevant_docs = set()
+        for token in query_tokens:
+            if token in inverted_index:
+                relevant_docs.update(inverted_index[token])
+        # If more than `top_n` documents, sort by some criteria (e.g., frequency of terms in the doc)
+        doc_scores = []
+        for doc_id in relevant_docs:
+            doc_text = preprocess_text(wikipedia_dict[doc_id])
+            score = sum(doc_text.count(token) for token in query_tokens)  # Term frequency score
+            doc_scores.append((score, doc_id))
+        # Get the top `top_n` documents based on the score
+        top_docs = heapq.nlargest(top_n, doc_scores)
+        query_results[query_id] = [doc_id for _, doc_id in top_docs]
+    return query_results
+# Main flow
+def main_boolean_retrieval(wikipedia_dict, queries_dict):
+    # Step 1: Create inverted index
+    inverted_index = create_inverted_index(wikipedia_dict)
+    # Step 2: Perform boolean retrieval
+    top_docs = boolean_retrieval(queries_dict, inverted_index, wikipedia_dict)
+    return top_docs
+def retrieve_single_query(query, wikipedia_dict, top_n=100, inverted_index_path="Baseline/inverted_index.pkl"):
+    """
+    Retrieve documents for a single query using the inverted index.
+    If the inverted index is not found, it will be created and saved.
+    Args:
+        query (str): The query text.
+        wikipedia_dict (dict): The original document dictionary.
+        top_n (int): The number of top documents to retrieve.
+        inverted_index_path (str): Path to the saved inverted index file.
+    Returns:
+        list: A list of top document IDs matching the query.
+    """
+    # Load or create the inverted index
+    inverted_index = load_inverted_index(inverted_index_path)
+    if inverted_index is None:
+        print("Inverted index not found. Creating one...")
+        inverted_index = create_inverted_index(wikipedia_dict)
+        save_inverted_index(inverted_index, inverted_index_path)
+    # Preprocess the query
+    query_tokens = preprocess_text(query)
+    # Collect relevant documents
+    relevant_docs = set()
+    for token in query_tokens:
+        if token in inverted_index:
+            relevant_docs.update(inverted_index[token])
+    # Rank documents by frequency of terms
+    doc_scores = []
+    for doc_id in relevant_docs:
+        doc_text = preprocess_text(wikipedia_dict[doc_id])
+        score = sum(doc_text.count(token) for token in query_tokens)
+        doc_scores.append((score, doc_id))
+    # Get the top `top_n` documents based on the score
+    top_docs = heapq.nlargest(top_n, doc_scores)
+    return [doc_id for _, doc_id in top_docs]
+# Example usage:
+# Assuming `wikipedia_dict` and `queries_dict` are already prepared
+# top_results = main_boolean_retrieval(wikipedia_dict, queries_dict)
+# print(top_results)

Baseline/data_processor.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Assuming sanitize_text is a function you've defined elsewhere
+import re
+def merge_documents(main_dict, additional_json, limit=1000):
+    """
+    Adds a subset of documents from an additional JSON file to the main dictionary.
+    Args:
+        main_dict (dict): The main dictionary where processed documents are stored.
+        additional_json (list): The additional JSON data containing documents.
+        limit (int): The maximum number of documents to add to the main dictionary.
+    Returns:
+        dict: The updated main dictionary with additional documents added.
+    """
+    # Counter to track how many documents have been added
+    count = 0
+    for doc in additional_json:
+        if count >= limit:
+            break
+        # Extract wikipedia_id and text from the document
+        wikipedia_id = doc.get("wikipedia_id")
+        text = doc.get("text", [])
+        # Check if the document ID is unique to avoid overwriting
+        if wikipedia_id not in main_dict:
+            # Process and sanitize the document
+            joined_text = " ".join(text)
+            sanitized_text = sanitize_text(joined_text)
+            # Add to the main dictionary
+            main_dict[wikipedia_id] = sanitized_text
+            count += 1
+    print(f"{count} documents added to the main dictionary.")
+    return main_dict
+def sanitize_text(text):
+    """
+    Cleans and standardizes text by keeping only alphanumeric characters and spaces.
+    Args:
+        text (str): Text to sanitize.
+    Returns:
+        str: Sanitized text.
+    """
+    if isinstance(text, str):
+        # Use regex to keep only alphanumeric characters and spaces
+        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
+        # Optionally, collapse multiple spaces into a single space
+        text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def process_json_data(json_data):
+    result_dict = {}
+    for doc in json_data:
+        # Extract wikipedia_id and text
+        wikipedia_id = doc.get("wikipedia_id")
+        text = doc.get("text", [])
+        # Join the text content and sanitize
+        joined_text = " ".join(text)
+        sanitized_text = sanitize_text(joined_text)
+        # Store in the dictionary
+        result_dict[wikipedia_id] = sanitized_text
+    return result_dict
+def process_queries(json_data):
+    """
+    Processes a JSON object containing queries and query IDs.
+    Args:
+        json_data (dict): The input JSON data.
+    Returns:
+        dict: A dictionary with query_id as the key and query text as the value.
+    """
+    result_dict = {}
+    for query_id, query_info in json_data.items():
+        # Extract the query input
+        query_text = query_info.get("input", "")
+        # Store query_id and text in the result dictionary
+        result_dict[query_id] = query_text
+    return result_dict
+# Example usage
+# Assuming `query_json_file` contains your JSON data
+# processed_queries = process_queries(query_json_file)

Baseline/inverted_index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c47f19521041e7b2a5681da4128cfae538eba1bc653528f04c7dc9df300fbc5
+size 4671080

Datasets/mini_wiki_collection.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:986eedb174550564ce95cf9b08de1207cfb1e2290646b4aeb60257c9edceb27a
+size 41656963

Query_Modification/QueryModification.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import json, os
+import pandas as pd
+import google.generativeai as genai
+# Function to process text input with Gemini model
+def query_Modifier(input_text):
+    gemini_key = os.getenv("GEMINI")
+    if not gemini_key:
+        raise ValueError("GEMINI environment variable not found. Please set it before running the script.")
+    # Initialize the API key
+    genai.configure(api_key=gemini_key)
+    # print(gemini_key)
+    # Load the prompt from file
+    with open("Query_Modification/prompt.txt", 'r') as file:
+        PROMPT_TEMPLATE = file.read()
+    # Safety settings for Gemini model
+    safe = [
+        {
+            "category": "HARM_CATEGORY_DANGEROUS",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_HARASSMENT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_HATE_SPEECH",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+            "threshold": "BLOCK_NONE",
+        },
+    ]
+    generation_config = {
+        "temperature": 1,
+        "top_p": 0.95,
+        "top_k": 40,
+        "max_output_tokens": 8192,
+        "response_mime_type": "text/plain",
+    }
+    # Initialize the generative model
+    model = genai.GenerativeModel("gemini-1.5-flash", generation_config=generation_config)
+    full_prompt = f"{input_text}\n\n{PROMPT_TEMPLATE}"
+    # Call the generative model for text input
+    result = model.generate_content([full_prompt], safety_settings=safe)
+    return result.text
+def getKeywords(input_text):
+    # Extract keywords from the input text
+    gemini_key = os.getenv("GEMINI")
+    if not gemini_key:
+        raise ValueError("GEMINI environment variable not found. Please set it before running the script.")
+    # Initialize the API key
+    genai.configure(api_key=gemini_key)
+    # Safety settings for Gemini model
+    safe = [
+        {
+            "category": "HARM_CATEGORY_DANGEROUS",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_HARASSMENT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_HATE_SPEECH",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+            "threshold": "BLOCK_NONE",
+        },
+    ]
+    generation_config = {
+        "temperature": 1,
+        "top_p": 0.95,
+        "top_k": 40,
+        "max_output_tokens": 8192,
+        "response_mime_type": "text/plain",
+    }
+    # Initialize the generative model
+    model = genai.GenerativeModel("gemini-1.5-flash", generation_config=generation_config)
+    full_prompt = f"{input_text} \n\n Give the Keywords for the above sentence and output nothing else."
+    # Call the generative model for text input
+    result = model.generate_content([full_prompt], safety_settings=safe)
+    response = result.text
+    response = response.replace("Keywords:", "")
+    response = response.replace(",", "")
+    return response.strip()

Query_Modification/__pycache__/QueryModification.cpython-311.pyc ADDED Viewed

Binary file (3.42 kB). View file

Query_Modification/__pycache__/QueryModification.cpython-312.pyc ADDED Viewed

Binary file (2.98 kB). View file

Query_Modification/prompt.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+Modify the following query to improve its suitability for a Retrieval Augmented Generation (RAG) system using a semantic search engine like Cosign:
+Original Query: [Original query here]
+Guidelines:
+Clarity and Specificity: Make the query more specific and focused.
+Keyword Optimization: Identify and include relevant keywords that align with the dataset.
+Semantic Relevance: Consider the underlying meaning and context of the query.
+Question Formulation: Frame the query as a question to facilitate direct answer extraction.
+Contextual Clues: If applicable, provide additional context or background information.
+Example:
+Original Query: "Tell me about the French Revolution"
+Modified Query: "What were the main causes and effects of the French Revolution, and who were its key figures?"
+Guardrail : Output only the Modified Query.

Ranking/RRF/RRF_implementation.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import json
+import os
+from collections import defaultdict
+def load_and_merge_json_files(directory_path):
+    """
+    Load and merge JSON files from a directory into a single structure, keeping each list from different files separate for each query.
+    Args:
+    directory_path (str): Path to the directory containing the JSON files.
+    Returns:
+    list: Merged list of dictionaries, keeping separate lists for each query.
+    """
+    merged_queries = defaultdict(list)
+    # Iterate through all files in the directory
+    for filename in os.listdir(directory_path):
+        if filename.endswith('.json'):
+            file_path = os.path.join(directory_path, filename)
+            try:
+                with open(file_path, 'r') as f:
+                    json_data = json.load(f)
+                    # For each file, add the lists to the corresponding query
+                    for query_data in json_data:
+                        for query, rank_list in query_data.items():
+                            if isinstance(rank_list, list):  # Ensure rank_list is a list
+                                merged_queries[query].append(rank_list)
+                            else:
+                                print(f"Warning: Expected a list for query '{query}' but got {type(rank_list)}")
+            except Exception as e:
+                print(f"Error reading {filename}: {e}")
+    # Convert defaultdict to a list of dictionaries
+    return [{query: lists} for query, lists in merged_queries.items()]
+def reciprocal_rank_fusion(json_input, K=60, top_n=100):
+    """
+    Fuse rank from multiple IR systems for multiple queries using Reciprocal Rank Fusion.
+    Args:
+    json_input (list): A list of dictionaries where keys are queries, and values are ranked document lists from different systems.
+    K (int): A constant used in the RRF formula (default is 60).
+    top_n (int): Number of top results to return for each query.
+    Returns:
+    list: A list of dictionaries with each query and its respective fused document rankings.
+    """
+    query_fusion_results = []
+    # Iterate over each query in the JSON input
+    for query_data in json_input:
+        for query, list_of_ranked_docs in query_data.items():
+            rrf_map = defaultdict(float)
+            # Fuse rankings for the query using RRF
+            for rank_list in list_of_ranked_docs:
+                for rank, doc in enumerate(rank_list, 1):
+                    rrf_map[doc] += 1 / (rank + K)
+            # Sort the documents based on RRF scores in descending order
+            sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
+            fused_rankings = [doc for doc, score in sorted_docs[:top_n]]  # Keep only top N results
+            # Store the results for the current query
+            query_fusion_results.append({query: fused_rankings})
+    return query_fusion_results
+def save_to_json(output_data, output_file_path):
+    """
+    Save the RRF results to a JSON file in the same format as the input.
+    Args:
+    output_data (list): The processed data to save.
+    output_file_path (str): Path to the output JSON file.
+    """
+    with open(output_file_path, 'w') as f:
+        json.dump(output_data, f, indent=2)
+# # Example usage
+# directory_path = "Modified_1_2"  # Replace with your directory path
+# output_file_path = "Modified_1_2/rrf_1_2_modified.json"  # Replace with your desired output file path
+# # Load and merge JSON files
+# merged_input = load_and_merge_json_files(directory_path)
+# print(merged_input[0]["5xvggq"])
+# # Perform RRF on the merged input, keeping only the top 100 results
+# combined_results = reciprocal_rank_fusion(merged_input, top_n=100)
+# # Save the combined results to a JSON file
+# save_to_json(combined_results, output_file_path)
+# print(f"Combined results saved to {output_file_path}")
+def reciprocal_rank_fusion_two(rank_list1, rank_list2, K=60, top_n=100):
+    """
+    Perform Reciprocal Rank Fusion (RRF) for two ranking lists.
+    Args:
+    rank_list1 (list): First list of ranked documents.
+    rank_list2 (list): Second list of ranked documents.
+    K (int): A constant used in the RRF formula (default is 60).
+    top_n (int): Number of top results to return (default is 100).
+    Returns:
+    list: Combined list of rankings after applying RRF.
+    """
+    rrf_map = defaultdict(float)
+    # Process the first ranking list
+    for rank, doc in enumerate(rank_list1, 1):  # Start ranks from 1
+        rrf_map[doc] += 1 / (rank + K)
+    # Process the second ranking list
+    for rank, doc in enumerate(rank_list2, 1):  # Start ranks from 1
+        rrf_map[doc] += 1 / (rank + K)
+    # Sort the documents based on RRF scores in descending order
+    sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
+    # Return only the top N results
+    return [doc for doc, score in sorted_docs[:top_n]]
+def reciprocal_rank_fusion_three(rank_list1, rank_list2, rank_list3, K=60, top_n=100):
+    """
+    Perform Reciprocal Rank Fusion (RRF) for three ranking lists.
+    Args:
+    rank_list1 (list): First list of ranked documents.
+    rank_list2 (list): Second list of ranked documents.
+    rank_list3 (list): Third list of ranked documents.
+    K (int): A constant used in the RRF formula (default is 60).
+    top_n (int): Number of top results to return (default is 100).
+    Returns:
+    list: Combined list of rankings after applying RRF.
+    """
+    rrf_map = defaultdict(float)
+    # Process the first ranking list
+    for rank, doc in enumerate(rank_list1, 1):  # Start ranks from 1
+        rrf_map[doc] += 1 / (rank + K)
+    # Process the second ranking list
+    for rank, doc in enumerate(rank_list2, 1):  # Start ranks from 1
+        rrf_map[doc] += 1 / (rank + K)
+    # Process the third ranking list
+    for rank, doc in enumerate(rank_list3, 1):  # Start ranks from 1
+        rrf_map[doc] += 1 / (rank + K)
+    # Sort the documents based on RRF scores in descending order
+    sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
+    # Return only the top N results
+    return [doc for doc, score in sorted_docs[:top_n]]
+def reciprocal_rank_fusion_six(rank_list1, rank_list2, rank_list3, rank_list4, rank_list5, rank_list6, K=60, top_n=100):
+    """
+    Perform Reciprocal Rank Fusion (RRF) for six ranking lists.
+    Args:
+    rank_list1 (list): First list of ranked documents.
+    rank_list2 (list): Second list of ranked documents.
+    rank_list3 (list): Third list of ranked documents.
+    rank_list4 (list): Fourth list of ranked documents.
+    rank_list5 (list): Fifth list of ranked documents.
+    rank_list6 (list): Sixth list of ranked documents.
+    K (int): A constant used in the RRF formula (default is 60).
+    top_n (int): Number of top results to return (default is 100).
+    Returns:
+    list: Combined list of rankings after applying RRF.
+    """
+    rrf_map = defaultdict(float)
+    # Process each ranking list
+    for rank, doc in enumerate(rank_list1, 1):
+        rrf_map[doc] += 1 / (rank + K)
+    for rank, doc in enumerate(rank_list2, 1):
+        rrf_map[doc] += 1 / (rank + K)
+    for rank, doc in enumerate(rank_list3, 1):
+        rrf_map[doc] += 1 / (rank + K)
+    for rank, doc in enumerate(rank_list4, 1):
+        rrf_map[doc] += 1 / (rank + K)
+    for rank, doc in enumerate(rank_list5, 1):
+        rrf_map[doc] += 1 / (rank + K)
+    for rank, doc in enumerate(rank_list6, 1):
+        rrf_map[doc] += 1 / (rank + K)
+    # Sort the documents based on RRF scores in descending order
+    sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
+    # Return only the top N results
+    return [doc for doc, score in sorted_docs[:top_n]]
+def reciprocal_rank_fusion_multiple_lists(ranking_lists, K=60, top_n=100):
+    """
+    Perform Reciprocal Rank Fusion (RRF) for multiple ranking lists for each query.
+    Args:
+    ranking_lists (list of list of dict): Each element is a list of dictionaries, where each dictionary contains query IDs and ranked lists.
+    K (int): A constant used in the RRF formula (default is 60).
+    top_n (int): Number of top results to return for each query (default is 100).
+    Returns:
+    dict: A dictionary with query IDs as keys and their combined rankings as values.
+    """
+    combined_results = defaultdict(list)
+    # Flatten all ranking lists into a single dictionary per query
+    merged_rankings = defaultdict(list)
+    for ranking_list in ranking_lists:
+        for ranking_dict in ranking_list:
+            for query_id, doc_list in ranking_dict.items():
+                merged_rankings[query_id].append(doc_list)
+    # Apply RRF for each query
+    for query_id, ranked_lists in merged_rankings.items():
+        rrf_map = defaultdict(float)
+        # Process rankings for each system
+        for rank_list in ranked_lists:
+            for rank, doc in enumerate(rank_list, 1):  # Start rank from 1
+                rrf_map[str(doc)] += 1 / (rank + K)
+        # Sort documents based on their RRF scores in descending order
+        sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
+        combined_results[query_id] = [doc for doc, score in sorted_docs[:top_n]]
+    return dict(combined_results)

Ranking/RRF/__pycache__/RRF_implementation.cpython-311.pyc ADDED Viewed

Binary file (12.5 kB). View file

Ranking/RRF/__pycache__/RRF_implementation.cpython-312.pyc ADDED Viewed

Binary file (10.5 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import gradio as gr
+import json
+# Import your modules here
+from Agents.togetherAIAgent import generate_article_from_query
+from Agents.wikiAgent import get_wiki_data
+from Agents.rankerAgent import rankerAgent
+from Query_Modification.QueryModification import query_Modifier, getKeywords
+from Ranking.RRF.RRF_implementation import reciprocal_rank_fusion_three, reciprocal_rank_fusion_six
+from Retrieval.tf_idf import tf_idf_pipeline
+from Retrieval.bm25 import bm25_pipeline
+from Retrieval.vision import vision_pipeline
+from Retrieval.openSource import open_source_pipeline
+from Baseline.boolean import boolean_pipeline
+from AnswerGeneration.getAnswer import generate_answer_withContext, generate_answer_zeroShot
+# Load miniWikiCollection
+miniWikiCollection = json.load(open('Datasets/mini_wiki_collection.json', 'r'))
+miniWikiCollectionDict = {wiki['wikipedia_id']: " ".join(wiki['text']) for wiki in miniWikiCollection}
+def process_query(query):
+    # Query modification
+    modified_query = query_Modifier(query)
+    # Context Generation
+    article = generate_article_from_query(query)
+    # Keyword Extraction and getting context from Wiki
+    keywords = getKeywords(query)
+    wiki_data = get_wiki_data(keywords)
+    # Retrieve rankings
+    boolean_ranking = boolean_pipeline(query)
+    tf_idf_ranking = tf_idf_pipeline(query)
+    bm25_ranking = bm25_pipeline(query)
+    vision_ranking = vision_pipeline(query)
+    open_source_ranking = open_source_pipeline(query)
+    # Modified queries
+    boolean_ranking_modified = boolean_pipeline(modified_query)
+    tf_idf_ranking_modified = tf_idf_pipeline(modified_query)
+    bm25_ranking_modified = bm25_pipeline(modified_query)
+    vision_ranking_modified = vision_pipeline(modified_query)
+    open_source_ranking_modified = open_source_pipeline(modified_query)
+    # RRF rankings
+    tf_idf_bm25_open_RRF_Ranking = reciprocal_rank_fusion_three(tf_idf_ranking, bm25_ranking, open_source_ranking)
+    tf_idf_bm25_open_RRF_Ranking_modified = reciprocal_rank_fusion_three(tf_idf_ranking_modified, bm25_ranking_modified, open_source_ranking_modified)
+    tf_idf_bm25_open_RRF_Ranking_combined = reciprocal_rank_fusion_six(
+        tf_idf_ranking, bm25_ranking, open_source_ranking,
+        tf_idf_ranking_modified, bm25_ranking_modified, open_source_ranking_modified
+    )
+    # Retrieve contexts
+    boolean_context = miniWikiCollectionDict[boolean_ranking[0]]
+    tf_idf_context = miniWikiCollectionDict[tf_idf_ranking[0]]
+    bm25_context = miniWikiCollectionDict[str(bm25_ranking[0])]
+    vision_context = miniWikiCollectionDict[vision_ranking[0]]
+    open_source_context = miniWikiCollectionDict[open_source_ranking[0]]
+    tf_idf_bm25_open_RRF_Ranking_context = miniWikiCollectionDict[tf_idf_bm25_open_RRF_Ranking[0]]
+    # Generating answers
+    agent1_context = wiki_data[0]
+    agent2_context = article
+    agent1_answer = generate_answer_withContext(query, agent1_context)
+    agent2_answer = generate_answer_withContext(query, agent2_context)
+    boolean_answer = generate_answer_withContext(query, boolean_context)
+    tf_idf_answer = generate_answer_withContext(query, tf_idf_context)
+    bm25_answer = generate_answer_withContext(query, bm25_context)
+    vision_answer = generate_answer_withContext(query, vision_context)
+    open_source_answer = generate_answer_withContext(query, open_source_context)
+    tf_idf_bm25_open_RRF_Ranking_answer = generate_answer_withContext(query, tf_idf_bm25_open_RRF_Ranking_context)
+    zeroShot = generate_answer_zeroShot(query)
+    # Ranking the best answer
+    rankerAgentInput = {
+        "query": query,
+        "agent1": agent1_answer,
+        "agent2": agent2_answer,
+        "boolean": boolean_answer,
+        "tf_idf": tf_idf_answer,
+        "bm25": bm25_answer,
+        "vision": vision_answer,
+        "open_source": open_source_answer,
+        "tf_idf_bm25_open_RRF_Ranking": tf_idf_bm25_open_RRF_Ranking_answer,
+        "zeroShot": zeroShot,
+    }
+    best_model, best_answer = rankerAgent(rankerAgentInput)
+    return best_model, best_answer
+# Gradio interface
+interface = gr.Interface(
+    fn=process_query,
+    inputs=gr.Textbox(label="Enter your query"),
+    outputs=[
+        gr.Textbox(label="Best Model"),
+        gr.Textbox(label="Best Answer"),
+    ],
+    title="Query Answering System",
+    description="Enter a query to get the best model and the best answer using multiple retrieval models and ranking techniques.",
+    allow_flagging="never"
+)
+# Launch the interface
+if __name__ == "__main__":
+    interface.launch()

vision/Text_to_image/main.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# main.py
+from textToPdf import create_pdf
+from pdfToImage import pdf_to_image
+def main():
+    # Sample input text
+    input_text = "This is a sample text that will be used to generate the PDF. " * 500  # Example long text
+    # Call the create_pdf function and get the path of the generated PDF
+    pdf_path = create_pdf(input_text)
+    image_path = pdf_to_image(pdf_path)
+    # Print the output path of the generated PDF
+    print(f"PDF generated successfully: {pdf_path}")
+    for i in image_path:
+        print(i)
+if __name__ == "__main__":
+    main()

vision/Text_to_image/pdfToImage.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import fitz  # PyMuPDF
+import os
+def pdf_to_image(pdf_path, zoom=2.0):
+    # Open the PDF file
+    pdf_document = fitz.open(pdf_path)
+    # Create a list to store image paths
+    image_paths = []
+    # Create an 'Images' directory if it doesn't exist
+    os.makedirs("Images", exist_ok=True)
+    # Iterate over PDF pages and convert each to an image
+    for page_num in range(len(pdf_document)):
+        page = pdf_document.load_page(page_num)  # Load the page
+        # Set zoom level to improve quality
+        mat = fitz.Matrix(zoom, zoom)  # Create a transformation matrix with the zoom level
+        pix = page.get_pixmap(matrix=mat)  # Render the page to an image with the specified zoom
+        image_file = f'Images/{os.path.basename(pdf_path)}_page_{page_num}.png'
+        pix.save(image_file)  # Save the image as PNG
+        image_paths.append(image_file)
+    # Return the list containing paths of all images
+    return image_paths
+# Example usage
+# pdf_to_image('your_pdf_file.pdf', zoom=2.0)  # Increase zoom for higher quality

vision/Text_to_image/textToPdf.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from fpdf import FPDF
+from datetime import datetime
+import os
+def create_pdf(input_text):
+    # Create instance of FPDF class
+    pdf = FPDF()
+    # Add a page
+    pdf.add_page()
+    # Set font
+    pdf.set_font("Arial", size=10)
+    # Split the input text into multiple lines if necessary
+    # This ensures that the text fits the page and multiple pages are handled
+    pdf.multi_cell(0, 5, txt=input_text)
+    # Create a unique file name with the current time
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    file_name = f"PDFs/Aditya_{timestamp}.pdf"
+    # Create output directory if it doesn't exist
+    os.makedirs(os.path.dirname(file_name), exist_ok=True)
+    # Save the PDF
+    pdf.output(file_name)
+    # Return the file path
+    return file_name