raghuv-aditya commited on
Commit
9f21f05
·
verified ·
1 Parent(s): d38e818

Upload 24 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Datasets/mini_wiki_collection.json filter=lfs diff=lfs merge=lfs -text
Agents/rankerAgent.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from together import Together
4
+
5
+ def rerank_best_answer(json_files, config_file='config.json', model="meta-llama/Llama-3-8b-chat-hf"):
6
+ # Load API key from configuration file
7
+
8
+
9
+ together_ai_key = os.getenv("TOGETHER_AI")
10
+ if not together_ai_key:
11
+ raise ValueError("TOGETHER_AI environment variable not found. Please set it before running the script.")
12
+
13
+ # Initialize Together client
14
+ client = Together(api_key=together_ai_key)
15
+
16
+ # Combine all JSON files into a single structure
17
+ combined_prompts = {}
18
+ for json_file in json_files:
19
+ with open(json_file, 'r') as file:
20
+ data = json.load(file)
21
+
22
+ # Format the input for the prompt
23
+ for item in data:
24
+ query_id = item['query_id']
25
+ if query_id not in combined_prompts:
26
+ combined_prompts[query_id] = {
27
+ "question": item['input'],
28
+ "answers": {}
29
+ }
30
+ combined_prompts[query_id]["answers"][json_file] = item['response']
31
+
32
+ responses = []
33
+
34
+ for query_id, prompt in combined_prompts.items():
35
+ # Generate the prompt text
36
+ prompt_text = f"""Input JSON:
37
+ {json.dumps(prompt, indent=4)}
38
+
39
+ For the above question, identify which model gave the best response based on accuracy. Ensure the chosen response is an answer and not a follow-up question. Provide the output in the format:
40
+ {{
41
+ "best_model": "<model_name>",
42
+ "best_answer": "<answer>"
43
+ }}
44
+ Just output this JSON and nothing else.
45
+ """
46
+
47
+ # Generate response from Together API
48
+ response = client.chat.completions.create(
49
+ model=model,
50
+ messages=[{"role": "user", "content": prompt_text}],
51
+ )
52
+ response_content = response.choices[0].message.content
53
+ # print(response_content)
54
+
55
+ prompt_text_extract_bestModel = f"""Input JSON:
56
+ {json.dumps(response_content, indent=4)}
57
+
58
+ Just Output the best_model from above JSON and nothing else.
59
+ """
60
+ prompt_text_extract_bestAnswer = f"""Input JSON:
61
+ {json.dumps(response_content, indent=4)}
62
+
63
+ Just Output the best_answer from above JSON and nothing else.
64
+ """
65
+ response_bestModel = client.chat.completions.create(
66
+ model=model,
67
+ messages=[{"role": "user", "content": prompt_text_extract_bestModel}],
68
+ )
69
+ response_bestAnswer = client.chat.completions.create(
70
+ model=model,
71
+ messages=[{"role": "user", "content": prompt_text_extract_bestAnswer}],
72
+ )
73
+
74
+ # print({"query_id": query_id, "question": prompt["question"], "Ranker_Output": response.choices[0].message.content})
75
+ responses.append({"query_id": query_id, "question": prompt["question"], "best_model": response_bestModel.choices[0].message.content, "best_answer": response_bestAnswer.choices[0].message.content})
76
+
77
+ print(response_bestModel.choices[0].message.content)
78
+
79
+ return responses
80
+
81
+
82
+ def rankerAgent(prompt, config_file='config.json', model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"):
83
+ # Load API key from configuration file
84
+ with open(config_file, 'r') as file:
85
+ config = json.load(file)
86
+
87
+ together_ai_key = config.get("TOGETHER_AI")
88
+ if not together_ai_key:
89
+ raise ValueError("TOGETHER_AI key not found in the config file.")
90
+
91
+ # Initialize Together client
92
+ client = Together(api_key=together_ai_key)
93
+
94
+ prompt_text = f"""Input JSON:
95
+ {json.dumps(prompt, indent=4)}
96
+
97
+ For the above question, identify which model gave the best response based on accuracy. Ensure the chosen response is an answer and not a follow-up question. Provide the output in the format:
98
+ {{
99
+ "best_model": "<model_name>",
100
+ "best_answer": "<answer>"
101
+ }}
102
+ Just output this JSON and nothing else.
103
+ """
104
+
105
+ # Generate response from Together API
106
+ response = client.chat.completions.create(
107
+ model=model,
108
+ messages=[{"role": "user", "content": prompt_text}],
109
+ )
110
+ response_content = response.choices[0].message.content
111
+ # print(response_content)
112
+
113
+ prompt_text_extract_bestModel = f"""Input JSON:
114
+ {json.dumps(response_content, indent=4)}
115
+
116
+ Just Output the best_model from above JSON and nothing else.
117
+ """
118
+ prompt_text_extract_bestAnswer = f"""Input JSON:
119
+ {json.dumps(response_content, indent=4)}
120
+
121
+ Just Output the best_answer from above JSON and nothing else.
122
+ """
123
+ response_bestModel = client.chat.completions.create(
124
+ model=model,
125
+ messages=[{"role": "user", "content": prompt_text_extract_bestModel}],
126
+ )
127
+ response_bestAnswer = client.chat.completions.create(
128
+ model=model,
129
+ messages=[{"role": "user", "content": prompt_text_extract_bestAnswer}],
130
+ )
131
+
132
+ return response_bestModel.choices[0].message.content, response_bestAnswer.choices[0].message.content
133
+
134
+
135
+ # # Usage example
136
+ # json_files = ["../QnA_Eval/Responses/BOW_1_2_top_100_response.json",
137
+ # "../QnA_Eval/Responses/BOW_1_2_top_100_modified_response.json",
138
+ # "../QnA_Eval/Responses/tf-idf_1_2_top_100_response.json",
139
+ # "../QnA_Eval/Responses/tf-idf_1_2_top_100_modified_response.json",
140
+ # "../QnA_Eval/Responses/bm25_1_2_top_100_response.json",
141
+ # "../QnA_Eval/Responses/bm25_1_2_top_100_modified_response.json",
142
+ # "../QnA_Eval/Responses/open_source_1_2_top_100_response.json",
143
+ # "../QnA_Eval/Responses/open_source_1_2_top_100_modified_response.json",
144
+ # "../QnA_Eval/Responses/vision_1_2_top_100_response.json",
145
+ # "../QnA_Eval/Responses/vision_1_2_top_100_modified_response.json",
146
+ # "../QnA_Eval/Responses/ZeroShot_response.json",
147
+ # "../QnA_Eval/Responses/WikiAgent_response.json",
148
+ # "../QnA_Eval/Responses/WikiAgent_response_modified.json",
149
+ # "../QnA_Eval/Responses/LlamaAgent_response.json",
150
+ # "../QnA_Eval/Responses/LlamaAgent_response_modified.json",
151
+ # "../QnA_Eval/Responses/tf_idf_bm25_open_1_2_top_100_combined_response.json", "../QnA_Eval/Responses/tf_idf_bm25_open_1_2_top_100_combined_modified_response.json", "../QnA_Eval/Responses/tf_idf_bm25_open_1_2_top_100_combined_both_response.json"]
152
+
153
+ # config_file = "../config.json"
154
+
155
+ # result = rerank_best_answer(json_files, config_file)
156
+
157
+ # with open("reranked_best_answers_1_2.json", 'w') as file:
158
+ # json.dump(result, file, indent=4, ensure_ascii=False)
Agents/togetherAIAgent.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json, os
2
+ from together import Together
3
+
4
+ def generate_article_from_query(query, config_file='config.json', model="meta-llama/Llama-3-8b-chat-hf"):
5
+ """
6
+ Generates an article based on the given query using the Together API.
7
+
8
+ Parameters:
9
+ - query (str): The input query for generating the article.
10
+ - config_file (str): Path to the JSON file containing the API key. Default is 'config.json'.
11
+ - model (str): The Together AI model to use. Default is "meta-llama/Llama-3-8b-chat-hf".
12
+
13
+ Returns:
14
+ - str: The generated article content.
15
+ """
16
+
17
+ together_ai_key = os.getenv("TOGETHER_AI")
18
+ if not together_ai_key:
19
+ raise ValueError("TOGETHER_AI environment variable not found. Please set it before running the script.")
20
+
21
+
22
+ # Initialize Together client
23
+ client = Together(api_key=together_ai_key)
24
+
25
+ # Create the prompt
26
+ prompt = f"""Using the query provided, generate a well-researched and informative short article. The article should be detailed, accurate, and structured to cover various aspects of the topic in an engaging way. Focus on presenting key facts, historical context, notable insights, and any relevant background information that adds value to the reader’s understanding. Ensure the tone is neutral and informative. Keep the article short. Here’s the query:
27
+
28
+ Query: {query}"""
29
+
30
+ # Generate response
31
+ response = client.chat.completions.create(
32
+ model=model,
33
+ messages=[{"role": "user", "content": prompt}],
34
+ )
35
+
36
+ return response.choices[0].message.content
37
+
38
+ # # Example usage
39
+ # if __name__ == "__main__":
40
+ # query = "I feel anxious about my health and stressed at work."
41
+ # article = generate_article_from_query(query)
42
+ # print(article)
Agents/wikiAgent.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wikipediaapi
2
+ from typing import List, Dict
3
+ import logging
4
+ from dataclasses import dataclass
5
+ from datetime import datetime
6
+
7
+ @dataclass
8
+ class WikiSearchResult:
9
+ """Data class to store Wikipedia article information"""
10
+ title: str
11
+ summary: str
12
+ full_text: str
13
+ url: str
14
+ last_modified: datetime
15
+ categories: List[str]
16
+
17
+ def initialize_wikipedia_client(language: str = 'en', user_agent: str = 'WikipediaSearcher/1.0') -> wikipediaapi.Wikipedia:
18
+ """
19
+ Initialize Wikipedia API client
20
+
21
+ Args:
22
+ language: Language code (e.g., 'en' for English)
23
+ user_agent: User agent string for API requests
24
+
25
+ Returns:
26
+ Wikipedia API client instance
27
+ """
28
+ return wikipediaapi.Wikipedia(
29
+ language=language,
30
+ extract_format=wikipediaapi.ExtractFormat.WIKI,
31
+ user_agent=user_agent
32
+ )
33
+
34
+ def process_page(page: wikipediaapi.WikipediaPage) -> WikiSearchResult:
35
+ """Process a Wikipedia page and extract relevant information"""
36
+ categories = [cat.title for cat in page.categories.values()]
37
+
38
+ return WikiSearchResult(
39
+ title=page.title,
40
+ summary=page.summary,
41
+ full_text=page.text,
42
+ url=page.fullurl,
43
+ last_modified=datetime.strptime(page.touched, '%Y-%m-%dT%H:%M:%SZ'),
44
+ categories=categories
45
+ )
46
+
47
+ def search_wikipedia(client: wikipediaapi.Wikipedia, query: str, results_limit: int = 3) -> List[WikiSearchResult]:
48
+ """
49
+ Search Wikipedia and get detailed information for matching articles
50
+
51
+ Args:
52
+ client: Wikipedia API client instance
53
+ query: Search query string
54
+ results_limit: Maximum number of results to return
55
+
56
+ Returns:
57
+ List of WikiSearchResult objects containing article information
58
+ """
59
+ try:
60
+ page = client.page(query)
61
+
62
+ if not page.exists():
63
+ logging.warning(f"No exact match found for: {query}")
64
+ return []
65
+
66
+ results = [process_page(page)]
67
+
68
+ # Get related pages through links (if we want more results)
69
+ if results_limit > 1:
70
+ for link_title in list(page.links.keys())[:results_limit - 1]:
71
+ link_page = client.page(link_title)
72
+ if link_page.exists():
73
+ results.append(process_page(link_page))
74
+
75
+ return results
76
+
77
+ except Exception as e:
78
+ logging.error(f"Error searching Wikipedia: {e}")
79
+ return []
80
+
81
+ def format_result(result: WikiSearchResult, include_full_text: bool = False) -> str:
82
+ """
83
+ Format a search result for display
84
+
85
+ Args:
86
+ result: WikiSearchResult object to format
87
+ include_full_text: Whether to include the full article text
88
+
89
+ Returns:
90
+ Formatted string containing article information
91
+ """
92
+ formatted = f"""
93
+ Title: {result.title}
94
+ URL: {result.url}
95
+ Last Modified: {result.last_modified}
96
+ Categories: {', '.join(result.categories[:5])}{'...' if len(result.categories) > 5 else ''}
97
+
98
+ Summary:
99
+ {result.summary}
100
+ """
101
+ if include_full_text:
102
+ formatted += f"\nFull Text:\n{result.full_text}"
103
+
104
+ return formatted
105
+
106
+ def get_wiki_data(query: str, results_limit: int = 3) -> List[str]:
107
+ """
108
+ Get Wikipedia data for a given query. If the search returns no results,
109
+ try using n-grams of decreasing size until a result is found or all attempts fail.
110
+
111
+ Args:
112
+ query: Search query string
113
+ results_limit: Maximum number of results to return
114
+
115
+ Returns:
116
+ List of summaries from Wikipedia search results, or None if no results are found.
117
+ """
118
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
119
+ client = initialize_wikipedia_client()
120
+
121
+ def get_search_result(query):
122
+ """Helper function to get search result summary."""
123
+ result = search_wikipedia(client, query, results_limit)
124
+ if result:
125
+ return result[0].summary # Return the first result's summary if available
126
+ return None
127
+
128
+ # Check the search results with the full query
129
+ summary = get_search_result(query)
130
+ if summary:
131
+ return [summary]
132
+
133
+ # If no result, try reducing the query by n-grams
134
+ n = len(query.split()) # Starting with the number of words in the query
135
+ for i in range(n, 1, -1): # Try from n-grams down to 2-grams
136
+ # Generate n-grams for the current iteration
137
+ n_grams_query = ' '.join(query.split()[:i])
138
+ logging.info(f"Trying n-gram query: {n_grams_query}")
139
+ summary = get_search_result(n_grams_query)
140
+ if summary:
141
+ return [summary]
142
+
143
+ # If no results found after all n-gram reductions, return None
144
+ logging.info("No results found for any query variations.")
145
+ return None
146
+
147
+ # # Example usage
148
+ # if __name__ == "__main__":
149
+ # query = "Clash of Clans"
150
+ # results = get_wiki_data(query, results_limit=3)
151
+
152
+ # if not results:
153
+ # print(f"No results found for query: {query}")
154
+ # else:
155
+ # for idx, result in enumerate(results, 1):
156
+ # print(f"\nResult {idx}:")
157
+ # print("-" * 60)
158
+ # print(format_result(result))
AnswerGeneration/__pycache__/getAnswer.cpython-312.pyc ADDED
Binary file (2.13 kB). View file
 
AnswerGeneration/getAnswer.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from together import Together
4
+
5
+ def generate_answer_withContext(question, context):
6
+ together_ai_key = os.getenv("TOGETHER_AI")
7
+ if not together_ai_key:
8
+ raise ValueError("TOGETHER_AI environment variable not found. Please set it before running the script.")
9
+
10
+
11
+ client = Together(api_key=together_ai_key)
12
+
13
+ prompt = f"""Consider the context and generate a brief 1-2 line answer to the question. Output only the answer.
14
+
15
+ Context: {context}
16
+
17
+ Question: {question}
18
+ """
19
+ response = client.chat.completions.create(
20
+ # model="meta-llama/Llama-3-8b-chat-hf",
21
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
22
+ messages=[{"role": "user", "content": prompt}],
23
+ )
24
+
25
+ return response.choices[0].message.content
26
+
27
+
28
+ def generate_answer_zeroShot(question):
29
+ together_ai_key = os.getenv("TOGETHER_AI")
30
+ if not together_ai_key:
31
+ raise ValueError("TOGETHER_AI environment variable not found. Please set it before running the script.")
32
+
33
+
34
+ client = Together(api_key=together_ai_key)
35
+
36
+ prompt = f"""Answer the following question:
37
+
38
+ Question: {question}
39
+ """
40
+ response = client.chat.completions.create(
41
+ model="meta-llama/Llama-3-8b-chat-hf",
42
+ messages=[{"role": "user", "content": prompt}],
43
+ )
44
+
45
+ return response.choices[0].message.content
Baseline/__pycache__/boolean.cpython-312.pyc ADDED
Binary file (960 Bytes). View file
 
Baseline/__pycache__/boolean_retrieval.cpython-312.pyc ADDED
Binary file (5.61 kB). View file
 
Baseline/__pycache__/data_processor.cpython-312.pyc ADDED
Binary file (2.89 kB). View file
 
Baseline/boolean.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Baseline.data_processor import process_json_data, process_queries, merge_documents
2
+ from Baseline.boolean_retrieval import main_boolean_retrieval, retrieve_single_query
3
+ import json
4
+
5
+ def boolean_pipeline(query, wikipedia_data_path="Datasets/mini_wiki_collection.json", top_n=100):
6
+ # Load the JSON files
7
+ with open(wikipedia_data_path, "r") as file1:
8
+ wikipedia_data = json.load(file1)
9
+
10
+ # Process the JSON files
11
+ wikipedia_dict = process_json_data(wikipedia_data)
12
+ # Print the processed data
13
+
14
+ top_results = retrieve_single_query(query, wikipedia_dict, top_n)
15
+
16
+ return top_results
17
+
18
+ # def main():
19
+ # # Load the JSON files
20
+ # # boolean_retrieval("In the United States, why are positions like Attorney General, Secretary of State, etc. appointed by the president at the federal level but elected by the people at the state level? Had it ever been proposed to do this differently?")
21
+ # # return
22
+ # with open("../Datasets/mini_wiki_collection.json", "r") as file1: # Replace with the actual path to your file
23
+ # wikipedia_data = json.load(file1)
24
+
25
+ # with open("../Datasets/mini_wiki_collection_10000_documents.json", "r") as file1: # Replace with the actual path to your file
26
+ # additional_json_file = json.load(file1)
27
+
28
+ # with open("../Datasets/FinalDataset_WithModifiedQuery.json", "r") as file2: # Replace with the actual path to your file
29
+ # queries_data = json.load(file2)
30
+
31
+ # # Process the JSON files
32
+ # wikipedia_dict = process_json_data(wikipedia_data)
33
+ # updated_main_dict = merge_documents(wikipedia_dict, additional_json_file, limit=2000)
34
+ # queries_dict = process_queries(queries_data)
35
+
36
+ # # Print the processed data
37
+ # print("Processed Wikipedia Data:")
38
+ # print(wikipedia_dict["420538"])
39
+ # print("\nProcessed Queries Data:")
40
+ # print(queries_dict["5xvggq"])
41
+
42
+ # top_results = main_boolean_retrieval(updated_main_dict, queries_dict)
43
+
44
+ # # Print the results for a specific query
45
+ # print("\nTop results for query '5xvggq':")
46
+ # print(top_results.get("5xvggq", []))
47
+
48
+ # # Optionally, save the top results to a JSON file
49
+ # with open("boolean_retrieval_1_2_query.json", "w") as output_file:
50
+ # json.dump(top_results, output_file, indent=4)
51
+
52
+
53
+ # # if __name__ == "__main__":
54
+ # # main()
Baseline/boolean_retrieval.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ import re
3
+ import heapq
4
+ import joblib
5
+ import os
6
+
7
+ def preprocess_text(text):
8
+ """
9
+ Preprocess the text for tokenization.
10
+ Removes special characters, lowercases, and splits into words.
11
+ """
12
+ return re.findall(r'\w+', text.lower())
13
+
14
+ def create_inverted_index(wikipedia_dict):
15
+ """
16
+ Create an inverted index from the document dictionary.
17
+ Args:
18
+ wikipedia_dict (dict): A dictionary with document IDs as keys and text as values.
19
+
20
+ Returns:
21
+ dict: An inverted index where each term maps to a list of document IDs containing it.
22
+ """
23
+ inverted_index = defaultdict(set)
24
+ for doc_id, text in wikipedia_dict.items():
25
+ tokens = set(preprocess_text(text)) # Unique tokens for each document
26
+ for token in tokens:
27
+ inverted_index[token].add(doc_id)
28
+ return inverted_index
29
+
30
+ def save_inverted_index(inverted_index, filepath="Baseline/inverted_index.pkl"):
31
+ """
32
+ Save the inverted index to a file using joblib.
33
+ """
34
+ joblib.dump(inverted_index, filepath)
35
+
36
+ def load_inverted_index(filepath="Baseline/inverted_index.pkl"):
37
+ """
38
+ Load the inverted index from a file using joblib.
39
+ """
40
+ if os.path.exists(filepath):
41
+ return joblib.load(filepath)
42
+ return None
43
+
44
+ def boolean_retrieval(queries_dict, inverted_index, wikipedia_dict, top_n=100):
45
+ """
46
+ Perform boolean retrieval for each query.
47
+ Args:
48
+ queries_dict (dict): A dictionary with query IDs as keys and query text as values.
49
+ inverted_index (dict): The inverted index created from the document collection.
50
+ wikipedia_dict (dict): The original document dictionary (for scoring if needed).
51
+ top_n (int): The number of top documents to retrieve for each query.
52
+
53
+ Returns:
54
+ dict: A dictionary with query IDs as keys and a list of top document IDs as values.
55
+ """
56
+ query_results = {}
57
+
58
+ for query_id, query_text in queries_dict.items():
59
+ query_tokens = preprocess_text(query_text)
60
+
61
+ # Collect all document IDs that contain any of the query terms
62
+ relevant_docs = set()
63
+ for token in query_tokens:
64
+ if token in inverted_index:
65
+ relevant_docs.update(inverted_index[token])
66
+
67
+ # If more than `top_n` documents, sort by some criteria (e.g., frequency of terms in the doc)
68
+ doc_scores = []
69
+ for doc_id in relevant_docs:
70
+ doc_text = preprocess_text(wikipedia_dict[doc_id])
71
+ score = sum(doc_text.count(token) for token in query_tokens) # Term frequency score
72
+ doc_scores.append((score, doc_id))
73
+
74
+ # Get the top `top_n` documents based on the score
75
+ top_docs = heapq.nlargest(top_n, doc_scores)
76
+ query_results[query_id] = [doc_id for _, doc_id in top_docs]
77
+
78
+ return query_results
79
+
80
+ # Main flow
81
+ def main_boolean_retrieval(wikipedia_dict, queries_dict):
82
+ # Step 1: Create inverted index
83
+ inverted_index = create_inverted_index(wikipedia_dict)
84
+
85
+ # Step 2: Perform boolean retrieval
86
+ top_docs = boolean_retrieval(queries_dict, inverted_index, wikipedia_dict)
87
+
88
+ return top_docs
89
+
90
+ def retrieve_single_query(query, wikipedia_dict, top_n=100, inverted_index_path="Baseline/inverted_index.pkl"):
91
+ """
92
+ Retrieve documents for a single query using the inverted index.
93
+ If the inverted index is not found, it will be created and saved.
94
+
95
+ Args:
96
+ query (str): The query text.
97
+ wikipedia_dict (dict): The original document dictionary.
98
+ top_n (int): The number of top documents to retrieve.
99
+ inverted_index_path (str): Path to the saved inverted index file.
100
+
101
+ Returns:
102
+ list: A list of top document IDs matching the query.
103
+ """
104
+ # Load or create the inverted index
105
+ inverted_index = load_inverted_index(inverted_index_path)
106
+ if inverted_index is None:
107
+ print("Inverted index not found. Creating one...")
108
+ inverted_index = create_inverted_index(wikipedia_dict)
109
+ save_inverted_index(inverted_index, inverted_index_path)
110
+
111
+ # Preprocess the query
112
+ query_tokens = preprocess_text(query)
113
+
114
+ # Collect relevant documents
115
+ relevant_docs = set()
116
+ for token in query_tokens:
117
+ if token in inverted_index:
118
+ relevant_docs.update(inverted_index[token])
119
+
120
+ # Rank documents by frequency of terms
121
+ doc_scores = []
122
+ for doc_id in relevant_docs:
123
+ doc_text = preprocess_text(wikipedia_dict[doc_id])
124
+ score = sum(doc_text.count(token) for token in query_tokens)
125
+ doc_scores.append((score, doc_id))
126
+
127
+ # Get the top `top_n` documents based on the score
128
+ top_docs = heapq.nlargest(top_n, doc_scores)
129
+ return [doc_id for _, doc_id in top_docs]
130
+
131
+ # Example usage:
132
+ # Assuming `wikipedia_dict` and `queries_dict` are already prepared
133
+ # top_results = main_boolean_retrieval(wikipedia_dict, queries_dict)
134
+ # print(top_results)
Baseline/data_processor.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Assuming sanitize_text is a function you've defined elsewhere
2
+
3
+ import re
4
+
5
+ def merge_documents(main_dict, additional_json, limit=1000):
6
+ """
7
+ Adds a subset of documents from an additional JSON file to the main dictionary.
8
+
9
+ Args:
10
+ main_dict (dict): The main dictionary where processed documents are stored.
11
+ additional_json (list): The additional JSON data containing documents.
12
+ limit (int): The maximum number of documents to add to the main dictionary.
13
+
14
+ Returns:
15
+ dict: The updated main dictionary with additional documents added.
16
+ """
17
+ # Counter to track how many documents have been added
18
+ count = 0
19
+
20
+ for doc in additional_json:
21
+ if count >= limit:
22
+ break
23
+
24
+ # Extract wikipedia_id and text from the document
25
+ wikipedia_id = doc.get("wikipedia_id")
26
+ text = doc.get("text", [])
27
+
28
+ # Check if the document ID is unique to avoid overwriting
29
+ if wikipedia_id not in main_dict:
30
+ # Process and sanitize the document
31
+ joined_text = " ".join(text)
32
+ sanitized_text = sanitize_text(joined_text)
33
+
34
+ # Add to the main dictionary
35
+ main_dict[wikipedia_id] = sanitized_text
36
+ count += 1
37
+
38
+ print(f"{count} documents added to the main dictionary.")
39
+ return main_dict
40
+
41
+ def sanitize_text(text):
42
+ """
43
+ Cleans and standardizes text by keeping only alphanumeric characters and spaces.
44
+ Args:
45
+ text (str): Text to sanitize.
46
+ Returns:
47
+ str: Sanitized text.
48
+ """
49
+ if isinstance(text, str):
50
+ # Use regex to keep only alphanumeric characters and spaces
51
+ text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
52
+ # Optionally, collapse multiple spaces into a single space
53
+ text = re.sub(r'\s+', ' ', text).strip()
54
+ return text
55
+
56
+
57
+ def process_json_data(json_data):
58
+ result_dict = {}
59
+
60
+ for doc in json_data:
61
+ # Extract wikipedia_id and text
62
+ wikipedia_id = doc.get("wikipedia_id")
63
+ text = doc.get("text", [])
64
+
65
+ # Join the text content and sanitize
66
+ joined_text = " ".join(text)
67
+ sanitized_text = sanitize_text(joined_text)
68
+
69
+ # Store in the dictionary
70
+ result_dict[wikipedia_id] = sanitized_text
71
+
72
+ return result_dict
73
+
74
+ def process_queries(json_data):
75
+ """
76
+ Processes a JSON object containing queries and query IDs.
77
+
78
+ Args:
79
+ json_data (dict): The input JSON data.
80
+
81
+ Returns:
82
+ dict: A dictionary with query_id as the key and query text as the value.
83
+ """
84
+ result_dict = {}
85
+
86
+ for query_id, query_info in json_data.items():
87
+ # Extract the query input
88
+ query_text = query_info.get("input", "")
89
+
90
+ # Store query_id and text in the result dictionary
91
+ result_dict[query_id] = query_text
92
+
93
+ return result_dict
94
+
95
+ # Example usage
96
+ # Assuming `query_json_file` contains your JSON data
97
+ # processed_queries = process_queries(query_json_file)
98
+
Baseline/inverted_index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c47f19521041e7b2a5681da4128cfae538eba1bc653528f04c7dc9df300fbc5
3
+ size 4671080
Datasets/mini_wiki_collection.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:986eedb174550564ce95cf9b08de1207cfb1e2290646b4aeb60257c9edceb27a
3
+ size 41656963
Query_Modification/QueryModification.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json, os
2
+ import pandas as pd
3
+ import google.generativeai as genai
4
+
5
+ # Function to process text input with Gemini model
6
+ def query_Modifier(input_text):
7
+
8
+ gemini_key = os.getenv("GEMINI")
9
+ if not gemini_key:
10
+ raise ValueError("GEMINI environment variable not found. Please set it before running the script.")
11
+
12
+ # Initialize the API key
13
+ genai.configure(api_key=gemini_key)
14
+
15
+ # print(gemini_key)
16
+
17
+ # Load the prompt from file
18
+ with open("Query_Modification/prompt.txt", 'r') as file:
19
+ PROMPT_TEMPLATE = file.read()
20
+
21
+ # Safety settings for Gemini model
22
+ safe = [
23
+ {
24
+ "category": "HARM_CATEGORY_DANGEROUS",
25
+ "threshold": "BLOCK_NONE",
26
+ },
27
+ {
28
+ "category": "HARM_CATEGORY_HARASSMENT",
29
+ "threshold": "BLOCK_NONE",
30
+ },
31
+ {
32
+ "category": "HARM_CATEGORY_HATE_SPEECH",
33
+ "threshold": "BLOCK_NONE",
34
+ },
35
+ {
36
+ "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
37
+ "threshold": "BLOCK_NONE",
38
+ },
39
+ {
40
+ "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
41
+ "threshold": "BLOCK_NONE",
42
+ },
43
+ ]
44
+
45
+ generation_config = {
46
+ "temperature": 1,
47
+ "top_p": 0.95,
48
+ "top_k": 40,
49
+ "max_output_tokens": 8192,
50
+ "response_mime_type": "text/plain",
51
+ }
52
+
53
+ # Initialize the generative model
54
+ model = genai.GenerativeModel("gemini-1.5-flash", generation_config=generation_config)
55
+
56
+
57
+ full_prompt = f"{input_text}\n\n{PROMPT_TEMPLATE}"
58
+
59
+ # Call the generative model for text input
60
+ result = model.generate_content([full_prompt], safety_settings=safe)
61
+ return result.text
62
+
63
+
64
+ def getKeywords(input_text):
65
+ # Extract keywords from the input text
66
+
67
+ gemini_key = os.getenv("GEMINI")
68
+ if not gemini_key:
69
+ raise ValueError("GEMINI environment variable not found. Please set it before running the script.")
70
+
71
+ # Initialize the API key
72
+ genai.configure(api_key=gemini_key)
73
+
74
+ # Safety settings for Gemini model
75
+ safe = [
76
+ {
77
+ "category": "HARM_CATEGORY_DANGEROUS",
78
+ "threshold": "BLOCK_NONE",
79
+ },
80
+ {
81
+ "category": "HARM_CATEGORY_HARASSMENT",
82
+ "threshold": "BLOCK_NONE",
83
+ },
84
+ {
85
+ "category": "HARM_CATEGORY_HATE_SPEECH",
86
+ "threshold": "BLOCK_NONE",
87
+ },
88
+ {
89
+ "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
90
+ "threshold": "BLOCK_NONE",
91
+ },
92
+ {
93
+ "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
94
+ "threshold": "BLOCK_NONE",
95
+ },
96
+ ]
97
+
98
+ generation_config = {
99
+ "temperature": 1,
100
+ "top_p": 0.95,
101
+ "top_k": 40,
102
+ "max_output_tokens": 8192,
103
+ "response_mime_type": "text/plain",
104
+ }
105
+
106
+ # Initialize the generative model
107
+ model = genai.GenerativeModel("gemini-1.5-flash", generation_config=generation_config)
108
+
109
+
110
+ full_prompt = f"{input_text} \n\n Give the Keywords for the above sentence and output nothing else."
111
+
112
+ # Call the generative model for text input
113
+ result = model.generate_content([full_prompt], safety_settings=safe)
114
+
115
+ response = result.text
116
+ response = response.replace("Keywords:", "")
117
+ response = response.replace(",", "")
118
+
119
+ return response.strip()
Query_Modification/__pycache__/QueryModification.cpython-311.pyc ADDED
Binary file (3.42 kB). View file
 
Query_Modification/__pycache__/QueryModification.cpython-312.pyc ADDED
Binary file (2.98 kB). View file
 
Query_Modification/prompt.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Modify the following query to improve its suitability for a Retrieval Augmented Generation (RAG) system using a semantic search engine like Cosign:
2
+
3
+ Original Query: [Original query here]
4
+
5
+ Guidelines:
6
+
7
+ Clarity and Specificity: Make the query more specific and focused.
8
+ Keyword Optimization: Identify and include relevant keywords that align with the dataset.
9
+ Semantic Relevance: Consider the underlying meaning and context of the query.
10
+ Question Formulation: Frame the query as a question to facilitate direct answer extraction.
11
+ Contextual Clues: If applicable, provide additional context or background information.
12
+
13
+ Example:
14
+
15
+ Original Query: "Tell me about the French Revolution"
16
+
17
+ Modified Query: "What were the main causes and effects of the French Revolution, and who were its key figures?"
18
+
19
+ Guardrail : Output only the Modified Query.
Ranking/RRF/RRF_implementation.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from collections import defaultdict
4
+
5
+ def load_and_merge_json_files(directory_path):
6
+ """
7
+ Load and merge JSON files from a directory into a single structure, keeping each list from different files separate for each query.
8
+
9
+ Args:
10
+ directory_path (str): Path to the directory containing the JSON files.
11
+
12
+ Returns:
13
+ list: Merged list of dictionaries, keeping separate lists for each query.
14
+ """
15
+ merged_queries = defaultdict(list)
16
+
17
+ # Iterate through all files in the directory
18
+ for filename in os.listdir(directory_path):
19
+ if filename.endswith('.json'):
20
+ file_path = os.path.join(directory_path, filename)
21
+ try:
22
+ with open(file_path, 'r') as f:
23
+ json_data = json.load(f)
24
+
25
+ # For each file, add the lists to the corresponding query
26
+ for query_data in json_data:
27
+ for query, rank_list in query_data.items():
28
+ if isinstance(rank_list, list): # Ensure rank_list is a list
29
+ merged_queries[query].append(rank_list)
30
+ else:
31
+ print(f"Warning: Expected a list for query '{query}' but got {type(rank_list)}")
32
+ except Exception as e:
33
+ print(f"Error reading {filename}: {e}")
34
+
35
+ # Convert defaultdict to a list of dictionaries
36
+ return [{query: lists} for query, lists in merged_queries.items()]
37
+
38
+ def reciprocal_rank_fusion(json_input, K=60, top_n=100):
39
+ """
40
+ Fuse rank from multiple IR systems for multiple queries using Reciprocal Rank Fusion.
41
+
42
+ Args:
43
+ json_input (list): A list of dictionaries where keys are queries, and values are ranked document lists from different systems.
44
+ K (int): A constant used in the RRF formula (default is 60).
45
+ top_n (int): Number of top results to return for each query.
46
+
47
+ Returns:
48
+ list: A list of dictionaries with each query and its respective fused document rankings.
49
+ """
50
+ query_fusion_results = []
51
+
52
+ # Iterate over each query in the JSON input
53
+ for query_data in json_input:
54
+ for query, list_of_ranked_docs in query_data.items():
55
+ rrf_map = defaultdict(float)
56
+
57
+ # Fuse rankings for the query using RRF
58
+ for rank_list in list_of_ranked_docs:
59
+ for rank, doc in enumerate(rank_list, 1):
60
+ rrf_map[doc] += 1 / (rank + K)
61
+
62
+ # Sort the documents based on RRF scores in descending order
63
+ sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
64
+ fused_rankings = [doc for doc, score in sorted_docs[:top_n]] # Keep only top N results
65
+
66
+ # Store the results for the current query
67
+ query_fusion_results.append({query: fused_rankings})
68
+
69
+ return query_fusion_results
70
+
71
+ def save_to_json(output_data, output_file_path):
72
+ """
73
+ Save the RRF results to a JSON file in the same format as the input.
74
+
75
+ Args:
76
+ output_data (list): The processed data to save.
77
+ output_file_path (str): Path to the output JSON file.
78
+ """
79
+ with open(output_file_path, 'w') as f:
80
+ json.dump(output_data, f, indent=2)
81
+
82
+ # # Example usage
83
+ # directory_path = "Modified_1_2" # Replace with your directory path
84
+ # output_file_path = "Modified_1_2/rrf_1_2_modified.json" # Replace with your desired output file path
85
+
86
+ # # Load and merge JSON files
87
+ # merged_input = load_and_merge_json_files(directory_path)
88
+
89
+ # print(merged_input[0]["5xvggq"])
90
+
91
+ # # Perform RRF on the merged input, keeping only the top 100 results
92
+ # combined_results = reciprocal_rank_fusion(merged_input, top_n=100)
93
+
94
+ # # Save the combined results to a JSON file
95
+ # save_to_json(combined_results, output_file_path)
96
+
97
+ # print(f"Combined results saved to {output_file_path}")
98
+
99
+
100
+ def reciprocal_rank_fusion_two(rank_list1, rank_list2, K=60, top_n=100):
101
+ """
102
+ Perform Reciprocal Rank Fusion (RRF) for two ranking lists.
103
+
104
+ Args:
105
+ rank_list1 (list): First list of ranked documents.
106
+ rank_list2 (list): Second list of ranked documents.
107
+ K (int): A constant used in the RRF formula (default is 60).
108
+ top_n (int): Number of top results to return (default is 100).
109
+
110
+ Returns:
111
+ list: Combined list of rankings after applying RRF.
112
+ """
113
+ rrf_map = defaultdict(float)
114
+
115
+ # Process the first ranking list
116
+ for rank, doc in enumerate(rank_list1, 1): # Start ranks from 1
117
+ rrf_map[doc] += 1 / (rank + K)
118
+
119
+ # Process the second ranking list
120
+ for rank, doc in enumerate(rank_list2, 1): # Start ranks from 1
121
+ rrf_map[doc] += 1 / (rank + K)
122
+
123
+ # Sort the documents based on RRF scores in descending order
124
+ sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
125
+
126
+ # Return only the top N results
127
+ return [doc for doc, score in sorted_docs[:top_n]]
128
+
129
+
130
+ def reciprocal_rank_fusion_three(rank_list1, rank_list2, rank_list3, K=60, top_n=100):
131
+ """
132
+ Perform Reciprocal Rank Fusion (RRF) for three ranking lists.
133
+
134
+ Args:
135
+ rank_list1 (list): First list of ranked documents.
136
+ rank_list2 (list): Second list of ranked documents.
137
+ rank_list3 (list): Third list of ranked documents.
138
+ K (int): A constant used in the RRF formula (default is 60).
139
+ top_n (int): Number of top results to return (default is 100).
140
+
141
+ Returns:
142
+ list: Combined list of rankings after applying RRF.
143
+ """
144
+ rrf_map = defaultdict(float)
145
+
146
+ # Process the first ranking list
147
+ for rank, doc in enumerate(rank_list1, 1): # Start ranks from 1
148
+ rrf_map[doc] += 1 / (rank + K)
149
+
150
+ # Process the second ranking list
151
+ for rank, doc in enumerate(rank_list2, 1): # Start ranks from 1
152
+ rrf_map[doc] += 1 / (rank + K)
153
+
154
+ # Process the third ranking list
155
+ for rank, doc in enumerate(rank_list3, 1): # Start ranks from 1
156
+ rrf_map[doc] += 1 / (rank + K)
157
+
158
+ # Sort the documents based on RRF scores in descending order
159
+ sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
160
+
161
+ # Return only the top N results
162
+ return [doc for doc, score in sorted_docs[:top_n]]
163
+
164
+
165
+ def reciprocal_rank_fusion_six(rank_list1, rank_list2, rank_list3, rank_list4, rank_list5, rank_list6, K=60, top_n=100):
166
+ """
167
+ Perform Reciprocal Rank Fusion (RRF) for six ranking lists.
168
+
169
+ Args:
170
+ rank_list1 (list): First list of ranked documents.
171
+ rank_list2 (list): Second list of ranked documents.
172
+ rank_list3 (list): Third list of ranked documents.
173
+ rank_list4 (list): Fourth list of ranked documents.
174
+ rank_list5 (list): Fifth list of ranked documents.
175
+ rank_list6 (list): Sixth list of ranked documents.
176
+ K (int): A constant used in the RRF formula (default is 60).
177
+ top_n (int): Number of top results to return (default is 100).
178
+
179
+ Returns:
180
+ list: Combined list of rankings after applying RRF.
181
+ """
182
+ rrf_map = defaultdict(float)
183
+
184
+ # Process each ranking list
185
+ for rank, doc in enumerate(rank_list1, 1):
186
+ rrf_map[doc] += 1 / (rank + K)
187
+ for rank, doc in enumerate(rank_list2, 1):
188
+ rrf_map[doc] += 1 / (rank + K)
189
+ for rank, doc in enumerate(rank_list3, 1):
190
+ rrf_map[doc] += 1 / (rank + K)
191
+ for rank, doc in enumerate(rank_list4, 1):
192
+ rrf_map[doc] += 1 / (rank + K)
193
+ for rank, doc in enumerate(rank_list5, 1):
194
+ rrf_map[doc] += 1 / (rank + K)
195
+ for rank, doc in enumerate(rank_list6, 1):
196
+ rrf_map[doc] += 1 / (rank + K)
197
+
198
+ # Sort the documents based on RRF scores in descending order
199
+ sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
200
+
201
+ # Return only the top N results
202
+ return [doc for doc, score in sorted_docs[:top_n]]
203
+
204
+
205
+ def reciprocal_rank_fusion_multiple_lists(ranking_lists, K=60, top_n=100):
206
+ """
207
+ Perform Reciprocal Rank Fusion (RRF) for multiple ranking lists for each query.
208
+
209
+ Args:
210
+ ranking_lists (list of list of dict): Each element is a list of dictionaries, where each dictionary contains query IDs and ranked lists.
211
+ K (int): A constant used in the RRF formula (default is 60).
212
+ top_n (int): Number of top results to return for each query (default is 100).
213
+
214
+ Returns:
215
+ dict: A dictionary with query IDs as keys and their combined rankings as values.
216
+ """
217
+ combined_results = defaultdict(list)
218
+
219
+ # Flatten all ranking lists into a single dictionary per query
220
+ merged_rankings = defaultdict(list)
221
+ for ranking_list in ranking_lists:
222
+ for ranking_dict in ranking_list:
223
+ for query_id, doc_list in ranking_dict.items():
224
+ merged_rankings[query_id].append(doc_list)
225
+
226
+ # Apply RRF for each query
227
+ for query_id, ranked_lists in merged_rankings.items():
228
+ rrf_map = defaultdict(float)
229
+
230
+ # Process rankings for each system
231
+ for rank_list in ranked_lists:
232
+ for rank, doc in enumerate(rank_list, 1): # Start rank from 1
233
+ rrf_map[str(doc)] += 1 / (rank + K)
234
+
235
+ # Sort documents based on their RRF scores in descending order
236
+ sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
237
+ combined_results[query_id] = [doc for doc, score in sorted_docs[:top_n]]
238
+
239
+ return dict(combined_results)
Ranking/RRF/__pycache__/RRF_implementation.cpython-311.pyc ADDED
Binary file (12.5 kB). View file
 
Ranking/RRF/__pycache__/RRF_implementation.cpython-312.pyc ADDED
Binary file (10.5 kB). View file
 
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+
4
+ # Import your modules here
5
+ from Agents.togetherAIAgent import generate_article_from_query
6
+ from Agents.wikiAgent import get_wiki_data
7
+ from Agents.rankerAgent import rankerAgent
8
+ from Query_Modification.QueryModification import query_Modifier, getKeywords
9
+ from Ranking.RRF.RRF_implementation import reciprocal_rank_fusion_three, reciprocal_rank_fusion_six
10
+ from Retrieval.tf_idf import tf_idf_pipeline
11
+ from Retrieval.bm25 import bm25_pipeline
12
+ from Retrieval.vision import vision_pipeline
13
+ from Retrieval.openSource import open_source_pipeline
14
+ from Baseline.boolean import boolean_pipeline
15
+ from AnswerGeneration.getAnswer import generate_answer_withContext, generate_answer_zeroShot
16
+
17
+ # Load miniWikiCollection
18
+ miniWikiCollection = json.load(open('Datasets/mini_wiki_collection.json', 'r'))
19
+ miniWikiCollectionDict = {wiki['wikipedia_id']: " ".join(wiki['text']) for wiki in miniWikiCollection}
20
+
21
+ def process_query(query):
22
+ # Query modification
23
+ modified_query = query_Modifier(query)
24
+
25
+ # Context Generation
26
+ article = generate_article_from_query(query)
27
+
28
+ # Keyword Extraction and getting context from Wiki
29
+ keywords = getKeywords(query)
30
+ wiki_data = get_wiki_data(keywords)
31
+
32
+ # Retrieve rankings
33
+ boolean_ranking = boolean_pipeline(query)
34
+ tf_idf_ranking = tf_idf_pipeline(query)
35
+ bm25_ranking = bm25_pipeline(query)
36
+ vision_ranking = vision_pipeline(query)
37
+ open_source_ranking = open_source_pipeline(query)
38
+
39
+ # Modified queries
40
+ boolean_ranking_modified = boolean_pipeline(modified_query)
41
+ tf_idf_ranking_modified = tf_idf_pipeline(modified_query)
42
+ bm25_ranking_modified = bm25_pipeline(modified_query)
43
+ vision_ranking_modified = vision_pipeline(modified_query)
44
+ open_source_ranking_modified = open_source_pipeline(modified_query)
45
+
46
+ # RRF rankings
47
+ tf_idf_bm25_open_RRF_Ranking = reciprocal_rank_fusion_three(tf_idf_ranking, bm25_ranking, open_source_ranking)
48
+ tf_idf_bm25_open_RRF_Ranking_modified = reciprocal_rank_fusion_three(tf_idf_ranking_modified, bm25_ranking_modified, open_source_ranking_modified)
49
+ tf_idf_bm25_open_RRF_Ranking_combined = reciprocal_rank_fusion_six(
50
+ tf_idf_ranking, bm25_ranking, open_source_ranking,
51
+ tf_idf_ranking_modified, bm25_ranking_modified, open_source_ranking_modified
52
+ )
53
+
54
+ # Retrieve contexts
55
+ boolean_context = miniWikiCollectionDict[boolean_ranking[0]]
56
+ tf_idf_context = miniWikiCollectionDict[tf_idf_ranking[0]]
57
+ bm25_context = miniWikiCollectionDict[str(bm25_ranking[0])]
58
+ vision_context = miniWikiCollectionDict[vision_ranking[0]]
59
+ open_source_context = miniWikiCollectionDict[open_source_ranking[0]]
60
+
61
+ tf_idf_bm25_open_RRF_Ranking_context = miniWikiCollectionDict[tf_idf_bm25_open_RRF_Ranking[0]]
62
+
63
+ # Generating answers
64
+ agent1_context = wiki_data[0]
65
+ agent2_context = article
66
+
67
+ agent1_answer = generate_answer_withContext(query, agent1_context)
68
+ agent2_answer = generate_answer_withContext(query, agent2_context)
69
+ boolean_answer = generate_answer_withContext(query, boolean_context)
70
+ tf_idf_answer = generate_answer_withContext(query, tf_idf_context)
71
+ bm25_answer = generate_answer_withContext(query, bm25_context)
72
+ vision_answer = generate_answer_withContext(query, vision_context)
73
+ open_source_answer = generate_answer_withContext(query, open_source_context)
74
+
75
+ tf_idf_bm25_open_RRF_Ranking_answer = generate_answer_withContext(query, tf_idf_bm25_open_RRF_Ranking_context)
76
+
77
+ zeroShot = generate_answer_zeroShot(query)
78
+
79
+ # Ranking the best answer
80
+ rankerAgentInput = {
81
+ "query": query,
82
+ "agent1": agent1_answer,
83
+ "agent2": agent2_answer,
84
+ "boolean": boolean_answer,
85
+ "tf_idf": tf_idf_answer,
86
+ "bm25": bm25_answer,
87
+ "vision": vision_answer,
88
+ "open_source": open_source_answer,
89
+ "tf_idf_bm25_open_RRF_Ranking": tf_idf_bm25_open_RRF_Ranking_answer,
90
+ "zeroShot": zeroShot,
91
+ }
92
+
93
+ best_model, best_answer = rankerAgent(rankerAgentInput)
94
+
95
+ return best_model, best_answer
96
+
97
+ # Gradio interface
98
+ interface = gr.Interface(
99
+ fn=process_query,
100
+ inputs=gr.Textbox(label="Enter your query"),
101
+ outputs=[
102
+ gr.Textbox(label="Best Model"),
103
+ gr.Textbox(label="Best Answer"),
104
+ ],
105
+ title="Query Answering System",
106
+ description="Enter a query to get the best model and the best answer using multiple retrieval models and ranking techniques.",
107
+ allow_flagging="never"
108
+ )
109
+
110
+ # Launch the interface
111
+ if __name__ == "__main__":
112
+ interface.launch()
vision/Text_to_image/main.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ from textToPdf import create_pdf
3
+ from pdfToImage import pdf_to_image
4
+
5
+ def main():
6
+ # Sample input text
7
+ input_text = "This is a sample text that will be used to generate the PDF. " * 500 # Example long text
8
+
9
+ # Call the create_pdf function and get the path of the generated PDF
10
+ pdf_path = create_pdf(input_text)
11
+ image_path = pdf_to_image(pdf_path)
12
+
13
+ # Print the output path of the generated PDF
14
+ print(f"PDF generated successfully: {pdf_path}")
15
+
16
+ for i in image_path:
17
+ print(i)
18
+
19
+ if __name__ == "__main__":
20
+ main()
vision/Text_to_image/pdfToImage.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import os
3
+
4
+ def pdf_to_image(pdf_path, zoom=2.0):
5
+ # Open the PDF file
6
+ pdf_document = fitz.open(pdf_path)
7
+
8
+ # Create a list to store image paths
9
+ image_paths = []
10
+
11
+ # Create an 'Images' directory if it doesn't exist
12
+ os.makedirs("Images", exist_ok=True)
13
+
14
+ # Iterate over PDF pages and convert each to an image
15
+ for page_num in range(len(pdf_document)):
16
+ page = pdf_document.load_page(page_num) # Load the page
17
+
18
+ # Set zoom level to improve quality
19
+ mat = fitz.Matrix(zoom, zoom) # Create a transformation matrix with the zoom level
20
+ pix = page.get_pixmap(matrix=mat) # Render the page to an image with the specified zoom
21
+
22
+ image_file = f'Images/{os.path.basename(pdf_path)}_page_{page_num}.png'
23
+ pix.save(image_file) # Save the image as PNG
24
+ image_paths.append(image_file)
25
+
26
+ # Return the list containing paths of all images
27
+ return image_paths
28
+
29
+ # Example usage
30
+ # pdf_to_image('your_pdf_file.pdf', zoom=2.0) # Increase zoom for higher quality
vision/Text_to_image/textToPdf.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fpdf import FPDF
2
+ from datetime import datetime
3
+ import os
4
+
5
+ def create_pdf(input_text):
6
+ # Create instance of FPDF class
7
+ pdf = FPDF()
8
+
9
+ # Add a page
10
+ pdf.add_page()
11
+
12
+ # Set font
13
+ pdf.set_font("Arial", size=10)
14
+
15
+ # Split the input text into multiple lines if necessary
16
+ # This ensures that the text fits the page and multiple pages are handled
17
+ pdf.multi_cell(0, 5, txt=input_text)
18
+
19
+ # Create a unique file name with the current time
20
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
21
+ file_name = f"PDFs/Aditya_{timestamp}.pdf"
22
+
23
+ # Create output directory if it doesn't exist
24
+ os.makedirs(os.path.dirname(file_name), exist_ok=True)
25
+
26
+ # Save the PDF
27
+ pdf.output(file_name)
28
+
29
+ # Return the file path
30
+ return file_name