Spaces:

MediaNetAdsRag
/

Ads_Rag

Running

App Files Files Community

Rajat.bans commited on 6 days ago

Commit

0b952f2

•

1 Parent(s): 3da4cba

Updated the code with comments and type definitions

Browse files

Files changed (2) hide show

rag.ipynb +0 -0
rag.py +290 -206

rag.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

rag.py CHANGED Viewed

@@ -11,14 +11,18 @@ import time
 import os
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from dotenv import load_dotenv
-import pandas as pd
 from typing import List, Tuple, Dict, Any
 class CLUSTERING:
     def cluster_embeddings(
         self,
-        embeddings: List[List[float]],
         clustering_algo: str,
         no_of_clusters: int,
         no_of_points: int,
@@ -27,7 +31,7 @@ class CLUSTERING:
         Clusters embeddings using the specified clustering algorithm and returns the indices of points in each cluster.
         Parameters:
-            embeddings (List[List[float]]): The input embeddings to cluster.
             clustering_algo (str): The clustering algorithm to use ("kmeans-cc", "kmeans-sp", or "spectral").
             no_of_clusters (int): The number of clusters to form.
             no_of_points (int): The maximum number of points to include in each cluster.
@@ -123,7 +127,7 @@ class VECTOR_DB:
     def queryVectorDB(
         self, page_information: str, threshold: float = None
-    ) -> Tuple[List[List[Tuple]], float]:
         """
         Query the vector database and cluster the retrieved documents.
@@ -196,7 +200,7 @@ class FAISS_DB:
         metadata: List[Dict[str, Any]],
         CHUNK_SIZE: int = 2048,
         CHUNK_OVERLAP: int = 512,
-    ) -> List[Dict[str, Any]]:
         """
         Split the provided content into chunks with metadata.
@@ -207,7 +211,7 @@ class FAISS_DB:
             CHUNK_OVERLAP (int): The overlap between chunks. Default is 512.
         Returns:
-            List[Dict[str, Any]]: The split documents with metadata.
         """
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=CHUNK_SIZE,
@@ -218,13 +222,13 @@ class FAISS_DB:
         return split_docs
     def createDBFromDocs(
-        self, split_docs: List[Dict[str, Any]], embeddings_model: HuggingFaceEmbeddings
     ) -> FAISS:
         """
         Create a FAISS database from the provided documents and embeddings model.
         Parameters:
-            split_docs (List[Dict[str, Any]]): The split documents.
             embeddings_model (HuggingFaceEmbeddings): The embeddings model to use.
         Returns:
@@ -235,7 +239,7 @@ class FAISS_DB:
     def createAndSaveDBInChunks(
         self,
-        split_docs: List[Dict[str, Any]],
         embeddings_model: HuggingFaceEmbeddings,
         DB_FAISS_PATH: str,
         chunk_size: int = 1000,
@@ -244,7 +248,7 @@ class FAISS_DB:
         Create and save the FAISS database in chunks.
         Parameters:
-            split_docs (List[Dict[str, Any]]): The split documents.
             embeddings_model (HuggingFaceEmbeddings): The embeddings model to use.
             DB_FAISS_PATH (str): The path to save the FAISS database.
             chunk_size (int): The size of each chunk. Default is 1000.
@@ -347,7 +351,7 @@ class FAISS_DB:
                 cv = fl[6:-6]
                 ind = max(ind, int(cv))
-        all_dbs = []
         for i in range(0, ind + 1, 2):
             print(i)
             db1 = FAISS.load_local(
@@ -389,12 +393,13 @@ class FAISS_DB:
 class ADS_RAG:
     def __init__(
         self,
-        db,
-        qa_model_name,
-        relation_check_best_value_thresh,
-        bestRelationSystemPrompt,
-        bestQuestionSystemPrompt,
-    ):
         self.client = OpenAI()
         self.db = db
         self.qa_model_name = qa_model_name
@@ -402,7 +407,18 @@ class ADS_RAG:
         self.bestRelationSystemPrompt = bestRelationSystemPrompt
         self.bestQuestionSystemPrompt = bestQuestionSystemPrompt
-    def callOpenAiApi(self, messages):
         while True:
             try:
                 response = self.client.chat.completions.create(
@@ -423,12 +439,25 @@ class ADS_RAG:
     def getBestQuestionOnTheBasisOfPageInformationAndAdsData(
         self,
-        page_information,
-        adsData,
-        relationSystemPrompt,
-        questionSystemPrompt,
-        bestRetreivedAdValue,
-    ):
         if adsData == "":
             return ({"reasoning": "No ads data present", "classification": 0}, 0), (
                 {"reasoning": "", "question": "", "options": []},
@@ -454,11 +483,9 @@ class ADS_RAG:
                     }
                 ]
             )
-            tokens_used_question = 0
         else:
             relation_answer["reasoning"] = (
-                "First retreived document value less than threshold so no need to check relation"
             )
         if relation_answer["classification"] != 0:
@@ -483,7 +510,18 @@ class ADS_RAG:
             "tokens_used_question": tokens_used_question,
         }
-    def convertDocumentsClustersToStringForApiCall(self, documents_clusters):
         key_counter = count(1)
         res = json.dumps(
             {
@@ -497,14 +535,30 @@ class ADS_RAG:
         return res
     def getRagResponse(
-        self, page_information, threshold=None, RelationPrompt=None, QuestionPrompt=None
-    ):
         curr_relation_prompt = self.bestRelationSystemPrompt
-        if RelationPrompt != None and len(RelationPrompt):
             curr_relation_prompt = RelationPrompt
         curr_question_prompt = self.bestQuestionSystemPrompt
-        if QuestionPrompt != None and len(QuestionPrompt):
             curr_question_prompt = QuestionPrompt
         documents_clusters, best_value = self.db.queryVectorDB(
@@ -520,7 +574,18 @@ class ADS_RAG:
         return answer, documents_clusters
-    def changeDocumentsToPrintableString(self, documents_clusters):
         res = ""
         i = 0
         for ind, documents_cluster in enumerate(documents_clusters):
@@ -531,7 +596,19 @@ class ADS_RAG:
             res += "\n"
         return res
-    def changeResponseToPrintableString(self, response, task):
         if task == "relation":
             return f"Reasoning: {response['reasoning']}\n\nClassification: {response['classification']}\n"
         res = f"Reasoning: {response['reasoning']}\n\nQuestion: {response['question']}\n\nOptions: \n"
@@ -543,8 +620,21 @@ class ADS_RAG:
         return res
     def logResult(
-        self, curr_relation_prompt, curr_question_prompt, page_information, answer
-    ):
         print(
             "**************************************************************************************************\n",
             # curr_relation_prompt,
@@ -555,13 +645,32 @@ class ADS_RAG:
         )
     def getRagGradioResponse(
-        self, page_information, RelationPrompt, QuestionPrompt, threshold
-    ):
         answer, documents_clusters = self.getRagResponse(
             page_information, threshold, RelationPrompt, QuestionPrompt
         )
         self.logResult(RelationPrompt, QuestionPrompt, page_information, answer)
         docs_info = self.changeDocumentsToPrintableString(documents_clusters)
         relation_answer_string = self.changeResponseToPrintableString(
             answer["relation_answer"], "relation"
@@ -569,79 +678,70 @@ class ADS_RAG:
         question_answer_string = self.changeResponseToPrintableString(
             answer["question_answer"], "question"
         )
         question_tokens = answer["tokens_used_question"]
         relation_tokens = answer["tokens_used_relation"]
-        full_response = f"**ANSWER**: \n Relation answer:\n {relation_answer_string}\n Question answer:\n {question_answer_string}\n\n**RETREIVED DOCUMENTS CLUSTERS**:\n{docs_info}\n\n**TOKENS USED**:\nQuestion api call: {question_tokens}\nRelation api call: {relation_tokens}"
         return full_response
-class VARIABLE_MANAGER:
-    def __init__(self):
         load_dotenv(override=True)
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
-        self.embedding_model_hf = "BAAI/bge-m3"
-        # embedding_model_hf = "sentence-transformers/all-mpnet-base-v2"
-        self.DB_FAISS_PATH = (
-            "./vectorstore/db_faiss_ads_Jun_facty_activebeat_Health_dupRemoved0.85"
-        )
-    def getRag(self):
-        # embeddings_oa = OpenAIEmbeddings(model=embedding_model_oa)
-        # embeddings_hf = HuggingFaceEmbeddings(model_name = embedding_model_hf, show_progress = True)
-        embeddings_hf = HuggingFaceEmbeddings(model_name=self.embedding_model_hf)
         vector_db = VECTOR_DB(
-            0.75, 50, "kmeans-cc", 3, 6, self.DB_FAISS_PATH, embeddings_hf
         )
         rag = ADS_RAG(
-            vector_db,
-            "gpt-3.5-turbo",
-            0.6,
-            self.getRelationSystemPrompt(),
-            self.getQuestionSystemPrompt(),
         )
         return rag
-    def QnAAdsSampleGenerationPreProcessing(self):
-        data_file_path = (
-            "./data/148_facty_activebeat_24Jun-30Jun_top1000each_urlsContent.tsv"
-        )
-        data = pd.read_csv(data_file_path, sep="\t")
-        data.dropna(axis=0, how="any", inplace=True)
-        # data.drop_duplicates(subset = ['ad_title', 'ad_desc'], inplace=True)
-        # ad_title_content = list(data["ad_title"].values)
-        def get_core_content(row):
-            url_content = row["url_content"]
-            url_title = row["url_title"]
-            return (
-                "Page Title -: "
-                + url_title
-                + "\nPage Content -: "
-                + ". ".join(url_content.split(". ")[:7])
-            )
-        data["core_content"] = data.apply(get_core_content, axis=1)
-        # for i in range(len(data)):
-        #     print(data.loc[i, 'url'])
-        #     print(data.loc[i, 'url_content'])
-        #     print(data.loc[i, 'core_content'])
-        #     print()
-        #     if(i > 10):
-        #         break
-        return data
-    def GradioRagPreProcessing(self):
-        data_file_path = (
-            "./data/149_adclick_Jun_facty_activeBeat_Health_dupRemoved0.85_campaign.tsv"
-        )
-        data = pd.read_csv(data_file_path, sep="\t")
-        # data.dropna(axis=0, how="any", inplace=True)
-        data.drop_duplicates(subset=["ad_title", "ad_desc"], inplace=True)
-        ad_title_content = list(data["ad_title"].values)
-        return ad_title_content
-    def getQuestionSystemPrompt(self):
         bestQuestionSystemPrompt = """1. You are an advertising concierge for text ads on websites. Given an INPUT and the available ad inventory (ADS_DATA), your task is to form a relevant QUESTION to ask the user visiting the webpage. This question should help identify the user's intent behind visiting the webpage and should be highly attractive.
 2. Now form a highly attractive/lucrative and diverse/mutually exclusive OPTION which should be both the answer for the QUESTION and related to ads in this cluster.
 3. Try to generate intelligent creatives for advertising and keep QUESTION within 70 characters and either 2, 3 or 4 options with each OPTION within 4 to 6 words.
@@ -699,7 +799,13 @@ The ADS_DATA provided to you is as follows:
         # """
         return bestQuestionSystemPrompt
-    def getRelationSystemPrompt(self):
         bestRelationSystemPrompt = """You are an advertising concierge for text ads on websites. Given an INPUT and the available ad inventory (ADS_DATA), your task is to determine whether there are some relevant ADS to INPUT are present in ADS_DATA. ADS WHICH DON'T MATCH USER'S INTENT SHOULD BE CONSIDERED IRRELEVANT
 ---------------------------------------
@@ -727,129 +833,82 @@ The ADS_DATA provided to you is as follows:
         return bestRelationSystemPrompt
-# *********************** DB GENERATION ******************************
-# df = pd.read_csv(data_file_path, sep="\t")
-# --------------------------------
-# WEB DATA PROCESSING
-# from urllib.parse import urlparse
-# import re
-# def get_cleaned_url(url):
-#     path = urlparse(url).path.strip()
-#     cleaned_path = re.sub(r'[^a-zA-Z0-9\s-]', ' ', path).replace('/', '')
-#     cleaned_path = re.sub(r'[^a-zA-Z0-9\s]', ' ', path).replace('-', '')
-#     return cleaned_path.strip()
-# df['cleaned_url'] = df['url'].map(get_cleaned_url)
-# df.dropna(subset=['cleaned_url', 'url_content', 'url_title'], inplace=True)
-# df['combined'] = df['cleaned_url'] + ". " + df['url_title'] + ". " + df['url_content']
-# content = df["combined"].tolist()
-# metadata = [
-#         {"title": row["url_title"], "url": row["url"]}
-#         for _, row in df.iterrows()
-#     ]
-# ------------------------------
-# ADS DATA PROCESSING
-# # df.dropna(axis=0, how='any', inplace=True)
-# df.drop_duplicates(subset = ['ad_title', 'ad_desc'], inplace=True)
-# dfRPC = df[df['RPC'] > 0]
-# dfRPC.dropna(how = 'any', inplace=True)
-# dfCampaign = df[df['type'] == 'campaign']
-# dfCampaign.fillna('', inplace=True)
-# df = pd.concat([dfRPC, dfCampaign])
-# df
-# content = (df["ad_title"] + ". " + df["ad_desc"]).tolist()
-# metadata = [
-#         {"publisher_url": row["publisher_url"], "keyword_term": row["keyword_term"], "ad_display_url": row["ad_display_url"], "revenue": row["revenue"], "ad_click_count": row["ad_click_count"], "RPC": row["RPC"], "Type": row["type"]}
-#         # {"revenue": row["revenue"], "ad_click_count": row["ad_click_count"]}
-#         for _, row in df.iterrows()
-#     ]
-# --------------------------------
-# faiss_db = FAISS_DB()
-# db = faiss_db.createDBFromDocs(content, metadata)
-# faiss_db.saveDB(db, '.')
-# ************************************************************************
-# PARALLELY CREATING DB - BACKUP FOR FUTURE USE
-# import time
-# import threading
-# import os
-# one_db_docs_size = 1000
-# starting_i = 0
-# parallel_processes = 3
-# def split_list(lst, n):
-#     k, m = divmod(len(lst), n)
-#     return (lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
-# def createDBForIndexes(inds):
-#     for i in inds:
-#         ctime = time.time()
-#         print(f"Processing {i}")
-#         if not os.path.exists(DB_FAISS_PATH + "/index_{int(i/one_db_docs_size)}.faiss"):
-#             db = FAISS.from_documents(split_docs[i:i+one_db_docs_size], embeddings_hf)
-#             db.save_local(DB_FAISS_PATH, index_name = f"index_{int(i/one_db_docs_size)}")
-#         ctime = time.time() - ctime
-#         print(f"{i})Time taken", ctime)
-# indexes = split_list(range(starting_i, len(split_docs), one_db_docs_size), parallel_processes)
-# threads = []
-# for i, one_process_indexes in enumerate(indexes):
-#     thread = threading.Thread(target=createDBForIndexes, args=(one_process_indexes,))
-#     thread.start()
-#     threads.append(thread)
-# for thread in threads:
-#     thread.join()
-# print("All threads completed.")
-# ************************************************************************
-if __name__ == "__main__":
-    import pandas as pd
-    import gradio as gr
-    import random
-    vm = VARIABLE_MANAGER()
-    rag = vm.getRag()
-    ad_title_content = vm.GradioRagPreProcessing()
-    with gr.Blocks() as demo:
-        gr.Markdown("# RAG on ads data")
-        with gr.Row():
-            RelationPrompt = gr.Textbox(
-                vm.getRelationSystemPrompt(),
-                lines=1,
-                placeholder="Enter the relation system prompt for relation check",
-                label="Relation System prompt",
-            )
-            QuestionPrompt = gr.Textbox(
-                vm.getQuestionSystemPrompt(),
-                lines=1,
-                placeholder="Enter the question system prompt for question formulation",
-                label="Question System prompt",
-            )
-            page_information = gr.Textbox(
-                lines=1,
-                placeholder="Enter the page information",
-                label="Page Information",
-            )
-            threshold = gr.Number(
-                value=rag.db.default_threshold, label="Threshold", interactive=True
-            )
         output = gr.Textbox(label="Output")
         submit_btn = gr.Button("Submit")
         submit_btn.click(
-            rag.getRagGradioResponse,
             inputs=[page_information, RelationPrompt, QuestionPrompt, threshold],
             outputs=[output],
         )
         page_information.submit(
-            rag.getRagGradioResponse,
             inputs=[page_information, RelationPrompt, QuestionPrompt, threshold],
             outputs=[output],
         )
         with gr.Accordion("Ad Titles", open=False):
             ad_titles = gr.Markdown()
         demo.load(
             lambda: "<br>".join(
                 random.sample(
@@ -861,5 +920,30 @@ if __name__ == "__main__":
             ad_titles,
         )
-    gr.close_all()
-    demo.launch()

 import os
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from dotenv import load_dotenv
 from typing import List, Tuple, Dict, Any
+from numpy import ndarray
+from langchain_core.documents import Document
+import gradio as gr
+import random
+import pandas as pd
 class CLUSTERING:
     def cluster_embeddings(
         self,
+        embeddings: ndarray,
         clustering_algo: str,
         no_of_clusters: int,
         no_of_points: int,
         Clusters embeddings using the specified clustering algorithm and returns the indices of points in each cluster.
         Parameters:
+            embeddings (ndarray): The input embeddings to cluster.
             clustering_algo (str): The clustering algorithm to use ("kmeans-cc", "kmeans-sp", or "spectral").
             no_of_clusters (int): The number of clusters to form.
             no_of_points (int): The maximum number of points to include in each cluster.
     def queryVectorDB(
         self, page_information: str, threshold: float = None
+    ) -> Tuple[List[List[Tuple[Document, float]]], float]:
         """
         Query the vector database and cluster the retrieved documents.
         metadata: List[Dict[str, Any]],
         CHUNK_SIZE: int = 2048,
         CHUNK_OVERLAP: int = 512,
+    ) -> List[Document]:
         """
         Split the provided content into chunks with metadata.
             CHUNK_OVERLAP (int): The overlap between chunks. Default is 512.
         Returns:
+            List[Document]: The split documents with metadata.
         """
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=CHUNK_SIZE,
         return split_docs
     def createDBFromDocs(
+        self, split_docs: List[Document], embeddings_model: HuggingFaceEmbeddings
     ) -> FAISS:
         """
         Create a FAISS database from the provided documents and embeddings model.
         Parameters:
+            split_docs (List[Document]): The split documents.
             embeddings_model (HuggingFaceEmbeddings): The embeddings model to use.
         Returns:
     def createAndSaveDBInChunks(
         self,
+        split_docs: List[Document],
         embeddings_model: HuggingFaceEmbeddings,
         DB_FAISS_PATH: str,
         chunk_size: int = 1000,
         Create and save the FAISS database in chunks.
         Parameters:
+            split_docs (List[Document]): The split documents.
             embeddings_model (HuggingFaceEmbeddings): The embeddings model to use.
             DB_FAISS_PATH (str): The path to save the FAISS database.
             chunk_size (int): The size of each chunk. Default is 1000.
                 cv = fl[6:-6]
                 ind = max(ind, int(cv))
+        all_dbs: List[FAISS] = []
         for i in range(0, ind + 1, 2):
             print(i)
             db1 = FAISS.load_local(
 class ADS_RAG:
     def __init__(
         self,
+        db: VECTOR_DB,
+        qa_model_name: str,
+        relation_check_best_value_thresh: float,
+        bestRelationSystemPrompt: str,
+        bestQuestionSystemPrompt: str,
+    ) -> None:
+        """Initialize the ADS_RAG class with the given parameters."""
         self.client = OpenAI()
         self.db = db
         self.qa_model_name = qa_model_name
         self.bestRelationSystemPrompt = bestRelationSystemPrompt
         self.bestQuestionSystemPrompt = bestQuestionSystemPrompt
+    def callOpenAiApi(
+        self, messages: List[Dict[str, str]]
+    ) -> Tuple[Dict[str, Any], int]:
+        """
+        Call the OpenAI API with the given messages and return the response.
+        Parameters:
+            messages (List[Dict[str, str]]): The messages to send to the OpenAI API.
+        Returns:
+            Tuple[Dict[str, Any], int]: The response from the OpenAI API and the number of tokens used.
+        """
         while True:
             try:
                 response = self.client.chat.completions.create(
     def getBestQuestionOnTheBasisOfPageInformationAndAdsData(
         self,
+        page_information: str,
+        adsData: str,
+        relationSystemPrompt: str,
+        questionSystemPrompt: str,
+        bestRetreivedAdValue: float,
+    ) -> Dict[str, Any]:
+        """
+        Get the best question based on page information and ads data.
+        Parameters:
+            page_information (str): The information about the page.
+            adsData (str): The data about the ads.
+            relationSystemPrompt (str): The system prompt for relation checking.
+            questionSystemPrompt (str): The system prompt for question generation.
+            bestRetreivedAdValue (float): The best retrieved ad value.
+        Returns:
+            Dict[str, Any]: The relation and question answers along with token usage information.
+        """
         if adsData == "":
             return ({"reasoning": "No ads data present", "classification": 0}, 0), (
                 {"reasoning": "", "question": "", "options": []},
                     }
                 ]
             )
         else:
             relation_answer["reasoning"] = (
+                "First retrieved document value less than threshold so no need to check relation"
             )
         if relation_answer["classification"] != 0:
             "tokens_used_question": tokens_used_question,
         }
+    def convertDocumentsClustersToStringForApiCall(
+        self, documents_clusters: List[List[Tuple[Document, float]]]
+    ) -> str:
+        """
+        Convert document clusters to a string format suitable for API calls.
+        Parameters:
+            documents_clusters (List[List[Tuple[Document, float]]]): The document clusters.
+        Returns:
+            str: The document clusters converted to a string.
+        """
         key_counter = count(1)
         res = json.dumps(
             {
         return res
     def getRagResponse(
+        self,
+        page_information: str,
+        threshold: float = None,
+        RelationPrompt: str = None,
+        QuestionPrompt: str = None,
+    ) -> Tuple[Dict[str, Any], List[List[Tuple[Document, float]]]]:
+        """
+        Get the RAG response based on the page information and optional prompts.
+        Parameters:
+            page_information (str): The information about the page.
+            threshold (float): The threshold for querying the database. Default is None.
+            RelationPrompt (str): The prompt for relation checking. Default is None.
+            QuestionPrompt (str): The prompt for question generation. Default is None.
+        Returns:
+            Tuple[Dict[str, Any], List[List[Tuple[Document, float]]]]: The RAG response and the document clusters.
+        """
         curr_relation_prompt = self.bestRelationSystemPrompt
+        if RelationPrompt is not None and len(RelationPrompt):
             curr_relation_prompt = RelationPrompt
         curr_question_prompt = self.bestQuestionSystemPrompt
+        if QuestionPrompt is not None and len(QuestionPrompt):
             curr_question_prompt = QuestionPrompt
         documents_clusters, best_value = self.db.queryVectorDB(
         return answer, documents_clusters
+    def changeDocumentsToPrintableString(
+        self, documents_clusters: List[List[Tuple[Document, float]]]
+    ) -> str:
+        """
+        Convert document clusters to a printable string format.
+        Parameters:
+            documents_clusters (List[List[Tuple[Document, float]]]): The document clusters.
+        Returns:
+            str: The document clusters converted to a printable string.
+        """
         res = ""
         i = 0
         for ind, documents_cluster in enumerate(documents_clusters):
             res += "\n"
         return res
+    def changeResponseToPrintableString(
+        self, response: Dict[str, Any], task: str
+    ) -> str:
+        """
+        Convert the response to a printable string format.
+        Parameters:
+            response (Dict[str, Any]): The response to convert.
+            task (str): The task type ('relation' or 'question').
+        Returns:
+            str: The response converted to a printable string.
+        """
         if task == "relation":
             return f"Reasoning: {response['reasoning']}\n\nClassification: {response['classification']}\n"
         res = f"Reasoning: {response['reasoning']}\n\nQuestion: {response['question']}\n\nOptions: \n"
         return res
     def logResult(
+        self,
+        curr_relation_prompt: str,
+        curr_question_prompt: str,
+        page_information: str,
+        answer: Dict[str, Any],
+    ) -> None:
+        """
+        Log the result of the RAG response.
+        Parameters:
+            curr_relation_prompt (str): The current relation prompt.
+            curr_question_prompt (str): The current question prompt.
+            page_information (str): The information about the page.
+            answer (Dict[str, Any]): The RAG response.
+        """
         print(
             "**************************************************************************************************\n",
             # curr_relation_prompt,
         )
     def getRagGradioResponse(
+        self,
+        page_information: str,
+        RelationPrompt: str,
+        QuestionPrompt: str,
+        threshold: float,
+    ) -> str:
+        """
+        Get the RAG response in a format suitable for Gradio.
+        Parameters:
+            page_information (str): The information about the page.
+            RelationPrompt (str): The prompt for relation checking.
+            QuestionPrompt (str): The prompt for question generation.
+            threshold (float): The threshold for querying the database.
+        Returns:
+            str: The full response formatted for Gradio.
+        """
+        # Get the RAG response and document clusters
         answer, documents_clusters = self.getRagResponse(
             page_information, threshold, RelationPrompt, QuestionPrompt
         )
+        # Log the result
         self.logResult(RelationPrompt, QuestionPrompt, page_information, answer)
+        # Convert documents and responses to printable strings
         docs_info = self.changeDocumentsToPrintableString(documents_clusters)
         relation_answer_string = self.changeResponseToPrintableString(
             answer["relation_answer"], "relation"
         question_answer_string = self.changeResponseToPrintableString(
             answer["question_answer"], "question"
         )
+        # Get token usage information
         question_tokens = answer["tokens_used_question"]
         relation_tokens = answer["tokens_used_relation"]
+        # Format the full response
+        full_response = (
+            f"**ANSWER**: \n Relation answer:\n {relation_answer_string}\n "
+            f"Question answer:\n {question_answer_string}\n\n"
+            f"**RETRIEVED DOCUMENTS CLUSTERS**:\n{docs_info}\n\n"
+            f"**TOKENS USED**:\nQuestion api call: {question_tokens}\n"
+            f"Relation api call: {relation_tokens}"
+        )
         return full_response
+class Helper:
+    def __init__(self, DB_FAISS_PATH: str) -> None:
+        """Initialize the Helper class and set environment variables."""
         load_dotenv(override=True)
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        self.DB_FAISS_PATH = DB_FAISS_PATH
+    def getRag(self) -> ADS_RAG:
+        """
+        Create and return an instance of the ADS_RAG class.
+        Returns:
+            ADS_RAG: An instance of the ADS_RAG class.
+        """
+        # Initialize embeddings using HuggingFace
+        embeddings_hf = HuggingFaceEmbeddings(
+            model_name="BAAI/bge-m3"
+        )  # "sentence-transformers/all-mpnet-base-v2"
+        # Create a VECTOR_DB instance
         vector_db = VECTOR_DB(
+            default_threshold=0.75,
+            number_of_ads_to_fetch_from_db=50,
+            clustering_algo="kmeans-cc",
+            no_of_clusters=3,
+            no_of_ads_in_each_cluster=6,
+            DB_FAISS_PATH=self.DB_FAISS_PATH,
+            embeddings_hf=embeddings_hf,
         )
+        # Create and return an ADS_RAG instance
         rag = ADS_RAG(
+            db=vector_db,
+            qa_model_name="gpt-3.5-turbo",
+            relation_check_best_value_thresh=0.6,
+            bestRelationSystemPrompt=self.getRelationSystemPrompt(),
+            bestQuestionSystemPrompt=self.getQuestionSystemPrompt(),
         )
         return rag
+    def getQuestionSystemPrompt(self) -> str:
+        """
+        Return the system prompt for question generation.
+        Returns:
+            str: The question system prompt.
+        """
         bestQuestionSystemPrompt = """1. You are an advertising concierge for text ads on websites. Given an INPUT and the available ad inventory (ADS_DATA), your task is to form a relevant QUESTION to ask the user visiting the webpage. This question should help identify the user's intent behind visiting the webpage and should be highly attractive.
 2. Now form a highly attractive/lucrative and diverse/mutually exclusive OPTION which should be both the answer for the QUESTION and related to ads in this cluster.
 3. Try to generate intelligent creatives for advertising and keep QUESTION within 70 characters and either 2, 3 or 4 options with each OPTION within 4 to 6 words.
         # """
         return bestQuestionSystemPrompt
+    def getRelationSystemPrompt(self) -> str:
+        """
+        Return the system prompt for relation checking.
+        Returns:
+            str: The relation system prompt.
+        """
         bestRelationSystemPrompt = """You are an advertising concierge for text ads on websites. Given an INPUT and the available ad inventory (ADS_DATA), your task is to determine whether there are some relevant ADS to INPUT are present in ADS_DATA. ADS WHICH DON'T MATCH USER'S INTENT SHOULD BE CONSIDERED IRRELEVANT
 ---------------------------------------
         return bestRelationSystemPrompt
+class RAGGradioApp:
+    def __init__(self, helper: Helper) -> None:
+        """
+        Initialize the RAGGradioApp with an instance of ADS_RAG and Helper.
+        Args:
+            rag (ADS_RAG): An instance of ADS_RAG for handling RAG functionality.
+            helper (Helper): An instance of Helper for configuration and prompts.
+        """
+        self.rag = helper.getRag()
+        self.relationSystemPrompt = helper.getRelationSystemPrompt()
+        self.questionSystempPrompt = helper.getQuestionSystemPrompt()
+    def get_interface(self, ad_title_content: List[str]) -> gr.Blocks:
+        """
+        Construct the Gradio interface for RAG functionality.
+        Returns:
+            gr.Blocks: Gradio Blocks object containing the constructed interface.
+        """
+        # Textbox for Relation System prompt
+        RelationPrompt = gr.Textbox(
+            self.getRelationSystemPrompt(),
+            lines=1,
+            placeholder="Enter the relation system prompt for relation check",
+            label="Relation System prompt",
+        )
+        # Textbox for Question System prompt
+        QuestionPrompt = gr.Textbox(
+            self.getQuestionSystemPrompt(),
+            lines=1,
+            placeholder="Enter the question system prompt for question formulation",
+            label="Question System prompt",
+        )
+        # Textbox for Page Information input
+        page_information = gr.Textbox(
+            lines=1,
+            placeholder="Enter the page information",
+            label="Page Information",
+        )
+        # Number input for Threshold
+        threshold = gr.Number(
+            value=self.rag.db.default_threshold, label="Threshold", interactive=True
+        )
+        # Textbox for displaying output
         output = gr.Textbox(label="Output")
+        # Button for submitting the form
         submit_btn = gr.Button("Submit")
+        # Define behavior on button click
         submit_btn.click(
+            self.rag.getRagGradioResponse,
             inputs=[page_information, RelationPrompt, QuestionPrompt, threshold],
             outputs=[output],
         )
+        # Define behavior on form submission by pressing enter
         page_information.submit(
+            self.rag.getRagGradioResponse,
             inputs=[page_information, RelationPrompt, QuestionPrompt, threshold],
             outputs=[output],
         )
+        # Accordion to display Ad Titles
         with gr.Accordion("Ad Titles", open=False):
             ad_titles = gr.Markdown()
+        # Create a Gradio Blocks object for structured layout
+        demo = gr.Blocks()
+        # Load ad titles into the accordion
         demo.load(
             lambda: "<br>".join(
                 random.sample(
             ad_titles,
         )
+        return demo
+    def launch(self, example_content: List) -> None:
+        """
+        Launch the Gradio interface for RAG functionality.
+        """
+        gr.close_all()  # Close any existing Gradio instances
+        interface = self.get_interface(example_content)  # Get the constructed interface
+        interface.launch()  # Launch the Gradio interface
+if __name__ == "__main__":
+    helper = Helper(
+        "./vectorstore/db_faiss_ads_Jun_facty_activebeat_Health_dupRemoved0.85"
+    )
+    rag_gradio_app = RAGGradioApp(helper)
+    data = pd.read_csv(
+        "./data/149_adclick_Jun_facty_activeBeat_Health_dupRemoved0.85_campaign.tsv",
+        sep="\t",
+    )
+    # data.dropna(axis=0, how="any", inplace=True)
+    ad_title_content = list(
+        data.drop_duplicates(subset=["ad_title", "ad_desc"])["ad_title"].values
+    )
+    rag_gradio_app.launch(ad_title_content)