Spaces:

nourkchaou
/

RAG

Sleeping

App Files Files Community

nourkchaou commited on Jul 16

Commit

0da318d

•

1 Parent(s): da20205

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -66

app.py CHANGED Viewed

@@ -10,26 +10,26 @@ from dotenv import load_dotenv
 from pinecone import Pinecone, ServerlessSpec
 load_dotenv()
 HF_TOKEN = os.environ["HF_TOKEN"]
 PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
-#Initialisation de Pinecone et du Modèle d'Embeddings
 # index_name = "db"
-pc = Pinecone(api_key=PINECONE_API_KEY) #initialise une instance de Pinecone avec la clé API.
-embedder = HuggingFaceInferenceAPIEmbeddings(            #initialise un modèle d'embeddings
     api_key=HF_TOKEN,
-    model_name="intfloat/multilingual-e5-large-instruct",
 )
 index = "db"
 # users = {
 #     "aymen": "admin",
 #     "amin": "root",
@@ -44,39 +44,49 @@ def load_data(url=None, description=None, pdf=None):
     data = []
     if url != None:
         try:
-            loader = WebBaseLoader(url, encoding="utf-8")  # WebBaseLoader: charge et extrait le contenu textuel d'une page web
             loaded = loader.load()
-            data.append(loaded[0].page_content)
         except Exception as e:
             print("An error occurred while loading data from the URL:", e)
     if description != None:
         data.append(description)
     if pdf != None:
-        loader = PyPDFLoader(pdf)      #PyPDFLoader: charge et divise un fichier PDF en pages
         pages = loader.load_and_split()
         for page in pages:
             data.append(page.page_content)
     return data
 # function to Split the loaded data
-def split_data(data):  #divise les données en segments plus petits pour faciliter l'analyse et l'indexation
-    #data = "\n".join(data)
     # Create a RecursiveCharacterTextSplitter instance
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=60) #divise le texte en morceaux de taille spécifiée avec un chevauchement entre les morceaux pour éviter la perte de contexte
     # Split the text document into smaller chunks
     texts = text_splitter.create_documents(data)
     return texts
 # crée un index Pinecone pour un utilisateur s'il n'existe pas déjà
 def create_user_index(index_name):
     """Creates a Pinecone index with the username, validating the name first."""
-    existing_indexes = [index.name for index in pc.list_indexes()]  #liste les index existants.
     if index_name in existing_indexes:
         # L'index existe déjà, ne le recréez pas
         return
@@ -89,44 +99,54 @@ def create_user_index(index_name):
         spec=ServerlessSpec(cloud="aws", region="us-east-1"),
     )
 # embed: crée des embeddings pour les documents divisés et les stocke dans un magasin de vecteurs Pinecone.
 def embed(splited_docs, username):
     # Créez ou vérifiez l'index pour l'utilisateur
     create_user_index(index)
     # Créez une base de données vectorielle Pinecone à partir des documents divisés
-    PineconeVectorStore.from_documents( # PineconeVectorStore.from_documents: crée et stocke des vecteurs pour les documents fournis.
         documents=splited_docs,
         index_name=index,
         embedding=embedder,
         namespace=username,
     )
     # Créez un retrieveur
     # retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3})
     # return retriever
 # Récupération des Documents
 # retrieve documents from the dataset
 def retrieve(prompt, username):
-    vectorstore = PineconeVectorStore.from_existing_index( #from_existing_index: initialise un magasin de vecteurs à partir d'un index existant.
         index_name=index, embedding=embedder, namespace=username
     )
-    retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 2}) #as_retriever: crée un retrieveur pour interroger le magasin de vecteurs.
     retrieved_docs = retriever.invoke(prompt)
     return retrieved_docs
-def format_prompt(prompt, retrieved_documents, tone, marketing_technique,social_media):
-    prompt = f"you are a digital marketing expert in social media post , reply to the following prompt :\n{prompt}\n using a {tone} tone and implicitly use the {marketing_technique} marketing thechnique in your reply for a {social_media} post. use the following context when generating :\n"
     for document in retrieved_documents:
-        prompt += f"{document.page_content}\n" #an attribute of each document that contains the text content.
-    prompt += """If you don't know the answer, just say "I do not know."Don't make up an answer."""
     return prompt
@@ -136,17 +156,15 @@ def format_prompt(prompt, retrieved_documents, tone, marketing_technique,social_
 # Paris est non seulement la capitale de la France, mais aussi la plus grande ville du pays.
 # based on the following context
 # basé sur le contexte suivant
 # If you don't know the answer, just say "I do not know."Don't make up an answer.
 def clear_history(history):
     return []
 # function to Use a mistral llm via api hugging face space
 def ask_mistral(prompt):
     client = Client("hysts/mistral-7b")
@@ -159,15 +177,13 @@ def ask_mistral(prompt):
 		repetition_penalty=1.2,
 		api_name="/chat"
         )
     return result
 def inject_history(final_prompt, history):
     if len(history) > 0:
         final_prompt = (
-            final_prompt + "\n\n and the following history of the conversation : \n "
         )
         for user, assistant in history:
             final_prompt = final_prompt + "USER : " + user + "\n"
@@ -175,15 +191,15 @@ def inject_history(final_prompt, history):
         return final_prompt
     else:
         return final_prompt
 # what is my name based on the following context .
 # context:
 # retreived documents:
-# and the following history of the conversation :
 # USER : my name is nour
 # ASSISTANT : hi nour
 # USER : what is my name ?
 def upload_user_data(username, url=None, description=None, pdf_file=None):
@@ -194,19 +210,21 @@ def upload_user_data(username, url=None, description=None, pdf_file=None):
     return message
-def user_retrieve_and_generate(username, tone, marketing_technique, prompt, history ,social_media):
     # retrieve data from vector store
     retrieved_documents = retrieve(prompt, username)
     # format prompt
-    formatted_prompt = format_prompt(prompt, retrieved_documents,tone,marketing_technique,social_media)
-    #inject history
-    #final_prompt=inject_history(formatted_prompt, history)
     # ask mistral
     result = ask_mistral(formatted_prompt)
-    #history.append([prompt,result])
-    new_history= history+ [(prompt,result)]
     return new_history
@@ -230,43 +248,75 @@ upload_data = gr.Interface(
 )
 with gr.Blocks() as user_interface:
-    gr.Markdown(value="""user interface to retreive and genarate text based on uploaded data.""",
-                label=None)
     username = gr.Textbox(label="username")
-    tone=gr.Dropdown(["neutral","funny","serious","formal"], value="neutral",info="tone of voice used in the replies")
-    marketing_technique=gr.Radio(["Retargeting","AIDA","Promotion","Testimonial","FOMO","Before and after", "Problem and solution"], value="Retargeting",info="marketing technique to be used in the replies")
-    social_media=gr.Radio(["instagram","facebook","twitter"],value="facebook")
-    chatbot=gr.Chatbot(height=450, label="Gradio ChatInterface")
-    prompt=gr.Textbox(label="prompt")
     with gr.Row():
-        clear=gr.Button("🗑️Clear",variant="secondary")
-        submit=gr.Button("✅Submit",variant="primary")
     submit.click(
         fn=user_retrieve_and_generate,
-        inputs=[username, tone, marketing_technique, prompt, chatbot,social_media],
         outputs=[chatbot],
-        api_name="generate"
-    )
-    clear.click(fn=clear_history,inputs=chatbot,outputs=chatbot, show_api=False)
     prompt.submit(
         fn=user_retrieve_and_generate,
-        inputs=[username, tone, marketing_technique, prompt, chatbot,social_media],
         outputs=[chatbot],
-        api_name=False
-    )
 demo = gr.TabbedInterface(
-    [upload_data, user_interface], ["upload", "generate"], theme="upsatwal/mlsc_tiet" #upsatwal/mlsc_tiet
 )
 if __name__ == "__main__":
-    demo.launch(debug=True #,auth= custom_auth ,auth_message="Enter your username and password"
-)

 from pinecone import Pinecone, ServerlessSpec
 load_dotenv()
 HF_TOKEN = os.environ["HF_TOKEN"]
 PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
+# Initialisation de Pinecone et du Modèle d'Embeddings
 # index_name = "db"
+pc = Pinecone(
+    api_key=PINECONE_API_KEY
+)  # initialise une instance de Pinecone avec la clé API.
+embedder = HuggingFaceInferenceAPIEmbeddings(  # initialise un modèle d'embeddings
     api_key=HF_TOKEN,
+    model_name="mixedbread-ai/mxbai-embed-large-v1",
 )
 index = "db"
 # users = {
 #     "aymen": "admin",
 #     "amin": "root",
     data = []
     if url != None:
         try:
+            loader = WebBaseLoader(
+                url, encoding="utf-8"
+            )  # WebBaseLoader: charge et extrait le contenu textuel d'une page web
             loaded = loader.load()
+            for page in loaded :
+                data.append(page.page_content)
         except Exception as e:
             print("An error occurred while loading data from the URL:", e)
     if description != None:
         data.append(description)
     if pdf != None:
+        loader = PyPDFLoader(
+            pdf
+        )  # PyPDFLoader: charge et divise un fichier PDF en pages
         pages = loader.load_and_split()
         for page in pages:
             data.append(page.page_content)
     return data
 # function to Split the loaded data
+def split_data(
+    data,
+):  # divise les données en segments plus petits pour faciliter l'analyse et l'indexation
+    # data = "\n".join(data)
     # Create a RecursiveCharacterTextSplitter instance
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=512, chunk_overlap=60
+    )  # divise le texte en morceaux de taille spécifiée avec un chevauchement entre les morceaux pour éviter la perte de contexte
     # Split the text document into smaller chunks
     texts = text_splitter.create_documents(data)
     return texts
 # crée un index Pinecone pour un utilisateur s'il n'existe pas déjà
 def create_user_index(index_name):
     """Creates a Pinecone index with the username, validating the name first."""
+    existing_indexes = [
+        index.name for index in pc.list_indexes()
+    ]  # liste les index existants.
     if index_name in existing_indexes:
         # L'index existe déjà, ne le recréez pas
         return
         spec=ServerlessSpec(cloud="aws", region="us-east-1"),
     )
 # embed: crée des embeddings pour les documents divisés et les stocke dans un magasin de vecteurs Pinecone.
 def embed(splited_docs, username):
     # Créez ou vérifiez l'index pour l'utilisateur
     create_user_index(index)
     # Créez une base de données vectorielle Pinecone à partir des documents divisés
+    PineconeVectorStore.from_documents(  # PineconeVectorStore.from_documents: crée et stocke des vecteurs pour les documents fournis.
         documents=splited_docs,
         index_name=index,
         embedding=embedder,
         namespace=username,
     )
     # Créez un retrieveur
     # retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3})
     # return retriever
 # Récupération des Documents
 # retrieve documents from the dataset
 def retrieve(prompt, username):
+    vectorstore = PineconeVectorStore.from_existing_index(  # from_existing_index: initialise un magasin de vecteurs à partir d'un index existant.
         index_name=index, embedding=embedder, namespace=username
     )
+    retriever = vectorstore.as_retriever(
+        search_type="mmr", search_kwargs={"k": 2}
+    )  # as_retriever: crée un retrieveur pour interroger le magasin de vecteurs.
     retrieved_docs = retriever.invoke(prompt)
     return retrieved_docs
+def format_prompt(prompt, retrieved_documents, tone, marketing_technique, social_media):
+    prompt = f"""You are an assistant for digital marketing
+You are given the extracted parts of a long document and a question. Provide a conversational answer.
+If you don't know the answer, just ignore the context.
+Question: \n{prompt}\n"""
+    if tone != "Default":
+        prompt += f"Tone {tone} \n"
+    if marketing_technique != "Default":
+        prompt += f"Marketing technique: {marketing_technique}\n"
+    if social_media != "Default":
+        prompt += f"Social media plateform: {social_media}\n"
+    prompt += "Context:\n"
     for document in retrieved_documents:
+        prompt += f"{document.page_content}\n"
+    # prompt += """If you don't know the answer, just say "I do not know."Don't make up an answer."""
     return prompt
 # Paris est non seulement la capitale de la France, mais aussi la plus grande ville du pays.
 # based on the following context
 # basé sur le contexte suivant
 # If you don't know the answer, just say "I do not know."Don't make up an answer.
 def clear_history(history):
     return []
 # function to Use a mistral llm via api hugging face space
 def ask_mistral(prompt):
     client = Client("hysts/mistral-7b")
 		repetition_penalty=1.2,
 		api_name="/chat"
         )
     return result
 def inject_history(final_prompt, history):
     if len(history) > 0:
         final_prompt = (
+            final_prompt + "\n\nHistory : \n "
         )
         for user, assistant in history:
             final_prompt = final_prompt + "USER : " + user + "\n"
         return final_prompt
     else:
         return final_prompt
 # what is my name based on the following context .
 # context:
 # retreived documents:
+# and the following history of the conversation :
 # USER : my name is nour
 # ASSISTANT : hi nour
 # USER : what is my name ?
 def upload_user_data(username, url=None, description=None, pdf_file=None):
     return message
+def user_retrieve_and_generate(
+    username, tone, marketing_technique, prompt, history, social_media
+):
     # retrieve data from vector store
     retrieved_documents = retrieve(prompt, username)
     # format prompt
+    formatted_prompt = format_prompt(
+        prompt, retrieved_documents, tone, marketing_technique, social_media
+    )
+    # inject history
+    # formatted_prompt = inject_history(formatted_prompt, history)
     # ask mistral
     result = ask_mistral(formatted_prompt)
+    # history.append([prompt,result])
+    new_history = history + [(prompt, result)]
     return new_history
 )
+def clear_prompt(prompt):
+    return ""
 with gr.Blocks() as user_interface:
+    gr.Markdown(
+        value="""user interface to retreive and genarate text based on uploaded data.""",
+        label=None,
+    )
     username = gr.Textbox(label="username")
+    with gr.Accordion("Extra options ⚙️", open=False):
+        tone = gr.Dropdown(
+            ["Default", "neutral", "funny", "serious", "formal"],
+            value="Default",
+            label="tone of voice used in the replies",
+        )
+        marketing_technique = gr.Radio(
+            [
+                "Default",
+                "Retargeting",
+                "AIDA",
+                "Promotion",
+                "Testimonial",
+                "FOMO",
+                "Before and after",
+                "Problem and solution",
+            ],
+            value="Default",
+            label="marketing technique to be used in the replies",
+        )
+        social_media = gr.Radio(
+            ["Default", "instagram", "facebook", "twitter"],
+            value="Default",
+            label="social media platform to be used in the replies",
+        )
+    chatbot = gr.Chatbot(
+        height=450, label="Gradio ChatInterface", show_copy_button=True
+    )
+    prompt = gr.Textbox(label="prompt")
     with gr.Row():
+        clear = gr.Button("🗑️Clear", variant="secondary")
+        submit = gr.Button("✅Submit", variant="primary")
     submit.click(
         fn=user_retrieve_and_generate,
+        inputs=[username, tone, marketing_technique, prompt, chatbot, social_media],
         outputs=[chatbot],
+        api_name="generate",
+    ).then(clear_prompt, inputs=prompt, outputs=prompt, show_api=False)
     prompt.submit(
         fn=user_retrieve_and_generate,
+        inputs=[username, tone, marketing_technique, prompt, chatbot, social_media],
         outputs=[chatbot],
+        api_name=False,
+    ).then(clear_prompt, inputs=prompt, outputs=prompt, show_api=False)
+    clear.click(fn=clear_history, inputs=chatbot, outputs=chatbot, show_api=False)
 demo = gr.TabbedInterface(
+    [upload_data, user_interface],
+    ["upload", "generate"],
+    theme="upsatwal/mlsc_tiet",
 )
 if __name__ == "__main__":
+    demo.launch(
+        debug=True  # ,auth= custom_auth ,auth_message="Enter your username and password"
+    )