nourkchaou commited on
Commit
0da318d
1 Parent(s): da20205

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -66
app.py CHANGED
@@ -10,26 +10,26 @@ from dotenv import load_dotenv
10
  from pinecone import Pinecone, ServerlessSpec
11
 
12
 
13
-
14
  load_dotenv()
15
  HF_TOKEN = os.environ["HF_TOKEN"]
16
  PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
17
 
18
 
19
- #Initialisation de Pinecone et du Modèle d'Embeddings
20
 
21
  # index_name = "db"
22
- pc = Pinecone(api_key=PINECONE_API_KEY) #initialise une instance de Pinecone avec la clé API.
 
 
23
 
24
- embedder = HuggingFaceInferenceAPIEmbeddings( #initialise un modèle d'embeddings
25
  api_key=HF_TOKEN,
26
- model_name="intfloat/multilingual-e5-large-instruct",
27
  )
28
 
29
  index = "db"
30
 
31
 
32
-
33
  # users = {
34
  # "aymen": "admin",
35
  # "amin": "root",
@@ -44,39 +44,49 @@ def load_data(url=None, description=None, pdf=None):
44
  data = []
45
  if url != None:
46
  try:
47
- loader = WebBaseLoader(url, encoding="utf-8") # WebBaseLoader: charge et extrait le contenu textuel d'une page web
 
 
48
  loaded = loader.load()
49
- data.append(loaded[0].page_content)
 
50
  except Exception as e:
51
  print("An error occurred while loading data from the URL:", e)
52
 
53
  if description != None:
54
  data.append(description)
55
  if pdf != None:
56
- loader = PyPDFLoader(pdf) #PyPDFLoader: charge et divise un fichier PDF en pages
 
 
57
  pages = loader.load_and_split()
58
  for page in pages:
59
  data.append(page.page_content)
60
  return data
61
 
62
 
63
-
64
  # function to Split the loaded data
65
- def split_data(data): #divise les données en segments plus petits pour faciliter l'analyse et l'indexation
66
- #data = "\n".join(data)
 
 
67
  # Create a RecursiveCharacterTextSplitter instance
68
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=60) #divise le texte en morceaux de taille spécifiée avec un chevauchement entre les morceaux pour éviter la perte de contexte
 
 
69
 
70
  # Split the text document into smaller chunks
71
  texts = text_splitter.create_documents(data)
72
  return texts
73
-
74
 
75
  # crée un index Pinecone pour un utilisateur s'il n'existe pas déjà
76
  def create_user_index(index_name):
77
  """Creates a Pinecone index with the username, validating the name first."""
78
 
79
- existing_indexes = [index.name for index in pc.list_indexes()] #liste les index existants.
 
 
80
  if index_name in existing_indexes:
81
  # L'index existe déjà, ne le recréez pas
82
  return
@@ -89,44 +99,54 @@ def create_user_index(index_name):
89
  spec=ServerlessSpec(cloud="aws", region="us-east-1"),
90
  )
91
 
 
92
  # embed: crée des embeddings pour les documents divisés et les stocke dans un magasin de vecteurs Pinecone.
93
  def embed(splited_docs, username):
94
  # Créez ou vérifiez l'index pour l'utilisateur
95
  create_user_index(index)
96
 
97
  # Créez une base de données vectorielle Pinecone à partir des documents divisés
98
- PineconeVectorStore.from_documents( # PineconeVectorStore.from_documents: crée et stocke des vecteurs pour les documents fournis.
99
  documents=splited_docs,
100
  index_name=index,
101
  embedding=embedder,
102
  namespace=username,
103
  )
104
 
105
-
106
-
107
-
108
-
109
  # Créez un retrieveur
110
  # retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3})
111
  # return retriever
112
 
 
113
  # Récupération des Documents
114
  # retrieve documents from the dataset
115
  def retrieve(prompt, username):
116
- vectorstore = PineconeVectorStore.from_existing_index( #from_existing_index: initialise un magasin de vecteurs à partir d'un index existant.
117
  index_name=index, embedding=embedder, namespace=username
118
  )
119
- retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 2}) #as_retriever: crée un retrieveur pour interroger le magasin de vecteurs.
 
 
120
  retrieved_docs = retriever.invoke(prompt)
121
  return retrieved_docs
122
 
123
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- def format_prompt(prompt, retrieved_documents, tone, marketing_technique,social_media):
126
- prompt = f"you are a digital marketing expert in social media post , reply to the following prompt :\n{prompt}\n using a {tone} tone and implicitly use the {marketing_technique} marketing thechnique in your reply for a {social_media} post. use the following context when generating :\n"
127
  for document in retrieved_documents:
128
- prompt += f"{document.page_content}\n" #an attribute of each document that contains the text content.
129
- prompt += """If you don't know the answer, just say "I do not know."Don't make up an answer."""
130
  return prompt
131
 
132
 
@@ -136,17 +156,15 @@ def format_prompt(prompt, retrieved_documents, tone, marketing_technique,social_
136
  # Paris est non seulement la capitale de la France, mais aussi la plus grande ville du pays.
137
 
138
 
139
-
140
  # based on the following context
141
  # basé sur le contexte suivant
142
  # If you don't know the answer, just say "I do not know."Don't make up an answer.
143
 
 
144
  def clear_history(history):
145
  return []
146
 
147
 
148
-
149
-
150
  # function to Use a mistral llm via api hugging face space
151
  def ask_mistral(prompt):
152
  client = Client("hysts/mistral-7b")
@@ -159,15 +177,13 @@ def ask_mistral(prompt):
159
  repetition_penalty=1.2,
160
  api_name="/chat"
161
  )
162
-
163
  return result
164
 
165
 
166
-
167
  def inject_history(final_prompt, history):
168
  if len(history) > 0:
169
  final_prompt = (
170
- final_prompt + "\n\n and the following history of the conversation : \n "
171
  )
172
  for user, assistant in history:
173
  final_prompt = final_prompt + "USER : " + user + "\n"
@@ -175,15 +191,15 @@ def inject_history(final_prompt, history):
175
  return final_prompt
176
  else:
177
  return final_prompt
178
-
 
179
  # what is my name based on the following context .
180
  # context:
181
  # retreived documents:
182
- # and the following history of the conversation :
183
  # USER : my name is nour
184
  # ASSISTANT : hi nour
185
  # USER : what is my name ?
186
-
187
 
188
 
189
  def upload_user_data(username, url=None, description=None, pdf_file=None):
@@ -194,19 +210,21 @@ def upload_user_data(username, url=None, description=None, pdf_file=None):
194
  return message
195
 
196
 
197
-
198
-
199
- def user_retrieve_and_generate(username, tone, marketing_technique, prompt, history ,social_media):
200
  # retrieve data from vector store
201
  retrieved_documents = retrieve(prompt, username)
202
  # format prompt
203
- formatted_prompt = format_prompt(prompt, retrieved_documents,tone,marketing_technique,social_media)
204
- #inject history
205
- #final_prompt=inject_history(formatted_prompt, history)
 
 
206
  # ask mistral
207
  result = ask_mistral(formatted_prompt)
208
- #history.append([prompt,result])
209
- new_history= history+ [(prompt,result)]
210
  return new_history
211
 
212
 
@@ -230,43 +248,75 @@ upload_data = gr.Interface(
230
  )
231
 
232
 
 
 
 
 
233
  with gr.Blocks() as user_interface:
234
- gr.Markdown(value="""user interface to retreive and genarate text based on uploaded data.""",
235
- label=None)
 
 
236
  username = gr.Textbox(label="username")
237
- tone=gr.Dropdown(["neutral","funny","serious","formal"], value="neutral",info="tone of voice used in the replies")
238
- marketing_technique=gr.Radio(["Retargeting","AIDA","Promotion","Testimonial","FOMO","Before and after", "Problem and solution"], value="Retargeting",info="marketing technique to be used in the replies")
239
- social_media=gr.Radio(["instagram","facebook","twitter"],value="facebook")
240
- chatbot=gr.Chatbot(height=450, label="Gradio ChatInterface")
241
- prompt=gr.Textbox(label="prompt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  with gr.Row():
243
- clear=gr.Button("🗑️Clear",variant="secondary")
244
- submit=gr.Button("✅Submit",variant="primary")
245
 
 
246
  submit.click(
247
  fn=user_retrieve_and_generate,
248
- inputs=[username, tone, marketing_technique, prompt, chatbot,social_media],
249
  outputs=[chatbot],
250
- api_name="generate"
251
- )
252
- clear.click(fn=clear_history,inputs=chatbot,outputs=chatbot, show_api=False)
 
253
  prompt.submit(
254
  fn=user_retrieve_and_generate,
255
- inputs=[username, tone, marketing_technique, prompt, chatbot,social_media],
256
  outputs=[chatbot],
257
- api_name=False
258
- )
259
-
260
-
261
-
262
-
263
 
 
264
 
265
  demo = gr.TabbedInterface(
266
- [upload_data, user_interface], ["upload", "generate"], theme="upsatwal/mlsc_tiet" #upsatwal/mlsc_tiet
 
 
267
  )
268
 
269
 
270
  if __name__ == "__main__":
271
- demo.launch(debug=True #,auth= custom_auth ,auth_message="Enter your username and password"
272
- )
 
 
10
  from pinecone import Pinecone, ServerlessSpec
11
 
12
 
 
13
  load_dotenv()
14
  HF_TOKEN = os.environ["HF_TOKEN"]
15
  PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
16
 
17
 
18
+ # Initialisation de Pinecone et du Modèle d'Embeddings
19
 
20
  # index_name = "db"
21
+ pc = Pinecone(
22
+ api_key=PINECONE_API_KEY
23
+ ) # initialise une instance de Pinecone avec la clé API.
24
 
25
+ embedder = HuggingFaceInferenceAPIEmbeddings( # initialise un modèle d'embeddings
26
  api_key=HF_TOKEN,
27
+ model_name="mixedbread-ai/mxbai-embed-large-v1",
28
  )
29
 
30
  index = "db"
31
 
32
 
 
33
  # users = {
34
  # "aymen": "admin",
35
  # "amin": "root",
 
44
  data = []
45
  if url != None:
46
  try:
47
+ loader = WebBaseLoader(
48
+ url, encoding="utf-8"
49
+ ) # WebBaseLoader: charge et extrait le contenu textuel d'une page web
50
  loaded = loader.load()
51
+ for page in loaded :
52
+ data.append(page.page_content)
53
  except Exception as e:
54
  print("An error occurred while loading data from the URL:", e)
55
 
56
  if description != None:
57
  data.append(description)
58
  if pdf != None:
59
+ loader = PyPDFLoader(
60
+ pdf
61
+ ) # PyPDFLoader: charge et divise un fichier PDF en pages
62
  pages = loader.load_and_split()
63
  for page in pages:
64
  data.append(page.page_content)
65
  return data
66
 
67
 
 
68
  # function to Split the loaded data
69
+ def split_data(
70
+ data,
71
+ ): # divise les données en segments plus petits pour faciliter l'analyse et l'indexation
72
+ # data = "\n".join(data)
73
  # Create a RecursiveCharacterTextSplitter instance
74
+ text_splitter = RecursiveCharacterTextSplitter(
75
+ chunk_size=512, chunk_overlap=60
76
+ ) # divise le texte en morceaux de taille spécifiée avec un chevauchement entre les morceaux pour éviter la perte de contexte
77
 
78
  # Split the text document into smaller chunks
79
  texts = text_splitter.create_documents(data)
80
  return texts
81
+
82
 
83
  # crée un index Pinecone pour un utilisateur s'il n'existe pas déjà
84
  def create_user_index(index_name):
85
  """Creates a Pinecone index with the username, validating the name first."""
86
 
87
+ existing_indexes = [
88
+ index.name for index in pc.list_indexes()
89
+ ] # liste les index existants.
90
  if index_name in existing_indexes:
91
  # L'index existe déjà, ne le recréez pas
92
  return
 
99
  spec=ServerlessSpec(cloud="aws", region="us-east-1"),
100
  )
101
 
102
+
103
  # embed: crée des embeddings pour les documents divisés et les stocke dans un magasin de vecteurs Pinecone.
104
  def embed(splited_docs, username):
105
  # Créez ou vérifiez l'index pour l'utilisateur
106
  create_user_index(index)
107
 
108
  # Créez une base de données vectorielle Pinecone à partir des documents divisés
109
+ PineconeVectorStore.from_documents( # PineconeVectorStore.from_documents: crée et stocke des vecteurs pour les documents fournis.
110
  documents=splited_docs,
111
  index_name=index,
112
  embedding=embedder,
113
  namespace=username,
114
  )
115
 
 
 
 
 
116
  # Créez un retrieveur
117
  # retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3})
118
  # return retriever
119
 
120
+
121
  # Récupération des Documents
122
  # retrieve documents from the dataset
123
  def retrieve(prompt, username):
124
+ vectorstore = PineconeVectorStore.from_existing_index( # from_existing_index: initialise un magasin de vecteurs à partir d'un index existant.
125
  index_name=index, embedding=embedder, namespace=username
126
  )
127
+ retriever = vectorstore.as_retriever(
128
+ search_type="mmr", search_kwargs={"k": 2}
129
+ ) # as_retriever: crée un retrieveur pour interroger le magasin de vecteurs.
130
  retrieved_docs = retriever.invoke(prompt)
131
  return retrieved_docs
132
 
133
 
134
+ def format_prompt(prompt, retrieved_documents, tone, marketing_technique, social_media):
135
+ prompt = f"""You are an assistant for digital marketing
136
+ You are given the extracted parts of a long document and a question. Provide a conversational answer.
137
+ If you don't know the answer, just ignore the context.
138
+ Question: \n{prompt}\n"""
139
+ if tone != "Default":
140
+ prompt += f"Tone {tone} \n"
141
+ if marketing_technique != "Default":
142
+ prompt += f"Marketing technique: {marketing_technique}\n"
143
+ if social_media != "Default":
144
+ prompt += f"Social media plateform: {social_media}\n"
145
 
146
+ prompt += "Context:\n"
 
147
  for document in retrieved_documents:
148
+ prompt += f"{document.page_content}\n"
149
+ # prompt += """If you don't know the answer, just say "I do not know."Don't make up an answer."""
150
  return prompt
151
 
152
 
 
156
  # Paris est non seulement la capitale de la France, mais aussi la plus grande ville du pays.
157
 
158
 
 
159
  # based on the following context
160
  # basé sur le contexte suivant
161
  # If you don't know the answer, just say "I do not know."Don't make up an answer.
162
 
163
+
164
  def clear_history(history):
165
  return []
166
 
167
 
 
 
168
  # function to Use a mistral llm via api hugging face space
169
  def ask_mistral(prompt):
170
  client = Client("hysts/mistral-7b")
 
177
  repetition_penalty=1.2,
178
  api_name="/chat"
179
  )
 
180
  return result
181
 
182
 
 
183
  def inject_history(final_prompt, history):
184
  if len(history) > 0:
185
  final_prompt = (
186
+ final_prompt + "\n\nHistory : \n "
187
  )
188
  for user, assistant in history:
189
  final_prompt = final_prompt + "USER : " + user + "\n"
 
191
  return final_prompt
192
  else:
193
  return final_prompt
194
+
195
+
196
  # what is my name based on the following context .
197
  # context:
198
  # retreived documents:
199
+ # and the following history of the conversation :
200
  # USER : my name is nour
201
  # ASSISTANT : hi nour
202
  # USER : what is my name ?
 
203
 
204
 
205
  def upload_user_data(username, url=None, description=None, pdf_file=None):
 
210
  return message
211
 
212
 
213
+ def user_retrieve_and_generate(
214
+ username, tone, marketing_technique, prompt, history, social_media
215
+ ):
216
  # retrieve data from vector store
217
  retrieved_documents = retrieve(prompt, username)
218
  # format prompt
219
+ formatted_prompt = format_prompt(
220
+ prompt, retrieved_documents, tone, marketing_technique, social_media
221
+ )
222
+ # inject history
223
+ # formatted_prompt = inject_history(formatted_prompt, history)
224
  # ask mistral
225
  result = ask_mistral(formatted_prompt)
226
+ # history.append([prompt,result])
227
+ new_history = history + [(prompt, result)]
228
  return new_history
229
 
230
 
 
248
  )
249
 
250
 
251
+ def clear_prompt(prompt):
252
+ return ""
253
+
254
+
255
  with gr.Blocks() as user_interface:
256
+ gr.Markdown(
257
+ value="""user interface to retreive and genarate text based on uploaded data.""",
258
+ label=None,
259
+ )
260
  username = gr.Textbox(label="username")
261
+ with gr.Accordion("Extra options ⚙️", open=False):
262
+ tone = gr.Dropdown(
263
+ ["Default", "neutral", "funny", "serious", "formal"],
264
+ value="Default",
265
+ label="tone of voice used in the replies",
266
+ )
267
+ marketing_technique = gr.Radio(
268
+ [
269
+ "Default",
270
+ "Retargeting",
271
+ "AIDA",
272
+ "Promotion",
273
+ "Testimonial",
274
+ "FOMO",
275
+ "Before and after",
276
+ "Problem and solution",
277
+ ],
278
+ value="Default",
279
+ label="marketing technique to be used in the replies",
280
+ )
281
+ social_media = gr.Radio(
282
+ ["Default", "instagram", "facebook", "twitter"],
283
+ value="Default",
284
+ label="social media platform to be used in the replies",
285
+ )
286
+ chatbot = gr.Chatbot(
287
+ height=450, label="Gradio ChatInterface", show_copy_button=True
288
+ )
289
+ prompt = gr.Textbox(label="prompt")
290
  with gr.Row():
291
+ clear = gr.Button("🗑️Clear", variant="secondary")
292
+ submit = gr.Button("✅Submit", variant="primary")
293
 
294
+
295
  submit.click(
296
  fn=user_retrieve_and_generate,
297
+ inputs=[username, tone, marketing_technique, prompt, chatbot, social_media],
298
  outputs=[chatbot],
299
+ api_name="generate",
300
+ ).then(clear_prompt, inputs=prompt, outputs=prompt, show_api=False)
301
+
302
+
303
  prompt.submit(
304
  fn=user_retrieve_and_generate,
305
+ inputs=[username, tone, marketing_technique, prompt, chatbot, social_media],
306
  outputs=[chatbot],
307
+ api_name=False,
308
+ ).then(clear_prompt, inputs=prompt, outputs=prompt, show_api=False)
 
 
 
 
309
 
310
+ clear.click(fn=clear_history, inputs=chatbot, outputs=chatbot, show_api=False)
311
 
312
  demo = gr.TabbedInterface(
313
+ [upload_data, user_interface],
314
+ ["upload", "generate"],
315
+ theme="upsatwal/mlsc_tiet",
316
  )
317
 
318
 
319
  if __name__ == "__main__":
320
+ demo.launch(
321
+ debug=True # ,auth= custom_auth ,auth_message="Enter your username and password"
322
+ )