Spaces:
Sleeping
Sleeping
nourkchaou
commited on
Commit
•
0da318d
1
Parent(s):
da20205
Update app.py
Browse files
app.py
CHANGED
@@ -10,26 +10,26 @@ from dotenv import load_dotenv
|
|
10 |
from pinecone import Pinecone, ServerlessSpec
|
11 |
|
12 |
|
13 |
-
|
14 |
load_dotenv()
|
15 |
HF_TOKEN = os.environ["HF_TOKEN"]
|
16 |
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
|
17 |
|
18 |
|
19 |
-
#Initialisation de Pinecone et du Modèle d'Embeddings
|
20 |
|
21 |
# index_name = "db"
|
22 |
-
pc = Pinecone(
|
|
|
|
|
23 |
|
24 |
-
embedder = HuggingFaceInferenceAPIEmbeddings(
|
25 |
api_key=HF_TOKEN,
|
26 |
-
model_name="
|
27 |
)
|
28 |
|
29 |
index = "db"
|
30 |
|
31 |
|
32 |
-
|
33 |
# users = {
|
34 |
# "aymen": "admin",
|
35 |
# "amin": "root",
|
@@ -44,39 +44,49 @@ def load_data(url=None, description=None, pdf=None):
|
|
44 |
data = []
|
45 |
if url != None:
|
46 |
try:
|
47 |
-
loader = WebBaseLoader(
|
|
|
|
|
48 |
loaded = loader.load()
|
49 |
-
|
|
|
50 |
except Exception as e:
|
51 |
print("An error occurred while loading data from the URL:", e)
|
52 |
|
53 |
if description != None:
|
54 |
data.append(description)
|
55 |
if pdf != None:
|
56 |
-
loader = PyPDFLoader(
|
|
|
|
|
57 |
pages = loader.load_and_split()
|
58 |
for page in pages:
|
59 |
data.append(page.page_content)
|
60 |
return data
|
61 |
|
62 |
|
63 |
-
|
64 |
# function to Split the loaded data
|
65 |
-
def split_data(
|
66 |
-
|
|
|
|
|
67 |
# Create a RecursiveCharacterTextSplitter instance
|
68 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
|
|
69 |
|
70 |
# Split the text document into smaller chunks
|
71 |
texts = text_splitter.create_documents(data)
|
72 |
return texts
|
73 |
-
|
74 |
|
75 |
# crée un index Pinecone pour un utilisateur s'il n'existe pas déjà
|
76 |
def create_user_index(index_name):
|
77 |
"""Creates a Pinecone index with the username, validating the name first."""
|
78 |
|
79 |
-
existing_indexes = [
|
|
|
|
|
80 |
if index_name in existing_indexes:
|
81 |
# L'index existe déjà, ne le recréez pas
|
82 |
return
|
@@ -89,44 +99,54 @@ def create_user_index(index_name):
|
|
89 |
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
|
90 |
)
|
91 |
|
|
|
92 |
# embed: crée des embeddings pour les documents divisés et les stocke dans un magasin de vecteurs Pinecone.
|
93 |
def embed(splited_docs, username):
|
94 |
# Créez ou vérifiez l'index pour l'utilisateur
|
95 |
create_user_index(index)
|
96 |
|
97 |
# Créez une base de données vectorielle Pinecone à partir des documents divisés
|
98 |
-
PineconeVectorStore.from_documents(
|
99 |
documents=splited_docs,
|
100 |
index_name=index,
|
101 |
embedding=embedder,
|
102 |
namespace=username,
|
103 |
)
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
# Créez un retrieveur
|
110 |
# retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3})
|
111 |
# return retriever
|
112 |
|
|
|
113 |
# Récupération des Documents
|
114 |
# retrieve documents from the dataset
|
115 |
def retrieve(prompt, username):
|
116 |
-
vectorstore = PineconeVectorStore.from_existing_index(
|
117 |
index_name=index, embedding=embedder, namespace=username
|
118 |
)
|
119 |
-
retriever = vectorstore.as_retriever(
|
|
|
|
|
120 |
retrieved_docs = retriever.invoke(prompt)
|
121 |
return retrieved_docs
|
122 |
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
-
|
126 |
-
prompt = f"you are a digital marketing expert in social media post , reply to the following prompt :\n{prompt}\n using a {tone} tone and implicitly use the {marketing_technique} marketing thechnique in your reply for a {social_media} post. use the following context when generating :\n"
|
127 |
for document in retrieved_documents:
|
128 |
-
prompt += f"{document.page_content}\n"
|
129 |
-
prompt += """If you don't know the answer, just say "I do not know."Don't make up an answer."""
|
130 |
return prompt
|
131 |
|
132 |
|
@@ -136,17 +156,15 @@ def format_prompt(prompt, retrieved_documents, tone, marketing_technique,social_
|
|
136 |
# Paris est non seulement la capitale de la France, mais aussi la plus grande ville du pays.
|
137 |
|
138 |
|
139 |
-
|
140 |
# based on the following context
|
141 |
# basé sur le contexte suivant
|
142 |
# If you don't know the answer, just say "I do not know."Don't make up an answer.
|
143 |
|
|
|
144 |
def clear_history(history):
|
145 |
return []
|
146 |
|
147 |
|
148 |
-
|
149 |
-
|
150 |
# function to Use a mistral llm via api hugging face space
|
151 |
def ask_mistral(prompt):
|
152 |
client = Client("hysts/mistral-7b")
|
@@ -159,15 +177,13 @@ def ask_mistral(prompt):
|
|
159 |
repetition_penalty=1.2,
|
160 |
api_name="/chat"
|
161 |
)
|
162 |
-
|
163 |
return result
|
164 |
|
165 |
|
166 |
-
|
167 |
def inject_history(final_prompt, history):
|
168 |
if len(history) > 0:
|
169 |
final_prompt = (
|
170 |
-
final_prompt + "\n\
|
171 |
)
|
172 |
for user, assistant in history:
|
173 |
final_prompt = final_prompt + "USER : " + user + "\n"
|
@@ -175,15 +191,15 @@ def inject_history(final_prompt, history):
|
|
175 |
return final_prompt
|
176 |
else:
|
177 |
return final_prompt
|
178 |
-
|
|
|
179 |
# what is my name based on the following context .
|
180 |
# context:
|
181 |
# retreived documents:
|
182 |
-
# and the following history of the conversation :
|
183 |
# USER : my name is nour
|
184 |
# ASSISTANT : hi nour
|
185 |
# USER : what is my name ?
|
186 |
-
|
187 |
|
188 |
|
189 |
def upload_user_data(username, url=None, description=None, pdf_file=None):
|
@@ -194,19 +210,21 @@ def upload_user_data(username, url=None, description=None, pdf_file=None):
|
|
194 |
return message
|
195 |
|
196 |
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
# retrieve data from vector store
|
201 |
retrieved_documents = retrieve(prompt, username)
|
202 |
# format prompt
|
203 |
-
formatted_prompt = format_prompt(
|
204 |
-
|
205 |
-
|
|
|
|
|
206 |
# ask mistral
|
207 |
result = ask_mistral(formatted_prompt)
|
208 |
-
#history.append([prompt,result])
|
209 |
-
new_history= history+ [(prompt,result)]
|
210 |
return new_history
|
211 |
|
212 |
|
@@ -230,43 +248,75 @@ upload_data = gr.Interface(
|
|
230 |
)
|
231 |
|
232 |
|
|
|
|
|
|
|
|
|
233 |
with gr.Blocks() as user_interface:
|
234 |
-
gr.Markdown(
|
235 |
-
|
|
|
|
|
236 |
username = gr.Textbox(label="username")
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
with gr.Row():
|
243 |
-
clear=gr.Button("🗑️Clear",variant="secondary")
|
244 |
-
submit=gr.Button("✅Submit",variant="primary")
|
245 |
|
|
|
246 |
submit.click(
|
247 |
fn=user_retrieve_and_generate,
|
248 |
-
inputs=[username, tone, marketing_technique, prompt, chatbot,social_media],
|
249 |
outputs=[chatbot],
|
250 |
-
api_name="generate"
|
251 |
-
)
|
252 |
-
|
|
|
253 |
prompt.submit(
|
254 |
fn=user_retrieve_and_generate,
|
255 |
-
inputs=[username, tone, marketing_technique, prompt, chatbot,social_media],
|
256 |
outputs=[chatbot],
|
257 |
-
api_name=False
|
258 |
-
)
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
|
|
|
264 |
|
265 |
demo = gr.TabbedInterface(
|
266 |
-
[upload_data, user_interface],
|
|
|
|
|
267 |
)
|
268 |
|
269 |
|
270 |
if __name__ == "__main__":
|
271 |
-
demo.launch(
|
272 |
-
|
|
|
|
10 |
from pinecone import Pinecone, ServerlessSpec
|
11 |
|
12 |
|
|
|
13 |
load_dotenv()
|
14 |
HF_TOKEN = os.environ["HF_TOKEN"]
|
15 |
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
|
16 |
|
17 |
|
18 |
+
# Initialisation de Pinecone et du Modèle d'Embeddings
|
19 |
|
20 |
# index_name = "db"
|
21 |
+
pc = Pinecone(
|
22 |
+
api_key=PINECONE_API_KEY
|
23 |
+
) # initialise une instance de Pinecone avec la clé API.
|
24 |
|
25 |
+
embedder = HuggingFaceInferenceAPIEmbeddings( # initialise un modèle d'embeddings
|
26 |
api_key=HF_TOKEN,
|
27 |
+
model_name="mixedbread-ai/mxbai-embed-large-v1",
|
28 |
)
|
29 |
|
30 |
index = "db"
|
31 |
|
32 |
|
|
|
33 |
# users = {
|
34 |
# "aymen": "admin",
|
35 |
# "amin": "root",
|
|
|
44 |
data = []
|
45 |
if url != None:
|
46 |
try:
|
47 |
+
loader = WebBaseLoader(
|
48 |
+
url, encoding="utf-8"
|
49 |
+
) # WebBaseLoader: charge et extrait le contenu textuel d'une page web
|
50 |
loaded = loader.load()
|
51 |
+
for page in loaded :
|
52 |
+
data.append(page.page_content)
|
53 |
except Exception as e:
|
54 |
print("An error occurred while loading data from the URL:", e)
|
55 |
|
56 |
if description != None:
|
57 |
data.append(description)
|
58 |
if pdf != None:
|
59 |
+
loader = PyPDFLoader(
|
60 |
+
pdf
|
61 |
+
) # PyPDFLoader: charge et divise un fichier PDF en pages
|
62 |
pages = loader.load_and_split()
|
63 |
for page in pages:
|
64 |
data.append(page.page_content)
|
65 |
return data
|
66 |
|
67 |
|
|
|
68 |
# function to Split the loaded data
|
69 |
+
def split_data(
|
70 |
+
data,
|
71 |
+
): # divise les données en segments plus petits pour faciliter l'analyse et l'indexation
|
72 |
+
# data = "\n".join(data)
|
73 |
# Create a RecursiveCharacterTextSplitter instance
|
74 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
75 |
+
chunk_size=512, chunk_overlap=60
|
76 |
+
) # divise le texte en morceaux de taille spécifiée avec un chevauchement entre les morceaux pour éviter la perte de contexte
|
77 |
|
78 |
# Split the text document into smaller chunks
|
79 |
texts = text_splitter.create_documents(data)
|
80 |
return texts
|
81 |
+
|
82 |
|
83 |
# crée un index Pinecone pour un utilisateur s'il n'existe pas déjà
|
84 |
def create_user_index(index_name):
|
85 |
"""Creates a Pinecone index with the username, validating the name first."""
|
86 |
|
87 |
+
existing_indexes = [
|
88 |
+
index.name for index in pc.list_indexes()
|
89 |
+
] # liste les index existants.
|
90 |
if index_name in existing_indexes:
|
91 |
# L'index existe déjà, ne le recréez pas
|
92 |
return
|
|
|
99 |
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
|
100 |
)
|
101 |
|
102 |
+
|
103 |
# embed: crée des embeddings pour les documents divisés et les stocke dans un magasin de vecteurs Pinecone.
|
104 |
def embed(splited_docs, username):
|
105 |
# Créez ou vérifiez l'index pour l'utilisateur
|
106 |
create_user_index(index)
|
107 |
|
108 |
# Créez une base de données vectorielle Pinecone à partir des documents divisés
|
109 |
+
PineconeVectorStore.from_documents( # PineconeVectorStore.from_documents: crée et stocke des vecteurs pour les documents fournis.
|
110 |
documents=splited_docs,
|
111 |
index_name=index,
|
112 |
embedding=embedder,
|
113 |
namespace=username,
|
114 |
)
|
115 |
|
|
|
|
|
|
|
|
|
116 |
# Créez un retrieveur
|
117 |
# retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3})
|
118 |
# return retriever
|
119 |
|
120 |
+
|
121 |
# Récupération des Documents
|
122 |
# retrieve documents from the dataset
|
123 |
def retrieve(prompt, username):
|
124 |
+
vectorstore = PineconeVectorStore.from_existing_index( # from_existing_index: initialise un magasin de vecteurs à partir d'un index existant.
|
125 |
index_name=index, embedding=embedder, namespace=username
|
126 |
)
|
127 |
+
retriever = vectorstore.as_retriever(
|
128 |
+
search_type="mmr", search_kwargs={"k": 2}
|
129 |
+
) # as_retriever: crée un retrieveur pour interroger le magasin de vecteurs.
|
130 |
retrieved_docs = retriever.invoke(prompt)
|
131 |
return retrieved_docs
|
132 |
|
133 |
|
134 |
+
def format_prompt(prompt, retrieved_documents, tone, marketing_technique, social_media):
|
135 |
+
prompt = f"""You are an assistant for digital marketing
|
136 |
+
You are given the extracted parts of a long document and a question. Provide a conversational answer.
|
137 |
+
If you don't know the answer, just ignore the context.
|
138 |
+
Question: \n{prompt}\n"""
|
139 |
+
if tone != "Default":
|
140 |
+
prompt += f"Tone {tone} \n"
|
141 |
+
if marketing_technique != "Default":
|
142 |
+
prompt += f"Marketing technique: {marketing_technique}\n"
|
143 |
+
if social_media != "Default":
|
144 |
+
prompt += f"Social media plateform: {social_media}\n"
|
145 |
|
146 |
+
prompt += "Context:\n"
|
|
|
147 |
for document in retrieved_documents:
|
148 |
+
prompt += f"{document.page_content}\n"
|
149 |
+
# prompt += """If you don't know the answer, just say "I do not know."Don't make up an answer."""
|
150 |
return prompt
|
151 |
|
152 |
|
|
|
156 |
# Paris est non seulement la capitale de la France, mais aussi la plus grande ville du pays.
|
157 |
|
158 |
|
|
|
159 |
# based on the following context
|
160 |
# basé sur le contexte suivant
|
161 |
# If you don't know the answer, just say "I do not know."Don't make up an answer.
|
162 |
|
163 |
+
|
164 |
def clear_history(history):
|
165 |
return []
|
166 |
|
167 |
|
|
|
|
|
168 |
# function to Use a mistral llm via api hugging face space
|
169 |
def ask_mistral(prompt):
|
170 |
client = Client("hysts/mistral-7b")
|
|
|
177 |
repetition_penalty=1.2,
|
178 |
api_name="/chat"
|
179 |
)
|
|
|
180 |
return result
|
181 |
|
182 |
|
|
|
183 |
def inject_history(final_prompt, history):
|
184 |
if len(history) > 0:
|
185 |
final_prompt = (
|
186 |
+
final_prompt + "\n\nHistory : \n "
|
187 |
)
|
188 |
for user, assistant in history:
|
189 |
final_prompt = final_prompt + "USER : " + user + "\n"
|
|
|
191 |
return final_prompt
|
192 |
else:
|
193 |
return final_prompt
|
194 |
+
|
195 |
+
|
196 |
# what is my name based on the following context .
|
197 |
# context:
|
198 |
# retreived documents:
|
199 |
+
# and the following history of the conversation :
|
200 |
# USER : my name is nour
|
201 |
# ASSISTANT : hi nour
|
202 |
# USER : what is my name ?
|
|
|
203 |
|
204 |
|
205 |
def upload_user_data(username, url=None, description=None, pdf_file=None):
|
|
|
210 |
return message
|
211 |
|
212 |
|
213 |
+
def user_retrieve_and_generate(
|
214 |
+
username, tone, marketing_technique, prompt, history, social_media
|
215 |
+
):
|
216 |
# retrieve data from vector store
|
217 |
retrieved_documents = retrieve(prompt, username)
|
218 |
# format prompt
|
219 |
+
formatted_prompt = format_prompt(
|
220 |
+
prompt, retrieved_documents, tone, marketing_technique, social_media
|
221 |
+
)
|
222 |
+
# inject history
|
223 |
+
# formatted_prompt = inject_history(formatted_prompt, history)
|
224 |
# ask mistral
|
225 |
result = ask_mistral(formatted_prompt)
|
226 |
+
# history.append([prompt,result])
|
227 |
+
new_history = history + [(prompt, result)]
|
228 |
return new_history
|
229 |
|
230 |
|
|
|
248 |
)
|
249 |
|
250 |
|
251 |
+
def clear_prompt(prompt):
|
252 |
+
return ""
|
253 |
+
|
254 |
+
|
255 |
with gr.Blocks() as user_interface:
|
256 |
+
gr.Markdown(
|
257 |
+
value="""user interface to retreive and genarate text based on uploaded data.""",
|
258 |
+
label=None,
|
259 |
+
)
|
260 |
username = gr.Textbox(label="username")
|
261 |
+
with gr.Accordion("Extra options ⚙️", open=False):
|
262 |
+
tone = gr.Dropdown(
|
263 |
+
["Default", "neutral", "funny", "serious", "formal"],
|
264 |
+
value="Default",
|
265 |
+
label="tone of voice used in the replies",
|
266 |
+
)
|
267 |
+
marketing_technique = gr.Radio(
|
268 |
+
[
|
269 |
+
"Default",
|
270 |
+
"Retargeting",
|
271 |
+
"AIDA",
|
272 |
+
"Promotion",
|
273 |
+
"Testimonial",
|
274 |
+
"FOMO",
|
275 |
+
"Before and after",
|
276 |
+
"Problem and solution",
|
277 |
+
],
|
278 |
+
value="Default",
|
279 |
+
label="marketing technique to be used in the replies",
|
280 |
+
)
|
281 |
+
social_media = gr.Radio(
|
282 |
+
["Default", "instagram", "facebook", "twitter"],
|
283 |
+
value="Default",
|
284 |
+
label="social media platform to be used in the replies",
|
285 |
+
)
|
286 |
+
chatbot = gr.Chatbot(
|
287 |
+
height=450, label="Gradio ChatInterface", show_copy_button=True
|
288 |
+
)
|
289 |
+
prompt = gr.Textbox(label="prompt")
|
290 |
with gr.Row():
|
291 |
+
clear = gr.Button("🗑️Clear", variant="secondary")
|
292 |
+
submit = gr.Button("✅Submit", variant="primary")
|
293 |
|
294 |
+
|
295 |
submit.click(
|
296 |
fn=user_retrieve_and_generate,
|
297 |
+
inputs=[username, tone, marketing_technique, prompt, chatbot, social_media],
|
298 |
outputs=[chatbot],
|
299 |
+
api_name="generate",
|
300 |
+
).then(clear_prompt, inputs=prompt, outputs=prompt, show_api=False)
|
301 |
+
|
302 |
+
|
303 |
prompt.submit(
|
304 |
fn=user_retrieve_and_generate,
|
305 |
+
inputs=[username, tone, marketing_technique, prompt, chatbot, social_media],
|
306 |
outputs=[chatbot],
|
307 |
+
api_name=False,
|
308 |
+
).then(clear_prompt, inputs=prompt, outputs=prompt, show_api=False)
|
|
|
|
|
|
|
|
|
309 |
|
310 |
+
clear.click(fn=clear_history, inputs=chatbot, outputs=chatbot, show_api=False)
|
311 |
|
312 |
demo = gr.TabbedInterface(
|
313 |
+
[upload_data, user_interface],
|
314 |
+
["upload", "generate"],
|
315 |
+
theme="upsatwal/mlsc_tiet",
|
316 |
)
|
317 |
|
318 |
|
319 |
if __name__ == "__main__":
|
320 |
+
demo.launch(
|
321 |
+
debug=True # ,auth= custom_auth ,auth_message="Enter your username and password"
|
322 |
+
)
|