Spaces:
Sleeping
Sleeping
Rajat.bans
commited on
Commit
•
537373b
1
Parent(s):
ead6614
Added all changes - responses are better then before
Browse files
rag.py
CHANGED
@@ -22,7 +22,9 @@ embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)
|
|
22 |
|
23 |
class CLUSTERING:
|
24 |
def __init__(self):
|
25 |
-
self.clustering_algo =
|
|
|
|
|
26 |
|
27 |
def cluster_embeddings(self, embeddings, no_of_clusters, no_of_points):
|
28 |
if self.clustering_algo in {"kmeans-cc", "kmeans-sp"}:
|
@@ -45,10 +47,14 @@ class CLUSTERING:
|
|
45 |
for i, label in enumerate(labels):
|
46 |
if len(clusters_indices[label]) < no_of_points:
|
47 |
clusters_indices[label].append(i)
|
48 |
-
if all(
|
|
|
|
|
49 |
break
|
50 |
elif self.clustering_algo == "spectral":
|
51 |
-
spectral_clustering = SpectralClustering(
|
|
|
|
|
52 |
labels = spectral_clustering.fit_predict(embeddings)
|
53 |
|
54 |
clusters_indices = [[] for _ in range(no_of_clusters)]
|
@@ -62,6 +68,7 @@ class CLUSTERING:
|
|
62 |
for i in range(no_of_clusters)
|
63 |
]
|
64 |
|
|
|
65 |
class VECTOR_DB:
|
66 |
def __init__(self):
|
67 |
self.DB_FAISS_PATH = "./vectorstore/db_faiss_ads_20May_20Jun_webmd_healthline_Health_dupRemoved0.8"
|
@@ -89,20 +96,25 @@ class VECTOR_DB:
|
|
89 |
retreived_documents[i][0].page_content = remove_html_tags(
|
90 |
retreived_documents[i][0].page_content
|
91 |
)
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
|
102 |
-
best_value = 1
|
103 |
-
if len(retreived_documents):
|
104 |
best_value = retreived_documents[0][1]
|
105 |
-
|
|
|
|
|
106 |
|
107 |
class ADS_RAG:
|
108 |
def __init__(self):
|
@@ -110,11 +122,11 @@ class ADS_RAG:
|
|
110 |
self.db = VECTOR_DB()
|
111 |
self.qa_model_name = "gpt-3.5-turbo"
|
112 |
self.relation_check_best_value_thresh = 0.6
|
113 |
-
self.bestRelationSystemPrompt = """You are an advertising concierge for text ads on websites. Given an INPUT
|
114 |
|
115 |
---------------------------------------
|
116 |
|
117 |
-
**Sample INPUT
|
118 |
|
119 |
Expected json output :
|
120 |
{
|
@@ -123,42 +135,40 @@ Expected json output :
|
|
123 |
}
|
124 |
------------------------------------------------
|
125 |
|
126 |
-
**Sample INPUT
|
127 |
|
128 |
Expected json output :
|
129 |
{
|
130 |
-
"reasoning" : "Given the user's search for 'The Effects of Aging on Skin,' it is clear that they are seeking information related to skin aging. Therefore, the ads that are relevant to skin effects should be considered. Ads 1 and 2 focus on wrinkle treatment and anti-aging solutions, making them pertinent to the user's intent. Ad 3 targets vitiligo and not general skin aging but it is related to skin effect. So it is also relevant. Ads 4 and 5 are about advanced lung cancer, which do not address the interest in skin. Ads 1 and 2, 3 are most relevant to the user's search. So ADS_DATA is relevant to INPUT
|
131 |
"classification": 1
|
132 |
}
|
133 |
---------------------------------------
|
134 |
|
135 |
The ADS_DATA provided to you is as follows:
|
|
|
136 |
|
137 |
-
"""
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
3. FROM REMAINING ADS in each ads cluster form an OPTION which should be both the answer for the QUESTION and related to ads in this cluster.
|
142 |
-
4. Try to generate intelligent creatives for advertising and keep QUESTION within 70 characters and each OPTION with either 4, 5, or 6 words.
|
143 |
-
5. Provide your REASONING behind choosing the QUESTION and the OPTIONS. Now provide the QUESTION and the OPTIONS. Along with each OPTION, provide the ads from ADS_DATA that you associated with it.
|
144 |
|
145 |
---------------------------------------
|
146 |
|
147 |
-
<Sample INPUT
|
148 |
The Effects of Aging on Skin
|
149 |
|
150 |
<Sample ADS_DATA>
|
151 |
-
|
152 |
|
153 |
<Expected json output>
|
154 |
{
|
155 |
-
"reasoning" : "Among the seven ads in **Sample ADS_DATA**, Ads 3 and 6 are irrelevant to the INPUT, so they should be discarded. Ad 1, 2
|
156 |
-
"question": "
|
157 |
-
"options": {"1.
|
158 |
}
|
159 |
-----------------------------------------------
|
160 |
|
161 |
-
<Sample INPUT
|
162 |
Got A Rosemary Bush? Here’re 20 Brilliant & Unusual Ways To Use All That Rosemary
|
163 |
|
164 |
<Sample ADS_DATA>
|
@@ -173,25 +183,25 @@ Got A Rosemary Bush? Here’re 20 Brilliant & Unusual Ways To Use All That Rosem
|
|
173 |
-----------------------------------------------
|
174 |
|
175 |
The ADS_DATA provided to you is as follows:
|
176 |
-
|
177 |
|
178 |
old_system_prompt_additional_example = """
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
"""
|
196 |
|
197 |
def callOpenAiApi(self, messages):
|
@@ -202,14 +212,15 @@ The ADS_DATA provided to you is as follows:
|
|
202 |
messages=messages,
|
203 |
temperature=0,
|
204 |
seed=42,
|
205 |
-
max_tokens=
|
206 |
response_format={"type": "json_object"},
|
207 |
)
|
208 |
tokens_used = response.usage.total_tokens
|
209 |
answer = json.loads(response.choices[0].message.content)
|
210 |
return answer, tokens_used
|
211 |
except Exception as e:
|
212 |
-
print(
|
|
|
213 |
print("Trying Again")
|
214 |
|
215 |
def getBestQuestionOnTheBasisOfPageInformationAndAdsData(
|
@@ -253,7 +264,8 @@ The ADS_DATA provided to you is as follows:
|
|
253 |
)
|
254 |
|
255 |
if relation_answer["classification"] != 0:
|
256 |
-
question_answer, tokens_used_question = self.callOpenAiApi(
|
|
|
257 |
{
|
258 |
"role": "system",
|
259 |
"content": questionSystemPrompt + adsData,
|
@@ -264,7 +276,8 @@ The ADS_DATA provided to you is as follows:
|
|
264 |
"role": "user",
|
265 |
"content": page_information + "\nThe JSON response: ",
|
266 |
}
|
267 |
-
]
|
|
|
268 |
return (relation_answer, tokens_used_relation), (
|
269 |
question_answer,
|
270 |
tokens_used_question,
|
@@ -272,17 +285,22 @@ The ADS_DATA provided to you is as follows:
|
|
272 |
|
273 |
def convertDocumentsClustersToStringForApiCall(self, documents_clusters):
|
274 |
key_counter = count(1)
|
275 |
-
res = json.dumps(
|
276 |
-
{
|
277 |
-
|
278 |
-
|
|
|
|
|
|
|
|
|
|
|
279 |
return res
|
280 |
|
281 |
def changeDocumentsToPrintableString(self, documents_clusters):
|
282 |
res = ""
|
283 |
i = 0
|
284 |
for ind, documents_cluster in enumerate(documents_clusters):
|
285 |
-
res += f"
|
286 |
for document in documents_cluster:
|
287 |
i += 1
|
288 |
res += f"[Ad {i}] Content: {document[0].page_content}\nRevenue: {document[0].metadata['revenue']}\nAd Click Count: {document[0].metadata['ad_click_count']}\nValue: {document[1]}\n"
|
@@ -300,8 +318,23 @@ The ADS_DATA provided to you is as follows:
|
|
300 |
res += "\n"
|
301 |
return res
|
302 |
|
303 |
-
def logResult(
|
304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
|
306 |
def getRagResponse(
|
307 |
self, RelationPrompt, QuestionPrompt, threshold, page_information
|
@@ -314,7 +347,9 @@ The ADS_DATA provided to you is as follows:
|
|
314 |
if QuestionPrompt != None or len(QuestionPrompt):
|
315 |
curr_question_prompt = QuestionPrompt
|
316 |
|
317 |
-
documents_clusters, best_value = self.db.queryVectorDB(
|
|
|
|
|
318 |
relation_answer, question_answer = (
|
319 |
self.getBestQuestionOnTheBasisOfPageInformationAndAdsData(
|
320 |
page_information,
|
@@ -324,7 +359,13 @@ The ADS_DATA provided to you is as follows:
|
|
324 |
best_value,
|
325 |
)
|
326 |
)
|
327 |
-
self.logResult(
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
|
329 |
docs_info = self.changeDocumentsToPrintableString(documents_clusters)
|
330 |
relation_answer_string = self.changeResponseToPrintableString(
|
|
|
22 |
|
23 |
class CLUSTERING:
|
24 |
def __init__(self):
|
25 |
+
self.clustering_algo = (
|
26 |
+
"kmeans-cc" # ['kmeans-cc', 'kmeans-sp', 'spectral_clustering']
|
27 |
+
)
|
28 |
|
29 |
def cluster_embeddings(self, embeddings, no_of_clusters, no_of_points):
|
30 |
if self.clustering_algo in {"kmeans-cc", "kmeans-sp"}:
|
|
|
47 |
for i, label in enumerate(labels):
|
48 |
if len(clusters_indices[label]) < no_of_points:
|
49 |
clusters_indices[label].append(i)
|
50 |
+
if all(
|
51 |
+
len(cluster) == no_of_points for cluster in clusters_indices
|
52 |
+
):
|
53 |
break
|
54 |
elif self.clustering_algo == "spectral":
|
55 |
+
spectral_clustering = SpectralClustering(
|
56 |
+
n_clusters=no_of_clusters, affinity="nearest_neighbors", random_state=42
|
57 |
+
)
|
58 |
labels = spectral_clustering.fit_predict(embeddings)
|
59 |
|
60 |
clusters_indices = [[] for _ in range(no_of_clusters)]
|
|
|
68 |
for i in range(no_of_clusters)
|
69 |
]
|
70 |
|
71 |
+
|
72 |
class VECTOR_DB:
|
73 |
def __init__(self):
|
74 |
self.DB_FAISS_PATH = "./vectorstore/db_faiss_ads_20May_20Jun_webmd_healthline_Health_dupRemoved0.8"
|
|
|
96 |
retreived_documents[i][0].page_content = remove_html_tags(
|
97 |
retreived_documents[i][0].page_content
|
98 |
)
|
99 |
+
if len(retreived_documents):
|
100 |
+
embeddings = np.array(
|
101 |
+
embeddings_hf.embed_documents(
|
102 |
+
[doc[0].page_content for doc in retreived_documents]
|
103 |
+
)
|
104 |
+
)
|
105 |
|
106 |
+
clustered_indices = CLUSTERING().cluster_embeddings(
|
107 |
+
embeddings, self.no_of_clusters, self.no_of_ads_in_each_cluster
|
108 |
+
)
|
109 |
+
documents_clusters = [
|
110 |
+
[retreived_documents[ind] for ind in cluster_indices]
|
111 |
+
for cluster_indices in clustered_indices
|
112 |
+
]
|
113 |
|
|
|
|
|
114 |
best_value = retreived_documents[0][1]
|
115 |
+
return documents_clusters, best_value
|
116 |
+
return [], 1
|
117 |
+
|
118 |
|
119 |
class ADS_RAG:
|
120 |
def __init__(self):
|
|
|
122 |
self.db = VECTOR_DB()
|
123 |
self.qa_model_name = "gpt-3.5-turbo"
|
124 |
self.relation_check_best_value_thresh = 0.6
|
125 |
+
self.bestRelationSystemPrompt = """You are an advertising concierge for text ads on websites. Given an INPUT and the available ad inventory (ADS_DATA), your task is to determine whether there are some relevant ADS to INPUT are present in ADS_DATA. ADS WHICH DON'T MATCH USER'S INTENT SHOULD BE CONSIDERED IRRELEVANT
|
126 |
|
127 |
---------------------------------------
|
128 |
|
129 |
+
**Sample INPUT***: What Causes Bright-Yellow Urine and Other Changes in Color?
|
130 |
|
131 |
Expected json output :
|
132 |
{
|
|
|
135 |
}
|
136 |
------------------------------------------------
|
137 |
|
138 |
+
**Sample INPUT**: The Effects of Aging on Skin
|
139 |
|
140 |
Expected json output :
|
141 |
{
|
142 |
+
"reasoning" : "Given the user's search for 'The Effects of Aging on Skin,' it is clear that they are seeking information related to skin aging. Therefore, the ads that are relevant to skin effects should be considered. Ads 1 and 2 focus on wrinkle treatment and anti-aging solutions, making them pertinent to the user's intent. Ad 3 targets vitiligo and not general skin aging but it is related to skin effect. So it is also relevant. Ads 4 and 5 are about advanced lung cancer, which do not address the interest in skin. Ads 1 and 2, 3 are most relevant to the user's search. So ADS_DATA is relevant to INPUT. ",
|
143 |
"classification": 1
|
144 |
}
|
145 |
---------------------------------------
|
146 |
|
147 |
The ADS_DATA provided to you is as follows:
|
148 |
+
"""
|
149 |
|
150 |
+
self.bestQuestionSystemPrompt = """1. You are an advertising concierge for text ads on websites. Given an INPUT and the available ad inventory (ADS_DATA), your task is to form a relevant QUESTION to ask the user visiting the webpage. This question should help identify the user's intent behind visiting the webpage and should be highly attractive.
|
151 |
+
2. Now form a highly attractive/lucrative and diverse/mutually exclusive OPTION which should be both the answer for the QUESTION and related to ads in this cluster.
|
152 |
+
3. Try to generate intelligent creatives for advertising and keep QUESTION within 70 characters and either 2, 3 or 4 options with each OPTION within 4 to 6 words.
|
153 |
+
4. Provide your REASONING behind choosing the QUESTION and the OPTIONS. Now provide the QUESTION and the OPTIONS. Along with each OPTION, provide the ads from ADS_DATA that you associated with it.
|
|
|
|
|
|
|
154 |
|
155 |
---------------------------------------
|
156 |
|
157 |
+
<Sample INPUT>
|
158 |
The Effects of Aging on Skin
|
159 |
|
160 |
<Sample ADS_DATA>
|
161 |
+
{"Cluster 1 Ads": {"Ad 1": "Forget Retinol, Use This Household Item To Fill In Wrinkles - Celebrities Are Ditching Pricey Facelifts For This."}, "Cluster 2 Ads": {"Ad 2": "Stop Covering Your Wrinkles with Make Up - Do This Instead.", "Ad 3": "Living With Migraines? - Discover A Treatment Option. Learn about a type of prescription migraine treatment called CGRP receptor antagonists. Discover a range of resources that may help people dealing with migraines"}, "Cluster 3 Ads": {"Ad 4": "What is Advanced Skin Cancer? - Find Disease Information Here.Find Facts About Advanced Skin Cancer and a Potential Treatment Option.", "Ad 5": "Learn About Advanced Melanoma - Find Disease Information Here.Find Facts About Advanced Melanoma and a Potential Treatment Option.", "Ad 6": "Treatment For CKD - Reduce Risk Of Progressing CKD. Ask About A Treatment That Can Help Reduce Your Risk Of Kidney Failure", "Ad 7": "Are You Living With Vitiligo? - For Patients & Caregivers.Discover An FDA-Approved Topical Cream That May Help With Nonsegmental Vitiligo Repigmentation. Learn About A Copay Savings Card For Eligible Patients With Vitiligo."}]
|
162 |
|
163 |
<Expected json output>
|
164 |
{
|
165 |
+
"reasoning" : "Among the seven ads in **Sample ADS_DATA**, Ads 3 and 6 are irrelevant to the INPUT, so they should be discarded. Ad 1, 2 closely aligns with the user's intent. Ads 4, 5, and 7 are also relevant to INPUT. The question will be formed in a way to connect the PAGE content with the goals of these five relevant ads, making sure they appeal to both specific and general user interests, with the OPTIONS being the answer for QUESTION(it is ensured that no irrelevant options are formed)",
|
166 |
+
"question": "Interested in methods to combat aging skin?",
|
167 |
+
"options": {"1. Retinol Alternatives for Wrinkle Treatment." : ["Ad 1: Forget Retinol, Use This Household Item To Fill In Wrinkles - Celebrities Are Ditching Pricey Facelifts For This."], "2. Reduce Wrinkles without Makeup.": ["Ad 2: Stop Covering Your Wrinkles with Make Up - Do This Instead."], "3. Information on Skin Diseases": ["Ad 3: What is Advanced Skin Cancer? - Find Disease Information Here.Find Facts About Advanced Skin Cancer and a Potential Treatment Option.", "Ad 4: Learn About Advanced Melanoma - Find Disease Information Here.Find Facts About Advanced Melanoma and a Potential Treatment Option.", "Ad 5: Are You Living With Vitiligo? - For Patients & Caregivers.Discover An FDA-Approved Topical Cream That May Help With Nonsegmental Vitiligo Repigmentation. Learn About A Copay Savings Card For Eligible Patients With Vitiligo."]}
|
168 |
}
|
169 |
-----------------------------------------------
|
170 |
|
171 |
+
<Sample INPUT>
|
172 |
Got A Rosemary Bush? Here’re 20 Brilliant & Unusual Ways To Use All That Rosemary
|
173 |
|
174 |
<Sample ADS_DATA>
|
|
|
183 |
-----------------------------------------------
|
184 |
|
185 |
The ADS_DATA provided to you is as follows:
|
186 |
+
"""
|
187 |
|
188 |
old_system_prompt_additional_example = """
|
189 |
+
-----------------------------------------------
|
190 |
+
<Sample INPUT>
|
191 |
+
7 Signs and Symptoms of Magnesium Deficiency
|
192 |
+
|
193 |
+
<Sample ADS_DATA>
|
194 |
+
Ad 1: 4 Warning Signs Of Dementia - Fight Dementia and Memory Loss. 100% Natural Program To Prevent Cognitive Decline. Developed By Dr. Will Mitchell. Read The Reviews-Get a Special Offer. Doctor Recommended. High Quality Standards. 60-Day Refund.
|
195 |
+
Ad 2: About Hyperkalemia - Learn About The Symptoms. High Potassium Can Be A Serious Condition. Learn More About Hyperkalemia Today.
|
196 |
+
Ad 3: Weak or Paralyzed Muscles? - A Common Symptom of Cataplexy. About 70% of People With Narcolepsy Are Believed to Have Cataplexy Symptoms. Learn More. Download the Doctor Discussion Guide to Have a Informed Conversation About Your Health.
|
197 |
+
|
198 |
+
<Expected json output>
|
199 |
+
{
|
200 |
+
"reasoning" : "Given the input '7 Signs and Symptoms of Magnesium Deficiency,' it is evident that the user is looking for information specifically about magnesium deficiency. Ads 1, 2, and 3 discuss topics such as dementia, hyperkalemia, weak muscles, which are not related to magnesium deficiency in any way. Therefore, all the ads in the ADS_DATA are not suitable for the user's query and will be discarded.",
|
201 |
+
"question": "No related ads available to form question and options.",
|
202 |
+
"options": []
|
203 |
+
}
|
204 |
+
------------------------------------------------
|
205 |
"""
|
206 |
|
207 |
def callOpenAiApi(self, messages):
|
|
|
212 |
messages=messages,
|
213 |
temperature=0,
|
214 |
seed=42,
|
215 |
+
max_tokens=1200,
|
216 |
response_format={"type": "json_object"},
|
217 |
)
|
218 |
tokens_used = response.usage.total_tokens
|
219 |
answer = json.loads(response.choices[0].message.content)
|
220 |
return answer, tokens_used
|
221 |
except Exception as e:
|
222 |
+
print(response.choices[0].message.content)
|
223 |
+
print("Error-: ", e)
|
224 |
print("Trying Again")
|
225 |
|
226 |
def getBestQuestionOnTheBasisOfPageInformationAndAdsData(
|
|
|
264 |
)
|
265 |
|
266 |
if relation_answer["classification"] != 0:
|
267 |
+
question_answer, tokens_used_question = self.callOpenAiApi(
|
268 |
+
[
|
269 |
{
|
270 |
"role": "system",
|
271 |
"content": questionSystemPrompt + adsData,
|
|
|
276 |
"role": "user",
|
277 |
"content": page_information + "\nThe JSON response: ",
|
278 |
}
|
279 |
+
]
|
280 |
+
)
|
281 |
return (relation_answer, tokens_used_relation), (
|
282 |
question_answer,
|
283 |
tokens_used_question,
|
|
|
285 |
|
286 |
def convertDocumentsClustersToStringForApiCall(self, documents_clusters):
|
287 |
key_counter = count(1)
|
288 |
+
res = json.dumps(
|
289 |
+
{
|
290 |
+
f"Option {i+1} Ads": {
|
291 |
+
f"Ad {next(key_counter)}": document[0].page_content
|
292 |
+
for j, document in enumerate(documents_cluster)
|
293 |
+
}
|
294 |
+
for i, documents_cluster in enumerate(documents_clusters)
|
295 |
+
}
|
296 |
+
)
|
297 |
return res
|
298 |
|
299 |
def changeDocumentsToPrintableString(self, documents_clusters):
|
300 |
res = ""
|
301 |
i = 0
|
302 |
for ind, documents_cluster in enumerate(documents_clusters):
|
303 |
+
res += f"Option {ind+1} Ads-:\n"
|
304 |
for document in documents_cluster:
|
305 |
i += 1
|
306 |
res += f"[Ad {i}] Content: {document[0].page_content}\nRevenue: {document[0].metadata['revenue']}\nAd Click Count: {document[0].metadata['ad_click_count']}\nValue: {document[1]}\n"
|
|
|
318 |
res += "\n"
|
319 |
return res
|
320 |
|
321 |
+
def logResult(
|
322 |
+
self,
|
323 |
+
curr_relation_prompt,
|
324 |
+
curr_question_prompt,
|
325 |
+
page_information,
|
326 |
+
relation_answer,
|
327 |
+
question_answer,
|
328 |
+
):
|
329 |
+
print(
|
330 |
+
"**************************************************************************************************\n",
|
331 |
+
# curr_relation_prompt,
|
332 |
+
# curr_question_prompt,
|
333 |
+
page_information,
|
334 |
+
json.dumps(relation_answer, indent=4),
|
335 |
+
json.dumps(question_answer, indent=4),
|
336 |
+
"\n************************************************************************************************\n\n",
|
337 |
+
)
|
338 |
|
339 |
def getRagResponse(
|
340 |
self, RelationPrompt, QuestionPrompt, threshold, page_information
|
|
|
347 |
if QuestionPrompt != None or len(QuestionPrompt):
|
348 |
curr_question_prompt = QuestionPrompt
|
349 |
|
350 |
+
documents_clusters, best_value = self.db.queryVectorDB(
|
351 |
+
page_information, threshold
|
352 |
+
)
|
353 |
relation_answer, question_answer = (
|
354 |
self.getBestQuestionOnTheBasisOfPageInformationAndAdsData(
|
355 |
page_information,
|
|
|
359 |
best_value,
|
360 |
)
|
361 |
)
|
362 |
+
self.logResult(
|
363 |
+
curr_relation_prompt,
|
364 |
+
curr_relation_prompt,
|
365 |
+
page_information,
|
366 |
+
relation_answer,
|
367 |
+
question_answer,
|
368 |
+
)
|
369 |
|
370 |
docs_info = self.changeDocumentsToPrintableString(documents_clusters)
|
371 |
relation_answer_string = self.changeResponseToPrintableString(
|