mohbay commited on
Commit
61a6c42
·
verified ·
1 Parent(s): 892da5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -188
app.py CHANGED
@@ -3,19 +3,13 @@ import pandas as pd
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
5
  import re
6
- import numpy as np
7
- from collections import Counter
8
 
9
- # Load models
10
  model = SentenceTransformer("distilbert-base-multilingual-cased")
11
  modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
12
-
13
- # Load data
14
  df = pd.read_csv("cleaned1.csv")
15
  df2 = pd.read_csv("cleaned2.csv")
16
  df3 = pd.read_csv("cleaned3.csv")
17
 
18
- # Load embeddings
19
  embeddings = torch.load("embeddings1_1.pt")
20
  embeddings2 = torch.load("embeddings2_1.pt")
21
  embeddings3 = torch.load("embeddings3_1.pt")
@@ -24,7 +18,6 @@ embeddingsa = torch.load("embeddings1.pt")
24
  embeddingsa2 = torch.load("embeddings2.pt")
25
  embeddingsa3 = torch.load("embeddings3.pt")
26
 
27
- # Extract data arrays
28
  df_questions = df["question"].values
29
  df_links = df["link"].values
30
  df2_questions = df2["question"].values
@@ -33,17 +26,13 @@ df3_questions = df3["question"].values
33
  df3_links = df3["url"].values
34
 
35
  def arabic_word_tokenize(text):
36
- """Improved tokenization with better handling of Arabic text"""
37
  if not isinstance(text, str):
38
  return []
39
- # Remove diacritics and normalize
40
- text = re.sub(r'[\u064B-\u065F\u0670\u06D6-\u06ED]', '', text)
41
- # Extract words (Arabic, English, and numbers)
42
- words = re.findall(r'[\u0600-\u06FF\u0750-\u077F\w]+', text.lower())
43
- return words
44
 
45
- def compute_enhanced_word_overlap(query, questions):
46
- """Enhanced word overlap with better scoring"""
47
  query_words = set(arabic_word_tokenize(query))
48
  if len(query_words) == 0:
49
  return [0.0] * len(questions)
@@ -55,83 +44,28 @@ def compute_enhanced_word_overlap(query, questions):
55
  overlaps.append(0.0)
56
  continue
57
 
58
- # Jaccard similarity (intersection over union)
59
  intersection = len(query_words & q_words)
60
  union = len(query_words | q_words)
61
  jaccard = intersection / union if union > 0 else 0.0
62
 
63
- # Word coverage (how much of query is covered)
64
  coverage = intersection / len(query_words)
65
 
66
- # Combine both metrics
67
- combined_overlap = 0.6 * jaccard + 0.4 * coverage
68
- overlaps.append(combined_overlap)
69
 
70
  return overlaps
71
 
72
- def compute_fuzzy_matches(query, questions):
73
- """Compute fuzzy string matching scores"""
74
- query_words = arabic_word_tokenize(query)
75
- if len(query_words) == 0:
76
- return [0.0] * len(questions)
77
-
78
- fuzzy_scores = []
79
- for q in questions:
80
- q_words = arabic_word_tokenize(q)
81
- if len(q_words) == 0:
82
- fuzzy_scores.append(0.0)
83
- continue
84
-
85
- # Find partial matches (substrings)
86
- matches = 0
87
- for q_word in query_words:
88
- for doc_word in q_words:
89
- if len(q_word) >= 3 and len(doc_word) >= 3:
90
- if q_word in doc_word or doc_word in q_word:
91
- matches += 1
92
- break
93
-
94
- fuzzy_score = matches / len(query_words) if len(query_words) > 0 else 0.0
95
- fuzzy_scores.append(fuzzy_score)
96
-
97
- return fuzzy_scores
98
-
99
- def compute_length_penalty(query, questions):
100
- """Penalize very long or very short results relative to query"""
101
- query_len = len(arabic_word_tokenize(query))
102
- penalties = []
103
-
104
- for q in questions:
105
- q_len = len(arabic_word_tokenize(q))
106
- if q_len == 0:
107
- penalties.append(0.0)
108
- continue
109
-
110
- # Optimal length ratio (prefer similar lengths)
111
- ratio = min(query_len, q_len) / max(query_len, q_len)
112
- # Penalty for very short results
113
- if q_len < 3:
114
- ratio *= 0.5
115
- penalties.append(ratio)
116
-
117
- return penalties
118
-
119
- def normalize_scores(scores):
120
- """Normalize scores to 0-1 range"""
121
- scores = np.array(scores)
122
- if scores.max() - scores.min() == 0:
123
- return scores
124
- return (scores - scores.min()) / (scores.max() - scores.min())
125
-
126
  def predict(text):
127
  if not text or text.strip() == "":
128
  return "No query provided"
129
 
130
- # Encode query with both models
131
  query_embedding = model.encode(text, convert_to_tensor=True)
132
  query_embeddinga = modela.encode(text, convert_to_tensor=True)
133
 
134
- # Compute semantic similarities
135
  sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
136
  util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
137
  sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
@@ -139,159 +73,106 @@ def predict(text):
139
  sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
140
  util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
141
 
142
- # Compute enhanced word overlaps
143
- word_overlap1 = compute_enhanced_word_overlap(text, df_questions)
144
- word_overlap2 = compute_enhanced_word_overlap(text, df2_questions)
145
- word_overlap3 = compute_enhanced_word_overlap(text, df3_questions)
146
-
147
- # Compute fuzzy matches
148
- fuzzy_scores1 = compute_fuzzy_matches(text, df_questions)
149
- fuzzy_scores2 = compute_fuzzy_matches(text, df2_questions)
150
- fuzzy_scores3 = compute_fuzzy_matches(text, df3_questions)
151
 
152
- # Compute length penalties
153
- length_penalties1 = compute_length_penalty(text, df_questions)
154
- length_penalties2 = compute_length_penalty(text, df2_questions)
155
- length_penalties3 = compute_length_penalty(text, df3_questions)
156
-
157
- # Normalize all scores
158
- sem_scores1 = normalize_scores([float(x.cpu().item()) for x in sim_scores1])
159
- sem_scores2 = normalize_scores([float(x.cpu().item()) for x in sim_scores2])
160
- sem_scores3 = normalize_scores([float(x.cpu().item()) for x in sim_scores3])
161
-
162
- word_scores1 = normalize_scores(word_overlap1)
163
- word_scores2 = normalize_scores(word_overlap2)
164
- word_scores3 = normalize_scores(word_overlap3)
165
-
166
- fuzzy_scores1_norm = normalize_scores(fuzzy_scores1)
167
- fuzzy_scores2_norm = normalize_scores(fuzzy_scores2)
168
- fuzzy_scores3_norm = normalize_scores(fuzzy_scores3)
169
-
170
- # Adaptive weights based on query characteristics
171
  query_words = arabic_word_tokenize(text)
172
  if len(query_words) <= 2:
173
- # Short queries: prioritize exact matches
174
- semantic_weight = 0.3
175
- word_weight = 0.5
176
- fuzzy_weight = 0.2
177
  elif len(query_words) <= 5:
178
- # Medium queries: balanced approach
179
- semantic_weight = 0.4
180
- word_weight = 0.4
181
- fuzzy_weight = 0.2
182
  else:
183
  # Long queries: prioritize semantic similarity
184
- semantic_weight = 0.5
185
- word_weight = 0.3
186
- fuzzy_weight = 0.2
187
 
188
- # Collect results for dataset 1
189
  combined1 = []
190
  for i in range(len(df_questions)):
191
- combined_score = (
192
- semantic_weight * sem_scores1[i] +
193
- word_weight * word_scores1[i] +
194
- fuzzy_weight * fuzzy_scores1_norm[i]
195
- ) * length_penalties1[i]
 
 
 
 
 
196
 
197
  combined1.append({
198
  "question": df_questions[i],
199
  "link": df_links[i],
200
- "cosine_score": float(sim_scores1[i].cpu().item()),
201
- "word_overlap_score": float(word_overlap1[i]),
202
- "fuzzy_score": float(fuzzy_scores1[i]),
203
- "length_penalty": float(length_penalties1[i]),
204
- "combined_score": float(combined_score)
205
  })
206
 
207
- # Collect results for dataset 2
208
  combined2 = []
209
  for i in range(len(df2_questions)):
210
- combined_score = (
211
- semantic_weight * sem_scores2[i] +
212
- word_weight * word_scores2[i] +
213
- fuzzy_weight * fuzzy_scores2_norm[i]
214
- ) * length_penalties2[i]
 
 
 
 
215
 
216
  combined2.append({
217
  "question": df2_questions[i],
218
  "link": df2_links[i],
219
- "cosine_score": float(sim_scores2[i].cpu().item()),
220
- "word_overlap_score": float(word_overlap2[i]),
221
- "fuzzy_score": float(fuzzy_scores2[i]),
222
- "length_penalty": float(length_penalties2[i]),
223
- "combined_score": float(combined_score)
224
  })
225
 
226
- # Collect results for dataset 3
227
  combined3 = []
228
  for i in range(len(df3_questions)):
229
- combined_score = (
230
- semantic_weight * sem_scores3[i] +
231
- word_weight * word_scores3[i] +
232
- fuzzy_weight * fuzzy_scores3_norm[i]
233
- ) * length_penalties3[i]
 
 
 
 
234
 
235
  combined3.append({
236
  "question": df3_questions[i],
237
  "link": df3_links[i],
238
- "cosine_score": float(sim_scores3[i].cpu().item()),
239
- "word_overlap_score": float(word_overlap3[i]),
240
- "fuzzy_score": float(fuzzy_scores3[i]),
241
- "length_penalty": float(length_penalties3[i]),
242
- "combined_score": float(combined_score)
243
  })
244
 
245
- # Get top results with diversity filtering
246
- def get_diverse_top_results(results, top_k=5):
247
- """Get top results while avoiding too similar ones"""
248
- sorted_results = sorted(results, key=lambda x: x["combined_score"], reverse=True)
249
-
250
- diverse_results = []
251
- for result in sorted_results:
252
- if len(diverse_results) >= top_k:
253
- break
254
-
255
- # Check if this result is too similar to already selected ones
256
- is_diverse = True
257
- for selected in diverse_results:
258
- # Simple diversity check based on word overlap
259
- overlap = compute_enhanced_word_overlap(result["question"], [selected["question"]])[0]
260
- if overlap > 0.8: # Too similar
261
- is_diverse = False
262
- break
263
-
264
- if is_diverse:
265
- diverse_results.append(result)
266
-
267
- return diverse_results
268
-
269
- top1 = get_diverse_top_results(combined1, 3)
270
- top2 = get_diverse_top_results(combined2, 3)
271
- top3 = get_diverse_top_results(combined3, 3)
272
 
273
  results = {
274
- "top1": top1,
275
  "top2": top2,
276
  "top3": top3,
277
- "query_analysis": {
278
- "word_count": len(query_words),
279
- "semantic_weight": semantic_weight,
280
- "word_weight": word_weight,
281
- "fuzzy_weight": fuzzy_weight
282
- }
283
  }
284
 
285
  return results
286
 
287
- title = "Enhanced Search CSV"
288
  iface = gr.Interface(
289
  fn=predict,
290
- inputs=[gr.Textbox(label="Search Query", lines=3, placeholder="Enter your search query here...")],
291
  outputs='json',
292
  title=title,
293
- description="Enhanced semantic search with improved matching algorithms"
294
  )
295
-
296
- if __name__ == "__main__":
297
- iface.launch()
 
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
5
  import re
 
 
6
 
 
7
  model = SentenceTransformer("distilbert-base-multilingual-cased")
8
  modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
 
 
9
  df = pd.read_csv("cleaned1.csv")
10
  df2 = pd.read_csv("cleaned2.csv")
11
  df3 = pd.read_csv("cleaned3.csv")
12
 
 
13
  embeddings = torch.load("embeddings1_1.pt")
14
  embeddings2 = torch.load("embeddings2_1.pt")
15
  embeddings3 = torch.load("embeddings3_1.pt")
 
18
  embeddingsa2 = torch.load("embeddings2.pt")
19
  embeddingsa3 = torch.load("embeddings3.pt")
20
 
 
21
  df_questions = df["question"].values
22
  df_links = df["link"].values
23
  df2_questions = df2["question"].values
 
26
  df3_links = df3["url"].values
27
 
28
  def arabic_word_tokenize(text):
 
29
  if not isinstance(text, str):
30
  return []
31
+ # Remove diacritics for better matching
32
+ text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
33
+ return re.findall(r'[\u0600-\u06FF\w]+', text.lower())
 
 
34
 
35
+ def compute_word_overlap(query, questions):
 
36
  query_words = set(arabic_word_tokenize(query))
37
  if len(query_words) == 0:
38
  return [0.0] * len(questions)
 
44
  overlaps.append(0.0)
45
  continue
46
 
47
+ # Use Jaccard similarity (intersection over union) instead of just coverage
48
  intersection = len(query_words & q_words)
49
  union = len(query_words | q_words)
50
  jaccard = intersection / union if union > 0 else 0.0
51
 
52
+ # Also compute coverage (how much of query is matched)
53
  coverage = intersection / len(query_words)
54
 
55
+ # Combine both: prioritize coverage but consider similarity
56
+ overlap_score = 0.7 * coverage + 0.3 * jaccard
57
+ overlaps.append(overlap_score)
58
 
59
  return overlaps
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def predict(text):
62
  if not text or text.strip() == "":
63
  return "No query provided"
64
 
 
65
  query_embedding = model.encode(text, convert_to_tensor=True)
66
  query_embeddinga = modela.encode(text, convert_to_tensor=True)
67
 
68
+ # Cosine similarities
69
  sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
70
  util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
71
  sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
 
73
  sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
74
  util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
75
 
76
+ # Enhanced word overlaps
77
+ word_overlap1 = compute_word_overlap(text, df_questions)
78
+ word_overlap2 = compute_word_overlap(text, df2_questions)
79
+ word_overlap3 = compute_word_overlap(text, df3_questions)
 
 
 
 
 
80
 
81
+ # Adaptive weighting based on query length
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  query_words = arabic_word_tokenize(text)
83
  if len(query_words) <= 2:
84
+ # Short queries: prioritize exact word matches
85
+ weight = 0.6
 
 
86
  elif len(query_words) <= 5:
87
+ # Medium queries: balanced
88
+ weight = 0.4
 
 
89
  else:
90
  # Long queries: prioritize semantic similarity
91
+ weight = 0.25
 
 
92
 
93
+ # Collect top1 with better scoring
94
  combined1 = []
95
  for i in range(len(df_questions)):
96
+ semantic_score = float(sim_scores1[i].cpu().item())
97
+ word_score = float(word_overlap1[i])
98
+
99
+ # Boost results that have both good semantic AND word overlap
100
+ if semantic_score > 0.5 and word_score > 0.3:
101
+ boost = 0.1
102
+ else:
103
+ boost = 0.0
104
+
105
+ combined_score = semantic_score + weight * word_score + boost
106
 
107
  combined1.append({
108
  "question": df_questions[i],
109
  "link": df_links[i],
110
+ "cosine_score": semantic_score,
111
+ "word_overlap_score": word_score,
112
+ "combined_score": combined_score
 
 
113
  })
114
 
115
+ # Collect top2 with better scoring
116
  combined2 = []
117
  for i in range(len(df2_questions)):
118
+ semantic_score = float(sim_scores2[i].cpu().item())
119
+ word_score = float(word_overlap2[i])
120
+
121
+ if semantic_score > 0.5 and word_score > 0.3:
122
+ boost = 0.1
123
+ else:
124
+ boost = 0.0
125
+
126
+ combined_score = semantic_score + weight * word_score + boost
127
 
128
  combined2.append({
129
  "question": df2_questions[i],
130
  "link": df2_links[i],
131
+ "cosine_score": semantic_score,
132
+ "word_overlap_score": word_score,
133
+ "combined_score": combined_score
 
 
134
  })
135
 
136
+ # Collect top3 with better scoring
137
  combined3 = []
138
  for i in range(len(df3_questions)):
139
+ semantic_score = float(sim_scores3[i].cpu().item())
140
+ word_score = float(word_overlap3[i])
141
+
142
+ if semantic_score > 0.5 and word_score > 0.3:
143
+ boost = 0.1
144
+ else:
145
+ boost = 0.0
146
+
147
+ combined_score = semantic_score + weight * word_score + boost
148
 
149
  combined3.append({
150
  "question": df3_questions[i],
151
  "link": df3_links[i],
152
+ "cosine_score": semantic_score,
153
+ "word_overlap_score": word_score,
154
+ "combined_score": combined_score
 
 
155
  })
156
 
157
+ # Get top results - consider more candidates then filter
158
+ top1 = sorted(combined1, key=lambda x: x["combined_score"], reverse=True)[:5]
159
+ top2 = sorted(combined2, key=lambda x: x["combined_score"], reverse=True)[:5]
160
+ top3 = sorted(combined3, key=lambda x: x["combined_score"], reverse=True)[:5]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  results = {
163
+
164
  "top2": top2,
165
  "top3": top3,
166
+ "top1": top1,
 
 
 
 
 
167
  }
168
 
169
  return results
170
 
171
+ title = "Search CSV"
172
  iface = gr.Interface(
173
  fn=predict,
174
+ inputs=[gr.Textbox(label="text", lines=3)],
175
  outputs='json',
176
  title=title,
 
177
  )
178
+ iface.launch()