Spaces:

mohbay
/

searchcsv2

Running

App Files Files Community

mohbay commited on Jul 5

Commit

61a6c42

verified ·

1 Parent(s): 892da5a

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -188

app.py CHANGED Viewed

@@ -3,19 +3,13 @@ import pandas as pd
 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
 import re
-import numpy as np
-from collections import Counter
-# Load models
 model = SentenceTransformer("distilbert-base-multilingual-cased")
 modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
-# Load data
 df = pd.read_csv("cleaned1.csv")
 df2 = pd.read_csv("cleaned2.csv")
 df3 = pd.read_csv("cleaned3.csv")
-# Load embeddings
 embeddings = torch.load("embeddings1_1.pt")
 embeddings2 = torch.load("embeddings2_1.pt")
 embeddings3 = torch.load("embeddings3_1.pt")
@@ -24,7 +18,6 @@ embeddingsa = torch.load("embeddings1.pt")
 embeddingsa2 = torch.load("embeddings2.pt")
 embeddingsa3 = torch.load("embeddings3.pt")
-# Extract data arrays
 df_questions = df["question"].values
 df_links = df["link"].values
 df2_questions = df2["question"].values
@@ -33,17 +26,13 @@ df3_questions = df3["question"].values
 df3_links = df3["url"].values
 def arabic_word_tokenize(text):
-    """Improved tokenization with better handling of Arabic text"""
     if not isinstance(text, str):
         return []
-    # Remove diacritics and normalize
-    text = re.sub(r'[\u064B-\u065F\u0670\u06D6-\u06ED]', '', text)
-    # Extract words (Arabic, English, and numbers)
-    words = re.findall(r'[\u0600-\u06FF\u0750-\u077F\w]+', text.lower())
-    return words
-def compute_enhanced_word_overlap(query, questions):
-    """Enhanced word overlap with better scoring"""
     query_words = set(arabic_word_tokenize(query))
     if len(query_words) == 0:
         return [0.0] * len(questions)
@@ -55,83 +44,28 @@ def compute_enhanced_word_overlap(query, questions):
             overlaps.append(0.0)
             continue
-        # Jaccard similarity (intersection over union)
         intersection = len(query_words & q_words)
         union = len(query_words | q_words)
         jaccard = intersection / union if union > 0 else 0.0
-        # Word coverage (how much of query is covered)
         coverage = intersection / len(query_words)
-        # Combine both metrics
-        combined_overlap = 0.6 * jaccard + 0.4 * coverage
-        overlaps.append(combined_overlap)
     return overlaps
-def compute_fuzzy_matches(query, questions):
-    """Compute fuzzy string matching scores"""
-    query_words = arabic_word_tokenize(query)
-    if len(query_words) == 0:
-        return [0.0] * len(questions)
-    fuzzy_scores = []
-    for q in questions:
-        q_words = arabic_word_tokenize(q)
-        if len(q_words) == 0:
-            fuzzy_scores.append(0.0)
-            continue
-        # Find partial matches (substrings)
-        matches = 0
-        for q_word in query_words:
-            for doc_word in q_words:
-                if len(q_word) >= 3 and len(doc_word) >= 3:
-                    if q_word in doc_word or doc_word in q_word:
-                        matches += 1
-                        break
-        fuzzy_score = matches / len(query_words) if len(query_words) > 0 else 0.0
-        fuzzy_scores.append(fuzzy_score)
-    return fuzzy_scores
-def compute_length_penalty(query, questions):
-    """Penalize very long or very short results relative to query"""
-    query_len = len(arabic_word_tokenize(query))
-    penalties = []
-    for q in questions:
-        q_len = len(arabic_word_tokenize(q))
-        if q_len == 0:
-            penalties.append(0.0)
-            continue
-        # Optimal length ratio (prefer similar lengths)
-        ratio = min(query_len, q_len) / max(query_len, q_len)
-        # Penalty for very short results
-        if q_len < 3:
-            ratio *= 0.5
-        penalties.append(ratio)
-    return penalties
-def normalize_scores(scores):
-    """Normalize scores to 0-1 range"""
-    scores = np.array(scores)
-    if scores.max() - scores.min() == 0:
-        return scores
-    return (scores - scores.min()) / (scores.max() - scores.min())
 def predict(text):
     if not text or text.strip() == "":
         return "No query provided"
-    # Encode query with both models
     query_embedding = model.encode(text, convert_to_tensor=True)
     query_embeddinga = modela.encode(text, convert_to_tensor=True)
-    # Compute semantic similarities
     sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
                    util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
     sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
@@ -139,159 +73,106 @@ def predict(text):
     sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
                    util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
-    # Compute enhanced word overlaps
-    word_overlap1 = compute_enhanced_word_overlap(text, df_questions)
-    word_overlap2 = compute_enhanced_word_overlap(text, df2_questions)
-    word_overlap3 = compute_enhanced_word_overlap(text, df3_questions)
-    # Compute fuzzy matches
-    fuzzy_scores1 = compute_fuzzy_matches(text, df_questions)
-    fuzzy_scores2 = compute_fuzzy_matches(text, df2_questions)
-    fuzzy_scores3 = compute_fuzzy_matches(text, df3_questions)
-    # Compute length penalties
-    length_penalties1 = compute_length_penalty(text, df_questions)
-    length_penalties2 = compute_length_penalty(text, df2_questions)
-    length_penalties3 = compute_length_penalty(text, df3_questions)
-    # Normalize all scores
-    sem_scores1 = normalize_scores([float(x.cpu().item()) for x in sim_scores1])
-    sem_scores2 = normalize_scores([float(x.cpu().item()) for x in sim_scores2])
-    sem_scores3 = normalize_scores([float(x.cpu().item()) for x in sim_scores3])
-    word_scores1 = normalize_scores(word_overlap1)
-    word_scores2 = normalize_scores(word_overlap2)
-    word_scores3 = normalize_scores(word_overlap3)
-    fuzzy_scores1_norm = normalize_scores(fuzzy_scores1)
-    fuzzy_scores2_norm = normalize_scores(fuzzy_scores2)
-    fuzzy_scores3_norm = normalize_scores(fuzzy_scores3)
-    # Adaptive weights based on query characteristics
     query_words = arabic_word_tokenize(text)
     if len(query_words) <= 2:
-        # Short queries: prioritize exact matches
-        semantic_weight = 0.3
-        word_weight = 0.5
-        fuzzy_weight = 0.2
     elif len(query_words) <= 5:
-        # Medium queries: balanced approach
-        semantic_weight = 0.4
-        word_weight = 0.4
-        fuzzy_weight = 0.2
     else:
         # Long queries: prioritize semantic similarity
-        semantic_weight = 0.5
-        word_weight = 0.3
-        fuzzy_weight = 0.2
-    # Collect results for dataset 1
     combined1 = []
     for i in range(len(df_questions)):
-        combined_score = (
-            semantic_weight * sem_scores1[i] +
-            word_weight * word_scores1[i] +
-            fuzzy_weight * fuzzy_scores1_norm[i]
-        ) * length_penalties1[i]
         combined1.append({
             "question": df_questions[i],
             "link": df_links[i],
-            "cosine_score": float(sim_scores1[i].cpu().item()),
-            "word_overlap_score": float(word_overlap1[i]),
-            "fuzzy_score": float(fuzzy_scores1[i]),
-            "length_penalty": float(length_penalties1[i]),
-            "combined_score": float(combined_score)
         })
-    # Collect results for dataset 2
     combined2 = []
     for i in range(len(df2_questions)):
-        combined_score = (
-            semantic_weight * sem_scores2[i] +
-            word_weight * word_scores2[i] +
-            fuzzy_weight * fuzzy_scores2_norm[i]
-        ) * length_penalties2[i]
         combined2.append({
             "question": df2_questions[i],
             "link": df2_links[i],
-            "cosine_score": float(sim_scores2[i].cpu().item()),
-            "word_overlap_score": float(word_overlap2[i]),
-            "fuzzy_score": float(fuzzy_scores2[i]),
-            "length_penalty": float(length_penalties2[i]),
-            "combined_score": float(combined_score)
         })
-    # Collect results for dataset 3
     combined3 = []
     for i in range(len(df3_questions)):
-        combined_score = (
-            semantic_weight * sem_scores3[i] +
-            word_weight * word_scores3[i] +
-            fuzzy_weight * fuzzy_scores3_norm[i]
-        ) * length_penalties3[i]
         combined3.append({
             "question": df3_questions[i],
             "link": df3_links[i],
-            "cosine_score": float(sim_scores3[i].cpu().item()),
-            "word_overlap_score": float(word_overlap3[i]),
-            "fuzzy_score": float(fuzzy_scores3[i]),
-            "length_penalty": float(length_penalties3[i]),
-            "combined_score": float(combined_score)
         })
-    # Get top results with diversity filtering
-    def get_diverse_top_results(results, top_k=5):
-        """Get top results while avoiding too similar ones"""
-        sorted_results = sorted(results, key=lambda x: x["combined_score"], reverse=True)
-        diverse_results = []
-        for result in sorted_results:
-            if len(diverse_results) >= top_k:
-                break
-            # Check if this result is too similar to already selected ones
-            is_diverse = True
-            for selected in diverse_results:
-                # Simple diversity check based on word overlap
-                overlap = compute_enhanced_word_overlap(result["question"], [selected["question"]])[0]
-                if overlap > 0.8:  # Too similar
-                    is_diverse = False
-                    break
-            if is_diverse:
-                diverse_results.append(result)
-        return diverse_results
-    top1 = get_diverse_top_results(combined1, 3)
-    top2 = get_diverse_top_results(combined2, 3)
-    top3 = get_diverse_top_results(combined3, 3)
     results = {
-        "top1": top1,
         "top2": top2,
         "top3": top3,
-        "query_analysis": {
-            "word_count": len(query_words),
-            "semantic_weight": semantic_weight,
-            "word_weight": word_weight,
-            "fuzzy_weight": fuzzy_weight
-        }
     }
     return results
-title = "Enhanced Search CSV"
 iface = gr.Interface(
     fn=predict,
-    inputs=[gr.Textbox(label="Search Query", lines=3, placeholder="Enter your search query here...")],
     outputs='json',
     title=title,
-    description="Enhanced semantic search with improved matching algorithms"
 )
-if __name__ == "__main__":
-    iface.launch()

 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
 import re
 model = SentenceTransformer("distilbert-base-multilingual-cased")
 modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
 df = pd.read_csv("cleaned1.csv")
 df2 = pd.read_csv("cleaned2.csv")
 df3 = pd.read_csv("cleaned3.csv")
 embeddings = torch.load("embeddings1_1.pt")
 embeddings2 = torch.load("embeddings2_1.pt")
 embeddings3 = torch.load("embeddings3_1.pt")
 embeddingsa2 = torch.load("embeddings2.pt")
 embeddingsa3 = torch.load("embeddings3.pt")
 df_questions = df["question"].values
 df_links = df["link"].values
 df2_questions = df2["question"].values
 df3_links = df3["url"].values
 def arabic_word_tokenize(text):
     if not isinstance(text, str):
         return []
+    # Remove diacritics for better matching
+    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
+    return re.findall(r'[\u0600-\u06FF\w]+', text.lower())
+def compute_word_overlap(query, questions):
     query_words = set(arabic_word_tokenize(query))
     if len(query_words) == 0:
         return [0.0] * len(questions)
             overlaps.append(0.0)
             continue
+        # Use Jaccard similarity (intersection over union) instead of just coverage
         intersection = len(query_words & q_words)
         union = len(query_words | q_words)
         jaccard = intersection / union if union > 0 else 0.0
+        # Also compute coverage (how much of query is matched)
         coverage = intersection / len(query_words)
+        # Combine both: prioritize coverage but consider similarity
+        overlap_score = 0.7 * coverage + 0.3 * jaccard
+        overlaps.append(overlap_score)
     return overlaps
 def predict(text):
     if not text or text.strip() == "":
         return "No query provided"
     query_embedding = model.encode(text, convert_to_tensor=True)
     query_embeddinga = modela.encode(text, convert_to_tensor=True)
+    # Cosine similarities
     sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
                    util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
     sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
     sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
                    util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
+    # Enhanced word overlaps
+    word_overlap1 = compute_word_overlap(text, df_questions)
+    word_overlap2 = compute_word_overlap(text, df2_questions)
+    word_overlap3 = compute_word_overlap(text, df3_questions)
+    # Adaptive weighting based on query length
     query_words = arabic_word_tokenize(text)
     if len(query_words) <= 2:
+        # Short queries: prioritize exact word matches
+        weight = 0.6
     elif len(query_words) <= 5:
+        # Medium queries: balanced
+        weight = 0.4
     else:
         # Long queries: prioritize semantic similarity
+        weight = 0.25
+    # Collect top1 with better scoring
     combined1 = []
     for i in range(len(df_questions)):
+        semantic_score = float(sim_scores1[i].cpu().item())
+        word_score = float(word_overlap1[i])
+        # Boost results that have both good semantic AND word overlap
+        if semantic_score > 0.5 and word_score > 0.3:
+            boost = 0.1
+        else:
+            boost = 0.0
+        combined_score = semantic_score + weight * word_score + boost
         combined1.append({
             "question": df_questions[i],
             "link": df_links[i],
+            "cosine_score": semantic_score,
+            "word_overlap_score": word_score,
+            "combined_score": combined_score
         })
+    # Collect top2 with better scoring
     combined2 = []
     for i in range(len(df2_questions)):
+        semantic_score = float(sim_scores2[i].cpu().item())
+        word_score = float(word_overlap2[i])
+        if semantic_score > 0.5 and word_score > 0.3:
+            boost = 0.1
+        else:
+            boost = 0.0
+        combined_score = semantic_score + weight * word_score + boost
         combined2.append({
             "question": df2_questions[i],
             "link": df2_links[i],
+            "cosine_score": semantic_score,
+            "word_overlap_score": word_score,
+            "combined_score": combined_score
         })
+    # Collect top3 with better scoring
     combined3 = []
     for i in range(len(df3_questions)):
+        semantic_score = float(sim_scores3[i].cpu().item())
+        word_score = float(word_overlap3[i])
+        if semantic_score > 0.5 and word_score > 0.3:
+            boost = 0.1
+        else:
+            boost = 0.0
+        combined_score = semantic_score + weight * word_score + boost
         combined3.append({
             "question": df3_questions[i],
             "link": df3_links[i],
+            "cosine_score": semantic_score,
+            "word_overlap_score": word_score,
+            "combined_score": combined_score
         })
+    # Get top results - consider more candidates then filter
+    top1 = sorted(combined1, key=lambda x: x["combined_score"], reverse=True)[:5]
+    top2 = sorted(combined2, key=lambda x: x["combined_score"], reverse=True)[:5]
+    top3 = sorted(combined3, key=lambda x: x["combined_score"], reverse=True)[:5]
     results = {
         "top2": top2,
         "top3": top3,
+        "top1": top1,
     }
     return results
+title = "Search CSV"
 iface = gr.Interface(
     fn=predict,
+    inputs=[gr.Textbox(label="text", lines=3)],
     outputs='json',
     title=title,
 )
+iface.launch()