Spaces:

Omartificial-Intelligence-Space
/

Matroyshka_eval_retrieval_ar

Running on Zero

App Files Files Community

Omartificial-Intelligence-Space commited on Oct 14, 2024

Commit

61dd04e

verified ·

1 Parent(s): 6f8d49a

update app.py

Browse files

Files changed (1) hide show

app.py +34 -35

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import spaces
 import torch
 import pandas as pd
-import matplotlib.pyplot as plt
 from datasets import load_dataset
 from sentence_transformers import SentenceTransformer
 from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
@@ -13,7 +13,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 zero = torch.Tensor([0]).to(device)
 print(f"Device being used: {zero.device}")
-@spaces.GPU(duration=120)
 def evaluate_model(model_id):
     model = SentenceTransformer(model_id, device=device)
     matryoshka_dimensions = [768, 512, 256, 128, 64]
@@ -21,7 +21,7 @@ def evaluate_model(model_id):
     # Prepare datasets
     datasets_info = [
         {
-            "name": "Arabic Financial Dataset (Financial Evaluation)",
             "dataset_id": "Omartificial-Intelligence-Space/Arabic-finanical-rag-embedding-dataset",
             "split": "train",
             "size": 7000,
@@ -29,7 +29,7 @@ def evaluate_model(model_id):
             "sample_size": 500
         },
         {
-            "name": "MLQA Arabic (Long Context Evaluation)",
             "dataset_id": "google/xtreme",
             "subset": "MLQA.ar.ar",
             "split": "validation",
@@ -38,7 +38,7 @@ def evaluate_model(model_id):
             "sample_size": 500
         },
         {
-            "name": "ARCD (Short Context Evaluation)",
             "dataset_id": "hsseinmz/arcd",
             "split": "train",
             "size": None,
@@ -105,24 +105,32 @@ def evaluate_model(model_id):
             })
             scores.append(score)
-        # Store scores by dataset for bar chart creation
         scores_by_dataset[dataset_info["name"]] = scores
     # Convert results to DataFrame for display
     result_df = pd.DataFrame(evaluation_results)
-    # Generate bar charts for each dataset
     charts = []
-    colors = ['#FF5733', '#33FF57', '#3357FF', '#FF33C4', '#F3FF33']  # Creative color palette
     for dataset_name, scores in scores_by_dataset.items():
-        fig, ax = plt.subplots()
-        ax.bar([str(dim) for dim in matryoshka_dimensions], scores, color=colors)
-        ax.set_title(f"{dataset_name} Evaluation Scores", fontsize=16, color='darkblue')
-        ax.set_xlabel("Embedding Dimension", fontsize=12)
-        ax.set_ylabel("NDCG@10 Score", fontsize=12)
-        ax.spines['top'].set_visible(False)
-        ax.spines['right'].set_visible(False)
-        plt.tight_layout()
         charts.append(fig)
     return result_df, charts[0], charts[1], charts[2]
@@ -134,32 +142,23 @@ def display_results(model_name):
 demo = gr.Interface(
     fn=display_results,
-    inputs=gr.Textbox(label="Enter Your Embedding Model ID", placeholder="e.g., Omartificial-Intelligence-Space/GATE-AraBert-v1"),
     outputs=[
         gr.Dataframe(label="Evaluation Results"),
-        gr.Plot(label="Arabic Financial Dataset (Financial Evaluation)"),
-        gr.Plot(label="MLQA Arabic (Long Context Evaluation)"),
-        gr.Plot(label="ARCD (Short Context Evaluation)")
     ],
-    title="Evaluation of Arabic Matryoshka Embedding Models on Retrieval Tasks ",
     description=(
-        "Evaluate your Sentence Transformer model's performance on **context and question retrieval** for Arabic datasets for enhancing Arabic RAG.\n"
-        "- **ARCD** evaluates short context retrieval performance.\n"
-        "- **MLQA Arabic** evaluates long context retrieval performance.\n"
-        "- **Arabic Financial Dataset** focuses on financial context retrieval.\n\n"
-        "**Evaluation Metric:**\n"
-        "The evaluation uses **NDCG@10** (Normalized Discounted Cumulative Gain), which measures how well the retrieved documents (contexts) match the query relevance.\n"
-        "Higher scores indicate better performance. Embedding dimensions are reduced from 768 to 64, evaluating how well the model performs with fewer dimensions."
     ),
     theme="default",
     live=False,
     css="footer {visibility: hidden;}"
 )
-demo.launch(share=True)
-demo.launch(share=True)
-# Add the footer
-print("\nCreated by Omar Najar | Omartificial Intelligence Space")

 import spaces
 import torch
 import pandas as pd
+import plotly.graph_objects as go
 from datasets import load_dataset
 from sentence_transformers import SentenceTransformer
 from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
 zero = torch.Tensor([0]).to(device)
 print(f"Device being used: {zero.device}")
+@spaces.GPU
 def evaluate_model(model_id):
     model = SentenceTransformer(model_id, device=device)
     matryoshka_dimensions = [768, 512, 256, 128, 64]
     # Prepare datasets
     datasets_info = [
         {
+            "name": "Financial",
             "dataset_id": "Omartificial-Intelligence-Space/Arabic-finanical-rag-embedding-dataset",
             "split": "train",
             "size": 7000,
             "sample_size": 500
         },
         {
+            "name": "MLQA",
             "dataset_id": "google/xtreme",
             "subset": "MLQA.ar.ar",
             "split": "validation",
             "sample_size": 500
         },
         {
+            "name": "ARCD",
             "dataset_id": "hsseinmz/arcd",
             "split": "train",
             "size": None,
             })
             scores.append(score)
+        # Store scores by dataset for plot creation
         scores_by_dataset[dataset_info["name"]] = scores
     # Convert results to DataFrame for display
     result_df = pd.DataFrame(evaluation_results)
+    # Generate bar charts for each dataset using Plotly
     charts = []
+    color_scale = ['#003f5c', '#2f4b7c', '#665191', '#a05195', '#d45087']
     for dataset_name, scores in scores_by_dataset.items():
+        fig = go.Figure()
+        fig.add_trace(go.Bar(
+            x=[str(dim) for dim in matryoshka_dimensions],
+            y=scores,
+            marker_color=color_scale,
+            text=[f"{score:.3f}" if score else "N/A" for score in scores],
+            textposition='auto'
+        ))
+        fig.update_layout(
+            title=f"{dataset_name} Evaluation",
+            xaxis_title="Embedding Dimension",
+            yaxis_title="NDCG@10 Score",
+            template="plotly_white"
+        )
         charts.append(fig)
     return result_df, charts[0], charts[1], charts[2]
 demo = gr.Interface(
     fn=display_results,
+    inputs=gr.Textbox(label="Enter a Hugging Face Model ID", placeholder="e.g., sentence-transformers/all-MiniLM-L6-v2"),
     outputs=[
         gr.Dataframe(label="Evaluation Results"),
+        gr.Plot(label="Financial Dataset"),
+        gr.Plot(label="MLQA Dataset"),
+        gr.Plot(label="ARCD Dataset")
     ],
+    title="Arabic Embedding Evaluation",
     description=(
+        "Evaluate your Sentence Transformer model on **Arabic retrieval tasks** using Matryoshka embeddings. "
+        "Compare performance across financial, long-context, and short-context datasets.\n\n"
+        "The evaluation uses **NDCG@10** to measure how well the model retrieves relevant contexts. "
+        "Embedding dimensions are reduced from 768 to 64."
     ),
     theme="default",
     live=False,
     css="footer {visibility: hidden;}"
 )
+demo.launch(share=True)