Omar Elshehy commited on
Commit
3516960
1 Parent(s): 2402c39

add mrr@10 metric

Browse files
Files changed (1) hide show
  1. app.py +52 -25
app.py CHANGED
@@ -13,6 +13,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
13
  zero = torch.Tensor([0]).to(device)
14
  print(f"Device being used: {zero.device}")
15
 
 
16
  @spaces.GPU
17
  def evaluate_model(model_id, num_questions):
18
  model = SentenceTransformer(model_id, device=device)
@@ -44,7 +45,7 @@ def evaluate_model(model_id, num_questions):
44
  "last_rows": True # Take the last num_questions rows
45
  }
46
  ]
47
-
48
  evaluation_results = []
49
  scores_by_dataset = {}
50
 
@@ -57,25 +58,26 @@ def evaluate_model(model_id, num_questions):
57
 
58
  # Select the required number of rows
59
  if dataset_info.get("last_rows"):
60
- dataset = dataset.select(range(len(dataset) - dataset_info["sample_size"], len(dataset))) # Take last n rows
 
61
  else:
62
  dataset = dataset.select(range(min(dataset_info["sample_size"], len(dataset)))) # Take first n rows
63
-
64
  # Rename columns to 'anchor' and 'positive'
65
  dataset = dataset.rename_column(dataset_info["columns"][0], "anchor")
66
  dataset = dataset.rename_column(dataset_info["columns"][1], "positive")
67
-
68
  # Check if "id" column already exists before adding it
69
  if "id" not in dataset.column_names:
70
  dataset = dataset.add_column("id", range(len(dataset)))
71
-
72
  # Prepare queries and corpus
73
  corpus = dict(zip(dataset["id"], dataset["positive"]))
74
  queries = dict(zip(dataset["id"], dataset["anchor"]))
75
-
76
  # Create a mapping of relevant documents (1 in our case) for each query
77
  relevant_docs = {q_id: [q_id] for q_id in queries}
78
-
79
  matryoshka_evaluators = []
80
  for dim in matryoshka_dimensions:
81
  ir_evaluator = InformationRetrievalEvaluator(
@@ -84,66 +86,91 @@ def evaluate_model(model_id, num_questions):
84
  relevant_docs=relevant_docs,
85
  name=f"dim_{dim}",
86
  truncate_dim=dim,
87
- score_functions={"cosine": cos_sim},
88
  )
89
  matryoshka_evaluators.append(ir_evaluator)
90
 
91
  evaluator = SequentialEvaluator(matryoshka_evaluators)
92
  results = evaluator(model)
93
-
94
- scores = []
 
95
  for dim in matryoshka_dimensions:
96
- key = f"dim_{dim}_cosine_ndcg@10"
97
- score = results[key] if key in results else None
 
 
98
  evaluation_results.append({
99
  "Dataset": dataset_info["name"],
100
  "Dimension": dim,
101
- "Score": score
 
102
  })
103
- scores.append(score)
 
104
 
105
  # Store scores by dataset for plot creation
106
- scores_by_dataset[dataset_info["name"]] = scores
 
 
 
107
 
108
  # Convert results to DataFrame for display
109
  result_df = pd.DataFrame(evaluation_results)
110
 
111
  # Generate bar charts for each dataset using Plotly
112
  charts = []
113
- color_scale = ['#003f5c', '#2f4b7c', '#665191', '#a05195', '#d45087']
 
114
 
115
  for dataset_name, scores in scores_by_dataset.items():
116
  fig = go.Figure()
 
117
  fig.add_trace(go.Bar(
118
  x=[str(dim) for dim in matryoshka_dimensions],
119
- y=scores,
120
- marker_color=color_scale,
121
- text=[f"{score:.3f}" if score else "N/A" for score in scores],
 
 
 
 
 
 
 
 
 
 
 
122
  textposition='auto'
123
  ))
124
 
125
  fig.update_layout(
126
  title=f"{dataset_name} Evaluation",
127
  xaxis_title="Embedding Dimension",
128
- yaxis_title="NDCG@10 Score",
 
129
  template="plotly_white"
130
  )
131
  charts.append(fig)
132
 
133
  return result_df, charts[0], charts[1], charts[2]
134
 
 
135
  # Define the Gradio interface
136
  def display_results(model_name, num_questions):
137
  result_df, chart1, chart2, chart3 = evaluate_model(model_name, num_questions)
138
  return result_df, chart1, chart2, chart3
139
 
 
140
  # Gradio interface with a slider to choose the number of questions (1 to 500)
141
  demo = gr.Interface(
142
- fn=display_results,
143
  inputs=[
144
- gr.Textbox(label="Enter a Hugging Face Model ID", placeholder="e.g., Omartificial-Intelligence-Space/GATE-AraBert-v1"),
 
145
  gr.Slider(label="Number of Questions", minimum=1, maximum=500, step=1, value=500)
146
- ],
147
  outputs=[
148
  gr.Dataframe(label="Evaluation Results"),
149
  gr.Plot(label="Financial Dataset"),
@@ -156,8 +183,8 @@ demo = gr.Interface(
156
  "- **ARCD** evaluates short context retrieval performance.\n"
157
  "- **MLQA Arabic** evaluates long context retrieval performance.\n"
158
  "- **Arabic Financial Dataset** focuses on financial context retrieval.\n\n"
159
- "**Evaluation Metric:**\n"
160
- "The evaluation uses **NDCG@10** (Normalized Discounted Cumulative Gain), which measures how well the retrieved documents (contexts) match the query relevance.\n"
161
  "Higher scores indicate better performance. Embedding dimensions are reduced from 768 to 64, evaluating how well the model performs with fewer dimensions."
162
  ),
163
  theme="default",
 
13
  zero = torch.Tensor([0]).to(device)
14
  print(f"Device being used: {zero.device}")
15
 
16
+
17
  @spaces.GPU
18
  def evaluate_model(model_id, num_questions):
19
  model = SentenceTransformer(model_id, device=device)
 
45
  "last_rows": True # Take the last num_questions rows
46
  }
47
  ]
48
+
49
  evaluation_results = []
50
  scores_by_dataset = {}
51
 
 
58
 
59
  # Select the required number of rows
60
  if dataset_info.get("last_rows"):
61
+ dataset = dataset.select(
62
+ range(len(dataset) - dataset_info["sample_size"], len(dataset))) # Take last n rows
63
  else:
64
  dataset = dataset.select(range(min(dataset_info["sample_size"], len(dataset)))) # Take first n rows
65
+
66
  # Rename columns to 'anchor' and 'positive'
67
  dataset = dataset.rename_column(dataset_info["columns"][0], "anchor")
68
  dataset = dataset.rename_column(dataset_info["columns"][1], "positive")
69
+
70
  # Check if "id" column already exists before adding it
71
  if "id" not in dataset.column_names:
72
  dataset = dataset.add_column("id", range(len(dataset)))
73
+
74
  # Prepare queries and corpus
75
  corpus = dict(zip(dataset["id"], dataset["positive"]))
76
  queries = dict(zip(dataset["id"], dataset["anchor"]))
77
+
78
  # Create a mapping of relevant documents (1 in our case) for each query
79
  relevant_docs = {q_id: [q_id] for q_id in queries}
80
+
81
  matryoshka_evaluators = []
82
  for dim in matryoshka_dimensions:
83
  ir_evaluator = InformationRetrievalEvaluator(
 
86
  relevant_docs=relevant_docs,
87
  name=f"dim_{dim}",
88
  truncate_dim=dim,
89
+ score_functions={"cosine": cos_sim}
90
  )
91
  matryoshka_evaluators.append(ir_evaluator)
92
 
93
  evaluator = SequentialEvaluator(matryoshka_evaluators)
94
  results = evaluator(model)
95
+
96
+ scores_ndcg = []
97
+ scores_mrr = []
98
  for dim in matryoshka_dimensions:
99
+ ndcg_key = f"dim_{dim}_cosine_ndcg@10"
100
+ mrr_key = f"dim_{dim}_cosine_mrr@10"
101
+ ndcg_score = results[ndcg_key] if ndcg_key in results else None
102
+ mrr_score = results[mrr_key] if mrr_key in results else None
103
  evaluation_results.append({
104
  "Dataset": dataset_info["name"],
105
  "Dimension": dim,
106
+ "NDCG@10": ndcg_score,
107
+ "MRR@10": mrr_score
108
  })
109
+ scores_ndcg.append(ndcg_score)
110
+ scores_mrr.append(mrr_score)
111
 
112
  # Store scores by dataset for plot creation
113
+ scores_by_dataset[dataset_info["name"]] = {
114
+ "NDCG@10": scores_ndcg,
115
+ "MRR@10": scores_mrr
116
+ }
117
 
118
  # Convert results to DataFrame for display
119
  result_df = pd.DataFrame(evaluation_results)
120
 
121
  # Generate bar charts for each dataset using Plotly
122
  charts = []
123
+ color_scale_ndcg = '#a05195'
124
+ color_scale_mrr = '#2f4b7c'
125
 
126
  for dataset_name, scores in scores_by_dataset.items():
127
  fig = go.Figure()
128
+ # NDCG@10 bars
129
  fig.add_trace(go.Bar(
130
  x=[str(dim) for dim in matryoshka_dimensions],
131
+ y=scores["NDCG@10"],
132
+ name="NDCG@10",
133
+ marker_color=color_scale_ndcg,
134
+ text=[f"{score:.3f}" if score else "N/A" for score in scores["NDCG@10"]],
135
+ textposition='auto'
136
+ ))
137
+
138
+ # MRR@10 bars
139
+ fig.add_trace(go.Bar(
140
+ x=[str(dim) for dim in matryoshka_dimensions],
141
+ y=scores["MRR@10"],
142
+ name="MRR@10",
143
+ marker_color=color_scale_mrr,
144
+ text=[f"{score:.3f}" if score else "N/A" for score in scores["MRR@10"]],
145
  textposition='auto'
146
  ))
147
 
148
  fig.update_layout(
149
  title=f"{dataset_name} Evaluation",
150
  xaxis_title="Embedding Dimension",
151
+ yaxis_title="Score",
152
+ barmode='group', # Group bars
153
  template="plotly_white"
154
  )
155
  charts.append(fig)
156
 
157
  return result_df, charts[0], charts[1], charts[2]
158
 
159
+
160
  # Define the Gradio interface
161
  def display_results(model_name, num_questions):
162
  result_df, chart1, chart2, chart3 = evaluate_model(model_name, num_questions)
163
  return result_df, chart1, chart2, chart3
164
 
165
+
166
  # Gradio interface with a slider to choose the number of questions (1 to 500)
167
  demo = gr.Interface(
168
+ fn=display_results,
169
  inputs=[
170
+ gr.Textbox(label="Enter a Hugging Face Model ID",
171
+ placeholder="e.g., Omartificial-Intelligence-Space/GATE-AraBert-v1"),
172
  gr.Slider(label="Number of Questions", minimum=1, maximum=500, step=1, value=500)
173
+ ],
174
  outputs=[
175
  gr.Dataframe(label="Evaluation Results"),
176
  gr.Plot(label="Financial Dataset"),
 
183
  "- **ARCD** evaluates short context retrieval performance.\n"
184
  "- **MLQA Arabic** evaluates long context retrieval performance.\n"
185
  "- **Arabic Financial Dataset** focuses on financial context retrieval.\n\n"
186
+ "**Evaluation Metrics:**\n"
187
+ "The evaluation uses **NDCG@10** and **MRR@10**, which measure how well the retrieved documents (contexts) match the query relevance.\n"
188
  "Higher scores indicate better performance. Embedding dimensions are reduced from 768 to 64, evaluating how well the model performs with fewer dimensions."
189
  ),
190
  theme="default",