Sarthak commited on
Commit
eb5363b
·
1 Parent(s): b82c1c9

feat: added MTEB evaluation scripts

Browse files
Files changed (29) hide show
  1. MTEB_evaluate.py +268 -275
  2. README.md +50 -0
  3. analyze_mteb_results.py +311 -0
  4. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AmazonCounterfactualClassification.json +1 -1
  5. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AmazonReviewsClassification.json +73 -0
  6. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AppsRetrieval.json +159 -0
  7. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/ArguAna.json +158 -0
  8. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AskUbuntuDupQuestions.json +26 -0
  9. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/BIOSSES.json +26 -0
  10. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/Banking77Classification.json +0 -73
  11. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/BiorxivClusteringS2S.json +32 -0
  12. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/COIRCodeSearchNetRetrieval.json +8 -0
  13. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CQADupstackProgrammersRetrieval.json +0 -158
  14. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeFeedbackMT.json +158 -0
  15. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeSearchNetCCRetrieval.json +8 -0
  16. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeTransOceanContest.json +159 -0
  17. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeTransOceanDL.json +158 -0
  18. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CosQA.json +159 -0
  19. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/STSBenchmark.json +0 -26
  20. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/SprintDuplicateQuestions.json +0 -58
  21. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/StackExchangeClustering.json +0 -47
  22. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/StackOverflowQA.json +158 -0
  23. mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/SyntheticText2SQL.json +159 -0
  24. mteb_results/mteb_parsed_results.json +0 -3
  25. mteb_results/mteb_raw_results.json +0 -7
  26. mteb_results/mteb_report.txt +0 -21
  27. mteb_results/mteb_summary.json +0 -20
  28. pyproject.toml +1 -0
  29. uv.lock +2 -0
MTEB_evaluate.py CHANGED
@@ -1,349 +1,342 @@
1
  #!/usr/bin/env python
2
  """
3
- MTEB Evaluation Script for Distilled Model - Code-Focused Tasks.
4
 
5
- This script evaluates the distilled gte-Qwen2-7B-instruct model using MTEB
6
- (Massive Text Embedding Benchmark) with a focus on tasks relevant for code:
7
-
8
- - Classification: Tests ability to distinguish between different categories (e.g., programming languages)
9
- - Clustering: Tests ability to group similar code by functionality
10
- - STS: Tests semantic similarity understanding between code snippets
11
- - Retrieval: Tests code search and duplicate detection capabilities
12
 
13
  Features:
14
- - Incremental evaluation: Skips tasks that already have results in mteb_results/
15
- - Combines existing and new results automatically
16
- - Saves results in multiple formats for analysis
17
-
18
- Usage:
19
- python MTEB_evaluate.py
20
-
21
- Configuration:
22
- - Set EVAL_ALL_TASKS = False to use only CODE_SPECIFIC_TASKS
23
- - Modify CODE_SPECIFIC_TASKS for granular task selection
24
  """
25
 
 
26
  import json
27
  import logging
 
28
  import sys
 
29
  import time
30
  from pathlib import Path
31
 
32
- import mteb
33
- from model2vec import StaticModel
34
- from mteb import ModelMeta
35
-
36
- from evaluation import (
37
- CustomMTEB,
38
- get_tasks,
39
- make_leaderboard,
40
- parse_mteb_results,
41
- summarize_results,
42
- )
43
 
44
  # =============================================================================
45
- # CONFIGURATION CONSTANTS
46
  # =============================================================================
47
 
48
- # Model Configuration
49
- MODEL_PATH = "." # Path to the distilled model directory
50
- MODEL_NAME = "gte-Qwen2-7B-instruct-M2V-Distilled" # Name for the model in results
51
-
52
- # Evaluation Configuration
53
- OUTPUT_DIR = "mteb_results" # Directory to save evaluation results
54
-
55
- EVAL_ALL_TASKS = True
56
-
57
- # Specific tasks most relevant for code evaluation (focused selection)
58
- CODE_SPECIFIC_TASKS = [
59
- # Classification - Programming language/category classification
60
- "Banking77Classification", # Fine-grained classification (77 classes)
61
- # Clustering - Code grouping by functionality
62
- "StackExchangeClustering.v2", # Technical Q&A clustering (most relevant)
63
- # STS - Code similarity understanding
64
- "STSBenchmark", # Standard semantic similarity benchmark
65
- # Retrieval - Code search capabilities
66
- "CQADupstackProgrammersRetrieval", # Programming Q&A retrieval
67
- # PairClassification - Duplicate/similar code detection
68
- "SprintDuplicateQuestions", # Duplicate question detection
69
- ]
70
-
71
- # Evaluation settings
72
- EVAL_SPLITS = ["test"] # Dataset splits to evaluate on
73
- VERBOSITY = 2 # MTEB verbosity level
74
 
75
- # =============================================================================
 
76
 
77
  # Configure logging
78
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
79
  logger = logging.getLogger(__name__)
80
 
 
 
 
81
 
82
- def check_existing_results(output_path: Path, tasks: list) -> list:
83
- """Check for existing task results and filter out completed tasks."""
84
- remaining_tasks = []
85
- completed_tasks = []
 
 
 
86
 
87
- for task in tasks:
88
- task_name = task.metadata.name
89
- # MTEB saves results as {model_name}__{task_name}.json
90
- result_file = output_path / MODEL_NAME / f"{task_name}.json"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- if result_file.exists():
93
- completed_tasks.append(task_name)
94
- logger.info(f"Skipping {task_name} - results already exist")
95
- else:
96
- remaining_tasks.append(task)
97
 
98
- if completed_tasks:
99
- logger.info(f"Found existing results for {len(completed_tasks)} tasks: {completed_tasks}")
 
 
 
100
 
101
- return remaining_tasks
 
 
 
 
 
102
 
103
 
104
- def load_existing_parsed_results(output_path: Path) -> dict:
105
- """Load existing parsed results if they exist."""
106
- parsed_results_file = output_path / "mteb_parsed_results.json"
107
- if parsed_results_file.exists():
108
- try:
109
- with parsed_results_file.open("r") as f:
110
- return json.load(f)
111
- except (json.JSONDecodeError, OSError) as e:
112
- logger.warning(f"Could not load existing parsed results: {e}")
113
- return {}
114
-
115
 
116
- def load_and_display_existing_results(output_path: Path) -> None:
117
- """Load and display existing MTEB results."""
118
- summary_file = output_path / "mteb_summary.json"
119
- if summary_file.exists():
120
- with summary_file.open("r") as f:
121
- summary = json.load(f)
122
 
123
- logger.info("=" * 80)
124
- logger.info("EXISTING MTEB EVALUATION RESULTS:")
125
- logger.info("=" * 80)
126
 
127
- stats = summary.get("summary_stats")
128
- if stats:
129
- logger.info(f"Total Datasets: {stats.get('total_datasets', 'N/A')}")
130
- logger.info(f"Average Score: {stats.get('average_score', 0):.4f}")
131
- logger.info(f"Median Score: {stats.get('median_score', 0):.4f}")
132
 
133
- logger.info("=" * 80)
134
- else:
135
- logger.info("No existing summary found. Individual task results may still exist.")
 
 
 
136
 
 
 
 
137
 
138
- def run_mteb_evaluation() -> None:
139
- """Run MTEB evaluation using the evaluation package."""
140
- output_path = Path(OUTPUT_DIR)
141
- output_path.mkdir(parents=True, exist_ok=True)
142
 
143
- logger.info(f"Loading model from {MODEL_PATH}")
144
- model = StaticModel.from_pretrained(MODEL_PATH)
145
- logger.info("Model loaded successfully")
146
 
147
- # Set up model metadata for MTEB
148
- model.mteb_model_meta = ModelMeta( # type: ignore[attr-defined]
149
- name=MODEL_NAME, revision="distilled", release_date=None, languages=["eng"]
150
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- # Get specific code-relevant tasks (focused selection)
153
- logger.info("Getting focused code-relevant MTEB tasks")
154
- logger.info(f"Selected specific tasks: {CODE_SPECIFIC_TASKS}")
155
 
156
- if EVAL_ALL_TASKS:
157
- all_tasks = get_tasks()
158
- else:
159
- all_tasks = [mteb.get_task(task_name, languages=["eng"]) for task_name in CODE_SPECIFIC_TASKS]
160
 
161
- logger.info(f"Found {len(all_tasks)} total tasks")
162
 
163
- # Check for existing results and filter out completed tasks
164
- tasks = check_existing_results(output_path, all_tasks)
165
- logger.info(f"Will evaluate {len(tasks)} remaining tasks")
 
 
166
 
167
- if not tasks:
168
- logger.info("No new tasks to evaluate - all tasks already completed!")
 
 
169
 
170
- # Load and display existing results
171
- logger.info("Loading existing results...")
172
  try:
173
- load_and_display_existing_results(output_path)
174
- except (json.JSONDecodeError, OSError, KeyError) as e:
175
- logger.warning(f"Could not load existing results: {e}")
176
- return
177
-
178
- # Define the CustomMTEB object with the specified tasks
179
- evaluation = CustomMTEB(tasks=tasks)
180
-
181
- # Run the evaluation
182
- logger.info("Starting MTEB evaluation...")
183
- start_time = time.time()
184
 
185
- results = evaluation.run(model, eval_splits=EVAL_SPLITS, output_folder=str(output_path), verbosity=VERBOSITY)
186
 
187
- end_time = time.time()
188
- evaluation_time = end_time - start_time
189
- logger.info(f"Evaluation completed in {evaluation_time:.2f} seconds")
190
 
191
- # Parse the results and summarize them
192
- logger.info("Parsing and summarizing results...")
193
- parsed_results = parse_mteb_results(mteb_results=results, model_name=MODEL_NAME)
 
 
 
 
 
 
194
 
195
- # Load existing results if any and combine them
196
- existing_results = load_existing_parsed_results(output_path)
197
- if existing_results:
198
- logger.info("Combining with existing results...")
199
- # Convert to dict for merging
200
- parsed_dict = dict(parsed_results) if hasattr(parsed_results, "items") else {}
201
- # Simple merge - existing results take precedence to avoid overwriting
202
- for key, value in existing_results.items():
203
- if key not in parsed_dict:
204
- parsed_dict[key] = value
205
- parsed_results = parsed_dict
206
 
207
- task_scores = summarize_results(parsed_results)
208
 
209
- # Save results in different formats
210
- save_results(output_path, results, parsed_results, task_scores, evaluation_time)
211
 
212
- # Print the results in a leaderboard format
213
- logger.info("MTEB Evaluation Results:")
214
- logger.info("=" * 80)
215
- leaderboard = make_leaderboard(task_scores) # type: ignore[arg-type]
216
- logger.info(leaderboard.to_string(index=False))
217
- logger.info("=" * 80)
218
 
219
- logger.info(f"Evaluation completed successfully. Results saved to {OUTPUT_DIR}")
 
 
220
 
 
 
221
 
222
- def save_results(
223
- output_path: Path, raw_results: list, parsed_results: dict, task_scores: dict, evaluation_time: float
224
- ) -> None:
225
- """Save evaluation results in multiple formats."""
226
- # Save raw results
227
- raw_results_file = output_path / "mteb_raw_results.json"
228
- with raw_results_file.open("w") as f:
229
- json.dump(raw_results, f, indent=2, default=str)
230
- logger.info(f"Raw results saved to {raw_results_file}")
231
 
232
- # Save parsed results
233
- parsed_results_file = output_path / "mteb_parsed_results.json"
234
- with parsed_results_file.open("w") as f:
235
- json.dump(parsed_results, f, indent=2, default=str)
236
- logger.info(f"Parsed results saved to {parsed_results_file}")
237
 
238
- # Generate summary statistics
239
- summary_stats = generate_summary_stats(task_scores)
 
240
 
241
- # Save task scores summary
242
- summary = {
243
- "model_name": MODEL_NAME,
244
- "evaluation_time_seconds": evaluation_time,
245
- "task_scores": task_scores,
246
- "summary_stats": summary_stats,
247
- }
248
 
249
- summary_file = output_path / "mteb_summary.json"
250
- with summary_file.open("w") as f:
251
- json.dump(summary, f, indent=2, default=str)
252
- logger.info(f"Summary saved to {summary_file}")
 
253
 
254
- # Save human-readable report
255
- report_file = output_path / "mteb_report.txt"
256
- generate_report(output_path, task_scores, summary_stats, evaluation_time)
257
- logger.info(f"Report saved to {report_file}")
258
 
 
 
259
 
260
- def generate_summary_stats(task_scores: dict) -> dict:
261
- """Generate summary statistics from task scores."""
262
- if not task_scores:
263
- return {}
 
 
 
264
 
265
- # Extract all individual dataset scores
266
- all_scores = []
267
- for model_data in task_scores.values():
268
- if isinstance(model_data, dict) and "dataset_scores" in model_data:
269
- dataset_scores = model_data["dataset_scores"]
270
- if isinstance(dataset_scores, dict):
271
- all_scores.extend(
272
- [
273
- float(score)
274
- for score in dataset_scores.values()
275
- if isinstance(score, int | float) and str(score).lower() != "nan"
276
- ]
277
- )
278
-
279
- if not all_scores:
280
- return {}
281
 
282
- import numpy as np
 
283
 
284
- return {
285
- "total_datasets": len(all_scores),
286
- "average_score": float(np.mean(all_scores)),
287
- "median_score": float(np.median(all_scores)),
288
- "std_dev": float(np.std(all_scores)),
289
- "min_score": float(np.min(all_scores)),
290
- "max_score": float(np.max(all_scores)),
291
- }
292
 
 
 
 
 
 
 
 
 
 
293
 
294
- def generate_report(output_path: Path, task_scores: dict, summary_stats: dict, evaluation_time: float) -> None:
295
- """Generate human-readable evaluation report."""
296
- report_file = output_path / "mteb_report.txt"
297
-
298
- with report_file.open("w") as f:
299
- f.write("=" * 80 + "\n")
300
- f.write("MTEB Evaluation Report\n")
301
- f.write("=" * 80 + "\n\n")
302
- f.write(f"Model: {MODEL_NAME}\n")
303
- f.write(f"Model Path: {MODEL_PATH}\n")
304
- f.write(f"Evaluation Time: {evaluation_time:.2f} seconds\n")
305
-
306
- # Write summary stats
307
- if summary_stats:
308
- f.write(f"Total Datasets: {summary_stats['total_datasets']}\n\n")
309
- f.write("Summary Statistics:\n")
310
- f.write(f" Average Score: {summary_stats['average_score']:.4f}\n")
311
- f.write(f" Median Score: {summary_stats['median_score']:.4f}\n")
312
- f.write(f" Standard Deviation: {summary_stats['std_dev']:.4f}\n")
313
- f.write(f" Score Range: {summary_stats['min_score']:.4f} - {summary_stats['max_score']:.4f}\n\n")
314
- else:
315
- f.write("Summary Statistics: No valid results found\n\n")
316
-
317
- # Write leaderboard
318
- f.write("Detailed Results:\n")
319
- f.write("-" * 50 + "\n")
320
- if task_scores:
321
- leaderboard = make_leaderboard(task_scores) # type: ignore[arg-type]
322
- f.write(leaderboard.to_string(index=False))
323
- else:
324
- f.write("No results available\n")
325
 
326
- f.write("\n\n" + "=" * 80 + "\n")
 
327
 
 
 
328
 
329
- def main() -> None:
330
- """Main evaluation function."""
331
- logger.info(f"Starting MTEB evaluation for {MODEL_NAME}")
332
- logger.info(f"Model path: {MODEL_PATH}")
333
- logger.info(f"Output directory: {OUTPUT_DIR}")
334
- logger.info("Running focused MTEB evaluation on code-relevant tasks:")
335
- logger.info(" - Classification: Programming language classification")
336
- logger.info(" - Clustering: Code clustering by functionality")
337
- logger.info(" - STS: Semantic similarity between code snippets")
338
- logger.info(" - Retrieval: Code search and retrieval")
339
 
340
- try:
341
- run_mteb_evaluation()
342
- logger.info("Evaluation pipeline completed successfully!")
 
 
 
 
 
 
 
 
 
 
343
 
344
- except Exception:
345
- logger.exception("Evaluation failed")
346
- sys.exit(1)
347
 
348
 
349
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python
2
  """
3
+ MTEB Evaluation Script with Subprocess Isolation (Code Information Retrieval Tasks).
4
 
5
+ This script evaluates models using MTEB with subprocess isolation to prevent
6
+ memory issues and process killing.
 
 
 
 
 
7
 
8
  Features:
9
+ - Each task runs in a separate subprocess to isolate memory
10
+ - 1-minute timeout per task
11
+ - No retries - if task fails or times out, move to next one
12
+ - Memory monitoring and cleanup
13
+
14
+ Note: Multi-threading is NOT used here because:
15
+ 1. Memory is the main bottleneck, not CPU
16
+ 2. Running multiple tasks simultaneously would increase memory pressure
17
+ 3. Many tasks are being killed (return code -9) due to OOM conditions
18
+ 4. Sequential processing with subprocess isolation is more stable
19
  """
20
 
21
+ import contextlib
22
  import json
23
  import logging
24
+ import subprocess
25
  import sys
26
+ import tempfile
27
  import time
28
  from pathlib import Path
29
 
30
+ import psutil
 
 
 
 
 
 
 
 
 
 
31
 
32
  # =============================================================================
33
+ # CONFIGURATION
34
  # =============================================================================
35
 
36
+ MODEL_PATH = "."
37
+ MODEL_NAME = "gte-Qwen2-7B-instruct-M2V-Distilled"
38
+ OUTPUT_DIR = "mteb_results"
39
+ TASK_TIMEOUT = 30 # 30 seconds timeout per task
40
+ MAX_RETRIES = 0 # No retries - move to next task if failed/timeout
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ # Constants
43
+ SIGKILL_RETURN_CODE = -9 # Process killed by SIGKILL (usually OOM)
44
 
45
  # Configure logging
46
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
47
  logger = logging.getLogger(__name__)
48
 
49
+ # =============================================================================
50
+ # SINGLE TASK RUNNER SCRIPT
51
+ # =============================================================================
52
 
53
+ TASK_RUNNER_SCRIPT = """
54
+ import sys
55
+ import os
56
+ import json
57
+ import tempfile
58
+ import traceback
59
+ from pathlib import Path
60
 
61
+ # Add current directory to path
62
+ sys.path.insert(0, ".")
63
+
64
+ try:
65
+ import mteb
66
+ from model2vec import StaticModel
67
+ from mteb import ModelMeta
68
+ from evaluation import CustomMTEB
69
+
70
+ def run_single_task():
71
+ # Get arguments
72
+ model_path = sys.argv[1]
73
+ task_name = sys.argv[2]
74
+ output_dir = sys.argv[3]
75
+ model_name = sys.argv[4]
76
+
77
+ # Load model
78
+ model = StaticModel.from_pretrained(model_path)
79
+ model.mteb_model_meta = ModelMeta(
80
+ name=model_name, revision="distilled", release_date=None, languages=["eng"]
81
+ )
82
+
83
+ # Get and run task
84
+ task = mteb.get_task(task_name, languages=["eng"])
85
+ evaluation = CustomMTEB(tasks=[task])
86
+
87
+ results = evaluation.run(
88
+ model,
89
+ eval_splits=["test"],
90
+ output_folder=output_dir,
91
+ verbosity=0
92
+ )
93
+
94
+ # Save results to temp file for parent process
95
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f:
96
+ json.dump({
97
+ "success": True,
98
+ "task_name": task_name,
99
+ "results": results
100
+ }, f)
101
+ temp_file = f.name
102
+
103
+ print(f"RESULT_FILE:{temp_file}")
104
+ return 0
105
+
106
+ if __name__ == "__main__":
107
+ exit(run_single_task())
108
+
109
+ except Exception as e:
110
+ print(f"ERROR: {str(e)}")
111
+ print(f"TRACEBACK: {traceback.format_exc()}")
112
+ exit(1)
113
+ """
114
 
 
 
 
 
 
115
 
116
+ def get_available_tasks() -> list[str]:
117
+ """Get list of available tasks."""
118
+ try:
119
+ import mteb
120
+ import mteb.benchmarks
121
 
122
+ # Use main MTEB benchmark for comprehensive evaluation
123
+ benchmark = mteb.benchmarks.CoIR
124
+ return [str(task) for task in benchmark.tasks] # All tasks
125
+ except Exception:
126
+ logger.exception("Failed to get tasks")
127
+ return []
128
 
129
 
130
+ def check_existing_results(output_path: Path, task_names: list[str]) -> list[str]:
131
+ """Check for existing results and return remaining tasks."""
132
+ remaining_tasks = []
 
 
 
 
 
 
 
 
133
 
134
+ for task_name in task_names:
135
+ result_file = output_path / MODEL_NAME / "distilled" / f"{task_name}.json"
136
+ if result_file.exists():
137
+ logger.info(f"Skipping {task_name} - results already exist")
138
+ else:
139
+ remaining_tasks.append(task_name)
140
 
141
+ return remaining_tasks
 
 
142
 
 
 
 
 
 
143
 
144
+ def run_task_subprocess(task_name: str, output_dir: str) -> tuple[bool, str, float]:
145
+ """Run a single task in a subprocess with memory and time limits."""
146
+ # Create temporary script file
147
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
148
+ f.write(TASK_RUNNER_SCRIPT)
149
+ script_path = f.name
150
 
151
+ try:
152
+ logger.info(f"Running task: {task_name}")
153
+ start_time = time.time()
154
 
155
+ # Run subprocess with timeout
156
+ # subprocess security: We control all inputs (script path and known arguments)
157
+ cmd = [sys.executable, script_path, MODEL_PATH, task_name, output_dir, MODEL_NAME]
 
158
 
159
+ process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) # noqa: S603
 
 
160
 
161
+ try:
162
+ stdout, stderr = process.communicate(timeout=TASK_TIMEOUT)
163
+ duration = time.time() - start_time
164
+
165
+ if process.returncode == 0:
166
+ # Check for result file
167
+ for line in stdout.split("\n"):
168
+ if line.startswith("RESULT_FILE:"):
169
+ result_file = line.split(":", 1)[1]
170
+ try:
171
+ with Path(result_file).open() as f:
172
+ json.load(f)
173
+ Path(result_file).unlink() # Clean up temp file
174
+ logger.info(f"✓ Completed {task_name} in {duration:.2f}s")
175
+ return True, task_name, duration
176
+ except (json.JSONDecodeError, OSError):
177
+ logger.exception("Failed to read result file")
178
+
179
+ logger.info(f"✓ Completed {task_name} in {duration:.2f}s")
180
+ return True, task_name, duration
181
+ if process.returncode == SIGKILL_RETURN_CODE:
182
+ logger.error(f"✗ Task {task_name} killed (OOM) - return code {process.returncode}")
183
+ else:
184
+ logger.error(f"✗ Task {task_name} failed with return code {process.returncode}")
185
+ if stderr:
186
+ logger.error(f"Error output: {stderr}")
187
+ return False, task_name, duration
188
+
189
+ except subprocess.TimeoutExpired:
190
+ logger.warning(f"⏱ Task {task_name} timed out after {TASK_TIMEOUT}s")
191
+ process.kill()
192
+ process.wait()
193
+ return False, task_name, TASK_TIMEOUT
194
 
195
+ except Exception:
196
+ logger.exception(f" Failed to run task {task_name}")
197
+ return False, task_name, 0.0
198
 
199
+ finally:
200
+ # Clean up script file
201
+ with contextlib.suppress(Exception):
202
+ Path(script_path).unlink()
203
 
 
204
 
205
+ def collect_results(output_path: Path) -> dict:
206
+ """Collect all results from completed tasks."""
207
+ results_dir = output_path / MODEL_NAME / "distilled"
208
+ if not results_dir.exists():
209
+ return {}
210
 
211
+ task_results = {}
212
+ for result_file in results_dir.glob("*.json"):
213
+ if result_file.name == "model_meta.json":
214
+ continue
215
 
 
 
216
  try:
217
+ with result_file.open() as f:
218
+ data = json.load(f)
219
+ task_name = result_file.stem
220
+ task_results[task_name] = data
221
+ except (json.JSONDecodeError, OSError) as e:
222
+ logger.warning(f"Could not load {result_file}: {e}")
 
 
 
 
 
223
 
224
+ return task_results
225
 
 
 
 
226
 
227
+ def save_summary(output_path: Path, results: dict, stats: dict) -> None:
228
+ """Save evaluation summary."""
229
+ summary = {
230
+ "model_name": MODEL_NAME,
231
+ "timestamp": time.time(),
232
+ "task_timeout": TASK_TIMEOUT,
233
+ "stats": stats,
234
+ "task_results": results,
235
+ }
236
 
237
+ summary_file = output_path / "mteb_summary.json"
238
+ with summary_file.open("w") as f:
239
+ json.dump(summary, f, indent=2, default=str)
 
 
 
 
 
 
 
 
240
 
241
+ logger.info(f"Summary saved to {summary_file}")
242
 
 
 
243
 
244
+ def main() -> None:
245
+ """Main evaluation function."""
246
+ logger.info(f"Starting MTEB evaluation for {MODEL_NAME}")
247
+ logger.info(f"Task timeout: {TASK_TIMEOUT}s (no retries)")
248
+ logger.info("Memory isolation: Each task runs in separate subprocess")
 
249
 
250
+ # Log system info
251
+ memory_info = psutil.virtual_memory()
252
+ logger.info(f"System memory: {memory_info.total / (1024**3):.1f} GB total")
253
 
254
+ output_path = Path(OUTPUT_DIR)
255
+ output_path.mkdir(parents=True, exist_ok=True)
256
 
257
+ # Get tasks
258
+ all_tasks = get_available_tasks()
259
+ if not all_tasks:
260
+ logger.error("No tasks found!")
261
+ return
 
 
 
 
262
 
263
+ logger.info(f"Found {len(all_tasks)} tasks")
 
 
 
 
264
 
265
+ # Check existing results
266
+ remaining_tasks = check_existing_results(output_path, all_tasks)
267
+ logger.info(f"Will evaluate {len(remaining_tasks)} remaining tasks")
268
 
269
+ if not remaining_tasks:
270
+ logger.info("All tasks already completed!")
271
+ return
 
 
 
 
272
 
273
+ # Process tasks sequentially (no retries)
274
+ start_time = time.time()
275
+ successful_tasks = []
276
+ failed_tasks = []
277
+ timed_out_tasks = []
278
 
279
+ for i, task_name in enumerate(remaining_tasks):
280
+ logger.info(f"[{i + 1}/{len(remaining_tasks)}] Processing: {task_name}")
 
 
281
 
282
+ # Run task once (no retries)
283
+ success, name, duration = run_task_subprocess(task_name, str(output_path))
284
 
285
+ if success:
286
+ successful_tasks.append((name, duration))
287
+ elif duration == TASK_TIMEOUT:
288
+ timed_out_tasks.append(name)
289
+ else:
290
+ failed_tasks.append(name)
291
+ # Check if it was OOM killed (this is logged in run_task_subprocess)
292
 
293
+ # Progress update
294
+ progress = ((i + 1) / len(remaining_tasks)) * 100
295
+ logger.info(f"Progress: {i + 1}/{len(remaining_tasks)} ({progress:.1f}%)")
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
+ # Brief pause between tasks
298
+ time.sleep(1)
299
 
300
+ total_time = time.time() - start_time
 
 
 
 
 
 
 
301
 
302
+ # Log final summary
303
+ logger.info("=" * 80)
304
+ logger.info("EVALUATION SUMMARY")
305
+ logger.info("=" * 80)
306
+ logger.info(f"Total tasks: {len(remaining_tasks)}")
307
+ logger.info(f"Successful: {len(successful_tasks)}")
308
+ logger.info(f"Failed: {len(failed_tasks)}")
309
+ logger.info(f"Timed out: {len(timed_out_tasks)}")
310
+ logger.info(f"Total time: {total_time:.2f}s")
311
 
312
+ if successful_tasks:
313
+ avg_time = sum(duration for _, duration in successful_tasks) / len(successful_tasks)
314
+ logger.info(f"Average successful task time: {avg_time:.2f}s")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
+ if failed_tasks:
317
+ logger.warning(f"Failed tasks: {failed_tasks}")
318
 
319
+ if timed_out_tasks:
320
+ logger.warning(f"Timed out tasks: {timed_out_tasks}")
321
 
322
+ logger.info("=" * 80)
 
 
 
 
 
 
 
 
 
323
 
324
+ # Collect and save results
325
+ all_results = collect_results(output_path)
326
+ stats = {
327
+ "total_tasks": len(remaining_tasks),
328
+ "successful": len(successful_tasks),
329
+ "failed": len(failed_tasks),
330
+ "timed_out": len(timed_out_tasks),
331
+ "total_time": total_time,
332
+ "avg_time": avg_time if successful_tasks else 0,
333
+ "successful_task_details": successful_tasks,
334
+ "failed_tasks": failed_tasks,
335
+ "timed_out_tasks": timed_out_tasks,
336
+ }
337
 
338
+ save_summary(output_path, all_results, stats)
339
+ logger.info("Evaluation completed!")
 
340
 
341
 
342
  if __name__ == "__main__":
README.md CHANGED
@@ -134,6 +134,56 @@ Detailed evaluation results, including similarity plots and performance metrics,
134
  - `trained_code_classifier/` - Directory containing trained classification model
135
  - `mteb_results/` - Directory containing MTEB evaluation results
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  ## Acknowledgments
138
 
139
  This project is built upon the following technologies:
 
134
  - `trained_code_classifier/` - Directory containing trained classification model
135
  - `mteb_results/` - Directory containing MTEB evaluation results
136
 
137
+ ## MTEB Benchmark Results (Partial)
138
+
139
+ **Overall Average Score: 0.1962**
140
+
141
+ | Category | Task | Score |
142
+ |----------|------|-------|
143
+ | **Classification** | **Average** | **0.4164** |
144
+ | | AmazonCounterfactualClassification | 0.5690 |
145
+ | | AmazonReviewsClassification | 0.2637 |
146
+ | | | |
147
+ | **Clustering** | **Average** | **0.0775** |
148
+ | | BiorxivClusteringS2S | 0.0775 |
149
+ | | | |
150
+ | **Reranking** | **Average** | **0.4643** |
151
+ | | AskUbuntuDupQuestions | 0.4643 |
152
+ | | | |
153
+ | **Retrieval** | **Average** | **0.1509** |
154
+ | | ArguAna | 0.1509 |
155
+ | | | |
156
+ | **CodeRetrieval** | **Average** | **0.1034** |
157
+ | | AppsRetrieval | 0.0008 |
158
+ | | COIRCodeSearchNetRetrieval | Failed |
159
+ | | CodeFeedbackMT | 0.1594 |
160
+ | | CodeSearchNetCCRetrieval | Failed |
161
+ | | CodeTransOceanContest | 0.0951 |
162
+ | | CodeTransOceanDL | 0.2780 |
163
+ | | CosQA | 0.0097 |
164
+ | | StackOverflowQA | 0.1762 |
165
+ | | SyntheticText2SQL | 0.0049 |
166
+ | | | |
167
+ | **STS** | **Average** | **0.3016** |
168
+ | | BIOSSES | 0.3016 |
169
+ | | | |
170
+
171
+ ### Summary Statistics
172
+
173
+ - **Total Tasks**: 15
174
+ - **Successful Tasks**: 13
175
+ - **Failed Tasks**: 2
176
+ - **Overall Average**: 0.1962
177
+
178
+ ### Category Averages
179
+
180
+ - **Classification**: 0.4164 (2 tasks)
181
+ - **Clustering**: 0.0775 (1 tasks)
182
+ - **Reranking**: 0.4643 (1 tasks)
183
+ - **Retrieval**: 0.1509 (1 tasks)
184
+ - **CodeRetrieval**: 0.1034 (7 tasks)
185
+ - **STS**: 0.3016 (1 tasks)
186
+
187
  ## Acknowledgments
188
 
189
  This project is built upon the following technologies:
analyze_mteb_results.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ MTEB Results Analysis Script.
4
+
5
+ This script analyzes MTEB benchmark results from the results directory,
6
+ categorizes tasks, calculates averages, and updates the README.md with
7
+ a comprehensive results table.
8
+ """
9
+
10
+ import json
11
+ import re
12
+ from pathlib import Path
13
+
14
+ # Task category mappings based on MTEB benchmark structure
15
+ TASK_CATEGORIES = {
16
+ # Classification tasks
17
+ "AmazonCounterfactualClassification": "Classification",
18
+ "AmazonReviewsClassification": "Classification",
19
+ "Banking77Classification": "Classification",
20
+ "EmotionClassification": "Classification",
21
+ "ImdbClassification": "Classification",
22
+ "MassiveIntentClassification": "Classification",
23
+ "MassiveScenarioClassification": "Classification",
24
+ "MTOPDomainClassification": "Classification",
25
+ "MTOPIntentClassification": "Classification",
26
+ "ToxicConversationsClassification": "Classification",
27
+ "TweetSentimentExtractionClassification": "Classification",
28
+ # Clustering tasks
29
+ "ArxivClusteringP2P": "Clustering",
30
+ "ArxivClusteringS2S": "Clustering",
31
+ "BiorxivClusteringP2P": "Clustering",
32
+ "BiorxivClusteringS2S": "Clustering",
33
+ "MedrxivClusteringP2P": "Clustering",
34
+ "MedrxivClusteringS2S": "Clustering",
35
+ "RedditClustering": "Clustering",
36
+ "RedditClusteringP2P": "Clustering",
37
+ "StackExchangeClustering": "Clustering",
38
+ "StackExchangeClusteringP2P": "Clustering",
39
+ "TwentyNewsgroupsClustering": "Clustering",
40
+ # Pair Classification tasks
41
+ "SprintDuplicateQuestions": "PairClassification",
42
+ "TwitterSemEval2015": "PairClassification",
43
+ "TwitterURLCorpus": "PairClassification",
44
+ # Reranking tasks
45
+ "AskUbuntuDupQuestions": "Reranking",
46
+ "MindSmallReranking": "Reranking",
47
+ "SciDocsRR": "Reranking",
48
+ "StackOverflowDupQuestions": "Reranking",
49
+ # Retrieval tasks
50
+ "ArguAna": "Retrieval",
51
+ "ClimateFEVER": "Retrieval",
52
+ "CQADupstackRetrieval": "Retrieval",
53
+ "DBPedia": "Retrieval",
54
+ "FEVER": "Retrieval",
55
+ "FiQA2018": "Retrieval",
56
+ "HotpotQA": "Retrieval",
57
+ "MSMARCO": "Retrieval",
58
+ "NFCorpus": "Retrieval",
59
+ "NQ": "Retrieval",
60
+ "QuoraRetrieval": "Retrieval",
61
+ "SCIDOCS": "Retrieval",
62
+ "SciFact": "Retrieval",
63
+ "Touche2020": "Retrieval",
64
+ "TRECCOVID": "Retrieval",
65
+ # Code retrieval tasks
66
+ "CodeSearchNetCCRetrieval": "CodeRetrieval",
67
+ "COIRCodeSearchNetRetrieval": "CodeRetrieval",
68
+ "StackOverflowQA": "CodeRetrieval",
69
+ "AppsRetrieval": "CodeRetrieval",
70
+ "CodeTransOceanContest": "CodeRetrieval",
71
+ "CodeTransOceanDL": "CodeRetrieval",
72
+ "CodeFeedbackMT": "CodeRetrieval",
73
+ "SyntheticText2SQL": "CodeRetrieval",
74
+ "CosQA": "CodeRetrieval",
75
+ # STS (Semantic Textual Similarity) tasks
76
+ "BIOSSES": "STS",
77
+ "SICK-R": "STS",
78
+ "STS12": "STS",
79
+ "STS13": "STS",
80
+ "STS14": "STS",
81
+ "STS15": "STS",
82
+ "STS16": "STS",
83
+ "STS17": "STS",
84
+ "STS22": "STS",
85
+ "STSBenchmark": "STS",
86
+ "SummEval": "STS",
87
+ }
88
+
89
+
90
+ def load_mteb_results(results_dir: Path) -> dict[str, dict]:
91
+ """Load all MTEB results from the results directory."""
92
+ results = {}
93
+
94
+ for json_file in results_dir.glob("*.json"):
95
+ if json_file.name == "model_meta.json":
96
+ continue
97
+
98
+ try:
99
+ with json_file.open() as f:
100
+ data = json.load(f)
101
+ task_name = data.get("task_name", json_file.stem)
102
+ results[task_name] = data
103
+ except (json.JSONDecodeError, KeyError):
104
+ pass
105
+
106
+ return results
107
+
108
+
109
+ def extract_main_score(result_data: dict) -> float:
110
+ """Extract the main score from a task result."""
111
+ try:
112
+ scores = result_data["scores"]["test"][0]
113
+ return scores["main_score"]
114
+ except (KeyError, IndexError, TypeError):
115
+ return 0.0
116
+
117
+
118
+ def categorize_tasks(results: dict[str, dict]) -> dict[str, list[tuple[str, float]]]:
119
+ """Categorize tasks and extract their scores."""
120
+ categories: dict[str, list[tuple[str, float]]] = {}
121
+
122
+ for task_name, result_data in results.items():
123
+ # Get category from mapping, or try to infer from task name
124
+ category = TASK_CATEGORIES.get(task_name)
125
+
126
+ if not category:
127
+ # Try to infer category from task name patterns
128
+ if "Classification" in task_name:
129
+ category = "Classification"
130
+ elif "Clustering" in task_name:
131
+ category = "Clustering"
132
+ elif "Retrieval" in task_name or "QA" in task_name:
133
+ category = "Retrieval"
134
+ elif "STS" in task_name or "SICK" in task_name or "BIOSSES" in task_name:
135
+ category = "STS"
136
+ elif "Code" in task_name or "SQL" in task_name:
137
+ category = "CodeRetrieval"
138
+ else:
139
+ category = "Other"
140
+
141
+ score = extract_main_score(result_data)
142
+
143
+ if category not in categories:
144
+ categories[category] = []
145
+ categories[category].append((task_name, score))
146
+
147
+ # Sort tasks within each category
148
+ for category_tasks in categories.values():
149
+ category_tasks.sort(key=lambda x: x[0])
150
+
151
+ return categories
152
+
153
+
154
+ def calculate_averages(categories: dict[str, list[tuple[str, float]]]) -> dict[str, float]:
155
+ """Calculate average scores for each category."""
156
+ averages = {}
157
+
158
+ for category, tasks in categories.items():
159
+ scores = [score for _, score in tasks if score > 0] # Exclude failed tasks (score = 0)
160
+ if scores:
161
+ averages[category] = sum(scores) / len(scores)
162
+ else:
163
+ averages[category] = 0.0
164
+
165
+ return averages
166
+
167
+
168
+ def generate_results_table(categories: dict[str, list[tuple[str, float]]], averages: dict[str, float]) -> str:
169
+ """Generate a markdown table with the results."""
170
+ # Calculate overall average
171
+ all_scores = []
172
+ for tasks in categories.values():
173
+ all_scores.extend([score for _, score in tasks if score > 0])
174
+
175
+ overall_avg = sum(all_scores) / len(all_scores) if all_scores else 0.0
176
+
177
+ # Create table
178
+ table_lines = [
179
+ "## MTEB Benchmark Results",
180
+ "",
181
+ f"**Overall Average Score: {overall_avg:.4f}**",
182
+ "",
183
+ "| Category | Task | Score |",
184
+ "|----------|------|-------|",
185
+ ]
186
+
187
+ # Sort categories for consistent ordering
188
+ category_order = [
189
+ "Classification",
190
+ "Clustering",
191
+ "PairClassification",
192
+ "Reranking",
193
+ "Retrieval",
194
+ "CodeRetrieval",
195
+ "STS",
196
+ "Other",
197
+ ]
198
+
199
+ for category in category_order:
200
+ if category not in categories:
201
+ continue
202
+
203
+ tasks = categories[category]
204
+ if not tasks:
205
+ continue
206
+
207
+ # Add category average row
208
+ avg_score = averages[category]
209
+ table_lines.append(f"| **{category}** | **Average** | **{avg_score:.4f}** |")
210
+
211
+ # Add individual tasks
212
+ for task_name, score in tasks:
213
+ if score > 0: # Only show successful tasks
214
+ table_lines.append(f"| | {task_name} | {score:.4f} |")
215
+ else:
216
+ table_lines.append(f"| | {task_name} | Failed |")
217
+
218
+ table_lines.append("| | | |") # Empty row for spacing
219
+
220
+ # Add summary statistics
221
+ table_lines.extend(
222
+ [
223
+ "",
224
+ "### Summary Statistics",
225
+ "",
226
+ f"- **Total Tasks**: {sum(len(tasks) for tasks in categories.values())}",
227
+ f"- **Successful Tasks**: {len(all_scores)}",
228
+ f"- **Failed Tasks**: {sum(len(tasks) for tasks in categories.values()) - len(all_scores)}",
229
+ f"- **Overall Average**: {overall_avg:.4f}",
230
+ "",
231
+ "### Category Averages",
232
+ "",
233
+ ]
234
+ )
235
+
236
+ for category in category_order:
237
+ if category in averages and categories.get(category):
238
+ avg = averages[category]
239
+ task_count = len([s for _, s in categories[category] if s > 0])
240
+ table_lines.append(f"- **{category}**: {avg:.4f} ({task_count} tasks)")
241
+
242
+ return "\n".join(table_lines)
243
+
244
+
245
+ def update_readme(results_table: str, readme_path: Path = Path("README.md")) -> None:
246
+ """Update the README.md file with the results table."""
247
+ if not readme_path.exists():
248
+ return
249
+
250
+ # Read current README
251
+ with readme_path.open() as f:
252
+ content = f.read()
253
+
254
+ # Find the insertion point or replace existing MTEB results
255
+ mteb_pattern = r"## MTEB Benchmark Results.*?(?=\n## |\n# |\Z)"
256
+
257
+ if re.search(mteb_pattern, content, re.DOTALL):
258
+ # Replace existing MTEB results section
259
+ new_content = re.sub(mteb_pattern, results_table, content, flags=re.DOTALL)
260
+ # Find a good insertion point (before Acknowledgments section or at the end)
261
+ elif "## Acknowledgments" in content:
262
+ new_content = content.replace("## Acknowledgments", f"{results_table}\n\n## Acknowledgments")
263
+ elif "## License" in content:
264
+ new_content = content.replace("## License", f"{results_table}\n\n## License")
265
+ else:
266
+ # Add at the end
267
+ new_content = f"{content}\n\n{results_table}"
268
+
269
+ # Write updated README
270
+ with readme_path.open("w") as f:
271
+ f.write(new_content)
272
+
273
+
274
+ def main() -> None:
275
+ """Main function to analyze MTEB results and update README."""
276
+ results_dir = Path("mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled")
277
+
278
+ if not results_dir.exists():
279
+ return
280
+
281
+ results = load_mteb_results(results_dir)
282
+
283
+ if not results:
284
+ return
285
+
286
+ categories = categorize_tasks(results)
287
+
288
+ averages = calculate_averages(categories)
289
+
290
+ results_table = generate_results_table(categories, averages)
291
+
292
+ update_readme(results_table)
293
+
294
+ # Print summary to console
295
+
296
+ sum(len(tasks) for tasks in categories.values())
297
+ successful_tasks = sum(len([s for _, s in tasks if s > 0]) for tasks in categories.values())
298
+
299
+ if successful_tasks > 0:
300
+ all_scores = []
301
+ for tasks in categories.values():
302
+ all_scores.extend([score for _, score in tasks if score > 0])
303
+ sum(all_scores) / len(all_scores)
304
+
305
+ for category, tasks in categories.items():
306
+ len([s for _, s in tasks if s > 0])
307
+ averages.get(category, 0.0)
308
+
309
+
310
+ if __name__ == "__main__":
311
+ main()
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AmazonCounterfactualClassification.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "dataset_revision": "e8379541af4e31359cca9fbcf4b00f2671dba205",
3
- "evaluation_time": 8.737873554229736,
4
  "kg_co2_emissions": null,
5
  "mteb_version": "1.14.15",
6
  "scores": {
 
1
  {
2
  "dataset_revision": "e8379541af4e31359cca9fbcf4b00f2671dba205",
3
+ "evaluation_time": 7.698482990264893,
4
  "kg_co2_emissions": null,
5
  "mteb_version": "1.14.15",
6
  "scores": {
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AmazonReviewsClassification.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "1399c76144fd37290681b995c656ef9b2e06e26d",
3
+ "evaluation_time": 5.071816444396973,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {
7
+ "test": [
8
+ {
9
+ "accuracy": 0.26374,
10
+ "f1": 0.25472288926645315,
11
+ "f1_weighted": 0.25472288926645315,
12
+ "hf_subset": "en",
13
+ "languages": [
14
+ "eng-Latn"
15
+ ],
16
+ "main_score": 0.26374,
17
+ "scores_per_experiment": [
18
+ {
19
+ "accuracy": 0.29,
20
+ "f1": 0.2830487996396496,
21
+ "f1_weighted": 0.2830487996396495
22
+ },
23
+ {
24
+ "accuracy": 0.276,
25
+ "f1": 0.26621916451801775,
26
+ "f1_weighted": 0.2662191645180177
27
+ },
28
+ {
29
+ "accuracy": 0.2682,
30
+ "f1": 0.24934092172665734,
31
+ "f1_weighted": 0.24934092172665728
32
+ },
33
+ {
34
+ "accuracy": 0.297,
35
+ "f1": 0.29141160920496506,
36
+ "f1_weighted": 0.29141160920496506
37
+ },
38
+ {
39
+ "accuracy": 0.268,
40
+ "f1": 0.2528895121087961,
41
+ "f1_weighted": 0.2528895121087961
42
+ },
43
+ {
44
+ "accuracy": 0.2548,
45
+ "f1": 0.25158219767608686,
46
+ "f1_weighted": 0.2515821976760869
47
+ },
48
+ {
49
+ "accuracy": 0.2192,
50
+ "f1": 0.21535453372408658,
51
+ "f1_weighted": 0.21535453372408656
52
+ },
53
+ {
54
+ "accuracy": 0.264,
55
+ "f1": 0.2493331111938578,
56
+ "f1_weighted": 0.24933311119385781
57
+ },
58
+ {
59
+ "accuracy": 0.2694,
60
+ "f1": 0.2569449221084947,
61
+ "f1_weighted": 0.2569449221084947
62
+ },
63
+ {
64
+ "accuracy": 0.2308,
65
+ "f1": 0.23110412076392003,
66
+ "f1_weighted": 0.23110412076392003
67
+ }
68
+ ]
69
+ }
70
+ ]
71
+ },
72
+ "task_name": "AmazonReviewsClassification"
73
+ }
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AppsRetrieval.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "f22508f96b7a36c2415181ed8bb76f76e04ae2d5",
3
+ "evaluation_time": 7.666281223297119,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {
7
+ "test": [
8
+ {
9
+ "hf_subset": "default",
10
+ "languages": [
11
+ "eng-Latn",
12
+ "python-Code"
13
+ ],
14
+ "main_score": 0.00085,
15
+ "map_at_1": 0.00053,
16
+ "map_at_10": 0.00071,
17
+ "map_at_100": 0.00084,
18
+ "map_at_1000": 0.00102,
19
+ "map_at_20": 0.00078,
20
+ "map_at_3": 0.00062,
21
+ "map_at_5": 0.00062,
22
+ "mrr_at_1": 0.0005312084993359894,
23
+ "mrr_at_10": 0.0007082779991146524,
24
+ "mrr_at_100": 0.0008420281651420131,
25
+ "mrr_at_1000": 0.0010194100996475369,
26
+ "mrr_at_20": 0.000778427190426253,
27
+ "mrr_at_3": 0.000619743249225321,
28
+ "mrr_at_5": 0.000619743249225321,
29
+ "nauc_map_at_1000_diff1": 0.17541432452987074,
30
+ "nauc_map_at_1000_max": -0.4154388332336005,
31
+ "nauc_map_at_1000_std": -0.3819043407981619,
32
+ "nauc_map_at_100_diff1": 0.20679746840207688,
33
+ "nauc_map_at_100_max": -0.4833278286603358,
34
+ "nauc_map_at_100_std": -0.4458942676390952,
35
+ "nauc_map_at_10_diff1": 0.24647952852769084,
36
+ "nauc_map_at_10_max": -0.5373695050821257,
37
+ "nauc_map_at_10_std": -0.5236953500263536,
38
+ "nauc_map_at_1_diff1": 0.29581300303762115,
39
+ "nauc_map_at_1_max": -0.5373695050821257,
40
+ "nauc_map_at_1_std": -0.5373695050821257,
41
+ "nauc_map_at_20_diff1": 0.21533097655723032,
42
+ "nauc_map_at_20_max": -0.5291249767848446,
43
+ "nauc_map_at_20_std": -0.4922264006934063,
44
+ "nauc_map_at_3_diff1": 0.26762244617480385,
45
+ "nauc_map_at_3_max": -0.5320020257576935,
46
+ "nauc_map_at_3_std": -0.542736984406558,
47
+ "nauc_map_at_5_diff1": 0.26762244617480385,
48
+ "nauc_map_at_5_max": -0.5320020257576935,
49
+ "nauc_map_at_5_std": -0.542736984406558,
50
+ "nauc_mrr_at_1000_diff1": 0.17541443100790596,
51
+ "nauc_mrr_at_1000_max": -0.4154385500206541,
52
+ "nauc_mrr_at_1000_std": -0.38190393644049175,
53
+ "nauc_mrr_at_100_diff1": 0.20679746840207688,
54
+ "nauc_mrr_at_100_max": -0.4833278286603358,
55
+ "nauc_mrr_at_100_std": -0.4458942676390952,
56
+ "nauc_mrr_at_10_diff1": 0.24647952852769084,
57
+ "nauc_mrr_at_10_max": -0.5373695050821257,
58
+ "nauc_mrr_at_10_std": -0.5236953500263536,
59
+ "nauc_mrr_at_1_diff1": 0.29581300303762115,
60
+ "nauc_mrr_at_1_max": -0.5373695050821257,
61
+ "nauc_mrr_at_1_std": -0.5373695050821257,
62
+ "nauc_mrr_at_20_diff1": 0.21533097655723032,
63
+ "nauc_mrr_at_20_max": -0.5291249767848446,
64
+ "nauc_mrr_at_20_std": -0.4922264006934063,
65
+ "nauc_mrr_at_3_diff1": 0.26762244617480385,
66
+ "nauc_mrr_at_3_max": -0.5320020257576935,
67
+ "nauc_mrr_at_3_std": -0.542736984406558,
68
+ "nauc_mrr_at_5_diff1": 0.26762244617480385,
69
+ "nauc_mrr_at_5_max": -0.5320020257576935,
70
+ "nauc_mrr_at_5_std": -0.542736984406558,
71
+ "nauc_ndcg_at_1000_diff1": 0.03711794407808404,
72
+ "nauc_ndcg_at_1000_max": -0.10620944898582887,
73
+ "nauc_ndcg_at_1000_std": -0.07214854599247035,
74
+ "nauc_ndcg_at_100_diff1": 0.1478165352946149,
75
+ "nauc_ndcg_at_100_max": -0.3266890379270042,
76
+ "nauc_ndcg_at_100_std": -0.24237793463929755,
77
+ "nauc_ndcg_at_10_diff1": 0.22133616828561992,
78
+ "nauc_ndcg_at_10_max": -0.5398539007431512,
79
+ "nauc_ndcg_at_10_std": -0.5106250645273135,
80
+ "nauc_ndcg_at_1_diff1": 0.29581300303762115,
81
+ "nauc_ndcg_at_1_max": -0.5373695050821257,
82
+ "nauc_ndcg_at_1_std": -0.5373695050821257,
83
+ "nauc_ndcg_at_20_diff1": 0.15496100992740594,
84
+ "nauc_ndcg_at_20_max": -0.5151090921512047,
85
+ "nauc_ndcg_at_20_std": -0.42331797746940425,
86
+ "nauc_ndcg_at_3_diff1": 0.25634622342967694,
87
+ "nauc_ndcg_at_3_max": -0.5298550340279207,
88
+ "nauc_ndcg_at_3_std": -0.5448839761363309,
89
+ "nauc_ndcg_at_5_diff1": 0.25634622342967694,
90
+ "nauc_ndcg_at_5_max": -0.5298550340279207,
91
+ "nauc_ndcg_at_5_std": -0.5448839761363309,
92
+ "nauc_precision_at_1000_diff1": 0.020777159599148352,
93
+ "nauc_precision_at_1000_max": -0.06655316040754289,
94
+ "nauc_precision_at_1000_std": -0.035219149425472995,
95
+ "nauc_precision_at_100_diff1": 0.11476528495195117,
96
+ "nauc_precision_at_100_max": -0.1810698361522713,
97
+ "nauc_precision_at_100_std": -0.0631149349365322,
98
+ "nauc_precision_at_10_diff1": 0.17741266421378812,
99
+ "nauc_precision_at_10_max": -0.5448839761363309,
100
+ "nauc_precision_at_10_std": -0.48609773784945,
101
+ "nauc_precision_at_1_diff1": 0.29581300303762115,
102
+ "nauc_precision_at_1_max": -0.5373695050821257,
103
+ "nauc_precision_at_1_std": -0.5373695050821257,
104
+ "nauc_precision_at_20_diff1": 0.0803859336130361,
105
+ "nauc_precision_at_20_max": -0.4938580214848722,
106
+ "nauc_precision_at_20_std": -0.32580885579158325,
107
+ "nauc_precision_at_3_diff1": 0.23003503702438047,
108
+ "nauc_precision_at_3_max": -0.5248453866584504,
109
+ "nauc_precision_at_3_std": -0.549893623505801,
110
+ "nauc_precision_at_5_diff1": 0.2300350370243805,
111
+ "nauc_precision_at_5_max": -0.5248453866584505,
112
+ "nauc_precision_at_5_std": -0.5498936235058011,
113
+ "nauc_recall_at_1000_diff1": 0.020777159599148387,
114
+ "nauc_recall_at_1000_max": -0.06655316040754268,
115
+ "nauc_recall_at_1000_std": -0.03521914942547286,
116
+ "nauc_recall_at_100_diff1": 0.11476528495195135,
117
+ "nauc_recall_at_100_max": -0.18106983615227104,
118
+ "nauc_recall_at_100_std": -0.06311493493653193,
119
+ "nauc_recall_at_10_diff1": 0.17741266421378812,
120
+ "nauc_recall_at_10_max": -0.5448839761363309,
121
+ "nauc_recall_at_10_std": -0.48609773784945015,
122
+ "nauc_recall_at_1_diff1": 0.29581300303762115,
123
+ "nauc_recall_at_1_max": -0.5373695050821257,
124
+ "nauc_recall_at_1_std": -0.5373695050821257,
125
+ "nauc_recall_at_20_diff1": 0.08038593361303595,
126
+ "nauc_recall_at_20_max": -0.4938580214848721,
127
+ "nauc_recall_at_20_std": -0.3258088557915833,
128
+ "nauc_recall_at_3_diff1": 0.23003503702438058,
129
+ "nauc_recall_at_3_max": -0.5248453866584504,
130
+ "nauc_recall_at_3_std": -0.549893623505801,
131
+ "nauc_recall_at_5_diff1": 0.23003503702438058,
132
+ "nauc_recall_at_5_max": -0.5248453866584504,
133
+ "nauc_recall_at_5_std": -0.549893623505801,
134
+ "ndcg_at_1": 0.00053,
135
+ "ndcg_at_10": 0.00085,
136
+ "ndcg_at_100": 0.00164,
137
+ "ndcg_at_1000": 0.01024,
138
+ "ndcg_at_20": 0.00112,
139
+ "ndcg_at_3": 0.00066,
140
+ "ndcg_at_5": 0.00066,
141
+ "precision_at_1": 0.00053,
142
+ "precision_at_10": 0.00013,
143
+ "precision_at_100": 5e-05,
144
+ "precision_at_1000": 8e-05,
145
+ "precision_at_20": 0.00012,
146
+ "precision_at_3": 0.00027,
147
+ "precision_at_5": 0.00016,
148
+ "recall_at_1": 0.00053,
149
+ "recall_at_10": 0.00133,
150
+ "recall_at_100": 0.00531,
151
+ "recall_at_1000": 0.08234,
152
+ "recall_at_20": 0.00239,
153
+ "recall_at_3": 0.0008,
154
+ "recall_at_5": 0.0008
155
+ }
156
+ ]
157
+ },
158
+ "task_name": "AppsRetrieval"
159
+ }
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/ArguAna.json ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "c22ab2a51041ffd869aaddef7af8d8215647e41a",
3
+ "evaluation_time": 2.6541521549224854,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {
7
+ "test": [
8
+ {
9
+ "hf_subset": "default",
10
+ "languages": [
11
+ "eng-Latn"
12
+ ],
13
+ "main_score": 0.1509,
14
+ "map_at_1": 0.0761,
15
+ "map_at_10": 0.12154,
16
+ "map_at_100": 0.12944,
17
+ "map_at_1000": 0.13039,
18
+ "map_at_20": 0.12583,
19
+ "map_at_3": 0.10218,
20
+ "map_at_5": 0.11381,
21
+ "mrr_at_1": 0.07681365576102418,
22
+ "mrr_at_10": 0.12187591727065411,
23
+ "mrr_at_100": 0.1297644188004288,
24
+ "mrr_at_1000": 0.13071256214837015,
25
+ "mrr_at_20": 0.12612116393992717,
26
+ "mrr_at_3": 0.10229966808914177,
27
+ "mrr_at_5": 0.11389284020862968,
28
+ "nauc_map_at_1000_diff1": 0.16146733687467354,
29
+ "nauc_map_at_1000_max": 0.06481036694891384,
30
+ "nauc_map_at_1000_std": 0.06943873238380074,
31
+ "nauc_map_at_100_diff1": 0.16139928093588565,
32
+ "nauc_map_at_100_max": 0.0644505695492588,
33
+ "nauc_map_at_100_std": 0.06974933502492409,
34
+ "nauc_map_at_10_diff1": 0.16658684660598214,
35
+ "nauc_map_at_10_max": 0.06224588862021987,
36
+ "nauc_map_at_10_std": 0.06880011677576231,
37
+ "nauc_map_at_1_diff1": 0.19579759176541026,
38
+ "nauc_map_at_1_max": 0.02603892226990134,
39
+ "nauc_map_at_1_std": 0.04620265141724082,
40
+ "nauc_map_at_20_diff1": 0.16382252282054222,
41
+ "nauc_map_at_20_max": 0.06529226434404913,
42
+ "nauc_map_at_20_std": 0.06849826441400649,
43
+ "nauc_map_at_3_diff1": 0.16402956096741358,
44
+ "nauc_map_at_3_max": 0.0419122332975646,
45
+ "nauc_map_at_3_std": 0.05925639235658917,
46
+ "nauc_map_at_5_diff1": 0.16894147524916653,
47
+ "nauc_map_at_5_max": 0.052414170749768195,
48
+ "nauc_map_at_5_std": 0.07005093386964208,
49
+ "nauc_mrr_at_1000_diff1": 0.15689213701351912,
50
+ "nauc_mrr_at_1000_max": 0.06318320049791439,
51
+ "nauc_mrr_at_1000_std": 0.06796437033671639,
52
+ "nauc_mrr_at_100_diff1": 0.1568336618890221,
53
+ "nauc_mrr_at_100_max": 0.06282844172684152,
54
+ "nauc_mrr_at_100_std": 0.06827651320612166,
55
+ "nauc_mrr_at_10_diff1": 0.1613861747091082,
56
+ "nauc_mrr_at_10_max": 0.06048932175951958,
57
+ "nauc_mrr_at_10_std": 0.06744723486463321,
58
+ "nauc_mrr_at_1_diff1": 0.1886592359414356,
59
+ "nauc_mrr_at_1_max": 0.025143721566481553,
60
+ "nauc_mrr_at_1_std": 0.04192879681303956,
61
+ "nauc_mrr_at_20_diff1": 0.15895555488261146,
62
+ "nauc_mrr_at_20_max": 0.06337596031238824,
63
+ "nauc_mrr_at_20_std": 0.06705300695703223,
64
+ "nauc_mrr_at_3_diff1": 0.15808017173425612,
65
+ "nauc_mrr_at_3_max": 0.03873273590791373,
66
+ "nauc_mrr_at_3_std": 0.05873440646581739,
67
+ "nauc_mrr_at_5_diff1": 0.1623674451736993,
68
+ "nauc_mrr_at_5_max": 0.048599887137470155,
69
+ "nauc_mrr_at_5_std": 0.06946191051556377,
70
+ "nauc_ndcg_at_1000_diff1": 0.14057148135021394,
71
+ "nauc_ndcg_at_1000_max": 0.09398561431514359,
72
+ "nauc_ndcg_at_1000_std": 0.06871748094502036,
73
+ "nauc_ndcg_at_100_diff1": 0.14160219742898073,
74
+ "nauc_ndcg_at_100_max": 0.08793842988004247,
75
+ "nauc_ndcg_at_100_std": 0.08107847041025427,
76
+ "nauc_ndcg_at_10_diff1": 0.16112940300466752,
77
+ "nauc_ndcg_at_10_max": 0.08282286934634887,
78
+ "nauc_ndcg_at_10_std": 0.07481333025577913,
79
+ "nauc_ndcg_at_1_diff1": 0.19579759176541026,
80
+ "nauc_ndcg_at_1_max": 0.02603892226990134,
81
+ "nauc_ndcg_at_1_std": 0.04620265141724082,
82
+ "nauc_ndcg_at_20_diff1": 0.15445394581844324,
83
+ "nauc_ndcg_at_20_max": 0.09290741055177616,
84
+ "nauc_ndcg_at_20_std": 0.0739310085946421,
85
+ "nauc_ndcg_at_3_diff1": 0.1574151504354397,
86
+ "nauc_ndcg_at_3_max": 0.04636430630581481,
87
+ "nauc_ndcg_at_3_std": 0.06191664189704533,
88
+ "nauc_ndcg_at_5_diff1": 0.1658753822856203,
89
+ "nauc_ndcg_at_5_max": 0.06313482448309465,
90
+ "nauc_ndcg_at_5_std": 0.07904072628627579,
91
+ "nauc_precision_at_1000_diff1": 0.0609588525314078,
92
+ "nauc_precision_at_1000_max": 0.1870041318251064,
93
+ "nauc_precision_at_1000_std": 0.019658161418599534,
94
+ "nauc_precision_at_100_diff1": 0.09473113411767209,
95
+ "nauc_precision_at_100_max": 0.1309396613797298,
96
+ "nauc_precision_at_100_std": 0.10623324275765494,
97
+ "nauc_precision_at_10_diff1": 0.15121181172955667,
98
+ "nauc_precision_at_10_max": 0.12477733598184097,
99
+ "nauc_precision_at_10_std": 0.08475912589528253,
100
+ "nauc_precision_at_1_diff1": 0.19579759176541026,
101
+ "nauc_precision_at_1_max": 0.02603892226990134,
102
+ "nauc_precision_at_1_std": 0.04620265141724082,
103
+ "nauc_precision_at_20_diff1": 0.1370251724378167,
104
+ "nauc_precision_at_20_max": 0.14912154538482067,
105
+ "nauc_precision_at_20_std": 0.08184312031151385,
106
+ "nauc_precision_at_3_diff1": 0.14253682467701162,
107
+ "nauc_precision_at_3_max": 0.05671718495423438,
108
+ "nauc_precision_at_3_std": 0.06788353997677292,
109
+ "nauc_precision_at_5_diff1": 0.16082986625463053,
110
+ "nauc_precision_at_5_max": 0.08573137277943063,
111
+ "nauc_precision_at_5_std": 0.09793524405071982,
112
+ "nauc_recall_at_1000_diff1": 0.06095885253140698,
113
+ "nauc_recall_at_1000_max": 0.1870041318251063,
114
+ "nauc_recall_at_1000_std": 0.019658161418598927,
115
+ "nauc_recall_at_100_diff1": 0.09473113411767223,
116
+ "nauc_recall_at_100_max": 0.1309396613797295,
117
+ "nauc_recall_at_100_std": 0.10623324275765476,
118
+ "nauc_recall_at_10_diff1": 0.1512118117295564,
119
+ "nauc_recall_at_10_max": 0.12477733598184074,
120
+ "nauc_recall_at_10_std": 0.08475912589528235,
121
+ "nauc_recall_at_1_diff1": 0.19579759176541026,
122
+ "nauc_recall_at_1_max": 0.02603892226990134,
123
+ "nauc_recall_at_1_std": 0.04620265141724082,
124
+ "nauc_recall_at_20_diff1": 0.13702517243781645,
125
+ "nauc_recall_at_20_max": 0.14912154538482042,
126
+ "nauc_recall_at_20_std": 0.08184312031151388,
127
+ "nauc_recall_at_3_diff1": 0.14253682467701162,
128
+ "nauc_recall_at_3_max": 0.056717184954234404,
129
+ "nauc_recall_at_3_std": 0.06788353997677302,
130
+ "nauc_recall_at_5_diff1": 0.16082986625463033,
131
+ "nauc_recall_at_5_max": 0.08573137277943055,
132
+ "nauc_recall_at_5_std": 0.0979352440507195,
133
+ "ndcg_at_1": 0.0761,
134
+ "ndcg_at_10": 0.1509,
135
+ "ndcg_at_100": 0.19506,
136
+ "ndcg_at_1000": 0.22612,
137
+ "ndcg_at_20": 0.16665,
138
+ "ndcg_at_3": 0.11065,
139
+ "ndcg_at_5": 0.13182,
140
+ "precision_at_1": 0.0761,
141
+ "precision_at_10": 0.02468,
142
+ "precision_at_100": 0.00467,
143
+ "precision_at_1000": 0.00072,
144
+ "precision_at_20": 0.01547,
145
+ "precision_at_3": 0.04505,
146
+ "precision_at_5": 0.03741,
147
+ "recall_at_1": 0.0761,
148
+ "recall_at_10": 0.2468,
149
+ "recall_at_100": 0.46728,
150
+ "recall_at_1000": 0.72475,
151
+ "recall_at_20": 0.30939,
152
+ "recall_at_3": 0.13514,
153
+ "recall_at_5": 0.18706
154
+ }
155
+ ]
156
+ },
157
+ "task_name": "ArguAna"
158
+ }
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AskUbuntuDupQuestions.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "2000358ca161889fa9c082cb41daa8dcfb161a54",
3
+ "evaluation_time": 0.4332466125488281,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {
7
+ "test": [
8
+ {
9
+ "hf_subset": "default",
10
+ "languages": [
11
+ "eng-Latn"
12
+ ],
13
+ "main_score": 0.46429757499703045,
14
+ "map": 0.46429757499703045,
15
+ "mrr": 0.601056368992657,
16
+ "nAUC_map_diff1": 0.16940332025233937,
17
+ "nAUC_map_max": 0.15925499774951668,
18
+ "nAUC_map_std": 0.05294826509824163,
19
+ "nAUC_mrr_diff1": 0.19481488519394907,
20
+ "nAUC_mrr_max": 0.21250668851129054,
21
+ "nAUC_mrr_std": 0.022766508692728404
22
+ }
23
+ ]
24
+ },
25
+ "task_name": "AskUbuntuDupQuestions"
26
+ }
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/BIOSSES.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "d3fb88f8f02e40887cd149695127462bbcf29b4a",
3
+ "evaluation_time": 0.0452265739440918,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {
7
+ "test": [
8
+ {
9
+ "cosine_pearson": 0.2956207137346794,
10
+ "cosine_spearman": 0.30161530624430144,
11
+ "euclidean_pearson": 0.2995531537590785,
12
+ "euclidean_spearman": 0.30161530624430144,
13
+ "hf_subset": "default",
14
+ "languages": [
15
+ "eng-Latn"
16
+ ],
17
+ "main_score": 0.30161530624430144,
18
+ "manhattan_pearson": 0.33453615581396934,
19
+ "manhattan_spearman": 0.3532610613411196,
20
+ "pearson": 0.2956207137346794,
21
+ "spearman": 0.30161530624430144
22
+ }
23
+ ]
24
+ },
25
+ "task_name": "BIOSSES"
26
+ }
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/Banking77Classification.json DELETED
@@ -1,73 +0,0 @@
1
- {
2
- "dataset_revision": "0fd18e25b25c072e09e0d92ab615fda904d66300",
3
- "evaluation_time": 6.451777696609497,
4
- "kg_co2_emissions": null,
5
- "mteb_version": "1.14.15",
6
- "scores": {
7
- "test": [
8
- {
9
- "accuracy": 0.4396103896103896,
10
- "f1": 0.4142711532114576,
11
- "f1_weighted": 0.4142711532114576,
12
- "hf_subset": "default",
13
- "languages": [
14
- "eng-Latn"
15
- ],
16
- "main_score": 0.4396103896103896,
17
- "scores_per_experiment": [
18
- {
19
- "accuracy": 0.4279220779220779,
20
- "f1": 0.4030476288783657,
21
- "f1_weighted": 0.4030476288783656
22
- },
23
- {
24
- "accuracy": 0.4211038961038961,
25
- "f1": 0.39776168133611584,
26
- "f1_weighted": 0.39776168133611584
27
- },
28
- {
29
- "accuracy": 0.45064935064935063,
30
- "f1": 0.42872843564828145,
31
- "f1_weighted": 0.42872843564828145
32
- },
33
- {
34
- "accuracy": 0.4448051948051948,
35
- "f1": 0.420756828398419,
36
- "f1_weighted": 0.42075682839841905
37
- },
38
- {
39
- "accuracy": 0.44675324675324674,
40
- "f1": 0.42100682221185654,
41
- "f1_weighted": 0.42100682221185654
42
- },
43
- {
44
- "accuracy": 0.45324675324675323,
45
- "f1": 0.4392342490231314,
46
- "f1_weighted": 0.4392342490231314
47
- },
48
- {
49
- "accuracy": 0.437012987012987,
50
- "f1": 0.4056017558988273,
51
- "f1_weighted": 0.40560175589882724
52
- },
53
- {
54
- "accuracy": 0.42337662337662335,
55
- "f1": 0.39123709562594644,
56
- "f1_weighted": 0.39123709562594655
57
- },
58
- {
59
- "accuracy": 0.44512987012987015,
60
- "f1": 0.41578171494860966,
61
- "f1_weighted": 0.41578171494860966
62
- },
63
- {
64
- "accuracy": 0.4461038961038961,
65
- "f1": 0.4195553201450221,
66
- "f1_weighted": 0.419555320145022
67
- }
68
- ]
69
- }
70
- ]
71
- },
72
- "task_name": "Banking77Classification"
73
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/BiorxivClusteringS2S.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "258694dd0231531bc1fd9de6ceb52a0853c6d908",
3
+ "evaluation_time": 6.352599620819092,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {
7
+ "test": [
8
+ {
9
+ "hf_subset": "default",
10
+ "languages": [
11
+ "eng-Latn"
12
+ ],
13
+ "main_score": 0.07745778878625219,
14
+ "v_measure": 0.07745778878625219,
15
+ "v_measure_std": 0.006515604585361752,
16
+ "v_measures": [
17
+ 0.07151497621642194,
18
+ 0.07152886858477273,
19
+ 0.07533936305694591,
20
+ 0.07390923787342664,
21
+ 0.07147679207450276,
22
+ 0.07213600223586297,
23
+ 0.08611746483041241,
24
+ 0.08170353591216682,
25
+ 0.08028322075745065,
26
+ 0.09056842632055917
27
+ ]
28
+ }
29
+ ]
30
+ },
31
+ "task_name": "BiorxivClusteringS2S"
32
+ }
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/COIRCodeSearchNetRetrieval.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "4adc7bc41202b5c13543c9c886a25f340634dab3",
3
+ "evaluation_time": 0.001447916030883789,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {},
7
+ "task_name": "COIRCodeSearchNetRetrieval"
8
+ }
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CQADupstackProgrammersRetrieval.json DELETED
@@ -1,158 +0,0 @@
1
- {
2
- "dataset_revision": "6184bc1440d2dbc7612be22b50686b8826d22b32",
3
- "evaluation_time": 99.69791841506958,
4
- "kg_co2_emissions": null,
5
- "mteb_version": "1.14.15",
6
- "scores": {
7
- "test": [
8
- {
9
- "hf_subset": "default",
10
- "languages": [
11
- "eng-Latn"
12
- ],
13
- "main_score": 0.0501,
14
- "map_at_1": 0.02467,
15
- "map_at_10": 0.03898,
16
- "map_at_100": 0.04261,
17
- "map_at_1000": 0.04333,
18
- "map_at_20": 0.04068,
19
- "map_at_3": 0.03388,
20
- "map_at_5": 0.03693,
21
- "mrr_at_1": 0.030821917808219176,
22
- "mrr_at_10": 0.04904462926723201,
23
- "mrr_at_100": 0.05339942610218758,
24
- "mrr_at_1000": 0.05413492750157237,
25
- "mrr_at_20": 0.05126402659708249,
26
- "mrr_at_3": 0.04280821917808219,
27
- "mrr_at_5": 0.04634703196347032,
28
- "nauc_map_at_1000_diff1": 0.03644747951501248,
29
- "nauc_map_at_1000_max": 0.2240572170754659,
30
- "nauc_map_at_1000_std": -0.17708810912472517,
31
- "nauc_map_at_100_diff1": 0.03759221625144172,
32
- "nauc_map_at_100_max": 0.22324901446317413,
33
- "nauc_map_at_100_std": -0.17630470695891512,
34
- "nauc_map_at_10_diff1": 0.03906418656483989,
35
- "nauc_map_at_10_max": 0.22061594321968936,
36
- "nauc_map_at_10_std": -0.17777470317814356,
37
- "nauc_map_at_1_diff1": 0.1731091343679673,
38
- "nauc_map_at_1_max": 0.33459947679728974,
39
- "nauc_map_at_1_std": -0.23115450977179597,
40
- "nauc_map_at_20_diff1": 0.03795725531499195,
41
- "nauc_map_at_20_max": 0.22396003211648763,
42
- "nauc_map_at_20_std": -0.17867373725662639,
43
- "nauc_map_at_3_diff1": 0.06042780588964212,
44
- "nauc_map_at_3_max": 0.2486807528974488,
45
- "nauc_map_at_3_std": -0.18512855007450404,
46
- "nauc_map_at_5_diff1": 0.04407217741234605,
47
- "nauc_map_at_5_max": 0.22647048266105405,
48
- "nauc_map_at_5_std": -0.18107585673560017,
49
- "nauc_mrr_at_1000_diff1": 0.033601872249839834,
50
- "nauc_mrr_at_1000_max": 0.2523936325136619,
51
- "nauc_mrr_at_1000_std": -0.19078164353963076,
52
- "nauc_mrr_at_100_diff1": 0.03435870935950355,
53
- "nauc_mrr_at_100_max": 0.2523932973431928,
54
- "nauc_mrr_at_100_std": -0.1900913512193067,
55
- "nauc_mrr_at_10_diff1": 0.03361519179733555,
56
- "nauc_mrr_at_10_max": 0.25392922716866984,
57
- "nauc_mrr_at_10_std": -0.1935061134919541,
58
- "nauc_mrr_at_1_diff1": 0.1772995319079407,
59
- "nauc_mrr_at_1_max": 0.35182174117717013,
60
- "nauc_mrr_at_1_std": -0.24426280067522707,
61
- "nauc_mrr_at_20_diff1": 0.03479828151019169,
62
- "nauc_mrr_at_20_max": 0.25624951214228564,
63
- "nauc_mrr_at_20_std": -0.19212268093923462,
64
- "nauc_mrr_at_3_diff1": 0.06173430027850725,
65
- "nauc_mrr_at_3_max": 0.26889485727748363,
66
- "nauc_mrr_at_3_std": -0.19153801111553947,
67
- "nauc_mrr_at_5_diff1": 0.036743759763164886,
68
- "nauc_mrr_at_5_max": 0.253857849052297,
69
- "nauc_mrr_at_5_std": -0.19604549670316734,
70
- "nauc_ndcg_at_1000_diff1": -0.010372586628261796,
71
- "nauc_ndcg_at_1000_max": 0.20925878430027478,
72
- "nauc_ndcg_at_1000_std": -0.1717044268161809,
73
- "nauc_ndcg_at_100_diff1": 0.0023309149151885546,
74
- "nauc_ndcg_at_100_max": 0.20125970115134734,
75
- "nauc_ndcg_at_100_std": -0.15865628929382014,
76
- "nauc_ndcg_at_10_diff1": 0.0026192804576363727,
77
- "nauc_ndcg_at_10_max": 0.19884193622357532,
78
- "nauc_ndcg_at_10_std": -0.16919003671988075,
79
- "nauc_ndcg_at_1_diff1": 0.1772995319079407,
80
- "nauc_ndcg_at_1_max": 0.35182174117717013,
81
- "nauc_ndcg_at_1_std": -0.24426280067522707,
82
- "nauc_ndcg_at_20_diff1": 0.0031543394811079034,
83
- "nauc_ndcg_at_20_max": 0.20925361343315524,
84
- "nauc_ndcg_at_20_std": -0.17106125631597793,
85
- "nauc_ndcg_at_3_diff1": 0.03670154146101528,
86
- "nauc_ndcg_at_3_max": 0.23212930749840155,
87
- "nauc_ndcg_at_3_std": -0.1728371812831961,
88
- "nauc_ndcg_at_5_diff1": 0.0107566708693031,
89
- "nauc_ndcg_at_5_max": 0.20474332948099355,
90
- "nauc_ndcg_at_5_std": -0.1734952739301359,
91
- "nauc_precision_at_1000_diff1": -0.07195606207962846,
92
- "nauc_precision_at_1000_max": 0.2542912736794115,
93
- "nauc_precision_at_1000_std": -0.1881459402790264,
94
- "nauc_precision_at_100_diff1": -0.04518222914182943,
95
- "nauc_precision_at_100_max": 0.22138981394024387,
96
- "nauc_precision_at_100_std": -0.13384472263037697,
97
- "nauc_precision_at_10_diff1": -0.052513811685878764,
98
- "nauc_precision_at_10_max": 0.18962064467698705,
99
- "nauc_precision_at_10_std": -0.14827004787357115,
100
- "nauc_precision_at_1_diff1": 0.1772995319079407,
101
- "nauc_precision_at_1_max": 0.35182174117717013,
102
- "nauc_precision_at_1_std": -0.24426280067522707,
103
- "nauc_precision_at_20_diff1": -0.040789324913047875,
104
- "nauc_precision_at_20_max": 0.22086458009752882,
105
- "nauc_precision_at_20_std": -0.14430508663959002,
106
- "nauc_precision_at_3_diff1": -0.013044619440245884,
107
- "nauc_precision_at_3_max": 0.21285488271783465,
108
- "nauc_precision_at_3_std": -0.1483164417030193,
109
- "nauc_precision_at_5_diff1": -0.05113181393685194,
110
- "nauc_precision_at_5_max": 0.1756649379589832,
111
- "nauc_precision_at_5_std": -0.15632134056178232,
112
- "nauc_recall_at_1000_diff1": -0.047075752528689695,
113
- "nauc_recall_at_1000_max": 0.16414155669676642,
114
- "nauc_recall_at_1000_std": -0.1513320281746568,
115
- "nauc_recall_at_100_diff1": -0.023004658252697183,
116
- "nauc_recall_at_100_max": 0.14861973646512244,
117
- "nauc_recall_at_100_std": -0.12240747671934184,
118
- "nauc_recall_at_10_diff1": -0.051375323084735164,
119
- "nauc_recall_at_10_max": 0.1384336247044034,
120
- "nauc_recall_at_10_std": -0.14737738059263306,
121
- "nauc_recall_at_1_diff1": 0.1731091343679673,
122
- "nauc_recall_at_1_max": 0.33459947679728974,
123
- "nauc_recall_at_1_std": -0.23115450977179597,
124
- "nauc_recall_at_20_diff1": -0.03578815918976938,
125
- "nauc_recall_at_20_max": 0.16386688869593355,
126
- "nauc_recall_at_20_std": -0.1528456365862212,
127
- "nauc_recall_at_3_diff1": -0.021696811828998432,
128
- "nauc_recall_at_3_max": 0.1864107664448688,
129
- "nauc_recall_at_3_std": -0.14586036842324565,
130
- "nauc_recall_at_5_diff1": -0.0538517948884412,
131
- "nauc_recall_at_5_max": 0.1453135254521713,
132
- "nauc_recall_at_5_std": -0.1531619473747777,
133
- "ndcg_at_1": 0.03082,
134
- "ndcg_at_10": 0.0501,
135
- "ndcg_at_100": 0.07072,
136
- "ndcg_at_1000": 0.09327,
137
- "ndcg_at_20": 0.05662,
138
- "ndcg_at_3": 0.03989,
139
- "ndcg_at_5": 0.04484,
140
- "precision_at_1": 0.03082,
141
- "precision_at_10": 0.00993,
142
- "precision_at_100": 0.00241,
143
- "precision_at_1000": 0.00052,
144
- "precision_at_20": 0.00685,
145
- "precision_at_3": 0.02017,
146
- "precision_at_5": 0.0153,
147
- "recall_at_1": 0.02467,
148
- "recall_at_10": 0.07499,
149
- "recall_at_100": 0.16969,
150
- "recall_at_1000": 0.33718,
151
- "recall_at_20": 0.09901,
152
- "recall_at_3": 0.04648,
153
- "recall_at_5": 0.05869
154
- }
155
- ]
156
- },
157
- "task_name": "CQADupstackProgrammersRetrieval"
158
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeFeedbackMT.json ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "b0f12fa0c0dd67f59c95a5c33d02aeeb4c398c5f",
3
+ "evaluation_time": 86.56418371200562,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {
7
+ "test": [
8
+ {
9
+ "hf_subset": "default",
10
+ "languages": [
11
+ "eng-Latn"
12
+ ],
13
+ "main_score": 0.15938,
14
+ "map_at_1": 0.12171,
15
+ "map_at_10": 0.14602,
16
+ "map_at_100": 0.14933,
17
+ "map_at_1000": 0.14984,
18
+ "map_at_20": 0.14772,
19
+ "map_at_3": 0.13875,
20
+ "map_at_5": 0.14277,
21
+ "mrr_at_1": 0.12171424267530316,
22
+ "mrr_at_10": 0.14602286924159336,
23
+ "mrr_at_100": 0.14933480468311353,
24
+ "mrr_at_1000": 0.14984386023850896,
25
+ "mrr_at_20": 0.14771560552079413,
26
+ "mrr_at_3": 0.1387487133137506,
27
+ "mrr_at_5": 0.14277070623383797,
28
+ "nauc_map_at_1000_diff1": 0.49472962132811066,
29
+ "nauc_map_at_1000_max": 0.1288096788830561,
30
+ "nauc_map_at_1000_std": 0.09402130699097373,
31
+ "nauc_map_at_100_diff1": 0.49521525169736585,
32
+ "nauc_map_at_100_max": 0.12889826157883777,
33
+ "nauc_map_at_100_std": 0.09387665760881421,
34
+ "nauc_map_at_10_diff1": 0.5007792220812604,
35
+ "nauc_map_at_10_max": 0.13096854092843976,
36
+ "nauc_map_at_10_std": 0.09297542921420311,
37
+ "nauc_map_at_1_diff1": 0.576490513877843,
38
+ "nauc_map_at_1_max": 0.155866059169816,
39
+ "nauc_map_at_1_std": 0.09440442396510458,
40
+ "nauc_map_at_20_diff1": 0.4980762076056278,
41
+ "nauc_map_at_20_max": 0.12991844827572516,
42
+ "nauc_map_at_20_std": 0.09346830652976015,
43
+ "nauc_map_at_3_diff1": 0.5220433951797554,
44
+ "nauc_map_at_3_max": 0.1391271534357672,
45
+ "nauc_map_at_3_std": 0.09400942293158544,
46
+ "nauc_map_at_5_diff1": 0.5107250849461592,
47
+ "nauc_map_at_5_max": 0.13530094789210456,
48
+ "nauc_map_at_5_std": 0.09342049003345741,
49
+ "nauc_mrr_at_1000_diff1": 0.49472962954673433,
50
+ "nauc_mrr_at_1000_max": 0.12880968585736355,
51
+ "nauc_mrr_at_1000_std": 0.0940213068870858,
52
+ "nauc_mrr_at_100_diff1": 0.49521525169736585,
53
+ "nauc_mrr_at_100_max": 0.12889826157883777,
54
+ "nauc_mrr_at_100_std": 0.09387665760881421,
55
+ "nauc_mrr_at_10_diff1": 0.5007792220812604,
56
+ "nauc_mrr_at_10_max": 0.13096854092843976,
57
+ "nauc_mrr_at_10_std": 0.09297542921420311,
58
+ "nauc_mrr_at_1_diff1": 0.576490513877843,
59
+ "nauc_mrr_at_1_max": 0.155866059169816,
60
+ "nauc_mrr_at_1_std": 0.09440442396510458,
61
+ "nauc_mrr_at_20_diff1": 0.4980762076056278,
62
+ "nauc_mrr_at_20_max": 0.12991844827572516,
63
+ "nauc_mrr_at_20_std": 0.09346830652976015,
64
+ "nauc_mrr_at_3_diff1": 0.5220433951797554,
65
+ "nauc_mrr_at_3_max": 0.1391271534357672,
66
+ "nauc_mrr_at_3_std": 0.09400942293158544,
67
+ "nauc_mrr_at_5_diff1": 0.5107250849461592,
68
+ "nauc_mrr_at_5_max": 0.13530094789210456,
69
+ "nauc_mrr_at_5_std": 0.09342049003345741,
70
+ "nauc_ndcg_at_1000_diff1": 0.42556848285754595,
71
+ "nauc_ndcg_at_1000_max": 0.1074330906576106,
72
+ "nauc_ndcg_at_1000_std": 0.09931415214354576,
73
+ "nauc_ndcg_at_100_diff1": 0.4389633172139021,
74
+ "nauc_ndcg_at_100_max": 0.10912358012253182,
75
+ "nauc_ndcg_at_100_std": 0.09591996585185938,
76
+ "nauc_ndcg_at_10_diff1": 0.4656271351032459,
77
+ "nauc_ndcg_at_10_max": 0.11811051132398084,
78
+ "nauc_ndcg_at_10_std": 0.09195643910816585,
79
+ "nauc_ndcg_at_1_diff1": 0.576490513877843,
80
+ "nauc_ndcg_at_1_max": 0.155866059169816,
81
+ "nauc_ndcg_at_1_std": 0.09440442396510458,
82
+ "nauc_ndcg_at_20_diff1": 0.45697106335736143,
83
+ "nauc_ndcg_at_20_max": 0.115023380566875,
84
+ "nauc_ndcg_at_20_std": 0.09369132873791501,
85
+ "nauc_ndcg_at_3_diff1": 0.5061759461194467,
86
+ "nauc_ndcg_at_3_max": 0.13434966943537516,
87
+ "nauc_ndcg_at_3_std": 0.09382725647213368,
88
+ "nauc_ndcg_at_5_diff1": 0.48712512841939637,
89
+ "nauc_ndcg_at_5_max": 0.12776188612692832,
90
+ "nauc_ndcg_at_5_std": 0.09280417774911971,
91
+ "nauc_precision_at_1000_diff1": 0.22171401911333807,
92
+ "nauc_precision_at_1000_max": 0.05180228755438657,
93
+ "nauc_precision_at_1000_std": 0.121478173960711,
94
+ "nauc_precision_at_100_diff1": 0.2930513840339096,
95
+ "nauc_precision_at_100_max": 0.058457996208423325,
96
+ "nauc_precision_at_100_std": 0.10329586184541412,
97
+ "nauc_precision_at_10_diff1": 0.37748222270492887,
98
+ "nauc_precision_at_10_max": 0.08516307019678841,
99
+ "nauc_precision_at_10_std": 0.08936548083478481,
100
+ "nauc_precision_at_1_diff1": 0.576490513877843,
101
+ "nauc_precision_at_1_max": 0.155866059169816,
102
+ "nauc_precision_at_1_std": 0.09440442396510458,
103
+ "nauc_precision_at_20_diff1": 0.35370118406718887,
104
+ "nauc_precision_at_20_max": 0.07720501737285508,
105
+ "nauc_precision_at_20_std": 0.09512670518828382,
106
+ "nauc_precision_at_3_diff1": 0.4648455680777127,
107
+ "nauc_precision_at_3_max": 0.12193379632419739,
108
+ "nauc_precision_at_3_std": 0.09333400762182767,
109
+ "nauc_precision_at_5_diff1": 0.42689240448557475,
110
+ "nauc_precision_at_5_max": 0.10840841308271118,
111
+ "nauc_precision_at_5_std": 0.09114478125877269,
112
+ "nauc_recall_at_1000_diff1": 0.22171401911333835,
113
+ "nauc_recall_at_1000_max": 0.05180228755438666,
114
+ "nauc_recall_at_1000_std": 0.12147817396071107,
115
+ "nauc_recall_at_100_diff1": 0.2930513840339097,
116
+ "nauc_recall_at_100_max": 0.05845799620842323,
117
+ "nauc_recall_at_100_std": 0.103295861845414,
118
+ "nauc_recall_at_10_diff1": 0.37748222270492904,
119
+ "nauc_recall_at_10_max": 0.08516307019678845,
120
+ "nauc_recall_at_10_std": 0.0893654808347849,
121
+ "nauc_recall_at_1_diff1": 0.576490513877843,
122
+ "nauc_recall_at_1_max": 0.155866059169816,
123
+ "nauc_recall_at_1_std": 0.09440442396510458,
124
+ "nauc_recall_at_20_diff1": 0.353701184067189,
125
+ "nauc_recall_at_20_max": 0.07720501737285505,
126
+ "nauc_recall_at_20_std": 0.09512670518828369,
127
+ "nauc_recall_at_3_diff1": 0.46484556807771255,
128
+ "nauc_recall_at_3_max": 0.12193379632419747,
129
+ "nauc_recall_at_3_std": 0.09333400762182767,
130
+ "nauc_recall_at_5_diff1": 0.4268924044855751,
131
+ "nauc_recall_at_5_max": 0.10840841308271101,
132
+ "nauc_recall_at_5_std": 0.09114478125877268,
133
+ "ndcg_at_1": 0.12171,
134
+ "ndcg_at_10": 0.15938,
135
+ "ndcg_at_100": 0.17773,
136
+ "ndcg_at_1000": 0.19422,
137
+ "ndcg_at_20": 0.16545,
138
+ "ndcg_at_3": 0.14423,
139
+ "ndcg_at_5": 0.1515,
140
+ "precision_at_1": 0.12171,
141
+ "precision_at_10": 0.02022,
142
+ "precision_at_100": 0.00293,
143
+ "precision_at_1000": 0.00043,
144
+ "precision_at_20": 0.0113,
145
+ "precision_at_3": 0.05335,
146
+ "precision_at_5": 0.03555,
147
+ "recall_at_1": 0.12171,
148
+ "recall_at_10": 0.20215,
149
+ "recall_at_100": 0.29329,
150
+ "recall_at_1000": 0.42977,
151
+ "recall_at_20": 0.22595,
152
+ "recall_at_3": 0.16005,
153
+ "recall_at_5": 0.17775
154
+ }
155
+ ]
156
+ },
157
+ "task_name": "CodeFeedbackMT"
158
+ }
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeSearchNetCCRetrieval.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "6e1effa2c03723c5fde48ee912b5ee08d4f211e8",
3
+ "evaluation_time": 0.0003421306610107422,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {},
7
+ "task_name": "CodeSearchNetCCRetrieval"
8
+ }
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeTransOceanContest.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "20da4eb20a4b17300c0986ee148c90867a7f2a4d",
3
+ "evaluation_time": 0.8471865653991699,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {
7
+ "test": [
8
+ {
9
+ "hf_subset": "default",
10
+ "languages": [
11
+ "python-Code",
12
+ "c++-Code"
13
+ ],
14
+ "main_score": 0.09511,
15
+ "map_at_1": 0.06787,
16
+ "map_at_10": 0.08418,
17
+ "map_at_100": 0.08966,
18
+ "map_at_1000": 0.09203,
19
+ "map_at_20": 0.08539,
20
+ "map_at_3": 0.07994,
21
+ "map_at_5": 0.08107,
22
+ "mrr_at_1": 0.06787330316742081,
23
+ "mrr_at_10": 0.08418085182791066,
24
+ "mrr_at_100": 0.08966102694364947,
25
+ "mrr_at_1000": 0.09203289825977753,
26
+ "mrr_at_20": 0.08539287509875745,
27
+ "mrr_at_3": 0.0799396681749623,
28
+ "mrr_at_5": 0.08107088989441931,
29
+ "nauc_map_at_1000_diff1": 0.4002206493865976,
30
+ "nauc_map_at_1000_max": 0.05565781891778103,
31
+ "nauc_map_at_1000_std": -0.004874041232219024,
32
+ "nauc_map_at_100_diff1": 0.39935983795466523,
33
+ "nauc_map_at_100_max": 0.05300217863316174,
34
+ "nauc_map_at_100_std": -0.003035576698055301,
35
+ "nauc_map_at_10_diff1": 0.41045827644014476,
36
+ "nauc_map_at_10_max": 0.06092815179963274,
37
+ "nauc_map_at_10_std": -0.005952146225472054,
38
+ "nauc_map_at_1_diff1": 0.5123648006404755,
39
+ "nauc_map_at_1_max": 0.06018186942687983,
40
+ "nauc_map_at_1_std": 0.003062256335957277,
41
+ "nauc_map_at_20_diff1": 0.41026580715260286,
42
+ "nauc_map_at_20_max": 0.05961500732393842,
43
+ "nauc_map_at_20_std": -0.008812971012975637,
44
+ "nauc_map_at_3_diff1": 0.43328416235104994,
45
+ "nauc_map_at_3_max": 0.061612024091789064,
46
+ "nauc_map_at_3_std": 0.0009811686045742218,
47
+ "nauc_map_at_5_diff1": 0.42638117055302016,
48
+ "nauc_map_at_5_max": 0.062409134446330596,
49
+ "nauc_map_at_5_std": -3.141760809509476e-05,
50
+ "nauc_mrr_at_1000_diff1": 0.4002206493865976,
51
+ "nauc_mrr_at_1000_max": 0.05565781891778103,
52
+ "nauc_mrr_at_1000_std": -0.004874041232219024,
53
+ "nauc_mrr_at_100_diff1": 0.39935983795466523,
54
+ "nauc_mrr_at_100_max": 0.05300217863316174,
55
+ "nauc_mrr_at_100_std": -0.003035576698055301,
56
+ "nauc_mrr_at_10_diff1": 0.41045827644014476,
57
+ "nauc_mrr_at_10_max": 0.06092815179963274,
58
+ "nauc_mrr_at_10_std": -0.005952146225472054,
59
+ "nauc_mrr_at_1_diff1": 0.5123648006404755,
60
+ "nauc_mrr_at_1_max": 0.06018186942687983,
61
+ "nauc_mrr_at_1_std": 0.003062256335957277,
62
+ "nauc_mrr_at_20_diff1": 0.41026580715260286,
63
+ "nauc_mrr_at_20_max": 0.05961500732393842,
64
+ "nauc_mrr_at_20_std": -0.008812971012975637,
65
+ "nauc_mrr_at_3_diff1": 0.43328416235104994,
66
+ "nauc_mrr_at_3_max": 0.061612024091789064,
67
+ "nauc_mrr_at_3_std": 0.0009811686045742218,
68
+ "nauc_mrr_at_5_diff1": 0.42638117055302016,
69
+ "nauc_mrr_at_5_max": 0.062409134446330596,
70
+ "nauc_mrr_at_5_std": -3.141760809509476e-05,
71
+ "nauc_ndcg_at_1000_diff1": 0.3401849332107565,
72
+ "nauc_ndcg_at_1000_max": 0.05887650595047429,
73
+ "nauc_ndcg_at_1000_std": 0.004274830251501765,
74
+ "nauc_ndcg_at_100_diff1": 0.3017142674492828,
75
+ "nauc_ndcg_at_100_max": 0.01657746093566299,
76
+ "nauc_ndcg_at_100_std": 0.020445323924594527,
77
+ "nauc_ndcg_at_10_diff1": 0.3606925243087163,
78
+ "nauc_ndcg_at_10_max": 0.05993698215407892,
79
+ "nauc_ndcg_at_10_std": -0.012383471019315629,
80
+ "nauc_ndcg_at_1_diff1": 0.5123648006404755,
81
+ "nauc_ndcg_at_1_max": 0.06018186942687983,
82
+ "nauc_ndcg_at_1_std": 0.003062256335957277,
83
+ "nauc_ndcg_at_20_diff1": 0.3627658572653584,
84
+ "nauc_ndcg_at_20_max": 0.05503924863968874,
85
+ "nauc_ndcg_at_20_std": -0.022353744095367632,
86
+ "nauc_ndcg_at_3_diff1": 0.40774589816759704,
87
+ "nauc_ndcg_at_3_max": 0.06078295183380332,
88
+ "nauc_ndcg_at_3_std": 0.002631991326812176,
89
+ "nauc_ndcg_at_5_diff1": 0.39699453568762005,
90
+ "nauc_ndcg_at_5_max": 0.06208096521525048,
91
+ "nauc_ndcg_at_5_std": 0.0009741567889838872,
92
+ "nauc_precision_at_1000_diff1": -0.332421505946305,
93
+ "nauc_precision_at_1000_max": 1.0,
94
+ "nauc_precision_at_1000_std": 0.9564489112227755,
95
+ "nauc_precision_at_100_diff1": 0.12129385857557387,
96
+ "nauc_precision_at_100_max": -0.0634555570739123,
97
+ "nauc_precision_at_100_std": 0.08437119311025783,
98
+ "nauc_precision_at_10_diff1": 0.2477538993229102,
99
+ "nauc_precision_at_10_max": 0.058120653790512844,
100
+ "nauc_precision_at_10_std": -0.028666404671314694,
101
+ "nauc_precision_at_1_diff1": 0.5123648006404755,
102
+ "nauc_precision_at_1_max": 0.06018186942687983,
103
+ "nauc_precision_at_1_std": 0.003062256335957277,
104
+ "nauc_precision_at_20_diff1": 0.2655368456031618,
105
+ "nauc_precision_at_20_max": 0.04343249021784076,
106
+ "nauc_precision_at_20_std": -0.05672812486089926,
107
+ "nauc_precision_at_3_diff1": 0.3471986913496342,
108
+ "nauc_precision_at_3_max": 0.05854588807862574,
109
+ "nauc_precision_at_3_std": 0.007034303620076266,
110
+ "nauc_precision_at_5_diff1": 0.3279845328741994,
111
+ "nauc_precision_at_5_max": 0.06114433941272132,
112
+ "nauc_precision_at_5_std": 0.003670428141042012,
113
+ "nauc_recall_at_1000_diff1": -0.33242150594628805,
114
+ "nauc_recall_at_1000_max": 1.0,
115
+ "nauc_recall_at_1000_std": 0.9564489112227793,
116
+ "nauc_recall_at_100_diff1": 0.1212938585755736,
117
+ "nauc_recall_at_100_max": -0.06345555707391222,
118
+ "nauc_recall_at_100_std": 0.08437119311025772,
119
+ "nauc_recall_at_10_diff1": 0.24775389932291023,
120
+ "nauc_recall_at_10_max": 0.05812065379051293,
121
+ "nauc_recall_at_10_std": -0.02866640467131462,
122
+ "nauc_recall_at_1_diff1": 0.5123648006404755,
123
+ "nauc_recall_at_1_max": 0.06018186942687983,
124
+ "nauc_recall_at_1_std": 0.003062256335957277,
125
+ "nauc_recall_at_20_diff1": 0.2655368456031617,
126
+ "nauc_recall_at_20_max": 0.04343249021784082,
127
+ "nauc_recall_at_20_std": -0.05672812486089907,
128
+ "nauc_recall_at_3_diff1": 0.34719869134963405,
129
+ "nauc_recall_at_3_max": 0.058545888078625666,
130
+ "nauc_recall_at_3_std": 0.007034303620076171,
131
+ "nauc_recall_at_5_diff1": 0.3279845328741993,
132
+ "nauc_recall_at_5_max": 0.06114433941272142,
133
+ "nauc_recall_at_5_std": 0.0036704281410421266,
134
+ "ndcg_at_1": 0.06787,
135
+ "ndcg_at_10": 0.09511,
136
+ "ndcg_at_100": 0.13674,
137
+ "ndcg_at_1000": 0.21258,
138
+ "ndcg_at_20": 0.09964,
139
+ "ndcg_at_3": 0.0849,
140
+ "ndcg_at_5": 0.08684,
141
+ "precision_at_1": 0.06787,
142
+ "precision_at_10": 0.01312,
143
+ "precision_at_100": 0.00362,
144
+ "precision_at_1000": 0.00099,
145
+ "precision_at_20": 0.00747,
146
+ "precision_at_3": 0.03318,
147
+ "precision_at_5": 0.02081,
148
+ "recall_at_1": 0.06787,
149
+ "recall_at_10": 0.13122,
150
+ "recall_at_100": 0.36199,
151
+ "recall_at_1000": 0.98643,
152
+ "recall_at_20": 0.14932,
153
+ "recall_at_3": 0.09955,
154
+ "recall_at_5": 0.10407
155
+ }
156
+ ]
157
+ },
158
+ "task_name": "CodeTransOceanContest"
159
+ }
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeTransOceanDL.json ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "281562cb8a1265ab5c0824bfa6ddcd9b0a15618f",
3
+ "evaluation_time": 0.3599967956542969,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {
7
+ "test": [
8
+ {
9
+ "hf_subset": "default",
10
+ "languages": [
11
+ "python-Code"
12
+ ],
13
+ "main_score": 0.27797,
14
+ "map_at_1": 0.06667,
15
+ "map_at_10": 0.16857,
16
+ "map_at_100": 0.18958,
17
+ "map_at_1000": 0.18973,
18
+ "map_at_20": 0.18736,
19
+ "map_at_3": 0.08704,
20
+ "map_at_5": 0.12065,
21
+ "mrr_at_1": 0.022222222222222223,
22
+ "mrr_at_10": 0.14406305114638449,
23
+ "mrr_at_100": 0.16441129312841948,
24
+ "mrr_at_1000": 0.1645598395211377,
25
+ "mrr_at_20": 0.16211789469464907,
26
+ "mrr_at_3": 0.057407407407407414,
27
+ "mrr_at_5": 0.09407407407407407,
28
+ "nauc_map_at_1000_diff1": 0.02784086538826234,
29
+ "nauc_map_at_1000_max": -0.287573307732991,
30
+ "nauc_map_at_1000_std": -0.034745406422382066,
31
+ "nauc_map_at_100_diff1": 0.027731715108106954,
32
+ "nauc_map_at_100_max": -0.28771336576146495,
33
+ "nauc_map_at_100_std": -0.03383814176825187,
34
+ "nauc_map_at_10_diff1": 0.03945986957137019,
35
+ "nauc_map_at_10_max": -0.25226936866501254,
36
+ "nauc_map_at_10_std": -0.03190349240293486,
37
+ "nauc_map_at_1_diff1": -0.0031079509882836883,
38
+ "nauc_map_at_1_max": -0.38109292549861384,
39
+ "nauc_map_at_1_std": 0.01117967981397013,
40
+ "nauc_map_at_20_diff1": 0.02588133080683189,
41
+ "nauc_map_at_20_max": -0.2867971359129569,
42
+ "nauc_map_at_20_std": -0.04076734332555616,
43
+ "nauc_map_at_3_diff1": -0.041422641218933104,
44
+ "nauc_map_at_3_max": -0.3935239742048571,
45
+ "nauc_map_at_3_std": -0.016444699737666612,
46
+ "nauc_map_at_5_diff1": 0.09205138060696524,
47
+ "nauc_map_at_5_max": -0.33277332112682373,
48
+ "nauc_map_at_5_std": -0.03392255678772473,
49
+ "nauc_mrr_at_1000_diff1": -0.28331752487610157,
50
+ "nauc_mrr_at_1000_max": -0.2111897323809926,
51
+ "nauc_mrr_at_1000_std": -0.16180758984470822,
52
+ "nauc_mrr_at_100_diff1": -0.2830509416012681,
53
+ "nauc_mrr_at_100_max": -0.21149355358382807,
54
+ "nauc_mrr_at_100_std": -0.16037144078976506,
55
+ "nauc_mrr_at_10_diff1": -0.24908906607383133,
56
+ "nauc_mrr_at_10_max": -0.16222471422585077,
57
+ "nauc_mrr_at_10_std": -0.13732552303818502,
58
+ "nauc_mrr_at_1_diff1": -0.5747249798765764,
59
+ "nauc_mrr_at_1_max": -0.4543198282801182,
60
+ "nauc_mrr_at_1_std": -0.35008049369466065,
61
+ "nauc_mrr_at_20_diff1": -0.2788790690994075,
62
+ "nauc_mrr_at_20_max": -0.21245333313324236,
63
+ "nauc_mrr_at_20_std": -0.16798426695097868,
64
+ "nauc_mrr_at_3_diff1": -0.2978755408478371,
65
+ "nauc_mrr_at_3_max": -0.3738950777316688,
66
+ "nauc_mrr_at_3_std": -0.16400629993717764,
67
+ "nauc_mrr_at_5_diff1": -0.2553974200779292,
68
+ "nauc_mrr_at_5_max": -0.2566333148288954,
69
+ "nauc_mrr_at_5_std": -0.1662715837606456,
70
+ "nauc_ndcg_at_1000_diff1": 0.018095130261789123,
71
+ "nauc_ndcg_at_1000_max": -0.25210611817093725,
72
+ "nauc_ndcg_at_1000_std": -0.045639669938462205,
73
+ "nauc_ndcg_at_100_diff1": 0.014294076823156266,
74
+ "nauc_ndcg_at_100_max": -0.2523941368276548,
75
+ "nauc_ndcg_at_100_std": -0.024740265353583573,
76
+ "nauc_ndcg_at_10_diff1": 0.028517877606712184,
77
+ "nauc_ndcg_at_10_max": -0.1379350447346928,
78
+ "nauc_ndcg_at_10_std": -0.04890416556969064,
79
+ "nauc_ndcg_at_1_diff1": -0.0031079509882836883,
80
+ "nauc_ndcg_at_1_max": -0.38109292549861384,
81
+ "nauc_ndcg_at_1_std": 0.01117967981397013,
82
+ "nauc_ndcg_at_20_diff1": -0.01798223055051044,
83
+ "nauc_ndcg_at_20_max": -0.21587479592623202,
84
+ "nauc_ndcg_at_20_std": -0.08674791082336787,
85
+ "nauc_ndcg_at_3_diff1": -0.055225744089572794,
86
+ "nauc_ndcg_at_3_max": -0.3980023359780902,
87
+ "nauc_ndcg_at_3_std": -0.026396552418542944,
88
+ "nauc_ndcg_at_5_diff1": 0.1484750076478242,
89
+ "nauc_ndcg_at_5_max": -0.3149749102906245,
90
+ "nauc_ndcg_at_5_std": -0.0507138930089742,
91
+ "nauc_precision_at_1000_diff1": 1.0,
92
+ "nauc_precision_at_1000_max": 1.0,
93
+ "nauc_precision_at_1000_std": 1.0,
94
+ "nauc_precision_at_100_diff1": -0.17098506069093805,
95
+ "nauc_precision_at_100_max": -0.22292250233425662,
96
+ "nauc_precision_at_100_std": 0.8978758169934575,
97
+ "nauc_precision_at_10_diff1": -0.017575447383332443,
98
+ "nauc_precision_at_10_max": 0.1212641892262422,
99
+ "nauc_precision_at_10_std": -0.08237519878626094,
100
+ "nauc_precision_at_1_diff1": -0.0031079509882836883,
101
+ "nauc_precision_at_1_max": -0.38109292549861384,
102
+ "nauc_precision_at_1_std": 0.01117967981397013,
103
+ "nauc_precision_at_20_diff1": -0.3151420415930391,
104
+ "nauc_precision_at_20_max": 0.12029359793394034,
105
+ "nauc_precision_at_20_std": -0.33358252911059727,
106
+ "nauc_precision_at_3_diff1": -0.08457460873068837,
107
+ "nauc_precision_at_3_max": -0.4075244582667053,
108
+ "nauc_precision_at_3_std": -0.04755669155513028,
109
+ "nauc_precision_at_5_diff1": 0.2490820314035684,
110
+ "nauc_precision_at_5_max": -0.2860130677842162,
111
+ "nauc_precision_at_5_std": -0.07902785749667773,
112
+ "nauc_recall_at_1000_diff1": NaN,
113
+ "nauc_recall_at_1000_max": NaN,
114
+ "nauc_recall_at_1000_std": NaN,
115
+ "nauc_recall_at_100_diff1": -0.1709850606909425,
116
+ "nauc_recall_at_100_max": -0.2229225023342648,
117
+ "nauc_recall_at_100_std": 0.8978758169934654,
118
+ "nauc_recall_at_10_diff1": -0.017575447383333363,
119
+ "nauc_recall_at_10_max": 0.12126418922624167,
120
+ "nauc_recall_at_10_std": -0.08237519878626104,
121
+ "nauc_recall_at_1_diff1": -0.0031079509882836883,
122
+ "nauc_recall_at_1_max": -0.38109292549861384,
123
+ "nauc_recall_at_1_std": 0.01117967981397013,
124
+ "nauc_recall_at_20_diff1": -0.3151420415930406,
125
+ "nauc_recall_at_20_max": 0.12029359793393885,
126
+ "nauc_recall_at_20_std": -0.33358252911059794,
127
+ "nauc_recall_at_3_diff1": -0.08457460873068838,
128
+ "nauc_recall_at_3_max": -0.4075244582667053,
129
+ "nauc_recall_at_3_std": -0.047556691555130225,
130
+ "nauc_recall_at_5_diff1": 0.24908203140356805,
131
+ "nauc_recall_at_5_max": -0.2860130677842164,
132
+ "nauc_recall_at_5_std": -0.07902785749667794,
133
+ "ndcg_at_1": 0.06667,
134
+ "ndcg_at_10": 0.27797,
135
+ "ndcg_at_100": 0.35629,
136
+ "ndcg_at_1000": 0.35936,
137
+ "ndcg_at_20": 0.33924,
138
+ "ndcg_at_3": 0.09722,
139
+ "ndcg_at_5": 0.15724,
140
+ "precision_at_1": 0.06667,
141
+ "precision_at_10": 0.06556,
142
+ "precision_at_100": 0.00978,
143
+ "precision_at_1000": 0.001,
144
+ "precision_at_20": 0.04417,
145
+ "precision_at_3": 0.04259,
146
+ "precision_at_5": 0.05444,
147
+ "recall_at_1": 0.06667,
148
+ "recall_at_10": 0.65556,
149
+ "recall_at_100": 0.97778,
150
+ "recall_at_1000": 1.0,
151
+ "recall_at_20": 0.88333,
152
+ "recall_at_3": 0.12778,
153
+ "recall_at_5": 0.27222
154
+ }
155
+ ]
156
+ },
157
+ "task_name": "CodeTransOceanDL"
158
+ }
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CosQA.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "bc5efb7e9d437246ce393ed19d772e08e4a79535",
3
+ "evaluation_time": 20.75157332420349,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {
7
+ "test": [
8
+ {
9
+ "hf_subset": "default",
10
+ "languages": [
11
+ "eng-Latn",
12
+ "python-Code"
13
+ ],
14
+ "main_score": 0.00971,
15
+ "map_at_1": 0.004,
16
+ "map_at_10": 0.00722,
17
+ "map_at_100": 0.01022,
18
+ "map_at_1000": 0.01074,
19
+ "map_at_20": 0.00905,
20
+ "map_at_3": 0.005,
21
+ "map_at_5": 0.0065,
22
+ "mrr_at_1": 0.002,
23
+ "mrr_at_10": 0.004341269841269841,
24
+ "mrr_at_100": 0.007548299135165892,
25
+ "mrr_at_1000": 0.008049580536282804,
26
+ "mrr_at_20": 0.0062800911077226865,
27
+ "mrr_at_3": 0.003,
28
+ "mrr_at_5": 0.003,
29
+ "nauc_map_at_1000_diff1": -0.3294146795633615,
30
+ "nauc_map_at_1000_max": -0.24406048510714173,
31
+ "nauc_map_at_1000_std": -0.1753914769148377,
32
+ "nauc_map_at_100_diff1": -0.3401799456138604,
33
+ "nauc_map_at_100_max": -0.25047032754654014,
34
+ "nauc_map_at_100_std": -0.17739627719079182,
35
+ "nauc_map_at_10_diff1": -0.4378248126973643,
36
+ "nauc_map_at_10_max": -0.30162019359765524,
37
+ "nauc_map_at_10_std": -0.21082330602051547,
38
+ "nauc_map_at_1_diff1": -0.5747249798765764,
39
+ "nauc_map_at_1_max": -0.20056345586262406,
40
+ "nauc_map_at_1_std": -0.24282264555943125,
41
+ "nauc_map_at_20_diff1": -0.3606250545571463,
42
+ "nauc_map_at_20_max": -0.24877416436848493,
43
+ "nauc_map_at_20_std": -0.1867708065128874,
44
+ "nauc_map_at_3_diff1": -0.5596994902066005,
45
+ "nauc_map_at_3_max": -0.27539576066541455,
46
+ "nauc_map_at_3_std": -0.2772739468741615,
47
+ "nauc_map_at_5_diff1": -0.4422612536376958,
48
+ "nauc_map_at_5_max": -0.3124806505541681,
49
+ "nauc_map_at_5_std": -0.21923180120121363,
50
+ "nauc_mrr_at_1000_diff1": -0.301017062901502,
51
+ "nauc_mrr_at_1000_max": -0.30574852118541557,
52
+ "nauc_mrr_at_1000_std": -0.12721072093737698,
53
+ "nauc_mrr_at_100_diff1": -0.31271510738527736,
54
+ "nauc_mrr_at_100_max": -0.3177547103171918,
55
+ "nauc_mrr_at_100_std": -0.12828398711778335,
56
+ "nauc_mrr_at_10_diff1": -0.4292943091791753,
57
+ "nauc_mrr_at_10_max": -0.45746906437484464,
58
+ "nauc_mrr_at_10_std": -0.1527977322458919,
59
+ "nauc_mrr_at_1_diff1": -0.5747249798765764,
60
+ "nauc_mrr_at_1_max": -0.4995975315266972,
61
+ "nauc_mrr_at_1_std": -0.0705661389857795,
62
+ "nauc_mrr_at_20_diff1": -0.3407197064284155,
63
+ "nauc_mrr_at_20_max": -0.33440320216781877,
64
+ "nauc_mrr_at_20_std": -0.12890471230942302,
65
+ "nauc_mrr_at_3_diff1": -0.5496824970932833,
66
+ "nauc_mrr_at_3_max": -0.5246400143099902,
67
+ "nauc_mrr_at_3_std": -0.18540381003488063,
68
+ "nauc_mrr_at_5_diff1": -0.5496824970932833,
69
+ "nauc_mrr_at_5_max": -0.5246400143099902,
70
+ "nauc_mrr_at_5_std": -0.18540381003488063,
71
+ "nauc_ndcg_at_1000_diff1": -0.1726642525674944,
72
+ "nauc_ndcg_at_1000_max": -0.1519851274416735,
73
+ "nauc_ndcg_at_1000_std": -0.152970784901727,
74
+ "nauc_ndcg_at_100_diff1": -0.22648229459252223,
75
+ "nauc_ndcg_at_100_max": -0.20905633164487697,
76
+ "nauc_ndcg_at_100_std": -0.15127742985051915,
77
+ "nauc_ndcg_at_10_diff1": -0.3920183074503633,
78
+ "nauc_ndcg_at_10_max": -0.31340312237742524,
79
+ "nauc_ndcg_at_10_std": -0.18755048697604484,
80
+ "nauc_ndcg_at_1_diff1": -0.5747249798765764,
81
+ "nauc_ndcg_at_1_max": -0.20056345586262406,
82
+ "nauc_ndcg_at_1_std": -0.24282264555943125,
83
+ "nauc_ndcg_at_20_diff1": -0.24984338312909435,
84
+ "nauc_ndcg_at_20_max": -0.19884254695674725,
85
+ "nauc_ndcg_at_20_std": -0.13845214629934277,
86
+ "nauc_ndcg_at_3_diff1": -0.556708481180822,
87
+ "nauc_ndcg_at_3_max": -0.2902920538313011,
88
+ "nauc_ndcg_at_3_std": -0.28413190328326815,
89
+ "nauc_ndcg_at_5_diff1": -0.3900872909119857,
90
+ "nauc_ndcg_at_5_max": -0.338313007450872,
91
+ "nauc_ndcg_at_5_std": -0.19903625569609631,
92
+ "nauc_precision_at_1000_diff1": -0.11872279831421322,
93
+ "nauc_precision_at_1000_max": -0.10043235608226829,
94
+ "nauc_precision_at_1000_std": -0.1582434744663016,
95
+ "nauc_precision_at_100_diff1": -0.1709441093780988,
96
+ "nauc_precision_at_100_max": -0.18360728409607915,
97
+ "nauc_precision_at_100_std": -0.14930413784253577,
98
+ "nauc_precision_at_10_diff1": -0.3340488328414273,
99
+ "nauc_precision_at_10_max": -0.32435977700265317,
100
+ "nauc_precision_at_10_std": -0.155978892764511,
101
+ "nauc_precision_at_1_diff1": -0.5747249798765764,
102
+ "nauc_precision_at_1_max": -0.20056345586262406,
103
+ "nauc_precision_at_1_std": -0.24282264555943125,
104
+ "nauc_precision_at_20_diff1": -0.1562553357562748,
105
+ "nauc_precision_at_20_max": -0.1475961655730907,
106
+ "nauc_precision_at_20_std": -0.09494597165646255,
107
+ "nauc_precision_at_3_diff1": -0.5496824970932834,
108
+ "nauc_precision_at_3_max": -0.32528396386727493,
109
+ "nauc_precision_at_3_std": -0.3002414810839818,
110
+ "nauc_precision_at_5_diff1": -0.30024148108398174,
111
+ "nauc_precision_at_5_max": -0.3806904570253108,
112
+ "nauc_precision_at_5_std": -0.1629997316876844,
113
+ "nauc_recall_at_1000_diff1": -0.11872279831421319,
114
+ "nauc_recall_at_1000_max": -0.10043235608226833,
115
+ "nauc_recall_at_1000_std": -0.1582434744663014,
116
+ "nauc_recall_at_100_diff1": -0.17094410937809862,
117
+ "nauc_recall_at_100_max": -0.1836072840960791,
118
+ "nauc_recall_at_100_std": -0.14930413784253557,
119
+ "nauc_recall_at_10_diff1": -0.33404883284142745,
120
+ "nauc_recall_at_10_max": -0.3243597770026533,
121
+ "nauc_recall_at_10_std": -0.15597889276451127,
122
+ "nauc_recall_at_1_diff1": -0.5747249798765764,
123
+ "nauc_recall_at_1_max": -0.20056345586262406,
124
+ "nauc_recall_at_1_std": -0.24282264555943125,
125
+ "nauc_recall_at_20_diff1": -0.15625533575627482,
126
+ "nauc_recall_at_20_max": -0.14759616557309066,
127
+ "nauc_recall_at_20_std": -0.09494597165646257,
128
+ "nauc_recall_at_3_diff1": -0.5496824970932833,
129
+ "nauc_recall_at_3_max": -0.3252839638672748,
130
+ "nauc_recall_at_3_std": -0.3002414810839818,
131
+ "nauc_recall_at_5_diff1": -0.30024148108398174,
132
+ "nauc_recall_at_5_max": -0.38069045702531085,
133
+ "nauc_recall_at_5_std": -0.1629997316876844,
134
+ "ndcg_at_1": 0.004,
135
+ "ndcg_at_10": 0.00971,
136
+ "ndcg_at_100": 0.02508,
137
+ "ndcg_at_1000": 0.04516,
138
+ "ndcg_at_20": 0.01632,
139
+ "ndcg_at_3": 0.00526,
140
+ "ndcg_at_5": 0.00785,
141
+ "precision_at_1": 0.004,
142
+ "precision_at_10": 0.0018,
143
+ "precision_at_100": 0.00092,
144
+ "precision_at_1000": 0.00026,
145
+ "precision_at_20": 0.0022,
146
+ "precision_at_3": 0.002,
147
+ "precision_at_5": 0.0024,
148
+ "recall_at_1": 0.004,
149
+ "recall_at_10": 0.018,
150
+ "recall_at_100": 0.092,
151
+ "recall_at_1000": 0.264,
152
+ "recall_at_20": 0.044,
153
+ "recall_at_3": 0.006,
154
+ "recall_at_5": 0.012
155
+ }
156
+ ]
157
+ },
158
+ "task_name": "CosQA"
159
+ }
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/STSBenchmark.json DELETED
@@ -1,26 +0,0 @@
1
- {
2
- "dataset_revision": "b0fddb56ed78048fa8b90373c8a3cfc37b684831",
3
- "evaluation_time": 0.12331175804138184,
4
- "kg_co2_emissions": null,
5
- "mteb_version": "1.14.15",
6
- "scores": {
7
- "test": [
8
- {
9
- "cosine_pearson": 0.34632056143460516,
10
- "cosine_spearman": 0.42973159111999676,
11
- "euclidean_pearson": 0.4043313982401531,
12
- "euclidean_spearman": 0.42973159111999676,
13
- "hf_subset": "default",
14
- "languages": [
15
- "eng-Latn"
16
- ],
17
- "main_score": 0.42973159111999676,
18
- "manhattan_pearson": 0.511950240807258,
19
- "manhattan_spearman": 0.5019330550880601,
20
- "pearson": 0.34632056143460516,
21
- "spearman": 0.42973159111999676
22
- }
23
- ]
24
- },
25
- "task_name": "STSBenchmark"
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/SprintDuplicateQuestions.json DELETED
@@ -1,58 +0,0 @@
1
- {
2
- "dataset_revision": "d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46",
3
- "evaluation_time": 1.9629368782043457,
4
- "kg_co2_emissions": null,
5
- "mteb_version": "1.14.15",
6
- "scores": {
7
- "test": [
8
- {
9
- "cosine_accuracy": 0.9926237623762376,
10
- "cosine_accuracy_threshold": 0.9106360077857971,
11
- "cosine_ap": 0.4700755863552174,
12
- "cosine_f1": 0.4925187032418952,
13
- "cosine_f1_threshold": 0.8986777067184448,
14
- "cosine_precision": 0.6539735099337748,
15
- "cosine_recall": 0.395,
16
- "dot_accuracy": 0.9926237623762376,
17
- "dot_accuracy_threshold": 0.9106361269950867,
18
- "dot_ap": 0.47007548398718707,
19
- "dot_f1": 0.4925187032418952,
20
- "dot_f1_threshold": 0.8986777663230896,
21
- "dot_precision": 0.6539735099337748,
22
- "dot_recall": 0.395,
23
- "euclidean_accuracy": 0.9926237623762376,
24
- "euclidean_accuracy_threshold": 0.42276236414909363,
25
- "euclidean_ap": 0.47007558217981027,
26
- "euclidean_f1": 0.4925187032418952,
27
- "euclidean_f1_threshold": 0.4501606225967407,
28
- "euclidean_precision": 0.6539735099337748,
29
- "euclidean_recall": 0.395,
30
- "hf_subset": "default",
31
- "languages": [
32
- "eng-Latn"
33
- ],
34
- "main_score": 0.6386707007383838,
35
- "manhattan_accuracy": 0.9939207920792079,
36
- "manhattan_accuracy_threshold": 4.824772834777832,
37
- "manhattan_ap": 0.6386707007383838,
38
- "manhattan_f1": 0.6293103448275862,
39
- "manhattan_f1_threshold": 5.194998741149902,
40
- "manhattan_precision": 0.6822429906542056,
41
- "manhattan_recall": 0.584,
42
- "max_accuracy": 0.9939207920792079,
43
- "max_ap": 0.6386707007383838,
44
- "max_f1": 0.6293103448275862,
45
- "max_precision": 0.6822429906542056,
46
- "max_recall": 0.584,
47
- "similarity_accuracy": 0.9926237623762376,
48
- "similarity_accuracy_threshold": 0.9106360077857971,
49
- "similarity_ap": 0.4700755863552174,
50
- "similarity_f1": 0.4925187032418952,
51
- "similarity_f1_threshold": 0.8986777067184448,
52
- "similarity_precision": 0.6539735099337748,
53
- "similarity_recall": 0.395
54
- }
55
- ]
56
- },
57
- "task_name": "SprintDuplicateQuestions"
58
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/StackExchangeClustering.json DELETED
@@ -1,47 +0,0 @@
1
- {
2
- "dataset_revision": "6cbc1f7b2bc0622f2e39d2c77fa502909748c259",
3
- "evaluation_time": 1075.5739603042603,
4
- "kg_co2_emissions": null,
5
- "mteb_version": "1.14.15",
6
- "scores": {
7
- "test": [
8
- {
9
- "hf_subset": "default",
10
- "languages": [
11
- "eng-Latn"
12
- ],
13
- "main_score": 0.2747977935355363,
14
- "v_measure": 0.2747977935355363,
15
- "v_measure_std": 0.04408138950391278,
16
- "v_measures": [
17
- 0.2671568735697825,
18
- 0.35324106044655595,
19
- 0.2134334295678833,
20
- 0.26069561242914296,
21
- 0.2360037867112385,
22
- 0.18352010080864292,
23
- 0.21227539957559294,
24
- 0.22564157353303899,
25
- 0.31014309699664405,
26
- 0.2792317143409387,
27
- 0.30736400840236383,
28
- 0.33654065468328326,
29
- 0.3375811203083562,
30
- 0.23635769205347795,
31
- 0.2889733490218442,
32
- 0.2628972368553193,
33
- 0.2892573063858698,
34
- 0.3093369539018476,
35
- 0.2778955236652676,
36
- 0.29489160764728006,
37
- 0.3092126928451642,
38
- 0.22100223054084894,
39
- 0.23711645754707986,
40
- 0.3264131545037563,
41
- 0.2937622020471872
42
- ]
43
- }
44
- ]
45
- },
46
- "task_name": "StackExchangeClustering"
47
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/StackOverflowQA.json ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "db8f169f3894c14a00251061f957b2063eef2bd5",
3
+ "evaluation_time": 21.146663904190063,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {
7
+ "test": [
8
+ {
9
+ "hf_subset": "default",
10
+ "languages": [
11
+ "eng-Latn"
12
+ ],
13
+ "main_score": 0.17615,
14
+ "map_at_1": 0.14142,
15
+ "map_at_10": 0.16367,
16
+ "map_at_100": 0.16807,
17
+ "map_at_1000": 0.16867,
18
+ "map_at_20": 0.16588,
19
+ "map_at_3": 0.1568,
20
+ "map_at_5": 0.16034,
21
+ "mrr_at_1": 0.14142427281845538,
22
+ "mrr_at_10": 0.1636685851204407,
23
+ "mrr_at_100": 0.16806598010525844,
24
+ "mrr_at_1000": 0.16867443260066448,
25
+ "mrr_at_20": 0.16587850269947257,
26
+ "mrr_at_3": 0.15680374456703444,
27
+ "mrr_at_5": 0.1603393513874958,
28
+ "nauc_map_at_1000_diff1": 0.5441830305562326,
29
+ "nauc_map_at_1000_max": 0.2166816774885428,
30
+ "nauc_map_at_1000_std": 0.14505555737829307,
31
+ "nauc_map_at_100_diff1": 0.5446431157527537,
32
+ "nauc_map_at_100_max": 0.21689938576550866,
33
+ "nauc_map_at_100_std": 0.14493387106545103,
34
+ "nauc_map_at_10_diff1": 0.5511736320008027,
35
+ "nauc_map_at_10_max": 0.21922402299128418,
36
+ "nauc_map_at_10_std": 0.14589505600247163,
37
+ "nauc_map_at_1_diff1": 0.6335084714813365,
38
+ "nauc_map_at_1_max": 0.2416674532732567,
39
+ "nauc_map_at_1_std": 0.15189301837631614,
40
+ "nauc_map_at_20_diff1": 0.5481004538160913,
41
+ "nauc_map_at_20_max": 0.21754392477744908,
42
+ "nauc_map_at_20_std": 0.14574143361267317,
43
+ "nauc_map_at_3_diff1": 0.569602881272386,
44
+ "nauc_map_at_3_max": 0.22657597605102178,
45
+ "nauc_map_at_3_std": 0.14362624083093203,
46
+ "nauc_map_at_5_diff1": 0.5601655545127238,
47
+ "nauc_map_at_5_max": 0.22021980923815318,
48
+ "nauc_map_at_5_std": 0.145190486252428,
49
+ "nauc_mrr_at_1000_diff1": 0.5441830305562326,
50
+ "nauc_mrr_at_1000_max": 0.2166816774885428,
51
+ "nauc_mrr_at_1000_std": 0.14505555737829307,
52
+ "nauc_mrr_at_100_diff1": 0.5446431157527537,
53
+ "nauc_mrr_at_100_max": 0.21689938576550866,
54
+ "nauc_mrr_at_100_std": 0.14493387106545103,
55
+ "nauc_mrr_at_10_diff1": 0.5511736320008027,
56
+ "nauc_mrr_at_10_max": 0.21922402299128418,
57
+ "nauc_mrr_at_10_std": 0.14589505600247163,
58
+ "nauc_mrr_at_1_diff1": 0.6335084714813365,
59
+ "nauc_mrr_at_1_max": 0.2416674532732567,
60
+ "nauc_mrr_at_1_std": 0.15189301837631614,
61
+ "nauc_mrr_at_20_diff1": 0.5481004538160913,
62
+ "nauc_mrr_at_20_max": 0.21754392477744908,
63
+ "nauc_mrr_at_20_std": 0.14574143361267317,
64
+ "nauc_mrr_at_3_diff1": 0.569602881272386,
65
+ "nauc_mrr_at_3_max": 0.22657597605102178,
66
+ "nauc_mrr_at_3_std": 0.14362624083093203,
67
+ "nauc_mrr_at_5_diff1": 0.5601655545127238,
68
+ "nauc_mrr_at_5_max": 0.22021980923815318,
69
+ "nauc_mrr_at_5_std": 0.145190486252428,
70
+ "nauc_ndcg_at_1000_diff1": 0.4728678699567455,
71
+ "nauc_ndcg_at_1000_max": 0.18937253079534216,
72
+ "nauc_ndcg_at_1000_std": 0.14596120873695492,
73
+ "nauc_ndcg_at_100_diff1": 0.4829489403420902,
74
+ "nauc_ndcg_at_100_max": 0.19711295138806267,
75
+ "nauc_ndcg_at_100_std": 0.14004483265553003,
76
+ "nauc_ndcg_at_10_diff1": 0.5147356366280121,
77
+ "nauc_ndcg_at_10_max": 0.20936478000130024,
78
+ "nauc_ndcg_at_10_std": 0.14480134602662714,
79
+ "nauc_ndcg_at_1_diff1": 0.6335084714813365,
80
+ "nauc_ndcg_at_1_max": 0.2416674532732567,
81
+ "nauc_ndcg_at_1_std": 0.15189301837631614,
82
+ "nauc_ndcg_at_20_diff1": 0.5045372953308567,
83
+ "nauc_ndcg_at_20_max": 0.20390468798029948,
84
+ "nauc_ndcg_at_20_std": 0.14429100965430774,
85
+ "nauc_ndcg_at_3_diff1": 0.5501813298382772,
86
+ "nauc_ndcg_at_3_max": 0.22229855178363508,
87
+ "nauc_ndcg_at_3_std": 0.1399986570615583,
88
+ "nauc_ndcg_at_5_diff1": 0.5343279242377332,
89
+ "nauc_ndcg_at_5_max": 0.21164562906788129,
90
+ "nauc_ndcg_at_5_std": 0.14278785553527687,
91
+ "nauc_precision_at_1000_diff1": 0.2504046219335285,
92
+ "nauc_precision_at_1000_max": 0.08591924265428995,
93
+ "nauc_precision_at_1000_std": 0.1677320203837767,
94
+ "nauc_precision_at_100_diff1": 0.31425670977915415,
95
+ "nauc_precision_at_100_max": 0.1387542114851391,
96
+ "nauc_precision_at_100_std": 0.1261904558936239,
97
+ "nauc_precision_at_10_diff1": 0.41968706662348626,
98
+ "nauc_precision_at_10_max": 0.18390157987927358,
99
+ "nauc_precision_at_10_std": 0.14312672622707642,
100
+ "nauc_precision_at_1_diff1": 0.6335084714813365,
101
+ "nauc_precision_at_1_max": 0.2416674532732567,
102
+ "nauc_precision_at_1_std": 0.15189301837631614,
103
+ "nauc_precision_at_20_diff1": 0.39118835707188254,
104
+ "nauc_precision_at_20_max": 0.16759815130477784,
105
+ "nauc_precision_at_20_std": 0.14154312425469426,
106
+ "nauc_precision_at_3_diff1": 0.4986851913309839,
107
+ "nauc_precision_at_3_max": 0.2110426423927967,
108
+ "nauc_precision_at_3_std": 0.13007101364000376,
109
+ "nauc_precision_at_5_diff1": 0.4672079991177685,
110
+ "nauc_precision_at_5_max": 0.18897950891809692,
111
+ "nauc_precision_at_5_std": 0.13674491342908243,
112
+ "nauc_recall_at_1000_diff1": 0.25040462193352936,
113
+ "nauc_recall_at_1000_max": 0.08591924265429102,
114
+ "nauc_recall_at_1000_std": 0.1677320203837774,
115
+ "nauc_recall_at_100_diff1": 0.3142567097791538,
116
+ "nauc_recall_at_100_max": 0.1387542114851391,
117
+ "nauc_recall_at_100_std": 0.12619045589362404,
118
+ "nauc_recall_at_10_diff1": 0.41968706662348615,
119
+ "nauc_recall_at_10_max": 0.18390157987927366,
120
+ "nauc_recall_at_10_std": 0.1431267262270766,
121
+ "nauc_recall_at_1_diff1": 0.6335084714813365,
122
+ "nauc_recall_at_1_max": 0.2416674532732567,
123
+ "nauc_recall_at_1_std": 0.15189301837631614,
124
+ "nauc_recall_at_20_diff1": 0.3911883570718826,
125
+ "nauc_recall_at_20_max": 0.16759815130477776,
126
+ "nauc_recall_at_20_std": 0.1415431242546944,
127
+ "nauc_recall_at_3_diff1": 0.49868519133098393,
128
+ "nauc_recall_at_3_max": 0.21104264239279674,
129
+ "nauc_recall_at_3_std": 0.13007101364000365,
130
+ "nauc_recall_at_5_diff1": 0.4672079991177685,
131
+ "nauc_recall_at_5_max": 0.18897950891809673,
132
+ "nauc_recall_at_5_std": 0.13674491342908224,
133
+ "ndcg_at_1": 0.14142,
134
+ "ndcg_at_10": 0.17615,
135
+ "ndcg_at_100": 0.20104,
136
+ "ndcg_at_1000": 0.22165,
137
+ "ndcg_at_20": 0.18433,
138
+ "ndcg_at_3": 0.16187,
139
+ "ndcg_at_5": 0.16825,
140
+ "precision_at_1": 0.14142,
141
+ "precision_at_10": 0.02161,
142
+ "precision_at_100": 0.00341,
143
+ "precision_at_1000": 0.00051,
144
+ "precision_at_20": 0.01244,
145
+ "precision_at_3": 0.05884,
146
+ "precision_at_5": 0.03842,
147
+ "recall_at_1": 0.14142,
148
+ "recall_at_10": 0.21615,
149
+ "recall_at_100": 0.34052,
150
+ "recall_at_1000": 0.51254,
151
+ "recall_at_20": 0.24875,
152
+ "recall_at_3": 0.17653,
153
+ "recall_at_5": 0.19208
154
+ }
155
+ ]
156
+ },
157
+ "task_name": "StackOverflowQA"
158
+ }
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/SyntheticText2SQL.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_revision": "686b87296c3a0191b5d9415a00526c62db9fce09",
3
+ "evaluation_time": 92.1711049079895,
4
+ "kg_co2_emissions": null,
5
+ "mteb_version": "1.14.15",
6
+ "scores": {
7
+ "test": [
8
+ {
9
+ "hf_subset": "default",
10
+ "languages": [
11
+ "eng-Latn",
12
+ "sql-Code"
13
+ ],
14
+ "main_score": 0.00489,
15
+ "map_at_1": 0.00034,
16
+ "map_at_10": 0.00331,
17
+ "map_at_100": 0.00393,
18
+ "map_at_1000": 0.00414,
19
+ "map_at_20": 0.00359,
20
+ "map_at_3": 0.00251,
21
+ "map_at_5": 0.00291,
22
+ "mrr_at_1": 0.002221842420099128,
23
+ "mrr_at_10": 0.004530496754048283,
24
+ "mrr_at_100": 0.005170980682014603,
25
+ "mrr_at_1000": 0.005371938321099836,
26
+ "mrr_at_20": 0.004822607995513001,
27
+ "mrr_at_3": 0.0038454964963254143,
28
+ "mrr_at_5": 0.004195863954879508,
29
+ "nauc_map_at_1000_diff1": -0.27416046541710665,
30
+ "nauc_map_at_1000_max": -0.12307636241646212,
31
+ "nauc_map_at_1000_std": -0.3169779465421886,
32
+ "nauc_map_at_100_diff1": -0.2867723753018623,
33
+ "nauc_map_at_100_max": -0.11793114543437405,
34
+ "nauc_map_at_100_std": -0.32768340793729833,
35
+ "nauc_map_at_10_diff1": -0.3071810467939698,
36
+ "nauc_map_at_10_max": -0.09394849271438202,
37
+ "nauc_map_at_10_std": -0.3443159235101931,
38
+ "nauc_map_at_1_diff1": -0.045574562309770715,
39
+ "nauc_map_at_1_max": -0.415009003625047,
40
+ "nauc_map_at_1_std": -0.28503182744193584,
41
+ "nauc_map_at_20_diff1": -0.30073635348293454,
42
+ "nauc_map_at_20_max": -0.1035440934145476,
43
+ "nauc_map_at_20_std": -0.33728144942994526,
44
+ "nauc_map_at_3_diff1": -0.36276475560891563,
45
+ "nauc_map_at_3_max": -0.09000122816382457,
46
+ "nauc_map_at_3_std": -0.35808488719288767,
47
+ "nauc_map_at_5_diff1": -0.34649671639377566,
48
+ "nauc_map_at_5_max": -0.07741484623960085,
49
+ "nauc_map_at_5_std": -0.3454332041446047,
50
+ "nauc_mrr_at_1000_diff1": -0.32007654216936365,
51
+ "nauc_mrr_at_1000_max": -0.05306747639186935,
52
+ "nauc_mrr_at_1000_std": -0.33505538550557523,
53
+ "nauc_mrr_at_100_diff1": -0.33152877543566905,
54
+ "nauc_mrr_at_100_max": -0.04652715811851764,
55
+ "nauc_mrr_at_100_std": -0.3439648778335655,
56
+ "nauc_mrr_at_10_diff1": -0.35260191778612204,
57
+ "nauc_mrr_at_10_max": -0.018284442733176375,
58
+ "nauc_mrr_at_10_std": -0.3583806093519501,
59
+ "nauc_mrr_at_1_diff1": -0.49322913632443244,
60
+ "nauc_mrr_at_1_max": 0.18386885076318166,
61
+ "nauc_mrr_at_1_std": -0.36881544615998557,
62
+ "nauc_mrr_at_20_diff1": -0.34523116750414906,
63
+ "nauc_mrr_at_20_max": -0.030423846920737567,
64
+ "nauc_mrr_at_20_std": -0.3523413443042862,
65
+ "nauc_mrr_at_3_diff1": -0.3986937025555519,
66
+ "nauc_mrr_at_3_max": 0.00596597350896994,
67
+ "nauc_mrr_at_3_std": -0.37595027480484544,
68
+ "nauc_mrr_at_5_diff1": -0.37058906995780483,
69
+ "nauc_mrr_at_5_max": 0.0023804395413750843,
70
+ "nauc_mrr_at_5_std": -0.3649770343981212,
71
+ "nauc_ndcg_at_1000_diff1": -0.12191989446547287,
72
+ "nauc_ndcg_at_1000_max": -0.18069129976379253,
73
+ "nauc_ndcg_at_1000_std": -0.21737660540578904,
74
+ "nauc_ndcg_at_100_diff1": -0.21534614581420813,
75
+ "nauc_ndcg_at_100_max": -0.16549108196966383,
76
+ "nauc_ndcg_at_100_std": -0.2967519876094673,
77
+ "nauc_ndcg_at_10_diff1": -0.2766087694329189,
78
+ "nauc_ndcg_at_10_max": -0.10425653229278331,
79
+ "nauc_ndcg_at_10_std": -0.34614483144111813,
80
+ "nauc_ndcg_at_1_diff1": -0.045574562309770715,
81
+ "nauc_ndcg_at_1_max": -0.415009003625047,
82
+ "nauc_ndcg_at_1_std": -0.28503182744193584,
83
+ "nauc_ndcg_at_20_diff1": -0.26495356113264346,
84
+ "nauc_ndcg_at_20_max": -0.12302281530014428,
85
+ "nauc_ndcg_at_20_std": -0.33040207062914734,
86
+ "nauc_ndcg_at_3_diff1": -0.35550615579366496,
87
+ "nauc_ndcg_at_3_max": -0.09065063772541752,
88
+ "nauc_ndcg_at_3_std": -0.3666750120549603,
89
+ "nauc_ndcg_at_5_diff1": -0.3367147607777083,
90
+ "nauc_ndcg_at_5_max": -0.07594752160761341,
91
+ "nauc_ndcg_at_5_std": -0.349392770228869,
92
+ "nauc_precision_at_1000_diff1": -0.05796266193135331,
93
+ "nauc_precision_at_1000_max": -0.19596247289607774,
94
+ "nauc_precision_at_1000_std": -0.1726159439969235,
95
+ "nauc_precision_at_100_diff1": -0.1623283482675489,
96
+ "nauc_precision_at_100_max": -0.20045006262758877,
97
+ "nauc_precision_at_100_std": -0.2711122975734177,
98
+ "nauc_precision_at_10_diff1": -0.23692269420435214,
99
+ "nauc_precision_at_10_max": -0.11995105342526458,
100
+ "nauc_precision_at_10_std": -0.34723986176409266,
101
+ "nauc_precision_at_1_diff1": -0.045574562309770715,
102
+ "nauc_precision_at_1_max": -0.415009003625047,
103
+ "nauc_precision_at_1_std": -0.28503182744193584,
104
+ "nauc_precision_at_20_diff1": -0.22656401175983737,
105
+ "nauc_precision_at_20_max": -0.14607723359403244,
106
+ "nauc_precision_at_20_std": -0.3206744368813374,
107
+ "nauc_precision_at_3_diff1": -0.3421859065827053,
108
+ "nauc_precision_at_3_max": -0.09374847026615557,
109
+ "nauc_precision_at_3_std": -0.37955719702776525,
110
+ "nauc_precision_at_5_diff1": -0.3217864826892486,
111
+ "nauc_precision_at_5_max": -0.07574764495371311,
112
+ "nauc_precision_at_5_std": -0.35431940648491467,
113
+ "nauc_recall_at_1000_diff1": -0.057962661931353035,
114
+ "nauc_recall_at_1000_max": -0.19596247289607757,
115
+ "nauc_recall_at_1000_std": -0.17261594399692332,
116
+ "nauc_recall_at_100_diff1": -0.16232834826754888,
117
+ "nauc_recall_at_100_max": -0.20045006262758874,
118
+ "nauc_recall_at_100_std": -0.2711122975734177,
119
+ "nauc_recall_at_10_diff1": -0.2369226942043523,
120
+ "nauc_recall_at_10_max": -0.11995105342526483,
121
+ "nauc_recall_at_10_std": -0.34723986176409277,
122
+ "nauc_recall_at_1_diff1": -0.045574562309770715,
123
+ "nauc_recall_at_1_max": -0.415009003625047,
124
+ "nauc_recall_at_1_std": -0.28503182744193584,
125
+ "nauc_recall_at_20_diff1": -0.22656401175983737,
126
+ "nauc_recall_at_20_max": -0.14607723359403255,
127
+ "nauc_recall_at_20_std": -0.3206744368813374,
128
+ "nauc_recall_at_3_diff1": -0.3421859065827052,
129
+ "nauc_recall_at_3_max": -0.09374847026615546,
130
+ "nauc_recall_at_3_std": -0.37955719702776536,
131
+ "nauc_recall_at_5_diff1": -0.3217864826892487,
132
+ "nauc_recall_at_5_max": -0.07574764495371322,
133
+ "nauc_recall_at_5_std": -0.3543194064849148,
134
+ "ndcg_at_1": 0.00034,
135
+ "ndcg_at_10": 0.00489,
136
+ "ndcg_at_100": 0.00885,
137
+ "ndcg_at_1000": 0.01629,
138
+ "ndcg_at_20": 0.00592,
139
+ "ndcg_at_3": 0.00322,
140
+ "ndcg_at_5": 0.00394,
141
+ "precision_at_1": 0.00034,
142
+ "precision_at_10": 0.00099,
143
+ "precision_at_100": 0.00031,
144
+ "precision_at_1000": 9e-05,
145
+ "precision_at_20": 0.0007,
146
+ "precision_at_3": 0.00177,
147
+ "precision_at_5": 0.0014,
148
+ "recall_at_1": 0.00034,
149
+ "recall_at_10": 0.00991,
150
+ "recall_at_100": 0.03076,
151
+ "recall_at_1000": 0.09383,
152
+ "recall_at_20": 0.01401,
153
+ "recall_at_3": 0.0053,
154
+ "recall_at_5": 0.00701
155
+ }
156
+ ]
157
+ },
158
+ "task_name": "SyntheticText2SQL"
159
+ }
mteb_results/mteb_parsed_results.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "gte-Qwen2-7B-instruct-M2V-Distilled": "ResultSet(datasets={'Banking77Classification': DatasetResult(scores=[0.4396103896103896], time=6.451777696609497), 'StackExchangeClustering': DatasetResult(scores=[0.2747977935355363], time=1075.5739603042603), 'STSBenchmark': DatasetResult(scores=[0.42973159111999676], time=0.12331175804138184), 'CQADupstackProgrammersRetrieval': DatasetResult(scores=[0.0501], time=99.69791841506958), 'SprintDuplicateQuestions': DatasetResult(scores=[0.6386707007383838], time=1.9629368782043457)})"
3
- }
 
 
 
 
mteb_results/mteb_raw_results.json DELETED
@@ -1,7 +0,0 @@
1
- [
2
- "dataset_revision='0fd18e25b25c072e09e0d92ab615fda904d66300' task_name='Banking77Classification' mteb_version='1.14.15' scores={'test': [{'accuracy': 0.4396103896103896, 'f1': 0.4142711532114576, 'f1_weighted': 0.4142711532114576, 'scores_per_experiment': [{'accuracy': 0.4279220779220779, 'f1': 0.4030476288783657, 'f1_weighted': 0.4030476288783656}, {'accuracy': 0.4211038961038961, 'f1': 0.39776168133611584, 'f1_weighted': 0.39776168133611584}, {'accuracy': 0.45064935064935063, 'f1': 0.42872843564828145, 'f1_weighted': 0.42872843564828145}, {'accuracy': 0.4448051948051948, 'f1': 0.420756828398419, 'f1_weighted': 0.42075682839841905}, {'accuracy': 0.44675324675324674, 'f1': 0.42100682221185654, 'f1_weighted': 0.42100682221185654}, {'accuracy': 0.45324675324675323, 'f1': 0.4392342490231314, 'f1_weighted': 0.4392342490231314}, {'accuracy': 0.437012987012987, 'f1': 0.4056017558988273, 'f1_weighted': 0.40560175589882724}, {'accuracy': 0.42337662337662335, 'f1': 0.39123709562594644, 'f1_weighted': 0.39123709562594655}, {'accuracy': 0.44512987012987015, 'f1': 0.41578171494860966, 'f1_weighted': 0.41578171494860966}, {'accuracy': 0.4461038961038961, 'f1': 0.4195553201450221, 'f1_weighted': 0.419555320145022}], 'main_score': 0.4396103896103896, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=6.451777696609497 kg_co2_emissions=None",
3
- "dataset_revision='6cbc1f7b2bc0622f2e39d2c77fa502909748c259' task_name='StackExchangeClustering' mteb_version='1.14.15' scores={'test': [{'v_measure': 0.2747977935355363, 'v_measure_std': 0.04408138950391278, 'v_measures': [0.2671568735697825, 0.35324106044655595, 0.2134334295678833, 0.26069561242914296, 0.2360037867112385, 0.18352010080864292, 0.21227539957559294, 0.22564157353303899, 0.31014309699664405, 0.2792317143409387, 0.30736400840236383, 0.33654065468328326, 0.3375811203083562, 0.23635769205347795, 0.2889733490218442, 0.2628972368553193, 0.2892573063858698, 0.3093369539018476, 0.2778955236652676, 0.29489160764728006, 0.3092126928451642, 0.22100223054084894, 0.23711645754707986, 0.3264131545037563, 0.2937622020471872], 'main_score': 0.2747977935355363, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=1075.5739603042603 kg_co2_emissions=None",
4
- "dataset_revision='b0fddb56ed78048fa8b90373c8a3cfc37b684831' task_name='STSBenchmark' mteb_version='1.14.15' scores={'test': [{'pearson': 0.34632056143460516, 'spearman': 0.42973159111999676, 'cosine_pearson': 0.34632056143460516, 'cosine_spearman': 0.42973159111999676, 'manhattan_pearson': 0.511950240807258, 'manhattan_spearman': 0.5019330550880601, 'euclidean_pearson': 0.4043313982401531, 'euclidean_spearman': 0.42973159111999676, 'main_score': 0.42973159111999676, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=0.12331175804138184 kg_co2_emissions=None",
5
- "dataset_revision='6184bc1440d2dbc7612be22b50686b8826d22b32' task_name='CQADupstackProgrammersRetrieval' mteb_version='1.14.15' scores={'test': [{'ndcg_at_1': 0.03082, 'ndcg_at_3': 0.03989, 'ndcg_at_5': 0.04484, 'ndcg_at_10': 0.0501, 'ndcg_at_20': 0.05662, 'ndcg_at_100': 0.07072, 'ndcg_at_1000': 0.09327, 'map_at_1': 0.02467, 'map_at_3': 0.03388, 'map_at_5': 0.03693, 'map_at_10': 0.03898, 'map_at_20': 0.04068, 'map_at_100': 0.04261, 'map_at_1000': 0.04333, 'recall_at_1': 0.02467, 'recall_at_3': 0.04648, 'recall_at_5': 0.05869, 'recall_at_10': 0.07499, 'recall_at_20': 0.09901, 'recall_at_100': 0.16969, 'recall_at_1000': 0.33718, 'precision_at_1': 0.03082, 'precision_at_3': 0.02017, 'precision_at_5': 0.0153, 'precision_at_10': 0.00993, 'precision_at_20': 0.00685, 'precision_at_100': 0.00241, 'precision_at_1000': 0.00052, 'mrr_at_1': 0.030821917808219176, 'mrr_at_3': 0.04280821917808219, 'mrr_at_5': 0.04634703196347032, 'mrr_at_10': 0.04904462926723201, 'mrr_at_20': 0.05126402659708249, 'mrr_at_100': 0.05339942610218758, 'mrr_at_1000': 0.05413492750157237, 'nauc_ndcg_at_1_max': 0.35182174117717013, 'nauc_ndcg_at_1_std': -0.24426280067522707, 'nauc_ndcg_at_1_diff1': 0.1772995319079407, 'nauc_ndcg_at_3_max': 0.23212930749840155, 'nauc_ndcg_at_3_std': -0.1728371812831961, 'nauc_ndcg_at_3_diff1': 0.03670154146101528, 'nauc_ndcg_at_5_max': 0.20474332948099355, 'nauc_ndcg_at_5_std': -0.1734952739301359, 'nauc_ndcg_at_5_diff1': 0.0107566708693031, 'nauc_ndcg_at_10_max': 0.19884193622357532, 'nauc_ndcg_at_10_std': -0.16919003671988075, 'nauc_ndcg_at_10_diff1': 0.0026192804576363727, 'nauc_ndcg_at_20_max': 0.20925361343315524, 'nauc_ndcg_at_20_std': -0.17106125631597793, 'nauc_ndcg_at_20_diff1': 0.0031543394811079034, 'nauc_ndcg_at_100_max': 0.20125970115134734, 'nauc_ndcg_at_100_std': -0.15865628929382014, 'nauc_ndcg_at_100_diff1': 0.0023309149151885546, 'nauc_ndcg_at_1000_max': 0.20925878430027478, 'nauc_ndcg_at_1000_std': -0.1717044268161809, 'nauc_ndcg_at_1000_diff1': -0.010372586628261796, 'nauc_map_at_1_max': 0.33459947679728974, 'nauc_map_at_1_std': -0.23115450977179597, 'nauc_map_at_1_diff1': 0.1731091343679673, 'nauc_map_at_3_max': 0.2486807528974488, 'nauc_map_at_3_std': -0.18512855007450404, 'nauc_map_at_3_diff1': 0.06042780588964212, 'nauc_map_at_5_max': 0.22647048266105405, 'nauc_map_at_5_std': -0.18107585673560017, 'nauc_map_at_5_diff1': 0.04407217741234605, 'nauc_map_at_10_max': 0.22061594321968936, 'nauc_map_at_10_std': -0.17777470317814356, 'nauc_map_at_10_diff1': 0.03906418656483989, 'nauc_map_at_20_max': 0.22396003211648763, 'nauc_map_at_20_std': -0.17867373725662639, 'nauc_map_at_20_diff1': 0.03795725531499195, 'nauc_map_at_100_max': 0.22324901446317413, 'nauc_map_at_100_std': -0.17630470695891512, 'nauc_map_at_100_diff1': 0.03759221625144172, 'nauc_map_at_1000_max': 0.2240572170754659, 'nauc_map_at_1000_std': -0.17708810912472517, 'nauc_map_at_1000_diff1': 0.03644747951501248, 'nauc_recall_at_1_max': 0.33459947679728974, 'nauc_recall_at_1_std': -0.23115450977179597, 'nauc_recall_at_1_diff1': 0.1731091343679673, 'nauc_recall_at_3_max': 0.1864107664448688, 'nauc_recall_at_3_std': -0.14586036842324565, 'nauc_recall_at_3_diff1': -0.021696811828998432, 'nauc_recall_at_5_max': 0.1453135254521713, 'nauc_recall_at_5_std': -0.1531619473747777, 'nauc_recall_at_5_diff1': -0.0538517948884412, 'nauc_recall_at_10_max': 0.1384336247044034, 'nauc_recall_at_10_std': -0.14737738059263306, 'nauc_recall_at_10_diff1': -0.051375323084735164, 'nauc_recall_at_20_max': 0.16386688869593355, 'nauc_recall_at_20_std': -0.1528456365862212, 'nauc_recall_at_20_diff1': -0.03578815918976938, 'nauc_recall_at_100_max': 0.14861973646512244, 'nauc_recall_at_100_std': -0.12240747671934184, 'nauc_recall_at_100_diff1': -0.023004658252697183, 'nauc_recall_at_1000_max': 0.16414155669676642, 'nauc_recall_at_1000_std': -0.1513320281746568, 'nauc_recall_at_1000_diff1': -0.047075752528689695, 'nauc_precision_at_1_max': 0.35182174117717013, 'nauc_precision_at_1_std': -0.24426280067522707, 'nauc_precision_at_1_diff1': 0.1772995319079407, 'nauc_precision_at_3_max': 0.21285488271783465, 'nauc_precision_at_3_std': -0.1483164417030193, 'nauc_precision_at_3_diff1': -0.013044619440245884, 'nauc_precision_at_5_max': 0.1756649379589832, 'nauc_precision_at_5_std': -0.15632134056178232, 'nauc_precision_at_5_diff1': -0.05113181393685194, 'nauc_precision_at_10_max': 0.18962064467698705, 'nauc_precision_at_10_std': -0.14827004787357115, 'nauc_precision_at_10_diff1': -0.052513811685878764, 'nauc_precision_at_20_max': 0.22086458009752882, 'nauc_precision_at_20_std': -0.14430508663959002, 'nauc_precision_at_20_diff1': -0.040789324913047875, 'nauc_precision_at_100_max': 0.22138981394024387, 'nauc_precision_at_100_std': -0.13384472263037697, 'nauc_precision_at_100_diff1': -0.04518222914182943, 'nauc_precision_at_1000_max': 0.2542912736794115, 'nauc_precision_at_1000_std': -0.1881459402790264, 'nauc_precision_at_1000_diff1': -0.07195606207962846, 'nauc_mrr_at_1_max': 0.35182174117717013, 'nauc_mrr_at_1_std': -0.24426280067522707, 'nauc_mrr_at_1_diff1': 0.1772995319079407, 'nauc_mrr_at_3_max': 0.26889485727748363, 'nauc_mrr_at_3_std': -0.19153801111553947, 'nauc_mrr_at_3_diff1': 0.06173430027850725, 'nauc_mrr_at_5_max': 0.253857849052297, 'nauc_mrr_at_5_std': -0.19604549670316734, 'nauc_mrr_at_5_diff1': 0.036743759763164886, 'nauc_mrr_at_10_max': 0.25392922716866984, 'nauc_mrr_at_10_std': -0.1935061134919541, 'nauc_mrr_at_10_diff1': 0.03361519179733555, 'nauc_mrr_at_20_max': 0.25624951214228564, 'nauc_mrr_at_20_std': -0.19212268093923462, 'nauc_mrr_at_20_diff1': 0.03479828151019169, 'nauc_mrr_at_100_max': 0.2523932973431928, 'nauc_mrr_at_100_std': -0.1900913512193067, 'nauc_mrr_at_100_diff1': 0.03435870935950355, 'nauc_mrr_at_1000_max': 0.2523936325136619, 'nauc_mrr_at_1000_std': -0.19078164353963076, 'nauc_mrr_at_1000_diff1': 0.033601872249839834, 'main_score': 0.0501, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=99.69791841506958 kg_co2_emissions=None",
6
- "dataset_revision='d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46' task_name='SprintDuplicateQuestions' mteb_version='1.14.15' scores={'test': [{'similarity_accuracy': 0.9926237623762376, 'similarity_accuracy_threshold': 0.9106360077857971, 'similarity_f1': 0.4925187032418952, 'similarity_f1_threshold': 0.8986777067184448, 'similarity_precision': 0.6539735099337748, 'similarity_recall': 0.395, 'similarity_ap': 0.4700755863552174, 'cosine_accuracy': 0.9926237623762376, 'cosine_accuracy_threshold': 0.9106360077857971, 'cosine_f1': 0.4925187032418952, 'cosine_f1_threshold': 0.8986777067184448, 'cosine_precision': 0.6539735099337748, 'cosine_recall': 0.395, 'cosine_ap': 0.4700755863552174, 'manhattan_accuracy': 0.9939207920792079, 'manhattan_accuracy_threshold': 4.824772834777832, 'manhattan_f1': 0.6293103448275862, 'manhattan_f1_threshold': 5.194998741149902, 'manhattan_precision': 0.6822429906542056, 'manhattan_recall': 0.584, 'manhattan_ap': 0.6386707007383838, 'euclidean_accuracy': 0.9926237623762376, 'euclidean_accuracy_threshold': 0.42276236414909363, 'euclidean_f1': 0.4925187032418952, 'euclidean_f1_threshold': 0.4501606225967407, 'euclidean_precision': 0.6539735099337748, 'euclidean_recall': 0.395, 'euclidean_ap': 0.47007558217981027, 'dot_accuracy': 0.9926237623762376, 'dot_accuracy_threshold': 0.9106361269950867, 'dot_f1': 0.4925187032418952, 'dot_f1_threshold': 0.8986777663230896, 'dot_precision': 0.6539735099337748, 'dot_recall': 0.395, 'dot_ap': 0.47007548398718707, 'max_accuracy': 0.9939207920792079, 'max_f1': 0.6293103448275862, 'max_precision': 0.6822429906542056, 'max_recall': 0.584, 'max_ap': 0.6386707007383838, 'main_score': 0.6386707007383838, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=1.9629368782043457 kg_co2_emissions=None"
7
- ]
 
 
 
 
 
 
 
 
mteb_results/mteb_report.txt DELETED
@@ -1,21 +0,0 @@
1
- ================================================================================
2
- MTEB Evaluation Report
3
- ================================================================================
4
-
5
- Model: gte-Qwen2-7B-instruct-M2V-Distilled
6
- Model Path: .
7
- Evaluation Time: 1235.71 seconds
8
- Total Datasets: 1
9
-
10
- Summary Statistics:
11
- Average Score: 0.0501
12
- Median Score: 0.0501
13
- Standard Deviation: 0.0000
14
- Score Range: 0.0501 - 0.0501
15
-
16
- Detailed Results:
17
- --------------------------------------------------
18
- Model Average (All) Average (MTEB) Classification Clustering PairClassification Reranking Retrieval STS Summarization PEARL WordSim
19
- gte-Qwen2-7B-instruct-M2V-Distilled nan nan nan nan nan nan 5.01 nan nan nan nan
20
-
21
- ================================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mteb_results/mteb_summary.json DELETED
@@ -1,20 +0,0 @@
1
- {
2
- "model_name": "gte-Qwen2-7B-instruct-M2V-Distilled",
3
- "evaluation_time_seconds": 1235.7057559490204,
4
- "task_scores": {
5
- "gte-Qwen2-7B-instruct-M2V-Distilled": {
6
- "task_means": "Classification NaN\nClustering NaN\nPairClassification NaN\nReranking NaN\nRetrieval 0.0501\nSTS NaN\nSummarization NaN\nPEARL NaN\nWordSim NaN\ndtype: float64",
7
- "dataset_scores": {
8
- "CQADupstack": 0.0501
9
- }
10
- }
11
- },
12
- "summary_stats": {
13
- "total_datasets": 1,
14
- "average_score": 0.0501,
15
- "median_score": 0.0501,
16
- "std_dev": 0.0,
17
- "min_score": 0.0501,
18
- "max_score": 0.0501
19
- }
20
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pyproject.toml CHANGED
@@ -12,6 +12,7 @@ dependencies = [
12
  "matplotlib>=3.10.3",
13
  "model2vec[train]>=0.5.0",
14
  "mteb>=1.14.15",
 
15
  "psutil>=7.0.0",
16
  "scikit-learn>=1.6.1",
17
  "sentence-transformers>=4.1.0",
 
12
  "matplotlib>=3.10.3",
13
  "model2vec[train]>=0.5.0",
14
  "mteb>=1.14.15",
15
+ "numpy>=1.26.4",
16
  "psutil>=7.0.0",
17
  "scikit-learn>=1.6.1",
18
  "sentence-transformers>=4.1.0",
uv.lock CHANGED
@@ -498,6 +498,7 @@ dependencies = [
498
  { name = "matplotlib" },
499
  { name = "model2vec", extra = ["train"] },
500
  { name = "mteb" },
 
501
  { name = "psutil" },
502
  { name = "scikit-learn" },
503
  { name = "sentence-transformers" },
@@ -519,6 +520,7 @@ requires-dist = [
519
  { name = "matplotlib", specifier = ">=3.10.3" },
520
  { name = "model2vec", extras = ["train"], specifier = ">=0.5.0" },
521
  { name = "mteb", specifier = ">=1.14.15" },
 
522
  { name = "psutil", specifier = ">=7.0.0" },
523
  { name = "scikit-learn", specifier = ">=1.6.1" },
524
  { name = "sentence-transformers", specifier = ">=4.1.0" },
 
498
  { name = "matplotlib" },
499
  { name = "model2vec", extra = ["train"] },
500
  { name = "mteb" },
501
+ { name = "numpy" },
502
  { name = "psutil" },
503
  { name = "scikit-learn" },
504
  { name = "sentence-transformers" },
 
520
  { name = "matplotlib", specifier = ">=3.10.3" },
521
  { name = "model2vec", extras = ["train"], specifier = ">=0.5.0" },
522
  { name = "mteb", specifier = ">=1.14.15" },
523
+ { name = "numpy", specifier = ">=1.26.4" },
524
  { name = "psutil", specifier = ">=7.0.0" },
525
  { name = "scikit-learn", specifier = ">=1.6.1" },
526
  { name = "sentence-transformers", specifier = ">=4.1.0" },