Sarthak
commited on
Commit
·
eb5363b
1
Parent(s):
b82c1c9
feat: added MTEB evaluation scripts
Browse files- MTEB_evaluate.py +268 -275
- README.md +50 -0
- analyze_mteb_results.py +311 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AmazonCounterfactualClassification.json +1 -1
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AmazonReviewsClassification.json +73 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AppsRetrieval.json +159 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/ArguAna.json +158 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AskUbuntuDupQuestions.json +26 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/BIOSSES.json +26 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/Banking77Classification.json +0 -73
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/BiorxivClusteringS2S.json +32 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/COIRCodeSearchNetRetrieval.json +8 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CQADupstackProgrammersRetrieval.json +0 -158
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeFeedbackMT.json +158 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeSearchNetCCRetrieval.json +8 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeTransOceanContest.json +159 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeTransOceanDL.json +158 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CosQA.json +159 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/STSBenchmark.json +0 -26
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/SprintDuplicateQuestions.json +0 -58
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/StackExchangeClustering.json +0 -47
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/StackOverflowQA.json +158 -0
- mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/SyntheticText2SQL.json +159 -0
- mteb_results/mteb_parsed_results.json +0 -3
- mteb_results/mteb_raw_results.json +0 -7
- mteb_results/mteb_report.txt +0 -21
- mteb_results/mteb_summary.json +0 -20
- pyproject.toml +1 -0
- uv.lock +2 -0
MTEB_evaluate.py
CHANGED
@@ -1,349 +1,342 @@
|
|
1 |
#!/usr/bin/env python
|
2 |
"""
|
3 |
-
MTEB Evaluation Script
|
4 |
|
5 |
-
This script evaluates
|
6 |
-
|
7 |
-
|
8 |
-
- Classification: Tests ability to distinguish between different categories (e.g., programming languages)
|
9 |
-
- Clustering: Tests ability to group similar code by functionality
|
10 |
-
- STS: Tests semantic similarity understanding between code snippets
|
11 |
-
- Retrieval: Tests code search and duplicate detection capabilities
|
12 |
|
13 |
Features:
|
14 |
-
-
|
15 |
-
-
|
16 |
-
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
"""
|
25 |
|
|
|
26 |
import json
|
27 |
import logging
|
|
|
28 |
import sys
|
|
|
29 |
import time
|
30 |
from pathlib import Path
|
31 |
|
32 |
-
import
|
33 |
-
from model2vec import StaticModel
|
34 |
-
from mteb import ModelMeta
|
35 |
-
|
36 |
-
from evaluation import (
|
37 |
-
CustomMTEB,
|
38 |
-
get_tasks,
|
39 |
-
make_leaderboard,
|
40 |
-
parse_mteb_results,
|
41 |
-
summarize_results,
|
42 |
-
)
|
43 |
|
44 |
# =============================================================================
|
45 |
-
# CONFIGURATION
|
46 |
# =============================================================================
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
#
|
53 |
-
OUTPUT_DIR = "mteb_results" # Directory to save evaluation results
|
54 |
-
|
55 |
-
EVAL_ALL_TASKS = True
|
56 |
-
|
57 |
-
# Specific tasks most relevant for code evaluation (focused selection)
|
58 |
-
CODE_SPECIFIC_TASKS = [
|
59 |
-
# Classification - Programming language/category classification
|
60 |
-
"Banking77Classification", # Fine-grained classification (77 classes)
|
61 |
-
# Clustering - Code grouping by functionality
|
62 |
-
"StackExchangeClustering.v2", # Technical Q&A clustering (most relevant)
|
63 |
-
# STS - Code similarity understanding
|
64 |
-
"STSBenchmark", # Standard semantic similarity benchmark
|
65 |
-
# Retrieval - Code search capabilities
|
66 |
-
"CQADupstackProgrammersRetrieval", # Programming Q&A retrieval
|
67 |
-
# PairClassification - Duplicate/similar code detection
|
68 |
-
"SprintDuplicateQuestions", # Duplicate question detection
|
69 |
-
]
|
70 |
-
|
71 |
-
# Evaluation settings
|
72 |
-
EVAL_SPLITS = ["test"] # Dataset splits to evaluate on
|
73 |
-
VERBOSITY = 2 # MTEB verbosity level
|
74 |
|
75 |
-
#
|
|
|
76 |
|
77 |
# Configure logging
|
78 |
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(
|
79 |
logger = logging.getLogger(__name__)
|
80 |
|
|
|
|
|
|
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
-
if result_file.exists():
|
93 |
-
completed_tasks.append(task_name)
|
94 |
-
logger.info(f"Skipping {task_name} - results already exist")
|
95 |
-
else:
|
96 |
-
remaining_tasks.append(task)
|
97 |
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
100 |
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
|
104 |
-
def
|
105 |
-
"""
|
106 |
-
|
107 |
-
if parsed_results_file.exists():
|
108 |
-
try:
|
109 |
-
with parsed_results_file.open("r") as f:
|
110 |
-
return json.load(f)
|
111 |
-
except (json.JSONDecodeError, OSError) as e:
|
112 |
-
logger.warning(f"Could not load existing parsed results: {e}")
|
113 |
-
return {}
|
114 |
-
|
115 |
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
|
123 |
-
|
124 |
-
logger.info("EXISTING MTEB EVALUATION RESULTS:")
|
125 |
-
logger.info("=" * 80)
|
126 |
|
127 |
-
stats = summary.get("summary_stats")
|
128 |
-
if stats:
|
129 |
-
logger.info(f"Total Datasets: {stats.get('total_datasets', 'N/A')}")
|
130 |
-
logger.info(f"Average Score: {stats.get('average_score', 0):.4f}")
|
131 |
-
logger.info(f"Median Score: {stats.get('median_score', 0):.4f}")
|
132 |
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
136 |
|
|
|
|
|
|
|
137 |
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
output_path.mkdir(parents=True, exist_ok=True)
|
142 |
|
143 |
-
|
144 |
-
model = StaticModel.from_pretrained(MODEL_PATH)
|
145 |
-
logger.info("Model loaded successfully")
|
146 |
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
|
161 |
-
logger.info(f"Found {len(all_tasks)} total tasks")
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
|
|
166 |
|
167 |
-
|
168 |
-
|
|
|
|
|
169 |
|
170 |
-
# Load and display existing results
|
171 |
-
logger.info("Loading existing results...")
|
172 |
try:
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
evaluation = CustomMTEB(tasks=tasks)
|
180 |
-
|
181 |
-
# Run the evaluation
|
182 |
-
logger.info("Starting MTEB evaluation...")
|
183 |
-
start_time = time.time()
|
184 |
|
185 |
-
|
186 |
|
187 |
-
end_time = time.time()
|
188 |
-
evaluation_time = end_time - start_time
|
189 |
-
logger.info(f"Evaluation completed in {evaluation_time:.2f} seconds")
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
logger.info("Combining with existing results...")
|
199 |
-
# Convert to dict for merging
|
200 |
-
parsed_dict = dict(parsed_results) if hasattr(parsed_results, "items") else {}
|
201 |
-
# Simple merge - existing results take precedence to avoid overwriting
|
202 |
-
for key, value in existing_results.items():
|
203 |
-
if key not in parsed_dict:
|
204 |
-
parsed_dict[key] = value
|
205 |
-
parsed_results = parsed_dict
|
206 |
|
207 |
-
|
208 |
|
209 |
-
# Save results in different formats
|
210 |
-
save_results(output_path, results, parsed_results, task_scores, evaluation_time)
|
211 |
|
212 |
-
|
213 |
-
|
214 |
-
logger.info("
|
215 |
-
|
216 |
-
logger.info(
|
217 |
-
logger.info("=" * 80)
|
218 |
|
219 |
-
|
|
|
|
|
220 |
|
|
|
|
|
221 |
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
raw_results_file = output_path / "mteb_raw_results.json"
|
228 |
-
with raw_results_file.open("w") as f:
|
229 |
-
json.dump(raw_results, f, indent=2, default=str)
|
230 |
-
logger.info(f"Raw results saved to {raw_results_file}")
|
231 |
|
232 |
-
|
233 |
-
parsed_results_file = output_path / "mteb_parsed_results.json"
|
234 |
-
with parsed_results_file.open("w") as f:
|
235 |
-
json.dump(parsed_results, f, indent=2, default=str)
|
236 |
-
logger.info(f"Parsed results saved to {parsed_results_file}")
|
237 |
|
238 |
-
#
|
239 |
-
|
|
|
240 |
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
"evaluation_time_seconds": evaluation_time,
|
245 |
-
"task_scores": task_scores,
|
246 |
-
"summary_stats": summary_stats,
|
247 |
-
}
|
248 |
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
|
|
253 |
|
254 |
-
|
255 |
-
|
256 |
-
generate_report(output_path, task_scores, summary_stats, evaluation_time)
|
257 |
-
logger.info(f"Report saved to {report_file}")
|
258 |
|
|
|
|
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
|
|
|
|
|
|
264 |
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
if isinstance(model_data, dict) and "dataset_scores" in model_data:
|
269 |
-
dataset_scores = model_data["dataset_scores"]
|
270 |
-
if isinstance(dataset_scores, dict):
|
271 |
-
all_scores.extend(
|
272 |
-
[
|
273 |
-
float(score)
|
274 |
-
for score in dataset_scores.values()
|
275 |
-
if isinstance(score, int | float) and str(score).lower() != "nan"
|
276 |
-
]
|
277 |
-
)
|
278 |
-
|
279 |
-
if not all_scores:
|
280 |
-
return {}
|
281 |
|
282 |
-
|
|
|
283 |
|
284 |
-
|
285 |
-
"total_datasets": len(all_scores),
|
286 |
-
"average_score": float(np.mean(all_scores)),
|
287 |
-
"median_score": float(np.median(all_scores)),
|
288 |
-
"std_dev": float(np.std(all_scores)),
|
289 |
-
"min_score": float(np.min(all_scores)),
|
290 |
-
"max_score": float(np.max(all_scores)),
|
291 |
-
}
|
292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
with report_file.open("w") as f:
|
299 |
-
f.write("=" * 80 + "\n")
|
300 |
-
f.write("MTEB Evaluation Report\n")
|
301 |
-
f.write("=" * 80 + "\n\n")
|
302 |
-
f.write(f"Model: {MODEL_NAME}\n")
|
303 |
-
f.write(f"Model Path: {MODEL_PATH}\n")
|
304 |
-
f.write(f"Evaluation Time: {evaluation_time:.2f} seconds\n")
|
305 |
-
|
306 |
-
# Write summary stats
|
307 |
-
if summary_stats:
|
308 |
-
f.write(f"Total Datasets: {summary_stats['total_datasets']}\n\n")
|
309 |
-
f.write("Summary Statistics:\n")
|
310 |
-
f.write(f" Average Score: {summary_stats['average_score']:.4f}\n")
|
311 |
-
f.write(f" Median Score: {summary_stats['median_score']:.4f}\n")
|
312 |
-
f.write(f" Standard Deviation: {summary_stats['std_dev']:.4f}\n")
|
313 |
-
f.write(f" Score Range: {summary_stats['min_score']:.4f} - {summary_stats['max_score']:.4f}\n\n")
|
314 |
-
else:
|
315 |
-
f.write("Summary Statistics: No valid results found\n\n")
|
316 |
-
|
317 |
-
# Write leaderboard
|
318 |
-
f.write("Detailed Results:\n")
|
319 |
-
f.write("-" * 50 + "\n")
|
320 |
-
if task_scores:
|
321 |
-
leaderboard = make_leaderboard(task_scores) # type: ignore[arg-type]
|
322 |
-
f.write(leaderboard.to_string(index=False))
|
323 |
-
else:
|
324 |
-
f.write("No results available\n")
|
325 |
|
326 |
-
|
|
|
327 |
|
|
|
|
|
328 |
|
329 |
-
|
330 |
-
"""Main evaluation function."""
|
331 |
-
logger.info(f"Starting MTEB evaluation for {MODEL_NAME}")
|
332 |
-
logger.info(f"Model path: {MODEL_PATH}")
|
333 |
-
logger.info(f"Output directory: {OUTPUT_DIR}")
|
334 |
-
logger.info("Running focused MTEB evaluation on code-relevant tasks:")
|
335 |
-
logger.info(" - Classification: Programming language classification")
|
336 |
-
logger.info(" - Clustering: Code clustering by functionality")
|
337 |
-
logger.info(" - STS: Semantic similarity between code snippets")
|
338 |
-
logger.info(" - Retrieval: Code search and retrieval")
|
339 |
|
340 |
-
|
341 |
-
|
342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
|
344 |
-
|
345 |
-
|
346 |
-
sys.exit(1)
|
347 |
|
348 |
|
349 |
if __name__ == "__main__":
|
|
|
1 |
#!/usr/bin/env python
|
2 |
"""
|
3 |
+
MTEB Evaluation Script with Subprocess Isolation (Code Information Retrieval Tasks).
|
4 |
|
5 |
+
This script evaluates models using MTEB with subprocess isolation to prevent
|
6 |
+
memory issues and process killing.
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
Features:
|
9 |
+
- Each task runs in a separate subprocess to isolate memory
|
10 |
+
- 1-minute timeout per task
|
11 |
+
- No retries - if task fails or times out, move to next one
|
12 |
+
- Memory monitoring and cleanup
|
13 |
+
|
14 |
+
Note: Multi-threading is NOT used here because:
|
15 |
+
1. Memory is the main bottleneck, not CPU
|
16 |
+
2. Running multiple tasks simultaneously would increase memory pressure
|
17 |
+
3. Many tasks are being killed (return code -9) due to OOM conditions
|
18 |
+
4. Sequential processing with subprocess isolation is more stable
|
19 |
"""
|
20 |
|
21 |
+
import contextlib
|
22 |
import json
|
23 |
import logging
|
24 |
+
import subprocess
|
25 |
import sys
|
26 |
+
import tempfile
|
27 |
import time
|
28 |
from pathlib import Path
|
29 |
|
30 |
+
import psutil
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
# =============================================================================
|
33 |
+
# CONFIGURATION
|
34 |
# =============================================================================
|
35 |
|
36 |
+
MODEL_PATH = "."
|
37 |
+
MODEL_NAME = "gte-Qwen2-7B-instruct-M2V-Distilled"
|
38 |
+
OUTPUT_DIR = "mteb_results"
|
39 |
+
TASK_TIMEOUT = 30 # 30 seconds timeout per task
|
40 |
+
MAX_RETRIES = 0 # No retries - move to next task if failed/timeout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
# Constants
|
43 |
+
SIGKILL_RETURN_CODE = -9 # Process killed by SIGKILL (usually OOM)
|
44 |
|
45 |
# Configure logging
|
46 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
47 |
logger = logging.getLogger(__name__)
|
48 |
|
49 |
+
# =============================================================================
|
50 |
+
# SINGLE TASK RUNNER SCRIPT
|
51 |
+
# =============================================================================
|
52 |
|
53 |
+
TASK_RUNNER_SCRIPT = """
|
54 |
+
import sys
|
55 |
+
import os
|
56 |
+
import json
|
57 |
+
import tempfile
|
58 |
+
import traceback
|
59 |
+
from pathlib import Path
|
60 |
|
61 |
+
# Add current directory to path
|
62 |
+
sys.path.insert(0, ".")
|
63 |
+
|
64 |
+
try:
|
65 |
+
import mteb
|
66 |
+
from model2vec import StaticModel
|
67 |
+
from mteb import ModelMeta
|
68 |
+
from evaluation import CustomMTEB
|
69 |
+
|
70 |
+
def run_single_task():
|
71 |
+
# Get arguments
|
72 |
+
model_path = sys.argv[1]
|
73 |
+
task_name = sys.argv[2]
|
74 |
+
output_dir = sys.argv[3]
|
75 |
+
model_name = sys.argv[4]
|
76 |
+
|
77 |
+
# Load model
|
78 |
+
model = StaticModel.from_pretrained(model_path)
|
79 |
+
model.mteb_model_meta = ModelMeta(
|
80 |
+
name=model_name, revision="distilled", release_date=None, languages=["eng"]
|
81 |
+
)
|
82 |
+
|
83 |
+
# Get and run task
|
84 |
+
task = mteb.get_task(task_name, languages=["eng"])
|
85 |
+
evaluation = CustomMTEB(tasks=[task])
|
86 |
+
|
87 |
+
results = evaluation.run(
|
88 |
+
model,
|
89 |
+
eval_splits=["test"],
|
90 |
+
output_folder=output_dir,
|
91 |
+
verbosity=0
|
92 |
+
)
|
93 |
+
|
94 |
+
# Save results to temp file for parent process
|
95 |
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f:
|
96 |
+
json.dump({
|
97 |
+
"success": True,
|
98 |
+
"task_name": task_name,
|
99 |
+
"results": results
|
100 |
+
}, f)
|
101 |
+
temp_file = f.name
|
102 |
+
|
103 |
+
print(f"RESULT_FILE:{temp_file}")
|
104 |
+
return 0
|
105 |
+
|
106 |
+
if __name__ == "__main__":
|
107 |
+
exit(run_single_task())
|
108 |
+
|
109 |
+
except Exception as e:
|
110 |
+
print(f"ERROR: {str(e)}")
|
111 |
+
print(f"TRACEBACK: {traceback.format_exc()}")
|
112 |
+
exit(1)
|
113 |
+
"""
|
114 |
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
+
def get_available_tasks() -> list[str]:
|
117 |
+
"""Get list of available tasks."""
|
118 |
+
try:
|
119 |
+
import mteb
|
120 |
+
import mteb.benchmarks
|
121 |
|
122 |
+
# Use main MTEB benchmark for comprehensive evaluation
|
123 |
+
benchmark = mteb.benchmarks.CoIR
|
124 |
+
return [str(task) for task in benchmark.tasks] # All tasks
|
125 |
+
except Exception:
|
126 |
+
logger.exception("Failed to get tasks")
|
127 |
+
return []
|
128 |
|
129 |
|
130 |
+
def check_existing_results(output_path: Path, task_names: list[str]) -> list[str]:
|
131 |
+
"""Check for existing results and return remaining tasks."""
|
132 |
+
remaining_tasks = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
+
for task_name in task_names:
|
135 |
+
result_file = output_path / MODEL_NAME / "distilled" / f"{task_name}.json"
|
136 |
+
if result_file.exists():
|
137 |
+
logger.info(f"Skipping {task_name} - results already exist")
|
138 |
+
else:
|
139 |
+
remaining_tasks.append(task_name)
|
140 |
|
141 |
+
return remaining_tasks
|
|
|
|
|
142 |
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
+
def run_task_subprocess(task_name: str, output_dir: str) -> tuple[bool, str, float]:
|
145 |
+
"""Run a single task in a subprocess with memory and time limits."""
|
146 |
+
# Create temporary script file
|
147 |
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
148 |
+
f.write(TASK_RUNNER_SCRIPT)
|
149 |
+
script_path = f.name
|
150 |
|
151 |
+
try:
|
152 |
+
logger.info(f"Running task: {task_name}")
|
153 |
+
start_time = time.time()
|
154 |
|
155 |
+
# Run subprocess with timeout
|
156 |
+
# subprocess security: We control all inputs (script path and known arguments)
|
157 |
+
cmd = [sys.executable, script_path, MODEL_PATH, task_name, output_dir, MODEL_NAME]
|
|
|
158 |
|
159 |
+
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) # noqa: S603
|
|
|
|
|
160 |
|
161 |
+
try:
|
162 |
+
stdout, stderr = process.communicate(timeout=TASK_TIMEOUT)
|
163 |
+
duration = time.time() - start_time
|
164 |
+
|
165 |
+
if process.returncode == 0:
|
166 |
+
# Check for result file
|
167 |
+
for line in stdout.split("\n"):
|
168 |
+
if line.startswith("RESULT_FILE:"):
|
169 |
+
result_file = line.split(":", 1)[1]
|
170 |
+
try:
|
171 |
+
with Path(result_file).open() as f:
|
172 |
+
json.load(f)
|
173 |
+
Path(result_file).unlink() # Clean up temp file
|
174 |
+
logger.info(f"✓ Completed {task_name} in {duration:.2f}s")
|
175 |
+
return True, task_name, duration
|
176 |
+
except (json.JSONDecodeError, OSError):
|
177 |
+
logger.exception("Failed to read result file")
|
178 |
+
|
179 |
+
logger.info(f"✓ Completed {task_name} in {duration:.2f}s")
|
180 |
+
return True, task_name, duration
|
181 |
+
if process.returncode == SIGKILL_RETURN_CODE:
|
182 |
+
logger.error(f"✗ Task {task_name} killed (OOM) - return code {process.returncode}")
|
183 |
+
else:
|
184 |
+
logger.error(f"✗ Task {task_name} failed with return code {process.returncode}")
|
185 |
+
if stderr:
|
186 |
+
logger.error(f"Error output: {stderr}")
|
187 |
+
return False, task_name, duration
|
188 |
+
|
189 |
+
except subprocess.TimeoutExpired:
|
190 |
+
logger.warning(f"⏱ Task {task_name} timed out after {TASK_TIMEOUT}s")
|
191 |
+
process.kill()
|
192 |
+
process.wait()
|
193 |
+
return False, task_name, TASK_TIMEOUT
|
194 |
|
195 |
+
except Exception:
|
196 |
+
logger.exception(f"✗ Failed to run task {task_name}")
|
197 |
+
return False, task_name, 0.0
|
198 |
|
199 |
+
finally:
|
200 |
+
# Clean up script file
|
201 |
+
with contextlib.suppress(Exception):
|
202 |
+
Path(script_path).unlink()
|
203 |
|
|
|
204 |
|
205 |
+
def collect_results(output_path: Path) -> dict:
|
206 |
+
"""Collect all results from completed tasks."""
|
207 |
+
results_dir = output_path / MODEL_NAME / "distilled"
|
208 |
+
if not results_dir.exists():
|
209 |
+
return {}
|
210 |
|
211 |
+
task_results = {}
|
212 |
+
for result_file in results_dir.glob("*.json"):
|
213 |
+
if result_file.name == "model_meta.json":
|
214 |
+
continue
|
215 |
|
|
|
|
|
216 |
try:
|
217 |
+
with result_file.open() as f:
|
218 |
+
data = json.load(f)
|
219 |
+
task_name = result_file.stem
|
220 |
+
task_results[task_name] = data
|
221 |
+
except (json.JSONDecodeError, OSError) as e:
|
222 |
+
logger.warning(f"Could not load {result_file}: {e}")
|
|
|
|
|
|
|
|
|
|
|
223 |
|
224 |
+
return task_results
|
225 |
|
|
|
|
|
|
|
226 |
|
227 |
+
def save_summary(output_path: Path, results: dict, stats: dict) -> None:
|
228 |
+
"""Save evaluation summary."""
|
229 |
+
summary = {
|
230 |
+
"model_name": MODEL_NAME,
|
231 |
+
"timestamp": time.time(),
|
232 |
+
"task_timeout": TASK_TIMEOUT,
|
233 |
+
"stats": stats,
|
234 |
+
"task_results": results,
|
235 |
+
}
|
236 |
|
237 |
+
summary_file = output_path / "mteb_summary.json"
|
238 |
+
with summary_file.open("w") as f:
|
239 |
+
json.dump(summary, f, indent=2, default=str)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
|
241 |
+
logger.info(f"Summary saved to {summary_file}")
|
242 |
|
|
|
|
|
243 |
|
244 |
+
def main() -> None:
|
245 |
+
"""Main evaluation function."""
|
246 |
+
logger.info(f"Starting MTEB evaluation for {MODEL_NAME}")
|
247 |
+
logger.info(f"Task timeout: {TASK_TIMEOUT}s (no retries)")
|
248 |
+
logger.info("Memory isolation: Each task runs in separate subprocess")
|
|
|
249 |
|
250 |
+
# Log system info
|
251 |
+
memory_info = psutil.virtual_memory()
|
252 |
+
logger.info(f"System memory: {memory_info.total / (1024**3):.1f} GB total")
|
253 |
|
254 |
+
output_path = Path(OUTPUT_DIR)
|
255 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
256 |
|
257 |
+
# Get tasks
|
258 |
+
all_tasks = get_available_tasks()
|
259 |
+
if not all_tasks:
|
260 |
+
logger.error("No tasks found!")
|
261 |
+
return
|
|
|
|
|
|
|
|
|
262 |
|
263 |
+
logger.info(f"Found {len(all_tasks)} tasks")
|
|
|
|
|
|
|
|
|
264 |
|
265 |
+
# Check existing results
|
266 |
+
remaining_tasks = check_existing_results(output_path, all_tasks)
|
267 |
+
logger.info(f"Will evaluate {len(remaining_tasks)} remaining tasks")
|
268 |
|
269 |
+
if not remaining_tasks:
|
270 |
+
logger.info("All tasks already completed!")
|
271 |
+
return
|
|
|
|
|
|
|
|
|
272 |
|
273 |
+
# Process tasks sequentially (no retries)
|
274 |
+
start_time = time.time()
|
275 |
+
successful_tasks = []
|
276 |
+
failed_tasks = []
|
277 |
+
timed_out_tasks = []
|
278 |
|
279 |
+
for i, task_name in enumerate(remaining_tasks):
|
280 |
+
logger.info(f"[{i + 1}/{len(remaining_tasks)}] Processing: {task_name}")
|
|
|
|
|
281 |
|
282 |
+
# Run task once (no retries)
|
283 |
+
success, name, duration = run_task_subprocess(task_name, str(output_path))
|
284 |
|
285 |
+
if success:
|
286 |
+
successful_tasks.append((name, duration))
|
287 |
+
elif duration == TASK_TIMEOUT:
|
288 |
+
timed_out_tasks.append(name)
|
289 |
+
else:
|
290 |
+
failed_tasks.append(name)
|
291 |
+
# Check if it was OOM killed (this is logged in run_task_subprocess)
|
292 |
|
293 |
+
# Progress update
|
294 |
+
progress = ((i + 1) / len(remaining_tasks)) * 100
|
295 |
+
logger.info(f"Progress: {i + 1}/{len(remaining_tasks)} ({progress:.1f}%)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
+
# Brief pause between tasks
|
298 |
+
time.sleep(1)
|
299 |
|
300 |
+
total_time = time.time() - start_time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
|
302 |
+
# Log final summary
|
303 |
+
logger.info("=" * 80)
|
304 |
+
logger.info("EVALUATION SUMMARY")
|
305 |
+
logger.info("=" * 80)
|
306 |
+
logger.info(f"Total tasks: {len(remaining_tasks)}")
|
307 |
+
logger.info(f"Successful: {len(successful_tasks)}")
|
308 |
+
logger.info(f"Failed: {len(failed_tasks)}")
|
309 |
+
logger.info(f"Timed out: {len(timed_out_tasks)}")
|
310 |
+
logger.info(f"Total time: {total_time:.2f}s")
|
311 |
|
312 |
+
if successful_tasks:
|
313 |
+
avg_time = sum(duration for _, duration in successful_tasks) / len(successful_tasks)
|
314 |
+
logger.info(f"Average successful task time: {avg_time:.2f}s")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
|
316 |
+
if failed_tasks:
|
317 |
+
logger.warning(f"Failed tasks: {failed_tasks}")
|
318 |
|
319 |
+
if timed_out_tasks:
|
320 |
+
logger.warning(f"Timed out tasks: {timed_out_tasks}")
|
321 |
|
322 |
+
logger.info("=" * 80)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
|
324 |
+
# Collect and save results
|
325 |
+
all_results = collect_results(output_path)
|
326 |
+
stats = {
|
327 |
+
"total_tasks": len(remaining_tasks),
|
328 |
+
"successful": len(successful_tasks),
|
329 |
+
"failed": len(failed_tasks),
|
330 |
+
"timed_out": len(timed_out_tasks),
|
331 |
+
"total_time": total_time,
|
332 |
+
"avg_time": avg_time if successful_tasks else 0,
|
333 |
+
"successful_task_details": successful_tasks,
|
334 |
+
"failed_tasks": failed_tasks,
|
335 |
+
"timed_out_tasks": timed_out_tasks,
|
336 |
+
}
|
337 |
|
338 |
+
save_summary(output_path, all_results, stats)
|
339 |
+
logger.info("Evaluation completed!")
|
|
|
340 |
|
341 |
|
342 |
if __name__ == "__main__":
|
README.md
CHANGED
@@ -134,6 +134,56 @@ Detailed evaluation results, including similarity plots and performance metrics,
|
|
134 |
- `trained_code_classifier/` - Directory containing trained classification model
|
135 |
- `mteb_results/` - Directory containing MTEB evaluation results
|
136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
## Acknowledgments
|
138 |
|
139 |
This project is built upon the following technologies:
|
|
|
134 |
- `trained_code_classifier/` - Directory containing trained classification model
|
135 |
- `mteb_results/` - Directory containing MTEB evaluation results
|
136 |
|
137 |
+
## MTEB Benchmark Results (Partial)
|
138 |
+
|
139 |
+
**Overall Average Score: 0.1962**
|
140 |
+
|
141 |
+
| Category | Task | Score |
|
142 |
+
|----------|------|-------|
|
143 |
+
| **Classification** | **Average** | **0.4164** |
|
144 |
+
| | AmazonCounterfactualClassification | 0.5690 |
|
145 |
+
| | AmazonReviewsClassification | 0.2637 |
|
146 |
+
| | | |
|
147 |
+
| **Clustering** | **Average** | **0.0775** |
|
148 |
+
| | BiorxivClusteringS2S | 0.0775 |
|
149 |
+
| | | |
|
150 |
+
| **Reranking** | **Average** | **0.4643** |
|
151 |
+
| | AskUbuntuDupQuestions | 0.4643 |
|
152 |
+
| | | |
|
153 |
+
| **Retrieval** | **Average** | **0.1509** |
|
154 |
+
| | ArguAna | 0.1509 |
|
155 |
+
| | | |
|
156 |
+
| **CodeRetrieval** | **Average** | **0.1034** |
|
157 |
+
| | AppsRetrieval | 0.0008 |
|
158 |
+
| | COIRCodeSearchNetRetrieval | Failed |
|
159 |
+
| | CodeFeedbackMT | 0.1594 |
|
160 |
+
| | CodeSearchNetCCRetrieval | Failed |
|
161 |
+
| | CodeTransOceanContest | 0.0951 |
|
162 |
+
| | CodeTransOceanDL | 0.2780 |
|
163 |
+
| | CosQA | 0.0097 |
|
164 |
+
| | StackOverflowQA | 0.1762 |
|
165 |
+
| | SyntheticText2SQL | 0.0049 |
|
166 |
+
| | | |
|
167 |
+
| **STS** | **Average** | **0.3016** |
|
168 |
+
| | BIOSSES | 0.3016 |
|
169 |
+
| | | |
|
170 |
+
|
171 |
+
### Summary Statistics
|
172 |
+
|
173 |
+
- **Total Tasks**: 15
|
174 |
+
- **Successful Tasks**: 13
|
175 |
+
- **Failed Tasks**: 2
|
176 |
+
- **Overall Average**: 0.1962
|
177 |
+
|
178 |
+
### Category Averages
|
179 |
+
|
180 |
+
- **Classification**: 0.4164 (2 tasks)
|
181 |
+
- **Clustering**: 0.0775 (1 tasks)
|
182 |
+
- **Reranking**: 0.4643 (1 tasks)
|
183 |
+
- **Retrieval**: 0.1509 (1 tasks)
|
184 |
+
- **CodeRetrieval**: 0.1034 (7 tasks)
|
185 |
+
- **STS**: 0.3016 (1 tasks)
|
186 |
+
|
187 |
## Acknowledgments
|
188 |
|
189 |
This project is built upon the following technologies:
|
analyze_mteb_results.py
ADDED
@@ -0,0 +1,311 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
"""
|
3 |
+
MTEB Results Analysis Script.
|
4 |
+
|
5 |
+
This script analyzes MTEB benchmark results from the results directory,
|
6 |
+
categorizes tasks, calculates averages, and updates the README.md with
|
7 |
+
a comprehensive results table.
|
8 |
+
"""
|
9 |
+
|
10 |
+
import json
|
11 |
+
import re
|
12 |
+
from pathlib import Path
|
13 |
+
|
14 |
+
# Task category mappings based on MTEB benchmark structure
|
15 |
+
TASK_CATEGORIES = {
|
16 |
+
# Classification tasks
|
17 |
+
"AmazonCounterfactualClassification": "Classification",
|
18 |
+
"AmazonReviewsClassification": "Classification",
|
19 |
+
"Banking77Classification": "Classification",
|
20 |
+
"EmotionClassification": "Classification",
|
21 |
+
"ImdbClassification": "Classification",
|
22 |
+
"MassiveIntentClassification": "Classification",
|
23 |
+
"MassiveScenarioClassification": "Classification",
|
24 |
+
"MTOPDomainClassification": "Classification",
|
25 |
+
"MTOPIntentClassification": "Classification",
|
26 |
+
"ToxicConversationsClassification": "Classification",
|
27 |
+
"TweetSentimentExtractionClassification": "Classification",
|
28 |
+
# Clustering tasks
|
29 |
+
"ArxivClusteringP2P": "Clustering",
|
30 |
+
"ArxivClusteringS2S": "Clustering",
|
31 |
+
"BiorxivClusteringP2P": "Clustering",
|
32 |
+
"BiorxivClusteringS2S": "Clustering",
|
33 |
+
"MedrxivClusteringP2P": "Clustering",
|
34 |
+
"MedrxivClusteringS2S": "Clustering",
|
35 |
+
"RedditClustering": "Clustering",
|
36 |
+
"RedditClusteringP2P": "Clustering",
|
37 |
+
"StackExchangeClustering": "Clustering",
|
38 |
+
"StackExchangeClusteringP2P": "Clustering",
|
39 |
+
"TwentyNewsgroupsClustering": "Clustering",
|
40 |
+
# Pair Classification tasks
|
41 |
+
"SprintDuplicateQuestions": "PairClassification",
|
42 |
+
"TwitterSemEval2015": "PairClassification",
|
43 |
+
"TwitterURLCorpus": "PairClassification",
|
44 |
+
# Reranking tasks
|
45 |
+
"AskUbuntuDupQuestions": "Reranking",
|
46 |
+
"MindSmallReranking": "Reranking",
|
47 |
+
"SciDocsRR": "Reranking",
|
48 |
+
"StackOverflowDupQuestions": "Reranking",
|
49 |
+
# Retrieval tasks
|
50 |
+
"ArguAna": "Retrieval",
|
51 |
+
"ClimateFEVER": "Retrieval",
|
52 |
+
"CQADupstackRetrieval": "Retrieval",
|
53 |
+
"DBPedia": "Retrieval",
|
54 |
+
"FEVER": "Retrieval",
|
55 |
+
"FiQA2018": "Retrieval",
|
56 |
+
"HotpotQA": "Retrieval",
|
57 |
+
"MSMARCO": "Retrieval",
|
58 |
+
"NFCorpus": "Retrieval",
|
59 |
+
"NQ": "Retrieval",
|
60 |
+
"QuoraRetrieval": "Retrieval",
|
61 |
+
"SCIDOCS": "Retrieval",
|
62 |
+
"SciFact": "Retrieval",
|
63 |
+
"Touche2020": "Retrieval",
|
64 |
+
"TRECCOVID": "Retrieval",
|
65 |
+
# Code retrieval tasks
|
66 |
+
"CodeSearchNetCCRetrieval": "CodeRetrieval",
|
67 |
+
"COIRCodeSearchNetRetrieval": "CodeRetrieval",
|
68 |
+
"StackOverflowQA": "CodeRetrieval",
|
69 |
+
"AppsRetrieval": "CodeRetrieval",
|
70 |
+
"CodeTransOceanContest": "CodeRetrieval",
|
71 |
+
"CodeTransOceanDL": "CodeRetrieval",
|
72 |
+
"CodeFeedbackMT": "CodeRetrieval",
|
73 |
+
"SyntheticText2SQL": "CodeRetrieval",
|
74 |
+
"CosQA": "CodeRetrieval",
|
75 |
+
# STS (Semantic Textual Similarity) tasks
|
76 |
+
"BIOSSES": "STS",
|
77 |
+
"SICK-R": "STS",
|
78 |
+
"STS12": "STS",
|
79 |
+
"STS13": "STS",
|
80 |
+
"STS14": "STS",
|
81 |
+
"STS15": "STS",
|
82 |
+
"STS16": "STS",
|
83 |
+
"STS17": "STS",
|
84 |
+
"STS22": "STS",
|
85 |
+
"STSBenchmark": "STS",
|
86 |
+
"SummEval": "STS",
|
87 |
+
}
|
88 |
+
|
89 |
+
|
90 |
+
def load_mteb_results(results_dir: Path) -> dict[str, dict]:
|
91 |
+
"""Load all MTEB results from the results directory."""
|
92 |
+
results = {}
|
93 |
+
|
94 |
+
for json_file in results_dir.glob("*.json"):
|
95 |
+
if json_file.name == "model_meta.json":
|
96 |
+
continue
|
97 |
+
|
98 |
+
try:
|
99 |
+
with json_file.open() as f:
|
100 |
+
data = json.load(f)
|
101 |
+
task_name = data.get("task_name", json_file.stem)
|
102 |
+
results[task_name] = data
|
103 |
+
except (json.JSONDecodeError, KeyError):
|
104 |
+
pass
|
105 |
+
|
106 |
+
return results
|
107 |
+
|
108 |
+
|
109 |
+
def extract_main_score(result_data: dict) -> float:
|
110 |
+
"""Extract the main score from a task result."""
|
111 |
+
try:
|
112 |
+
scores = result_data["scores"]["test"][0]
|
113 |
+
return scores["main_score"]
|
114 |
+
except (KeyError, IndexError, TypeError):
|
115 |
+
return 0.0
|
116 |
+
|
117 |
+
|
118 |
+
def categorize_tasks(results: dict[str, dict]) -> dict[str, list[tuple[str, float]]]:
|
119 |
+
"""Categorize tasks and extract their scores."""
|
120 |
+
categories: dict[str, list[tuple[str, float]]] = {}
|
121 |
+
|
122 |
+
for task_name, result_data in results.items():
|
123 |
+
# Get category from mapping, or try to infer from task name
|
124 |
+
category = TASK_CATEGORIES.get(task_name)
|
125 |
+
|
126 |
+
if not category:
|
127 |
+
# Try to infer category from task name patterns
|
128 |
+
if "Classification" in task_name:
|
129 |
+
category = "Classification"
|
130 |
+
elif "Clustering" in task_name:
|
131 |
+
category = "Clustering"
|
132 |
+
elif "Retrieval" in task_name or "QA" in task_name:
|
133 |
+
category = "Retrieval"
|
134 |
+
elif "STS" in task_name or "SICK" in task_name or "BIOSSES" in task_name:
|
135 |
+
category = "STS"
|
136 |
+
elif "Code" in task_name or "SQL" in task_name:
|
137 |
+
category = "CodeRetrieval"
|
138 |
+
else:
|
139 |
+
category = "Other"
|
140 |
+
|
141 |
+
score = extract_main_score(result_data)
|
142 |
+
|
143 |
+
if category not in categories:
|
144 |
+
categories[category] = []
|
145 |
+
categories[category].append((task_name, score))
|
146 |
+
|
147 |
+
# Sort tasks within each category
|
148 |
+
for category_tasks in categories.values():
|
149 |
+
category_tasks.sort(key=lambda x: x[0])
|
150 |
+
|
151 |
+
return categories
|
152 |
+
|
153 |
+
|
154 |
+
def calculate_averages(categories: dict[str, list[tuple[str, float]]]) -> dict[str, float]:
|
155 |
+
"""Calculate average scores for each category."""
|
156 |
+
averages = {}
|
157 |
+
|
158 |
+
for category, tasks in categories.items():
|
159 |
+
scores = [score for _, score in tasks if score > 0] # Exclude failed tasks (score = 0)
|
160 |
+
if scores:
|
161 |
+
averages[category] = sum(scores) / len(scores)
|
162 |
+
else:
|
163 |
+
averages[category] = 0.0
|
164 |
+
|
165 |
+
return averages
|
166 |
+
|
167 |
+
|
168 |
+
def generate_results_table(categories: dict[str, list[tuple[str, float]]], averages: dict[str, float]) -> str:
|
169 |
+
"""Generate a markdown table with the results."""
|
170 |
+
# Calculate overall average
|
171 |
+
all_scores = []
|
172 |
+
for tasks in categories.values():
|
173 |
+
all_scores.extend([score for _, score in tasks if score > 0])
|
174 |
+
|
175 |
+
overall_avg = sum(all_scores) / len(all_scores) if all_scores else 0.0
|
176 |
+
|
177 |
+
# Create table
|
178 |
+
table_lines = [
|
179 |
+
"## MTEB Benchmark Results",
|
180 |
+
"",
|
181 |
+
f"**Overall Average Score: {overall_avg:.4f}**",
|
182 |
+
"",
|
183 |
+
"| Category | Task | Score |",
|
184 |
+
"|----------|------|-------|",
|
185 |
+
]
|
186 |
+
|
187 |
+
# Sort categories for consistent ordering
|
188 |
+
category_order = [
|
189 |
+
"Classification",
|
190 |
+
"Clustering",
|
191 |
+
"PairClassification",
|
192 |
+
"Reranking",
|
193 |
+
"Retrieval",
|
194 |
+
"CodeRetrieval",
|
195 |
+
"STS",
|
196 |
+
"Other",
|
197 |
+
]
|
198 |
+
|
199 |
+
for category in category_order:
|
200 |
+
if category not in categories:
|
201 |
+
continue
|
202 |
+
|
203 |
+
tasks = categories[category]
|
204 |
+
if not tasks:
|
205 |
+
continue
|
206 |
+
|
207 |
+
# Add category average row
|
208 |
+
avg_score = averages[category]
|
209 |
+
table_lines.append(f"| **{category}** | **Average** | **{avg_score:.4f}** |")
|
210 |
+
|
211 |
+
# Add individual tasks
|
212 |
+
for task_name, score in tasks:
|
213 |
+
if score > 0: # Only show successful tasks
|
214 |
+
table_lines.append(f"| | {task_name} | {score:.4f} |")
|
215 |
+
else:
|
216 |
+
table_lines.append(f"| | {task_name} | Failed |")
|
217 |
+
|
218 |
+
table_lines.append("| | | |") # Empty row for spacing
|
219 |
+
|
220 |
+
# Add summary statistics
|
221 |
+
table_lines.extend(
|
222 |
+
[
|
223 |
+
"",
|
224 |
+
"### Summary Statistics",
|
225 |
+
"",
|
226 |
+
f"- **Total Tasks**: {sum(len(tasks) for tasks in categories.values())}",
|
227 |
+
f"- **Successful Tasks**: {len(all_scores)}",
|
228 |
+
f"- **Failed Tasks**: {sum(len(tasks) for tasks in categories.values()) - len(all_scores)}",
|
229 |
+
f"- **Overall Average**: {overall_avg:.4f}",
|
230 |
+
"",
|
231 |
+
"### Category Averages",
|
232 |
+
"",
|
233 |
+
]
|
234 |
+
)
|
235 |
+
|
236 |
+
for category in category_order:
|
237 |
+
if category in averages and categories.get(category):
|
238 |
+
avg = averages[category]
|
239 |
+
task_count = len([s for _, s in categories[category] if s > 0])
|
240 |
+
table_lines.append(f"- **{category}**: {avg:.4f} ({task_count} tasks)")
|
241 |
+
|
242 |
+
return "\n".join(table_lines)
|
243 |
+
|
244 |
+
|
245 |
+
def update_readme(results_table: str, readme_path: Path = Path("README.md")) -> None:
|
246 |
+
"""Update the README.md file with the results table."""
|
247 |
+
if not readme_path.exists():
|
248 |
+
return
|
249 |
+
|
250 |
+
# Read current README
|
251 |
+
with readme_path.open() as f:
|
252 |
+
content = f.read()
|
253 |
+
|
254 |
+
# Find the insertion point or replace existing MTEB results
|
255 |
+
mteb_pattern = r"## MTEB Benchmark Results.*?(?=\n## |\n# |\Z)"
|
256 |
+
|
257 |
+
if re.search(mteb_pattern, content, re.DOTALL):
|
258 |
+
# Replace existing MTEB results section
|
259 |
+
new_content = re.sub(mteb_pattern, results_table, content, flags=re.DOTALL)
|
260 |
+
# Find a good insertion point (before Acknowledgments section or at the end)
|
261 |
+
elif "## Acknowledgments" in content:
|
262 |
+
new_content = content.replace("## Acknowledgments", f"{results_table}\n\n## Acknowledgments")
|
263 |
+
elif "## License" in content:
|
264 |
+
new_content = content.replace("## License", f"{results_table}\n\n## License")
|
265 |
+
else:
|
266 |
+
# Add at the end
|
267 |
+
new_content = f"{content}\n\n{results_table}"
|
268 |
+
|
269 |
+
# Write updated README
|
270 |
+
with readme_path.open("w") as f:
|
271 |
+
f.write(new_content)
|
272 |
+
|
273 |
+
|
274 |
+
def main() -> None:
|
275 |
+
"""Main function to analyze MTEB results and update README."""
|
276 |
+
results_dir = Path("mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled")
|
277 |
+
|
278 |
+
if not results_dir.exists():
|
279 |
+
return
|
280 |
+
|
281 |
+
results = load_mteb_results(results_dir)
|
282 |
+
|
283 |
+
if not results:
|
284 |
+
return
|
285 |
+
|
286 |
+
categories = categorize_tasks(results)
|
287 |
+
|
288 |
+
averages = calculate_averages(categories)
|
289 |
+
|
290 |
+
results_table = generate_results_table(categories, averages)
|
291 |
+
|
292 |
+
update_readme(results_table)
|
293 |
+
|
294 |
+
# Print summary to console
|
295 |
+
|
296 |
+
sum(len(tasks) for tasks in categories.values())
|
297 |
+
successful_tasks = sum(len([s for _, s in tasks if s > 0]) for tasks in categories.values())
|
298 |
+
|
299 |
+
if successful_tasks > 0:
|
300 |
+
all_scores = []
|
301 |
+
for tasks in categories.values():
|
302 |
+
all_scores.extend([score for _, score in tasks if score > 0])
|
303 |
+
sum(all_scores) / len(all_scores)
|
304 |
+
|
305 |
+
for category, tasks in categories.items():
|
306 |
+
len([s for _, s in tasks if s > 0])
|
307 |
+
averages.get(category, 0.0)
|
308 |
+
|
309 |
+
|
310 |
+
if __name__ == "__main__":
|
311 |
+
main()
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AmazonCounterfactualClassification.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"dataset_revision": "e8379541af4e31359cca9fbcf4b00f2671dba205",
|
3 |
-
"evaluation_time":
|
4 |
"kg_co2_emissions": null,
|
5 |
"mteb_version": "1.14.15",
|
6 |
"scores": {
|
|
|
1 |
{
|
2 |
"dataset_revision": "e8379541af4e31359cca9fbcf4b00f2671dba205",
|
3 |
+
"evaluation_time": 7.698482990264893,
|
4 |
"kg_co2_emissions": null,
|
5 |
"mteb_version": "1.14.15",
|
6 |
"scores": {
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AmazonReviewsClassification.json
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "1399c76144fd37290681b995c656ef9b2e06e26d",
|
3 |
+
"evaluation_time": 5.071816444396973,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {
|
7 |
+
"test": [
|
8 |
+
{
|
9 |
+
"accuracy": 0.26374,
|
10 |
+
"f1": 0.25472288926645315,
|
11 |
+
"f1_weighted": 0.25472288926645315,
|
12 |
+
"hf_subset": "en",
|
13 |
+
"languages": [
|
14 |
+
"eng-Latn"
|
15 |
+
],
|
16 |
+
"main_score": 0.26374,
|
17 |
+
"scores_per_experiment": [
|
18 |
+
{
|
19 |
+
"accuracy": 0.29,
|
20 |
+
"f1": 0.2830487996396496,
|
21 |
+
"f1_weighted": 0.2830487996396495
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"accuracy": 0.276,
|
25 |
+
"f1": 0.26621916451801775,
|
26 |
+
"f1_weighted": 0.2662191645180177
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"accuracy": 0.2682,
|
30 |
+
"f1": 0.24934092172665734,
|
31 |
+
"f1_weighted": 0.24934092172665728
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"accuracy": 0.297,
|
35 |
+
"f1": 0.29141160920496506,
|
36 |
+
"f1_weighted": 0.29141160920496506
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"accuracy": 0.268,
|
40 |
+
"f1": 0.2528895121087961,
|
41 |
+
"f1_weighted": 0.2528895121087961
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"accuracy": 0.2548,
|
45 |
+
"f1": 0.25158219767608686,
|
46 |
+
"f1_weighted": 0.2515821976760869
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"accuracy": 0.2192,
|
50 |
+
"f1": 0.21535453372408658,
|
51 |
+
"f1_weighted": 0.21535453372408656
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"accuracy": 0.264,
|
55 |
+
"f1": 0.2493331111938578,
|
56 |
+
"f1_weighted": 0.24933311119385781
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"accuracy": 0.2694,
|
60 |
+
"f1": 0.2569449221084947,
|
61 |
+
"f1_weighted": 0.2569449221084947
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"accuracy": 0.2308,
|
65 |
+
"f1": 0.23110412076392003,
|
66 |
+
"f1_weighted": 0.23110412076392003
|
67 |
+
}
|
68 |
+
]
|
69 |
+
}
|
70 |
+
]
|
71 |
+
},
|
72 |
+
"task_name": "AmazonReviewsClassification"
|
73 |
+
}
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AppsRetrieval.json
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "f22508f96b7a36c2415181ed8bb76f76e04ae2d5",
|
3 |
+
"evaluation_time": 7.666281223297119,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {
|
7 |
+
"test": [
|
8 |
+
{
|
9 |
+
"hf_subset": "default",
|
10 |
+
"languages": [
|
11 |
+
"eng-Latn",
|
12 |
+
"python-Code"
|
13 |
+
],
|
14 |
+
"main_score": 0.00085,
|
15 |
+
"map_at_1": 0.00053,
|
16 |
+
"map_at_10": 0.00071,
|
17 |
+
"map_at_100": 0.00084,
|
18 |
+
"map_at_1000": 0.00102,
|
19 |
+
"map_at_20": 0.00078,
|
20 |
+
"map_at_3": 0.00062,
|
21 |
+
"map_at_5": 0.00062,
|
22 |
+
"mrr_at_1": 0.0005312084993359894,
|
23 |
+
"mrr_at_10": 0.0007082779991146524,
|
24 |
+
"mrr_at_100": 0.0008420281651420131,
|
25 |
+
"mrr_at_1000": 0.0010194100996475369,
|
26 |
+
"mrr_at_20": 0.000778427190426253,
|
27 |
+
"mrr_at_3": 0.000619743249225321,
|
28 |
+
"mrr_at_5": 0.000619743249225321,
|
29 |
+
"nauc_map_at_1000_diff1": 0.17541432452987074,
|
30 |
+
"nauc_map_at_1000_max": -0.4154388332336005,
|
31 |
+
"nauc_map_at_1000_std": -0.3819043407981619,
|
32 |
+
"nauc_map_at_100_diff1": 0.20679746840207688,
|
33 |
+
"nauc_map_at_100_max": -0.4833278286603358,
|
34 |
+
"nauc_map_at_100_std": -0.4458942676390952,
|
35 |
+
"nauc_map_at_10_diff1": 0.24647952852769084,
|
36 |
+
"nauc_map_at_10_max": -0.5373695050821257,
|
37 |
+
"nauc_map_at_10_std": -0.5236953500263536,
|
38 |
+
"nauc_map_at_1_diff1": 0.29581300303762115,
|
39 |
+
"nauc_map_at_1_max": -0.5373695050821257,
|
40 |
+
"nauc_map_at_1_std": -0.5373695050821257,
|
41 |
+
"nauc_map_at_20_diff1": 0.21533097655723032,
|
42 |
+
"nauc_map_at_20_max": -0.5291249767848446,
|
43 |
+
"nauc_map_at_20_std": -0.4922264006934063,
|
44 |
+
"nauc_map_at_3_diff1": 0.26762244617480385,
|
45 |
+
"nauc_map_at_3_max": -0.5320020257576935,
|
46 |
+
"nauc_map_at_3_std": -0.542736984406558,
|
47 |
+
"nauc_map_at_5_diff1": 0.26762244617480385,
|
48 |
+
"nauc_map_at_5_max": -0.5320020257576935,
|
49 |
+
"nauc_map_at_5_std": -0.542736984406558,
|
50 |
+
"nauc_mrr_at_1000_diff1": 0.17541443100790596,
|
51 |
+
"nauc_mrr_at_1000_max": -0.4154385500206541,
|
52 |
+
"nauc_mrr_at_1000_std": -0.38190393644049175,
|
53 |
+
"nauc_mrr_at_100_diff1": 0.20679746840207688,
|
54 |
+
"nauc_mrr_at_100_max": -0.4833278286603358,
|
55 |
+
"nauc_mrr_at_100_std": -0.4458942676390952,
|
56 |
+
"nauc_mrr_at_10_diff1": 0.24647952852769084,
|
57 |
+
"nauc_mrr_at_10_max": -0.5373695050821257,
|
58 |
+
"nauc_mrr_at_10_std": -0.5236953500263536,
|
59 |
+
"nauc_mrr_at_1_diff1": 0.29581300303762115,
|
60 |
+
"nauc_mrr_at_1_max": -0.5373695050821257,
|
61 |
+
"nauc_mrr_at_1_std": -0.5373695050821257,
|
62 |
+
"nauc_mrr_at_20_diff1": 0.21533097655723032,
|
63 |
+
"nauc_mrr_at_20_max": -0.5291249767848446,
|
64 |
+
"nauc_mrr_at_20_std": -0.4922264006934063,
|
65 |
+
"nauc_mrr_at_3_diff1": 0.26762244617480385,
|
66 |
+
"nauc_mrr_at_3_max": -0.5320020257576935,
|
67 |
+
"nauc_mrr_at_3_std": -0.542736984406558,
|
68 |
+
"nauc_mrr_at_5_diff1": 0.26762244617480385,
|
69 |
+
"nauc_mrr_at_5_max": -0.5320020257576935,
|
70 |
+
"nauc_mrr_at_5_std": -0.542736984406558,
|
71 |
+
"nauc_ndcg_at_1000_diff1": 0.03711794407808404,
|
72 |
+
"nauc_ndcg_at_1000_max": -0.10620944898582887,
|
73 |
+
"nauc_ndcg_at_1000_std": -0.07214854599247035,
|
74 |
+
"nauc_ndcg_at_100_diff1": 0.1478165352946149,
|
75 |
+
"nauc_ndcg_at_100_max": -0.3266890379270042,
|
76 |
+
"nauc_ndcg_at_100_std": -0.24237793463929755,
|
77 |
+
"nauc_ndcg_at_10_diff1": 0.22133616828561992,
|
78 |
+
"nauc_ndcg_at_10_max": -0.5398539007431512,
|
79 |
+
"nauc_ndcg_at_10_std": -0.5106250645273135,
|
80 |
+
"nauc_ndcg_at_1_diff1": 0.29581300303762115,
|
81 |
+
"nauc_ndcg_at_1_max": -0.5373695050821257,
|
82 |
+
"nauc_ndcg_at_1_std": -0.5373695050821257,
|
83 |
+
"nauc_ndcg_at_20_diff1": 0.15496100992740594,
|
84 |
+
"nauc_ndcg_at_20_max": -0.5151090921512047,
|
85 |
+
"nauc_ndcg_at_20_std": -0.42331797746940425,
|
86 |
+
"nauc_ndcg_at_3_diff1": 0.25634622342967694,
|
87 |
+
"nauc_ndcg_at_3_max": -0.5298550340279207,
|
88 |
+
"nauc_ndcg_at_3_std": -0.5448839761363309,
|
89 |
+
"nauc_ndcg_at_5_diff1": 0.25634622342967694,
|
90 |
+
"nauc_ndcg_at_5_max": -0.5298550340279207,
|
91 |
+
"nauc_ndcg_at_5_std": -0.5448839761363309,
|
92 |
+
"nauc_precision_at_1000_diff1": 0.020777159599148352,
|
93 |
+
"nauc_precision_at_1000_max": -0.06655316040754289,
|
94 |
+
"nauc_precision_at_1000_std": -0.035219149425472995,
|
95 |
+
"nauc_precision_at_100_diff1": 0.11476528495195117,
|
96 |
+
"nauc_precision_at_100_max": -0.1810698361522713,
|
97 |
+
"nauc_precision_at_100_std": -0.0631149349365322,
|
98 |
+
"nauc_precision_at_10_diff1": 0.17741266421378812,
|
99 |
+
"nauc_precision_at_10_max": -0.5448839761363309,
|
100 |
+
"nauc_precision_at_10_std": -0.48609773784945,
|
101 |
+
"nauc_precision_at_1_diff1": 0.29581300303762115,
|
102 |
+
"nauc_precision_at_1_max": -0.5373695050821257,
|
103 |
+
"nauc_precision_at_1_std": -0.5373695050821257,
|
104 |
+
"nauc_precision_at_20_diff1": 0.0803859336130361,
|
105 |
+
"nauc_precision_at_20_max": -0.4938580214848722,
|
106 |
+
"nauc_precision_at_20_std": -0.32580885579158325,
|
107 |
+
"nauc_precision_at_3_diff1": 0.23003503702438047,
|
108 |
+
"nauc_precision_at_3_max": -0.5248453866584504,
|
109 |
+
"nauc_precision_at_3_std": -0.549893623505801,
|
110 |
+
"nauc_precision_at_5_diff1": 0.2300350370243805,
|
111 |
+
"nauc_precision_at_5_max": -0.5248453866584505,
|
112 |
+
"nauc_precision_at_5_std": -0.5498936235058011,
|
113 |
+
"nauc_recall_at_1000_diff1": 0.020777159599148387,
|
114 |
+
"nauc_recall_at_1000_max": -0.06655316040754268,
|
115 |
+
"nauc_recall_at_1000_std": -0.03521914942547286,
|
116 |
+
"nauc_recall_at_100_diff1": 0.11476528495195135,
|
117 |
+
"nauc_recall_at_100_max": -0.18106983615227104,
|
118 |
+
"nauc_recall_at_100_std": -0.06311493493653193,
|
119 |
+
"nauc_recall_at_10_diff1": 0.17741266421378812,
|
120 |
+
"nauc_recall_at_10_max": -0.5448839761363309,
|
121 |
+
"nauc_recall_at_10_std": -0.48609773784945015,
|
122 |
+
"nauc_recall_at_1_diff1": 0.29581300303762115,
|
123 |
+
"nauc_recall_at_1_max": -0.5373695050821257,
|
124 |
+
"nauc_recall_at_1_std": -0.5373695050821257,
|
125 |
+
"nauc_recall_at_20_diff1": 0.08038593361303595,
|
126 |
+
"nauc_recall_at_20_max": -0.4938580214848721,
|
127 |
+
"nauc_recall_at_20_std": -0.3258088557915833,
|
128 |
+
"nauc_recall_at_3_diff1": 0.23003503702438058,
|
129 |
+
"nauc_recall_at_3_max": -0.5248453866584504,
|
130 |
+
"nauc_recall_at_3_std": -0.549893623505801,
|
131 |
+
"nauc_recall_at_5_diff1": 0.23003503702438058,
|
132 |
+
"nauc_recall_at_5_max": -0.5248453866584504,
|
133 |
+
"nauc_recall_at_5_std": -0.549893623505801,
|
134 |
+
"ndcg_at_1": 0.00053,
|
135 |
+
"ndcg_at_10": 0.00085,
|
136 |
+
"ndcg_at_100": 0.00164,
|
137 |
+
"ndcg_at_1000": 0.01024,
|
138 |
+
"ndcg_at_20": 0.00112,
|
139 |
+
"ndcg_at_3": 0.00066,
|
140 |
+
"ndcg_at_5": 0.00066,
|
141 |
+
"precision_at_1": 0.00053,
|
142 |
+
"precision_at_10": 0.00013,
|
143 |
+
"precision_at_100": 5e-05,
|
144 |
+
"precision_at_1000": 8e-05,
|
145 |
+
"precision_at_20": 0.00012,
|
146 |
+
"precision_at_3": 0.00027,
|
147 |
+
"precision_at_5": 0.00016,
|
148 |
+
"recall_at_1": 0.00053,
|
149 |
+
"recall_at_10": 0.00133,
|
150 |
+
"recall_at_100": 0.00531,
|
151 |
+
"recall_at_1000": 0.08234,
|
152 |
+
"recall_at_20": 0.00239,
|
153 |
+
"recall_at_3": 0.0008,
|
154 |
+
"recall_at_5": 0.0008
|
155 |
+
}
|
156 |
+
]
|
157 |
+
},
|
158 |
+
"task_name": "AppsRetrieval"
|
159 |
+
}
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/ArguAna.json
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "c22ab2a51041ffd869aaddef7af8d8215647e41a",
|
3 |
+
"evaluation_time": 2.6541521549224854,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {
|
7 |
+
"test": [
|
8 |
+
{
|
9 |
+
"hf_subset": "default",
|
10 |
+
"languages": [
|
11 |
+
"eng-Latn"
|
12 |
+
],
|
13 |
+
"main_score": 0.1509,
|
14 |
+
"map_at_1": 0.0761,
|
15 |
+
"map_at_10": 0.12154,
|
16 |
+
"map_at_100": 0.12944,
|
17 |
+
"map_at_1000": 0.13039,
|
18 |
+
"map_at_20": 0.12583,
|
19 |
+
"map_at_3": 0.10218,
|
20 |
+
"map_at_5": 0.11381,
|
21 |
+
"mrr_at_1": 0.07681365576102418,
|
22 |
+
"mrr_at_10": 0.12187591727065411,
|
23 |
+
"mrr_at_100": 0.1297644188004288,
|
24 |
+
"mrr_at_1000": 0.13071256214837015,
|
25 |
+
"mrr_at_20": 0.12612116393992717,
|
26 |
+
"mrr_at_3": 0.10229966808914177,
|
27 |
+
"mrr_at_5": 0.11389284020862968,
|
28 |
+
"nauc_map_at_1000_diff1": 0.16146733687467354,
|
29 |
+
"nauc_map_at_1000_max": 0.06481036694891384,
|
30 |
+
"nauc_map_at_1000_std": 0.06943873238380074,
|
31 |
+
"nauc_map_at_100_diff1": 0.16139928093588565,
|
32 |
+
"nauc_map_at_100_max": 0.0644505695492588,
|
33 |
+
"nauc_map_at_100_std": 0.06974933502492409,
|
34 |
+
"nauc_map_at_10_diff1": 0.16658684660598214,
|
35 |
+
"nauc_map_at_10_max": 0.06224588862021987,
|
36 |
+
"nauc_map_at_10_std": 0.06880011677576231,
|
37 |
+
"nauc_map_at_1_diff1": 0.19579759176541026,
|
38 |
+
"nauc_map_at_1_max": 0.02603892226990134,
|
39 |
+
"nauc_map_at_1_std": 0.04620265141724082,
|
40 |
+
"nauc_map_at_20_diff1": 0.16382252282054222,
|
41 |
+
"nauc_map_at_20_max": 0.06529226434404913,
|
42 |
+
"nauc_map_at_20_std": 0.06849826441400649,
|
43 |
+
"nauc_map_at_3_diff1": 0.16402956096741358,
|
44 |
+
"nauc_map_at_3_max": 0.0419122332975646,
|
45 |
+
"nauc_map_at_3_std": 0.05925639235658917,
|
46 |
+
"nauc_map_at_5_diff1": 0.16894147524916653,
|
47 |
+
"nauc_map_at_5_max": 0.052414170749768195,
|
48 |
+
"nauc_map_at_5_std": 0.07005093386964208,
|
49 |
+
"nauc_mrr_at_1000_diff1": 0.15689213701351912,
|
50 |
+
"nauc_mrr_at_1000_max": 0.06318320049791439,
|
51 |
+
"nauc_mrr_at_1000_std": 0.06796437033671639,
|
52 |
+
"nauc_mrr_at_100_diff1": 0.1568336618890221,
|
53 |
+
"nauc_mrr_at_100_max": 0.06282844172684152,
|
54 |
+
"nauc_mrr_at_100_std": 0.06827651320612166,
|
55 |
+
"nauc_mrr_at_10_diff1": 0.1613861747091082,
|
56 |
+
"nauc_mrr_at_10_max": 0.06048932175951958,
|
57 |
+
"nauc_mrr_at_10_std": 0.06744723486463321,
|
58 |
+
"nauc_mrr_at_1_diff1": 0.1886592359414356,
|
59 |
+
"nauc_mrr_at_1_max": 0.025143721566481553,
|
60 |
+
"nauc_mrr_at_1_std": 0.04192879681303956,
|
61 |
+
"nauc_mrr_at_20_diff1": 0.15895555488261146,
|
62 |
+
"nauc_mrr_at_20_max": 0.06337596031238824,
|
63 |
+
"nauc_mrr_at_20_std": 0.06705300695703223,
|
64 |
+
"nauc_mrr_at_3_diff1": 0.15808017173425612,
|
65 |
+
"nauc_mrr_at_3_max": 0.03873273590791373,
|
66 |
+
"nauc_mrr_at_3_std": 0.05873440646581739,
|
67 |
+
"nauc_mrr_at_5_diff1": 0.1623674451736993,
|
68 |
+
"nauc_mrr_at_5_max": 0.048599887137470155,
|
69 |
+
"nauc_mrr_at_5_std": 0.06946191051556377,
|
70 |
+
"nauc_ndcg_at_1000_diff1": 0.14057148135021394,
|
71 |
+
"nauc_ndcg_at_1000_max": 0.09398561431514359,
|
72 |
+
"nauc_ndcg_at_1000_std": 0.06871748094502036,
|
73 |
+
"nauc_ndcg_at_100_diff1": 0.14160219742898073,
|
74 |
+
"nauc_ndcg_at_100_max": 0.08793842988004247,
|
75 |
+
"nauc_ndcg_at_100_std": 0.08107847041025427,
|
76 |
+
"nauc_ndcg_at_10_diff1": 0.16112940300466752,
|
77 |
+
"nauc_ndcg_at_10_max": 0.08282286934634887,
|
78 |
+
"nauc_ndcg_at_10_std": 0.07481333025577913,
|
79 |
+
"nauc_ndcg_at_1_diff1": 0.19579759176541026,
|
80 |
+
"nauc_ndcg_at_1_max": 0.02603892226990134,
|
81 |
+
"nauc_ndcg_at_1_std": 0.04620265141724082,
|
82 |
+
"nauc_ndcg_at_20_diff1": 0.15445394581844324,
|
83 |
+
"nauc_ndcg_at_20_max": 0.09290741055177616,
|
84 |
+
"nauc_ndcg_at_20_std": 0.0739310085946421,
|
85 |
+
"nauc_ndcg_at_3_diff1": 0.1574151504354397,
|
86 |
+
"nauc_ndcg_at_3_max": 0.04636430630581481,
|
87 |
+
"nauc_ndcg_at_3_std": 0.06191664189704533,
|
88 |
+
"nauc_ndcg_at_5_diff1": 0.1658753822856203,
|
89 |
+
"nauc_ndcg_at_5_max": 0.06313482448309465,
|
90 |
+
"nauc_ndcg_at_5_std": 0.07904072628627579,
|
91 |
+
"nauc_precision_at_1000_diff1": 0.0609588525314078,
|
92 |
+
"nauc_precision_at_1000_max": 0.1870041318251064,
|
93 |
+
"nauc_precision_at_1000_std": 0.019658161418599534,
|
94 |
+
"nauc_precision_at_100_diff1": 0.09473113411767209,
|
95 |
+
"nauc_precision_at_100_max": 0.1309396613797298,
|
96 |
+
"nauc_precision_at_100_std": 0.10623324275765494,
|
97 |
+
"nauc_precision_at_10_diff1": 0.15121181172955667,
|
98 |
+
"nauc_precision_at_10_max": 0.12477733598184097,
|
99 |
+
"nauc_precision_at_10_std": 0.08475912589528253,
|
100 |
+
"nauc_precision_at_1_diff1": 0.19579759176541026,
|
101 |
+
"nauc_precision_at_1_max": 0.02603892226990134,
|
102 |
+
"nauc_precision_at_1_std": 0.04620265141724082,
|
103 |
+
"nauc_precision_at_20_diff1": 0.1370251724378167,
|
104 |
+
"nauc_precision_at_20_max": 0.14912154538482067,
|
105 |
+
"nauc_precision_at_20_std": 0.08184312031151385,
|
106 |
+
"nauc_precision_at_3_diff1": 0.14253682467701162,
|
107 |
+
"nauc_precision_at_3_max": 0.05671718495423438,
|
108 |
+
"nauc_precision_at_3_std": 0.06788353997677292,
|
109 |
+
"nauc_precision_at_5_diff1": 0.16082986625463053,
|
110 |
+
"nauc_precision_at_5_max": 0.08573137277943063,
|
111 |
+
"nauc_precision_at_5_std": 0.09793524405071982,
|
112 |
+
"nauc_recall_at_1000_diff1": 0.06095885253140698,
|
113 |
+
"nauc_recall_at_1000_max": 0.1870041318251063,
|
114 |
+
"nauc_recall_at_1000_std": 0.019658161418598927,
|
115 |
+
"nauc_recall_at_100_diff1": 0.09473113411767223,
|
116 |
+
"nauc_recall_at_100_max": 0.1309396613797295,
|
117 |
+
"nauc_recall_at_100_std": 0.10623324275765476,
|
118 |
+
"nauc_recall_at_10_diff1": 0.1512118117295564,
|
119 |
+
"nauc_recall_at_10_max": 0.12477733598184074,
|
120 |
+
"nauc_recall_at_10_std": 0.08475912589528235,
|
121 |
+
"nauc_recall_at_1_diff1": 0.19579759176541026,
|
122 |
+
"nauc_recall_at_1_max": 0.02603892226990134,
|
123 |
+
"nauc_recall_at_1_std": 0.04620265141724082,
|
124 |
+
"nauc_recall_at_20_diff1": 0.13702517243781645,
|
125 |
+
"nauc_recall_at_20_max": 0.14912154538482042,
|
126 |
+
"nauc_recall_at_20_std": 0.08184312031151388,
|
127 |
+
"nauc_recall_at_3_diff1": 0.14253682467701162,
|
128 |
+
"nauc_recall_at_3_max": 0.056717184954234404,
|
129 |
+
"nauc_recall_at_3_std": 0.06788353997677302,
|
130 |
+
"nauc_recall_at_5_diff1": 0.16082986625463033,
|
131 |
+
"nauc_recall_at_5_max": 0.08573137277943055,
|
132 |
+
"nauc_recall_at_5_std": 0.0979352440507195,
|
133 |
+
"ndcg_at_1": 0.0761,
|
134 |
+
"ndcg_at_10": 0.1509,
|
135 |
+
"ndcg_at_100": 0.19506,
|
136 |
+
"ndcg_at_1000": 0.22612,
|
137 |
+
"ndcg_at_20": 0.16665,
|
138 |
+
"ndcg_at_3": 0.11065,
|
139 |
+
"ndcg_at_5": 0.13182,
|
140 |
+
"precision_at_1": 0.0761,
|
141 |
+
"precision_at_10": 0.02468,
|
142 |
+
"precision_at_100": 0.00467,
|
143 |
+
"precision_at_1000": 0.00072,
|
144 |
+
"precision_at_20": 0.01547,
|
145 |
+
"precision_at_3": 0.04505,
|
146 |
+
"precision_at_5": 0.03741,
|
147 |
+
"recall_at_1": 0.0761,
|
148 |
+
"recall_at_10": 0.2468,
|
149 |
+
"recall_at_100": 0.46728,
|
150 |
+
"recall_at_1000": 0.72475,
|
151 |
+
"recall_at_20": 0.30939,
|
152 |
+
"recall_at_3": 0.13514,
|
153 |
+
"recall_at_5": 0.18706
|
154 |
+
}
|
155 |
+
]
|
156 |
+
},
|
157 |
+
"task_name": "ArguAna"
|
158 |
+
}
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AskUbuntuDupQuestions.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "2000358ca161889fa9c082cb41daa8dcfb161a54",
|
3 |
+
"evaluation_time": 0.4332466125488281,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {
|
7 |
+
"test": [
|
8 |
+
{
|
9 |
+
"hf_subset": "default",
|
10 |
+
"languages": [
|
11 |
+
"eng-Latn"
|
12 |
+
],
|
13 |
+
"main_score": 0.46429757499703045,
|
14 |
+
"map": 0.46429757499703045,
|
15 |
+
"mrr": 0.601056368992657,
|
16 |
+
"nAUC_map_diff1": 0.16940332025233937,
|
17 |
+
"nAUC_map_max": 0.15925499774951668,
|
18 |
+
"nAUC_map_std": 0.05294826509824163,
|
19 |
+
"nAUC_mrr_diff1": 0.19481488519394907,
|
20 |
+
"nAUC_mrr_max": 0.21250668851129054,
|
21 |
+
"nAUC_mrr_std": 0.022766508692728404
|
22 |
+
}
|
23 |
+
]
|
24 |
+
},
|
25 |
+
"task_name": "AskUbuntuDupQuestions"
|
26 |
+
}
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/BIOSSES.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "d3fb88f8f02e40887cd149695127462bbcf29b4a",
|
3 |
+
"evaluation_time": 0.0452265739440918,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {
|
7 |
+
"test": [
|
8 |
+
{
|
9 |
+
"cosine_pearson": 0.2956207137346794,
|
10 |
+
"cosine_spearman": 0.30161530624430144,
|
11 |
+
"euclidean_pearson": 0.2995531537590785,
|
12 |
+
"euclidean_spearman": 0.30161530624430144,
|
13 |
+
"hf_subset": "default",
|
14 |
+
"languages": [
|
15 |
+
"eng-Latn"
|
16 |
+
],
|
17 |
+
"main_score": 0.30161530624430144,
|
18 |
+
"manhattan_pearson": 0.33453615581396934,
|
19 |
+
"manhattan_spearman": 0.3532610613411196,
|
20 |
+
"pearson": 0.2956207137346794,
|
21 |
+
"spearman": 0.30161530624430144
|
22 |
+
}
|
23 |
+
]
|
24 |
+
},
|
25 |
+
"task_name": "BIOSSES"
|
26 |
+
}
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/Banking77Classification.json
DELETED
@@ -1,73 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"dataset_revision": "0fd18e25b25c072e09e0d92ab615fda904d66300",
|
3 |
-
"evaluation_time": 6.451777696609497,
|
4 |
-
"kg_co2_emissions": null,
|
5 |
-
"mteb_version": "1.14.15",
|
6 |
-
"scores": {
|
7 |
-
"test": [
|
8 |
-
{
|
9 |
-
"accuracy": 0.4396103896103896,
|
10 |
-
"f1": 0.4142711532114576,
|
11 |
-
"f1_weighted": 0.4142711532114576,
|
12 |
-
"hf_subset": "default",
|
13 |
-
"languages": [
|
14 |
-
"eng-Latn"
|
15 |
-
],
|
16 |
-
"main_score": 0.4396103896103896,
|
17 |
-
"scores_per_experiment": [
|
18 |
-
{
|
19 |
-
"accuracy": 0.4279220779220779,
|
20 |
-
"f1": 0.4030476288783657,
|
21 |
-
"f1_weighted": 0.4030476288783656
|
22 |
-
},
|
23 |
-
{
|
24 |
-
"accuracy": 0.4211038961038961,
|
25 |
-
"f1": 0.39776168133611584,
|
26 |
-
"f1_weighted": 0.39776168133611584
|
27 |
-
},
|
28 |
-
{
|
29 |
-
"accuracy": 0.45064935064935063,
|
30 |
-
"f1": 0.42872843564828145,
|
31 |
-
"f1_weighted": 0.42872843564828145
|
32 |
-
},
|
33 |
-
{
|
34 |
-
"accuracy": 0.4448051948051948,
|
35 |
-
"f1": 0.420756828398419,
|
36 |
-
"f1_weighted": 0.42075682839841905
|
37 |
-
},
|
38 |
-
{
|
39 |
-
"accuracy": 0.44675324675324674,
|
40 |
-
"f1": 0.42100682221185654,
|
41 |
-
"f1_weighted": 0.42100682221185654
|
42 |
-
},
|
43 |
-
{
|
44 |
-
"accuracy": 0.45324675324675323,
|
45 |
-
"f1": 0.4392342490231314,
|
46 |
-
"f1_weighted": 0.4392342490231314
|
47 |
-
},
|
48 |
-
{
|
49 |
-
"accuracy": 0.437012987012987,
|
50 |
-
"f1": 0.4056017558988273,
|
51 |
-
"f1_weighted": 0.40560175589882724
|
52 |
-
},
|
53 |
-
{
|
54 |
-
"accuracy": 0.42337662337662335,
|
55 |
-
"f1": 0.39123709562594644,
|
56 |
-
"f1_weighted": 0.39123709562594655
|
57 |
-
},
|
58 |
-
{
|
59 |
-
"accuracy": 0.44512987012987015,
|
60 |
-
"f1": 0.41578171494860966,
|
61 |
-
"f1_weighted": 0.41578171494860966
|
62 |
-
},
|
63 |
-
{
|
64 |
-
"accuracy": 0.4461038961038961,
|
65 |
-
"f1": 0.4195553201450221,
|
66 |
-
"f1_weighted": 0.419555320145022
|
67 |
-
}
|
68 |
-
]
|
69 |
-
}
|
70 |
-
]
|
71 |
-
},
|
72 |
-
"task_name": "Banking77Classification"
|
73 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/BiorxivClusteringS2S.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "258694dd0231531bc1fd9de6ceb52a0853c6d908",
|
3 |
+
"evaluation_time": 6.352599620819092,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {
|
7 |
+
"test": [
|
8 |
+
{
|
9 |
+
"hf_subset": "default",
|
10 |
+
"languages": [
|
11 |
+
"eng-Latn"
|
12 |
+
],
|
13 |
+
"main_score": 0.07745778878625219,
|
14 |
+
"v_measure": 0.07745778878625219,
|
15 |
+
"v_measure_std": 0.006515604585361752,
|
16 |
+
"v_measures": [
|
17 |
+
0.07151497621642194,
|
18 |
+
0.07152886858477273,
|
19 |
+
0.07533936305694591,
|
20 |
+
0.07390923787342664,
|
21 |
+
0.07147679207450276,
|
22 |
+
0.07213600223586297,
|
23 |
+
0.08611746483041241,
|
24 |
+
0.08170353591216682,
|
25 |
+
0.08028322075745065,
|
26 |
+
0.09056842632055917
|
27 |
+
]
|
28 |
+
}
|
29 |
+
]
|
30 |
+
},
|
31 |
+
"task_name": "BiorxivClusteringS2S"
|
32 |
+
}
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/COIRCodeSearchNetRetrieval.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "4adc7bc41202b5c13543c9c886a25f340634dab3",
|
3 |
+
"evaluation_time": 0.001447916030883789,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {},
|
7 |
+
"task_name": "COIRCodeSearchNetRetrieval"
|
8 |
+
}
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CQADupstackProgrammersRetrieval.json
DELETED
@@ -1,158 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"dataset_revision": "6184bc1440d2dbc7612be22b50686b8826d22b32",
|
3 |
-
"evaluation_time": 99.69791841506958,
|
4 |
-
"kg_co2_emissions": null,
|
5 |
-
"mteb_version": "1.14.15",
|
6 |
-
"scores": {
|
7 |
-
"test": [
|
8 |
-
{
|
9 |
-
"hf_subset": "default",
|
10 |
-
"languages": [
|
11 |
-
"eng-Latn"
|
12 |
-
],
|
13 |
-
"main_score": 0.0501,
|
14 |
-
"map_at_1": 0.02467,
|
15 |
-
"map_at_10": 0.03898,
|
16 |
-
"map_at_100": 0.04261,
|
17 |
-
"map_at_1000": 0.04333,
|
18 |
-
"map_at_20": 0.04068,
|
19 |
-
"map_at_3": 0.03388,
|
20 |
-
"map_at_5": 0.03693,
|
21 |
-
"mrr_at_1": 0.030821917808219176,
|
22 |
-
"mrr_at_10": 0.04904462926723201,
|
23 |
-
"mrr_at_100": 0.05339942610218758,
|
24 |
-
"mrr_at_1000": 0.05413492750157237,
|
25 |
-
"mrr_at_20": 0.05126402659708249,
|
26 |
-
"mrr_at_3": 0.04280821917808219,
|
27 |
-
"mrr_at_5": 0.04634703196347032,
|
28 |
-
"nauc_map_at_1000_diff1": 0.03644747951501248,
|
29 |
-
"nauc_map_at_1000_max": 0.2240572170754659,
|
30 |
-
"nauc_map_at_1000_std": -0.17708810912472517,
|
31 |
-
"nauc_map_at_100_diff1": 0.03759221625144172,
|
32 |
-
"nauc_map_at_100_max": 0.22324901446317413,
|
33 |
-
"nauc_map_at_100_std": -0.17630470695891512,
|
34 |
-
"nauc_map_at_10_diff1": 0.03906418656483989,
|
35 |
-
"nauc_map_at_10_max": 0.22061594321968936,
|
36 |
-
"nauc_map_at_10_std": -0.17777470317814356,
|
37 |
-
"nauc_map_at_1_diff1": 0.1731091343679673,
|
38 |
-
"nauc_map_at_1_max": 0.33459947679728974,
|
39 |
-
"nauc_map_at_1_std": -0.23115450977179597,
|
40 |
-
"nauc_map_at_20_diff1": 0.03795725531499195,
|
41 |
-
"nauc_map_at_20_max": 0.22396003211648763,
|
42 |
-
"nauc_map_at_20_std": -0.17867373725662639,
|
43 |
-
"nauc_map_at_3_diff1": 0.06042780588964212,
|
44 |
-
"nauc_map_at_3_max": 0.2486807528974488,
|
45 |
-
"nauc_map_at_3_std": -0.18512855007450404,
|
46 |
-
"nauc_map_at_5_diff1": 0.04407217741234605,
|
47 |
-
"nauc_map_at_5_max": 0.22647048266105405,
|
48 |
-
"nauc_map_at_5_std": -0.18107585673560017,
|
49 |
-
"nauc_mrr_at_1000_diff1": 0.033601872249839834,
|
50 |
-
"nauc_mrr_at_1000_max": 0.2523936325136619,
|
51 |
-
"nauc_mrr_at_1000_std": -0.19078164353963076,
|
52 |
-
"nauc_mrr_at_100_diff1": 0.03435870935950355,
|
53 |
-
"nauc_mrr_at_100_max": 0.2523932973431928,
|
54 |
-
"nauc_mrr_at_100_std": -0.1900913512193067,
|
55 |
-
"nauc_mrr_at_10_diff1": 0.03361519179733555,
|
56 |
-
"nauc_mrr_at_10_max": 0.25392922716866984,
|
57 |
-
"nauc_mrr_at_10_std": -0.1935061134919541,
|
58 |
-
"nauc_mrr_at_1_diff1": 0.1772995319079407,
|
59 |
-
"nauc_mrr_at_1_max": 0.35182174117717013,
|
60 |
-
"nauc_mrr_at_1_std": -0.24426280067522707,
|
61 |
-
"nauc_mrr_at_20_diff1": 0.03479828151019169,
|
62 |
-
"nauc_mrr_at_20_max": 0.25624951214228564,
|
63 |
-
"nauc_mrr_at_20_std": -0.19212268093923462,
|
64 |
-
"nauc_mrr_at_3_diff1": 0.06173430027850725,
|
65 |
-
"nauc_mrr_at_3_max": 0.26889485727748363,
|
66 |
-
"nauc_mrr_at_3_std": -0.19153801111553947,
|
67 |
-
"nauc_mrr_at_5_diff1": 0.036743759763164886,
|
68 |
-
"nauc_mrr_at_5_max": 0.253857849052297,
|
69 |
-
"nauc_mrr_at_5_std": -0.19604549670316734,
|
70 |
-
"nauc_ndcg_at_1000_diff1": -0.010372586628261796,
|
71 |
-
"nauc_ndcg_at_1000_max": 0.20925878430027478,
|
72 |
-
"nauc_ndcg_at_1000_std": -0.1717044268161809,
|
73 |
-
"nauc_ndcg_at_100_diff1": 0.0023309149151885546,
|
74 |
-
"nauc_ndcg_at_100_max": 0.20125970115134734,
|
75 |
-
"nauc_ndcg_at_100_std": -0.15865628929382014,
|
76 |
-
"nauc_ndcg_at_10_diff1": 0.0026192804576363727,
|
77 |
-
"nauc_ndcg_at_10_max": 0.19884193622357532,
|
78 |
-
"nauc_ndcg_at_10_std": -0.16919003671988075,
|
79 |
-
"nauc_ndcg_at_1_diff1": 0.1772995319079407,
|
80 |
-
"nauc_ndcg_at_1_max": 0.35182174117717013,
|
81 |
-
"nauc_ndcg_at_1_std": -0.24426280067522707,
|
82 |
-
"nauc_ndcg_at_20_diff1": 0.0031543394811079034,
|
83 |
-
"nauc_ndcg_at_20_max": 0.20925361343315524,
|
84 |
-
"nauc_ndcg_at_20_std": -0.17106125631597793,
|
85 |
-
"nauc_ndcg_at_3_diff1": 0.03670154146101528,
|
86 |
-
"nauc_ndcg_at_3_max": 0.23212930749840155,
|
87 |
-
"nauc_ndcg_at_3_std": -0.1728371812831961,
|
88 |
-
"nauc_ndcg_at_5_diff1": 0.0107566708693031,
|
89 |
-
"nauc_ndcg_at_5_max": 0.20474332948099355,
|
90 |
-
"nauc_ndcg_at_5_std": -0.1734952739301359,
|
91 |
-
"nauc_precision_at_1000_diff1": -0.07195606207962846,
|
92 |
-
"nauc_precision_at_1000_max": 0.2542912736794115,
|
93 |
-
"nauc_precision_at_1000_std": -0.1881459402790264,
|
94 |
-
"nauc_precision_at_100_diff1": -0.04518222914182943,
|
95 |
-
"nauc_precision_at_100_max": 0.22138981394024387,
|
96 |
-
"nauc_precision_at_100_std": -0.13384472263037697,
|
97 |
-
"nauc_precision_at_10_diff1": -0.052513811685878764,
|
98 |
-
"nauc_precision_at_10_max": 0.18962064467698705,
|
99 |
-
"nauc_precision_at_10_std": -0.14827004787357115,
|
100 |
-
"nauc_precision_at_1_diff1": 0.1772995319079407,
|
101 |
-
"nauc_precision_at_1_max": 0.35182174117717013,
|
102 |
-
"nauc_precision_at_1_std": -0.24426280067522707,
|
103 |
-
"nauc_precision_at_20_diff1": -0.040789324913047875,
|
104 |
-
"nauc_precision_at_20_max": 0.22086458009752882,
|
105 |
-
"nauc_precision_at_20_std": -0.14430508663959002,
|
106 |
-
"nauc_precision_at_3_diff1": -0.013044619440245884,
|
107 |
-
"nauc_precision_at_3_max": 0.21285488271783465,
|
108 |
-
"nauc_precision_at_3_std": -0.1483164417030193,
|
109 |
-
"nauc_precision_at_5_diff1": -0.05113181393685194,
|
110 |
-
"nauc_precision_at_5_max": 0.1756649379589832,
|
111 |
-
"nauc_precision_at_5_std": -0.15632134056178232,
|
112 |
-
"nauc_recall_at_1000_diff1": -0.047075752528689695,
|
113 |
-
"nauc_recall_at_1000_max": 0.16414155669676642,
|
114 |
-
"nauc_recall_at_1000_std": -0.1513320281746568,
|
115 |
-
"nauc_recall_at_100_diff1": -0.023004658252697183,
|
116 |
-
"nauc_recall_at_100_max": 0.14861973646512244,
|
117 |
-
"nauc_recall_at_100_std": -0.12240747671934184,
|
118 |
-
"nauc_recall_at_10_diff1": -0.051375323084735164,
|
119 |
-
"nauc_recall_at_10_max": 0.1384336247044034,
|
120 |
-
"nauc_recall_at_10_std": -0.14737738059263306,
|
121 |
-
"nauc_recall_at_1_diff1": 0.1731091343679673,
|
122 |
-
"nauc_recall_at_1_max": 0.33459947679728974,
|
123 |
-
"nauc_recall_at_1_std": -0.23115450977179597,
|
124 |
-
"nauc_recall_at_20_diff1": -0.03578815918976938,
|
125 |
-
"nauc_recall_at_20_max": 0.16386688869593355,
|
126 |
-
"nauc_recall_at_20_std": -0.1528456365862212,
|
127 |
-
"nauc_recall_at_3_diff1": -0.021696811828998432,
|
128 |
-
"nauc_recall_at_3_max": 0.1864107664448688,
|
129 |
-
"nauc_recall_at_3_std": -0.14586036842324565,
|
130 |
-
"nauc_recall_at_5_diff1": -0.0538517948884412,
|
131 |
-
"nauc_recall_at_5_max": 0.1453135254521713,
|
132 |
-
"nauc_recall_at_5_std": -0.1531619473747777,
|
133 |
-
"ndcg_at_1": 0.03082,
|
134 |
-
"ndcg_at_10": 0.0501,
|
135 |
-
"ndcg_at_100": 0.07072,
|
136 |
-
"ndcg_at_1000": 0.09327,
|
137 |
-
"ndcg_at_20": 0.05662,
|
138 |
-
"ndcg_at_3": 0.03989,
|
139 |
-
"ndcg_at_5": 0.04484,
|
140 |
-
"precision_at_1": 0.03082,
|
141 |
-
"precision_at_10": 0.00993,
|
142 |
-
"precision_at_100": 0.00241,
|
143 |
-
"precision_at_1000": 0.00052,
|
144 |
-
"precision_at_20": 0.00685,
|
145 |
-
"precision_at_3": 0.02017,
|
146 |
-
"precision_at_5": 0.0153,
|
147 |
-
"recall_at_1": 0.02467,
|
148 |
-
"recall_at_10": 0.07499,
|
149 |
-
"recall_at_100": 0.16969,
|
150 |
-
"recall_at_1000": 0.33718,
|
151 |
-
"recall_at_20": 0.09901,
|
152 |
-
"recall_at_3": 0.04648,
|
153 |
-
"recall_at_5": 0.05869
|
154 |
-
}
|
155 |
-
]
|
156 |
-
},
|
157 |
-
"task_name": "CQADupstackProgrammersRetrieval"
|
158 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeFeedbackMT.json
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "b0f12fa0c0dd67f59c95a5c33d02aeeb4c398c5f",
|
3 |
+
"evaluation_time": 86.56418371200562,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {
|
7 |
+
"test": [
|
8 |
+
{
|
9 |
+
"hf_subset": "default",
|
10 |
+
"languages": [
|
11 |
+
"eng-Latn"
|
12 |
+
],
|
13 |
+
"main_score": 0.15938,
|
14 |
+
"map_at_1": 0.12171,
|
15 |
+
"map_at_10": 0.14602,
|
16 |
+
"map_at_100": 0.14933,
|
17 |
+
"map_at_1000": 0.14984,
|
18 |
+
"map_at_20": 0.14772,
|
19 |
+
"map_at_3": 0.13875,
|
20 |
+
"map_at_5": 0.14277,
|
21 |
+
"mrr_at_1": 0.12171424267530316,
|
22 |
+
"mrr_at_10": 0.14602286924159336,
|
23 |
+
"mrr_at_100": 0.14933480468311353,
|
24 |
+
"mrr_at_1000": 0.14984386023850896,
|
25 |
+
"mrr_at_20": 0.14771560552079413,
|
26 |
+
"mrr_at_3": 0.1387487133137506,
|
27 |
+
"mrr_at_5": 0.14277070623383797,
|
28 |
+
"nauc_map_at_1000_diff1": 0.49472962132811066,
|
29 |
+
"nauc_map_at_1000_max": 0.1288096788830561,
|
30 |
+
"nauc_map_at_1000_std": 0.09402130699097373,
|
31 |
+
"nauc_map_at_100_diff1": 0.49521525169736585,
|
32 |
+
"nauc_map_at_100_max": 0.12889826157883777,
|
33 |
+
"nauc_map_at_100_std": 0.09387665760881421,
|
34 |
+
"nauc_map_at_10_diff1": 0.5007792220812604,
|
35 |
+
"nauc_map_at_10_max": 0.13096854092843976,
|
36 |
+
"nauc_map_at_10_std": 0.09297542921420311,
|
37 |
+
"nauc_map_at_1_diff1": 0.576490513877843,
|
38 |
+
"nauc_map_at_1_max": 0.155866059169816,
|
39 |
+
"nauc_map_at_1_std": 0.09440442396510458,
|
40 |
+
"nauc_map_at_20_diff1": 0.4980762076056278,
|
41 |
+
"nauc_map_at_20_max": 0.12991844827572516,
|
42 |
+
"nauc_map_at_20_std": 0.09346830652976015,
|
43 |
+
"nauc_map_at_3_diff1": 0.5220433951797554,
|
44 |
+
"nauc_map_at_3_max": 0.1391271534357672,
|
45 |
+
"nauc_map_at_3_std": 0.09400942293158544,
|
46 |
+
"nauc_map_at_5_diff1": 0.5107250849461592,
|
47 |
+
"nauc_map_at_5_max": 0.13530094789210456,
|
48 |
+
"nauc_map_at_5_std": 0.09342049003345741,
|
49 |
+
"nauc_mrr_at_1000_diff1": 0.49472962954673433,
|
50 |
+
"nauc_mrr_at_1000_max": 0.12880968585736355,
|
51 |
+
"nauc_mrr_at_1000_std": 0.0940213068870858,
|
52 |
+
"nauc_mrr_at_100_diff1": 0.49521525169736585,
|
53 |
+
"nauc_mrr_at_100_max": 0.12889826157883777,
|
54 |
+
"nauc_mrr_at_100_std": 0.09387665760881421,
|
55 |
+
"nauc_mrr_at_10_diff1": 0.5007792220812604,
|
56 |
+
"nauc_mrr_at_10_max": 0.13096854092843976,
|
57 |
+
"nauc_mrr_at_10_std": 0.09297542921420311,
|
58 |
+
"nauc_mrr_at_1_diff1": 0.576490513877843,
|
59 |
+
"nauc_mrr_at_1_max": 0.155866059169816,
|
60 |
+
"nauc_mrr_at_1_std": 0.09440442396510458,
|
61 |
+
"nauc_mrr_at_20_diff1": 0.4980762076056278,
|
62 |
+
"nauc_mrr_at_20_max": 0.12991844827572516,
|
63 |
+
"nauc_mrr_at_20_std": 0.09346830652976015,
|
64 |
+
"nauc_mrr_at_3_diff1": 0.5220433951797554,
|
65 |
+
"nauc_mrr_at_3_max": 0.1391271534357672,
|
66 |
+
"nauc_mrr_at_3_std": 0.09400942293158544,
|
67 |
+
"nauc_mrr_at_5_diff1": 0.5107250849461592,
|
68 |
+
"nauc_mrr_at_5_max": 0.13530094789210456,
|
69 |
+
"nauc_mrr_at_5_std": 0.09342049003345741,
|
70 |
+
"nauc_ndcg_at_1000_diff1": 0.42556848285754595,
|
71 |
+
"nauc_ndcg_at_1000_max": 0.1074330906576106,
|
72 |
+
"nauc_ndcg_at_1000_std": 0.09931415214354576,
|
73 |
+
"nauc_ndcg_at_100_diff1": 0.4389633172139021,
|
74 |
+
"nauc_ndcg_at_100_max": 0.10912358012253182,
|
75 |
+
"nauc_ndcg_at_100_std": 0.09591996585185938,
|
76 |
+
"nauc_ndcg_at_10_diff1": 0.4656271351032459,
|
77 |
+
"nauc_ndcg_at_10_max": 0.11811051132398084,
|
78 |
+
"nauc_ndcg_at_10_std": 0.09195643910816585,
|
79 |
+
"nauc_ndcg_at_1_diff1": 0.576490513877843,
|
80 |
+
"nauc_ndcg_at_1_max": 0.155866059169816,
|
81 |
+
"nauc_ndcg_at_1_std": 0.09440442396510458,
|
82 |
+
"nauc_ndcg_at_20_diff1": 0.45697106335736143,
|
83 |
+
"nauc_ndcg_at_20_max": 0.115023380566875,
|
84 |
+
"nauc_ndcg_at_20_std": 0.09369132873791501,
|
85 |
+
"nauc_ndcg_at_3_diff1": 0.5061759461194467,
|
86 |
+
"nauc_ndcg_at_3_max": 0.13434966943537516,
|
87 |
+
"nauc_ndcg_at_3_std": 0.09382725647213368,
|
88 |
+
"nauc_ndcg_at_5_diff1": 0.48712512841939637,
|
89 |
+
"nauc_ndcg_at_5_max": 0.12776188612692832,
|
90 |
+
"nauc_ndcg_at_5_std": 0.09280417774911971,
|
91 |
+
"nauc_precision_at_1000_diff1": 0.22171401911333807,
|
92 |
+
"nauc_precision_at_1000_max": 0.05180228755438657,
|
93 |
+
"nauc_precision_at_1000_std": 0.121478173960711,
|
94 |
+
"nauc_precision_at_100_diff1": 0.2930513840339096,
|
95 |
+
"nauc_precision_at_100_max": 0.058457996208423325,
|
96 |
+
"nauc_precision_at_100_std": 0.10329586184541412,
|
97 |
+
"nauc_precision_at_10_diff1": 0.37748222270492887,
|
98 |
+
"nauc_precision_at_10_max": 0.08516307019678841,
|
99 |
+
"nauc_precision_at_10_std": 0.08936548083478481,
|
100 |
+
"nauc_precision_at_1_diff1": 0.576490513877843,
|
101 |
+
"nauc_precision_at_1_max": 0.155866059169816,
|
102 |
+
"nauc_precision_at_1_std": 0.09440442396510458,
|
103 |
+
"nauc_precision_at_20_diff1": 0.35370118406718887,
|
104 |
+
"nauc_precision_at_20_max": 0.07720501737285508,
|
105 |
+
"nauc_precision_at_20_std": 0.09512670518828382,
|
106 |
+
"nauc_precision_at_3_diff1": 0.4648455680777127,
|
107 |
+
"nauc_precision_at_3_max": 0.12193379632419739,
|
108 |
+
"nauc_precision_at_3_std": 0.09333400762182767,
|
109 |
+
"nauc_precision_at_5_diff1": 0.42689240448557475,
|
110 |
+
"nauc_precision_at_5_max": 0.10840841308271118,
|
111 |
+
"nauc_precision_at_5_std": 0.09114478125877269,
|
112 |
+
"nauc_recall_at_1000_diff1": 0.22171401911333835,
|
113 |
+
"nauc_recall_at_1000_max": 0.05180228755438666,
|
114 |
+
"nauc_recall_at_1000_std": 0.12147817396071107,
|
115 |
+
"nauc_recall_at_100_diff1": 0.2930513840339097,
|
116 |
+
"nauc_recall_at_100_max": 0.05845799620842323,
|
117 |
+
"nauc_recall_at_100_std": 0.103295861845414,
|
118 |
+
"nauc_recall_at_10_diff1": 0.37748222270492904,
|
119 |
+
"nauc_recall_at_10_max": 0.08516307019678845,
|
120 |
+
"nauc_recall_at_10_std": 0.0893654808347849,
|
121 |
+
"nauc_recall_at_1_diff1": 0.576490513877843,
|
122 |
+
"nauc_recall_at_1_max": 0.155866059169816,
|
123 |
+
"nauc_recall_at_1_std": 0.09440442396510458,
|
124 |
+
"nauc_recall_at_20_diff1": 0.353701184067189,
|
125 |
+
"nauc_recall_at_20_max": 0.07720501737285505,
|
126 |
+
"nauc_recall_at_20_std": 0.09512670518828369,
|
127 |
+
"nauc_recall_at_3_diff1": 0.46484556807771255,
|
128 |
+
"nauc_recall_at_3_max": 0.12193379632419747,
|
129 |
+
"nauc_recall_at_3_std": 0.09333400762182767,
|
130 |
+
"nauc_recall_at_5_diff1": 0.4268924044855751,
|
131 |
+
"nauc_recall_at_5_max": 0.10840841308271101,
|
132 |
+
"nauc_recall_at_5_std": 0.09114478125877268,
|
133 |
+
"ndcg_at_1": 0.12171,
|
134 |
+
"ndcg_at_10": 0.15938,
|
135 |
+
"ndcg_at_100": 0.17773,
|
136 |
+
"ndcg_at_1000": 0.19422,
|
137 |
+
"ndcg_at_20": 0.16545,
|
138 |
+
"ndcg_at_3": 0.14423,
|
139 |
+
"ndcg_at_5": 0.1515,
|
140 |
+
"precision_at_1": 0.12171,
|
141 |
+
"precision_at_10": 0.02022,
|
142 |
+
"precision_at_100": 0.00293,
|
143 |
+
"precision_at_1000": 0.00043,
|
144 |
+
"precision_at_20": 0.0113,
|
145 |
+
"precision_at_3": 0.05335,
|
146 |
+
"precision_at_5": 0.03555,
|
147 |
+
"recall_at_1": 0.12171,
|
148 |
+
"recall_at_10": 0.20215,
|
149 |
+
"recall_at_100": 0.29329,
|
150 |
+
"recall_at_1000": 0.42977,
|
151 |
+
"recall_at_20": 0.22595,
|
152 |
+
"recall_at_3": 0.16005,
|
153 |
+
"recall_at_5": 0.17775
|
154 |
+
}
|
155 |
+
]
|
156 |
+
},
|
157 |
+
"task_name": "CodeFeedbackMT"
|
158 |
+
}
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeSearchNetCCRetrieval.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "6e1effa2c03723c5fde48ee912b5ee08d4f211e8",
|
3 |
+
"evaluation_time": 0.0003421306610107422,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {},
|
7 |
+
"task_name": "CodeSearchNetCCRetrieval"
|
8 |
+
}
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeTransOceanContest.json
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "20da4eb20a4b17300c0986ee148c90867a7f2a4d",
|
3 |
+
"evaluation_time": 0.8471865653991699,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {
|
7 |
+
"test": [
|
8 |
+
{
|
9 |
+
"hf_subset": "default",
|
10 |
+
"languages": [
|
11 |
+
"python-Code",
|
12 |
+
"c++-Code"
|
13 |
+
],
|
14 |
+
"main_score": 0.09511,
|
15 |
+
"map_at_1": 0.06787,
|
16 |
+
"map_at_10": 0.08418,
|
17 |
+
"map_at_100": 0.08966,
|
18 |
+
"map_at_1000": 0.09203,
|
19 |
+
"map_at_20": 0.08539,
|
20 |
+
"map_at_3": 0.07994,
|
21 |
+
"map_at_5": 0.08107,
|
22 |
+
"mrr_at_1": 0.06787330316742081,
|
23 |
+
"mrr_at_10": 0.08418085182791066,
|
24 |
+
"mrr_at_100": 0.08966102694364947,
|
25 |
+
"mrr_at_1000": 0.09203289825977753,
|
26 |
+
"mrr_at_20": 0.08539287509875745,
|
27 |
+
"mrr_at_3": 0.0799396681749623,
|
28 |
+
"mrr_at_5": 0.08107088989441931,
|
29 |
+
"nauc_map_at_1000_diff1": 0.4002206493865976,
|
30 |
+
"nauc_map_at_1000_max": 0.05565781891778103,
|
31 |
+
"nauc_map_at_1000_std": -0.004874041232219024,
|
32 |
+
"nauc_map_at_100_diff1": 0.39935983795466523,
|
33 |
+
"nauc_map_at_100_max": 0.05300217863316174,
|
34 |
+
"nauc_map_at_100_std": -0.003035576698055301,
|
35 |
+
"nauc_map_at_10_diff1": 0.41045827644014476,
|
36 |
+
"nauc_map_at_10_max": 0.06092815179963274,
|
37 |
+
"nauc_map_at_10_std": -0.005952146225472054,
|
38 |
+
"nauc_map_at_1_diff1": 0.5123648006404755,
|
39 |
+
"nauc_map_at_1_max": 0.06018186942687983,
|
40 |
+
"nauc_map_at_1_std": 0.003062256335957277,
|
41 |
+
"nauc_map_at_20_diff1": 0.41026580715260286,
|
42 |
+
"nauc_map_at_20_max": 0.05961500732393842,
|
43 |
+
"nauc_map_at_20_std": -0.008812971012975637,
|
44 |
+
"nauc_map_at_3_diff1": 0.43328416235104994,
|
45 |
+
"nauc_map_at_3_max": 0.061612024091789064,
|
46 |
+
"nauc_map_at_3_std": 0.0009811686045742218,
|
47 |
+
"nauc_map_at_5_diff1": 0.42638117055302016,
|
48 |
+
"nauc_map_at_5_max": 0.062409134446330596,
|
49 |
+
"nauc_map_at_5_std": -3.141760809509476e-05,
|
50 |
+
"nauc_mrr_at_1000_diff1": 0.4002206493865976,
|
51 |
+
"nauc_mrr_at_1000_max": 0.05565781891778103,
|
52 |
+
"nauc_mrr_at_1000_std": -0.004874041232219024,
|
53 |
+
"nauc_mrr_at_100_diff1": 0.39935983795466523,
|
54 |
+
"nauc_mrr_at_100_max": 0.05300217863316174,
|
55 |
+
"nauc_mrr_at_100_std": -0.003035576698055301,
|
56 |
+
"nauc_mrr_at_10_diff1": 0.41045827644014476,
|
57 |
+
"nauc_mrr_at_10_max": 0.06092815179963274,
|
58 |
+
"nauc_mrr_at_10_std": -0.005952146225472054,
|
59 |
+
"nauc_mrr_at_1_diff1": 0.5123648006404755,
|
60 |
+
"nauc_mrr_at_1_max": 0.06018186942687983,
|
61 |
+
"nauc_mrr_at_1_std": 0.003062256335957277,
|
62 |
+
"nauc_mrr_at_20_diff1": 0.41026580715260286,
|
63 |
+
"nauc_mrr_at_20_max": 0.05961500732393842,
|
64 |
+
"nauc_mrr_at_20_std": -0.008812971012975637,
|
65 |
+
"nauc_mrr_at_3_diff1": 0.43328416235104994,
|
66 |
+
"nauc_mrr_at_3_max": 0.061612024091789064,
|
67 |
+
"nauc_mrr_at_3_std": 0.0009811686045742218,
|
68 |
+
"nauc_mrr_at_5_diff1": 0.42638117055302016,
|
69 |
+
"nauc_mrr_at_5_max": 0.062409134446330596,
|
70 |
+
"nauc_mrr_at_5_std": -3.141760809509476e-05,
|
71 |
+
"nauc_ndcg_at_1000_diff1": 0.3401849332107565,
|
72 |
+
"nauc_ndcg_at_1000_max": 0.05887650595047429,
|
73 |
+
"nauc_ndcg_at_1000_std": 0.004274830251501765,
|
74 |
+
"nauc_ndcg_at_100_diff1": 0.3017142674492828,
|
75 |
+
"nauc_ndcg_at_100_max": 0.01657746093566299,
|
76 |
+
"nauc_ndcg_at_100_std": 0.020445323924594527,
|
77 |
+
"nauc_ndcg_at_10_diff1": 0.3606925243087163,
|
78 |
+
"nauc_ndcg_at_10_max": 0.05993698215407892,
|
79 |
+
"nauc_ndcg_at_10_std": -0.012383471019315629,
|
80 |
+
"nauc_ndcg_at_1_diff1": 0.5123648006404755,
|
81 |
+
"nauc_ndcg_at_1_max": 0.06018186942687983,
|
82 |
+
"nauc_ndcg_at_1_std": 0.003062256335957277,
|
83 |
+
"nauc_ndcg_at_20_diff1": 0.3627658572653584,
|
84 |
+
"nauc_ndcg_at_20_max": 0.05503924863968874,
|
85 |
+
"nauc_ndcg_at_20_std": -0.022353744095367632,
|
86 |
+
"nauc_ndcg_at_3_diff1": 0.40774589816759704,
|
87 |
+
"nauc_ndcg_at_3_max": 0.06078295183380332,
|
88 |
+
"nauc_ndcg_at_3_std": 0.002631991326812176,
|
89 |
+
"nauc_ndcg_at_5_diff1": 0.39699453568762005,
|
90 |
+
"nauc_ndcg_at_5_max": 0.06208096521525048,
|
91 |
+
"nauc_ndcg_at_5_std": 0.0009741567889838872,
|
92 |
+
"nauc_precision_at_1000_diff1": -0.332421505946305,
|
93 |
+
"nauc_precision_at_1000_max": 1.0,
|
94 |
+
"nauc_precision_at_1000_std": 0.9564489112227755,
|
95 |
+
"nauc_precision_at_100_diff1": 0.12129385857557387,
|
96 |
+
"nauc_precision_at_100_max": -0.0634555570739123,
|
97 |
+
"nauc_precision_at_100_std": 0.08437119311025783,
|
98 |
+
"nauc_precision_at_10_diff1": 0.2477538993229102,
|
99 |
+
"nauc_precision_at_10_max": 0.058120653790512844,
|
100 |
+
"nauc_precision_at_10_std": -0.028666404671314694,
|
101 |
+
"nauc_precision_at_1_diff1": 0.5123648006404755,
|
102 |
+
"nauc_precision_at_1_max": 0.06018186942687983,
|
103 |
+
"nauc_precision_at_1_std": 0.003062256335957277,
|
104 |
+
"nauc_precision_at_20_diff1": 0.2655368456031618,
|
105 |
+
"nauc_precision_at_20_max": 0.04343249021784076,
|
106 |
+
"nauc_precision_at_20_std": -0.05672812486089926,
|
107 |
+
"nauc_precision_at_3_diff1": 0.3471986913496342,
|
108 |
+
"nauc_precision_at_3_max": 0.05854588807862574,
|
109 |
+
"nauc_precision_at_3_std": 0.007034303620076266,
|
110 |
+
"nauc_precision_at_5_diff1": 0.3279845328741994,
|
111 |
+
"nauc_precision_at_5_max": 0.06114433941272132,
|
112 |
+
"nauc_precision_at_5_std": 0.003670428141042012,
|
113 |
+
"nauc_recall_at_1000_diff1": -0.33242150594628805,
|
114 |
+
"nauc_recall_at_1000_max": 1.0,
|
115 |
+
"nauc_recall_at_1000_std": 0.9564489112227793,
|
116 |
+
"nauc_recall_at_100_diff1": 0.1212938585755736,
|
117 |
+
"nauc_recall_at_100_max": -0.06345555707391222,
|
118 |
+
"nauc_recall_at_100_std": 0.08437119311025772,
|
119 |
+
"nauc_recall_at_10_diff1": 0.24775389932291023,
|
120 |
+
"nauc_recall_at_10_max": 0.05812065379051293,
|
121 |
+
"nauc_recall_at_10_std": -0.02866640467131462,
|
122 |
+
"nauc_recall_at_1_diff1": 0.5123648006404755,
|
123 |
+
"nauc_recall_at_1_max": 0.06018186942687983,
|
124 |
+
"nauc_recall_at_1_std": 0.003062256335957277,
|
125 |
+
"nauc_recall_at_20_diff1": 0.2655368456031617,
|
126 |
+
"nauc_recall_at_20_max": 0.04343249021784082,
|
127 |
+
"nauc_recall_at_20_std": -0.05672812486089907,
|
128 |
+
"nauc_recall_at_3_diff1": 0.34719869134963405,
|
129 |
+
"nauc_recall_at_3_max": 0.058545888078625666,
|
130 |
+
"nauc_recall_at_3_std": 0.007034303620076171,
|
131 |
+
"nauc_recall_at_5_diff1": 0.3279845328741993,
|
132 |
+
"nauc_recall_at_5_max": 0.06114433941272142,
|
133 |
+
"nauc_recall_at_5_std": 0.0036704281410421266,
|
134 |
+
"ndcg_at_1": 0.06787,
|
135 |
+
"ndcg_at_10": 0.09511,
|
136 |
+
"ndcg_at_100": 0.13674,
|
137 |
+
"ndcg_at_1000": 0.21258,
|
138 |
+
"ndcg_at_20": 0.09964,
|
139 |
+
"ndcg_at_3": 0.0849,
|
140 |
+
"ndcg_at_5": 0.08684,
|
141 |
+
"precision_at_1": 0.06787,
|
142 |
+
"precision_at_10": 0.01312,
|
143 |
+
"precision_at_100": 0.00362,
|
144 |
+
"precision_at_1000": 0.00099,
|
145 |
+
"precision_at_20": 0.00747,
|
146 |
+
"precision_at_3": 0.03318,
|
147 |
+
"precision_at_5": 0.02081,
|
148 |
+
"recall_at_1": 0.06787,
|
149 |
+
"recall_at_10": 0.13122,
|
150 |
+
"recall_at_100": 0.36199,
|
151 |
+
"recall_at_1000": 0.98643,
|
152 |
+
"recall_at_20": 0.14932,
|
153 |
+
"recall_at_3": 0.09955,
|
154 |
+
"recall_at_5": 0.10407
|
155 |
+
}
|
156 |
+
]
|
157 |
+
},
|
158 |
+
"task_name": "CodeTransOceanContest"
|
159 |
+
}
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CodeTransOceanDL.json
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "281562cb8a1265ab5c0824bfa6ddcd9b0a15618f",
|
3 |
+
"evaluation_time": 0.3599967956542969,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {
|
7 |
+
"test": [
|
8 |
+
{
|
9 |
+
"hf_subset": "default",
|
10 |
+
"languages": [
|
11 |
+
"python-Code"
|
12 |
+
],
|
13 |
+
"main_score": 0.27797,
|
14 |
+
"map_at_1": 0.06667,
|
15 |
+
"map_at_10": 0.16857,
|
16 |
+
"map_at_100": 0.18958,
|
17 |
+
"map_at_1000": 0.18973,
|
18 |
+
"map_at_20": 0.18736,
|
19 |
+
"map_at_3": 0.08704,
|
20 |
+
"map_at_5": 0.12065,
|
21 |
+
"mrr_at_1": 0.022222222222222223,
|
22 |
+
"mrr_at_10": 0.14406305114638449,
|
23 |
+
"mrr_at_100": 0.16441129312841948,
|
24 |
+
"mrr_at_1000": 0.1645598395211377,
|
25 |
+
"mrr_at_20": 0.16211789469464907,
|
26 |
+
"mrr_at_3": 0.057407407407407414,
|
27 |
+
"mrr_at_5": 0.09407407407407407,
|
28 |
+
"nauc_map_at_1000_diff1": 0.02784086538826234,
|
29 |
+
"nauc_map_at_1000_max": -0.287573307732991,
|
30 |
+
"nauc_map_at_1000_std": -0.034745406422382066,
|
31 |
+
"nauc_map_at_100_diff1": 0.027731715108106954,
|
32 |
+
"nauc_map_at_100_max": -0.28771336576146495,
|
33 |
+
"nauc_map_at_100_std": -0.03383814176825187,
|
34 |
+
"nauc_map_at_10_diff1": 0.03945986957137019,
|
35 |
+
"nauc_map_at_10_max": -0.25226936866501254,
|
36 |
+
"nauc_map_at_10_std": -0.03190349240293486,
|
37 |
+
"nauc_map_at_1_diff1": -0.0031079509882836883,
|
38 |
+
"nauc_map_at_1_max": -0.38109292549861384,
|
39 |
+
"nauc_map_at_1_std": 0.01117967981397013,
|
40 |
+
"nauc_map_at_20_diff1": 0.02588133080683189,
|
41 |
+
"nauc_map_at_20_max": -0.2867971359129569,
|
42 |
+
"nauc_map_at_20_std": -0.04076734332555616,
|
43 |
+
"nauc_map_at_3_diff1": -0.041422641218933104,
|
44 |
+
"nauc_map_at_3_max": -0.3935239742048571,
|
45 |
+
"nauc_map_at_3_std": -0.016444699737666612,
|
46 |
+
"nauc_map_at_5_diff1": 0.09205138060696524,
|
47 |
+
"nauc_map_at_5_max": -0.33277332112682373,
|
48 |
+
"nauc_map_at_5_std": -0.03392255678772473,
|
49 |
+
"nauc_mrr_at_1000_diff1": -0.28331752487610157,
|
50 |
+
"nauc_mrr_at_1000_max": -0.2111897323809926,
|
51 |
+
"nauc_mrr_at_1000_std": -0.16180758984470822,
|
52 |
+
"nauc_mrr_at_100_diff1": -0.2830509416012681,
|
53 |
+
"nauc_mrr_at_100_max": -0.21149355358382807,
|
54 |
+
"nauc_mrr_at_100_std": -0.16037144078976506,
|
55 |
+
"nauc_mrr_at_10_diff1": -0.24908906607383133,
|
56 |
+
"nauc_mrr_at_10_max": -0.16222471422585077,
|
57 |
+
"nauc_mrr_at_10_std": -0.13732552303818502,
|
58 |
+
"nauc_mrr_at_1_diff1": -0.5747249798765764,
|
59 |
+
"nauc_mrr_at_1_max": -0.4543198282801182,
|
60 |
+
"nauc_mrr_at_1_std": -0.35008049369466065,
|
61 |
+
"nauc_mrr_at_20_diff1": -0.2788790690994075,
|
62 |
+
"nauc_mrr_at_20_max": -0.21245333313324236,
|
63 |
+
"nauc_mrr_at_20_std": -0.16798426695097868,
|
64 |
+
"nauc_mrr_at_3_diff1": -0.2978755408478371,
|
65 |
+
"nauc_mrr_at_3_max": -0.3738950777316688,
|
66 |
+
"nauc_mrr_at_3_std": -0.16400629993717764,
|
67 |
+
"nauc_mrr_at_5_diff1": -0.2553974200779292,
|
68 |
+
"nauc_mrr_at_5_max": -0.2566333148288954,
|
69 |
+
"nauc_mrr_at_5_std": -0.1662715837606456,
|
70 |
+
"nauc_ndcg_at_1000_diff1": 0.018095130261789123,
|
71 |
+
"nauc_ndcg_at_1000_max": -0.25210611817093725,
|
72 |
+
"nauc_ndcg_at_1000_std": -0.045639669938462205,
|
73 |
+
"nauc_ndcg_at_100_diff1": 0.014294076823156266,
|
74 |
+
"nauc_ndcg_at_100_max": -0.2523941368276548,
|
75 |
+
"nauc_ndcg_at_100_std": -0.024740265353583573,
|
76 |
+
"nauc_ndcg_at_10_diff1": 0.028517877606712184,
|
77 |
+
"nauc_ndcg_at_10_max": -0.1379350447346928,
|
78 |
+
"nauc_ndcg_at_10_std": -0.04890416556969064,
|
79 |
+
"nauc_ndcg_at_1_diff1": -0.0031079509882836883,
|
80 |
+
"nauc_ndcg_at_1_max": -0.38109292549861384,
|
81 |
+
"nauc_ndcg_at_1_std": 0.01117967981397013,
|
82 |
+
"nauc_ndcg_at_20_diff1": -0.01798223055051044,
|
83 |
+
"nauc_ndcg_at_20_max": -0.21587479592623202,
|
84 |
+
"nauc_ndcg_at_20_std": -0.08674791082336787,
|
85 |
+
"nauc_ndcg_at_3_diff1": -0.055225744089572794,
|
86 |
+
"nauc_ndcg_at_3_max": -0.3980023359780902,
|
87 |
+
"nauc_ndcg_at_3_std": -0.026396552418542944,
|
88 |
+
"nauc_ndcg_at_5_diff1": 0.1484750076478242,
|
89 |
+
"nauc_ndcg_at_5_max": -0.3149749102906245,
|
90 |
+
"nauc_ndcg_at_5_std": -0.0507138930089742,
|
91 |
+
"nauc_precision_at_1000_diff1": 1.0,
|
92 |
+
"nauc_precision_at_1000_max": 1.0,
|
93 |
+
"nauc_precision_at_1000_std": 1.0,
|
94 |
+
"nauc_precision_at_100_diff1": -0.17098506069093805,
|
95 |
+
"nauc_precision_at_100_max": -0.22292250233425662,
|
96 |
+
"nauc_precision_at_100_std": 0.8978758169934575,
|
97 |
+
"nauc_precision_at_10_diff1": -0.017575447383332443,
|
98 |
+
"nauc_precision_at_10_max": 0.1212641892262422,
|
99 |
+
"nauc_precision_at_10_std": -0.08237519878626094,
|
100 |
+
"nauc_precision_at_1_diff1": -0.0031079509882836883,
|
101 |
+
"nauc_precision_at_1_max": -0.38109292549861384,
|
102 |
+
"nauc_precision_at_1_std": 0.01117967981397013,
|
103 |
+
"nauc_precision_at_20_diff1": -0.3151420415930391,
|
104 |
+
"nauc_precision_at_20_max": 0.12029359793394034,
|
105 |
+
"nauc_precision_at_20_std": -0.33358252911059727,
|
106 |
+
"nauc_precision_at_3_diff1": -0.08457460873068837,
|
107 |
+
"nauc_precision_at_3_max": -0.4075244582667053,
|
108 |
+
"nauc_precision_at_3_std": -0.04755669155513028,
|
109 |
+
"nauc_precision_at_5_diff1": 0.2490820314035684,
|
110 |
+
"nauc_precision_at_5_max": -0.2860130677842162,
|
111 |
+
"nauc_precision_at_5_std": -0.07902785749667773,
|
112 |
+
"nauc_recall_at_1000_diff1": NaN,
|
113 |
+
"nauc_recall_at_1000_max": NaN,
|
114 |
+
"nauc_recall_at_1000_std": NaN,
|
115 |
+
"nauc_recall_at_100_diff1": -0.1709850606909425,
|
116 |
+
"nauc_recall_at_100_max": -0.2229225023342648,
|
117 |
+
"nauc_recall_at_100_std": 0.8978758169934654,
|
118 |
+
"nauc_recall_at_10_diff1": -0.017575447383333363,
|
119 |
+
"nauc_recall_at_10_max": 0.12126418922624167,
|
120 |
+
"nauc_recall_at_10_std": -0.08237519878626104,
|
121 |
+
"nauc_recall_at_1_diff1": -0.0031079509882836883,
|
122 |
+
"nauc_recall_at_1_max": -0.38109292549861384,
|
123 |
+
"nauc_recall_at_1_std": 0.01117967981397013,
|
124 |
+
"nauc_recall_at_20_diff1": -0.3151420415930406,
|
125 |
+
"nauc_recall_at_20_max": 0.12029359793393885,
|
126 |
+
"nauc_recall_at_20_std": -0.33358252911059794,
|
127 |
+
"nauc_recall_at_3_diff1": -0.08457460873068838,
|
128 |
+
"nauc_recall_at_3_max": -0.4075244582667053,
|
129 |
+
"nauc_recall_at_3_std": -0.047556691555130225,
|
130 |
+
"nauc_recall_at_5_diff1": 0.24908203140356805,
|
131 |
+
"nauc_recall_at_5_max": -0.2860130677842164,
|
132 |
+
"nauc_recall_at_5_std": -0.07902785749667794,
|
133 |
+
"ndcg_at_1": 0.06667,
|
134 |
+
"ndcg_at_10": 0.27797,
|
135 |
+
"ndcg_at_100": 0.35629,
|
136 |
+
"ndcg_at_1000": 0.35936,
|
137 |
+
"ndcg_at_20": 0.33924,
|
138 |
+
"ndcg_at_3": 0.09722,
|
139 |
+
"ndcg_at_5": 0.15724,
|
140 |
+
"precision_at_1": 0.06667,
|
141 |
+
"precision_at_10": 0.06556,
|
142 |
+
"precision_at_100": 0.00978,
|
143 |
+
"precision_at_1000": 0.001,
|
144 |
+
"precision_at_20": 0.04417,
|
145 |
+
"precision_at_3": 0.04259,
|
146 |
+
"precision_at_5": 0.05444,
|
147 |
+
"recall_at_1": 0.06667,
|
148 |
+
"recall_at_10": 0.65556,
|
149 |
+
"recall_at_100": 0.97778,
|
150 |
+
"recall_at_1000": 1.0,
|
151 |
+
"recall_at_20": 0.88333,
|
152 |
+
"recall_at_3": 0.12778,
|
153 |
+
"recall_at_5": 0.27222
|
154 |
+
}
|
155 |
+
]
|
156 |
+
},
|
157 |
+
"task_name": "CodeTransOceanDL"
|
158 |
+
}
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CosQA.json
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "bc5efb7e9d437246ce393ed19d772e08e4a79535",
|
3 |
+
"evaluation_time": 20.75157332420349,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {
|
7 |
+
"test": [
|
8 |
+
{
|
9 |
+
"hf_subset": "default",
|
10 |
+
"languages": [
|
11 |
+
"eng-Latn",
|
12 |
+
"python-Code"
|
13 |
+
],
|
14 |
+
"main_score": 0.00971,
|
15 |
+
"map_at_1": 0.004,
|
16 |
+
"map_at_10": 0.00722,
|
17 |
+
"map_at_100": 0.01022,
|
18 |
+
"map_at_1000": 0.01074,
|
19 |
+
"map_at_20": 0.00905,
|
20 |
+
"map_at_3": 0.005,
|
21 |
+
"map_at_5": 0.0065,
|
22 |
+
"mrr_at_1": 0.002,
|
23 |
+
"mrr_at_10": 0.004341269841269841,
|
24 |
+
"mrr_at_100": 0.007548299135165892,
|
25 |
+
"mrr_at_1000": 0.008049580536282804,
|
26 |
+
"mrr_at_20": 0.0062800911077226865,
|
27 |
+
"mrr_at_3": 0.003,
|
28 |
+
"mrr_at_5": 0.003,
|
29 |
+
"nauc_map_at_1000_diff1": -0.3294146795633615,
|
30 |
+
"nauc_map_at_1000_max": -0.24406048510714173,
|
31 |
+
"nauc_map_at_1000_std": -0.1753914769148377,
|
32 |
+
"nauc_map_at_100_diff1": -0.3401799456138604,
|
33 |
+
"nauc_map_at_100_max": -0.25047032754654014,
|
34 |
+
"nauc_map_at_100_std": -0.17739627719079182,
|
35 |
+
"nauc_map_at_10_diff1": -0.4378248126973643,
|
36 |
+
"nauc_map_at_10_max": -0.30162019359765524,
|
37 |
+
"nauc_map_at_10_std": -0.21082330602051547,
|
38 |
+
"nauc_map_at_1_diff1": -0.5747249798765764,
|
39 |
+
"nauc_map_at_1_max": -0.20056345586262406,
|
40 |
+
"nauc_map_at_1_std": -0.24282264555943125,
|
41 |
+
"nauc_map_at_20_diff1": -0.3606250545571463,
|
42 |
+
"nauc_map_at_20_max": -0.24877416436848493,
|
43 |
+
"nauc_map_at_20_std": -0.1867708065128874,
|
44 |
+
"nauc_map_at_3_diff1": -0.5596994902066005,
|
45 |
+
"nauc_map_at_3_max": -0.27539576066541455,
|
46 |
+
"nauc_map_at_3_std": -0.2772739468741615,
|
47 |
+
"nauc_map_at_5_diff1": -0.4422612536376958,
|
48 |
+
"nauc_map_at_5_max": -0.3124806505541681,
|
49 |
+
"nauc_map_at_5_std": -0.21923180120121363,
|
50 |
+
"nauc_mrr_at_1000_diff1": -0.301017062901502,
|
51 |
+
"nauc_mrr_at_1000_max": -0.30574852118541557,
|
52 |
+
"nauc_mrr_at_1000_std": -0.12721072093737698,
|
53 |
+
"nauc_mrr_at_100_diff1": -0.31271510738527736,
|
54 |
+
"nauc_mrr_at_100_max": -0.3177547103171918,
|
55 |
+
"nauc_mrr_at_100_std": -0.12828398711778335,
|
56 |
+
"nauc_mrr_at_10_diff1": -0.4292943091791753,
|
57 |
+
"nauc_mrr_at_10_max": -0.45746906437484464,
|
58 |
+
"nauc_mrr_at_10_std": -0.1527977322458919,
|
59 |
+
"nauc_mrr_at_1_diff1": -0.5747249798765764,
|
60 |
+
"nauc_mrr_at_1_max": -0.4995975315266972,
|
61 |
+
"nauc_mrr_at_1_std": -0.0705661389857795,
|
62 |
+
"nauc_mrr_at_20_diff1": -0.3407197064284155,
|
63 |
+
"nauc_mrr_at_20_max": -0.33440320216781877,
|
64 |
+
"nauc_mrr_at_20_std": -0.12890471230942302,
|
65 |
+
"nauc_mrr_at_3_diff1": -0.5496824970932833,
|
66 |
+
"nauc_mrr_at_3_max": -0.5246400143099902,
|
67 |
+
"nauc_mrr_at_3_std": -0.18540381003488063,
|
68 |
+
"nauc_mrr_at_5_diff1": -0.5496824970932833,
|
69 |
+
"nauc_mrr_at_5_max": -0.5246400143099902,
|
70 |
+
"nauc_mrr_at_5_std": -0.18540381003488063,
|
71 |
+
"nauc_ndcg_at_1000_diff1": -0.1726642525674944,
|
72 |
+
"nauc_ndcg_at_1000_max": -0.1519851274416735,
|
73 |
+
"nauc_ndcg_at_1000_std": -0.152970784901727,
|
74 |
+
"nauc_ndcg_at_100_diff1": -0.22648229459252223,
|
75 |
+
"nauc_ndcg_at_100_max": -0.20905633164487697,
|
76 |
+
"nauc_ndcg_at_100_std": -0.15127742985051915,
|
77 |
+
"nauc_ndcg_at_10_diff1": -0.3920183074503633,
|
78 |
+
"nauc_ndcg_at_10_max": -0.31340312237742524,
|
79 |
+
"nauc_ndcg_at_10_std": -0.18755048697604484,
|
80 |
+
"nauc_ndcg_at_1_diff1": -0.5747249798765764,
|
81 |
+
"nauc_ndcg_at_1_max": -0.20056345586262406,
|
82 |
+
"nauc_ndcg_at_1_std": -0.24282264555943125,
|
83 |
+
"nauc_ndcg_at_20_diff1": -0.24984338312909435,
|
84 |
+
"nauc_ndcg_at_20_max": -0.19884254695674725,
|
85 |
+
"nauc_ndcg_at_20_std": -0.13845214629934277,
|
86 |
+
"nauc_ndcg_at_3_diff1": -0.556708481180822,
|
87 |
+
"nauc_ndcg_at_3_max": -0.2902920538313011,
|
88 |
+
"nauc_ndcg_at_3_std": -0.28413190328326815,
|
89 |
+
"nauc_ndcg_at_5_diff1": -0.3900872909119857,
|
90 |
+
"nauc_ndcg_at_5_max": -0.338313007450872,
|
91 |
+
"nauc_ndcg_at_5_std": -0.19903625569609631,
|
92 |
+
"nauc_precision_at_1000_diff1": -0.11872279831421322,
|
93 |
+
"nauc_precision_at_1000_max": -0.10043235608226829,
|
94 |
+
"nauc_precision_at_1000_std": -0.1582434744663016,
|
95 |
+
"nauc_precision_at_100_diff1": -0.1709441093780988,
|
96 |
+
"nauc_precision_at_100_max": -0.18360728409607915,
|
97 |
+
"nauc_precision_at_100_std": -0.14930413784253577,
|
98 |
+
"nauc_precision_at_10_diff1": -0.3340488328414273,
|
99 |
+
"nauc_precision_at_10_max": -0.32435977700265317,
|
100 |
+
"nauc_precision_at_10_std": -0.155978892764511,
|
101 |
+
"nauc_precision_at_1_diff1": -0.5747249798765764,
|
102 |
+
"nauc_precision_at_1_max": -0.20056345586262406,
|
103 |
+
"nauc_precision_at_1_std": -0.24282264555943125,
|
104 |
+
"nauc_precision_at_20_diff1": -0.1562553357562748,
|
105 |
+
"nauc_precision_at_20_max": -0.1475961655730907,
|
106 |
+
"nauc_precision_at_20_std": -0.09494597165646255,
|
107 |
+
"nauc_precision_at_3_diff1": -0.5496824970932834,
|
108 |
+
"nauc_precision_at_3_max": -0.32528396386727493,
|
109 |
+
"nauc_precision_at_3_std": -0.3002414810839818,
|
110 |
+
"nauc_precision_at_5_diff1": -0.30024148108398174,
|
111 |
+
"nauc_precision_at_5_max": -0.3806904570253108,
|
112 |
+
"nauc_precision_at_5_std": -0.1629997316876844,
|
113 |
+
"nauc_recall_at_1000_diff1": -0.11872279831421319,
|
114 |
+
"nauc_recall_at_1000_max": -0.10043235608226833,
|
115 |
+
"nauc_recall_at_1000_std": -0.1582434744663014,
|
116 |
+
"nauc_recall_at_100_diff1": -0.17094410937809862,
|
117 |
+
"nauc_recall_at_100_max": -0.1836072840960791,
|
118 |
+
"nauc_recall_at_100_std": -0.14930413784253557,
|
119 |
+
"nauc_recall_at_10_diff1": -0.33404883284142745,
|
120 |
+
"nauc_recall_at_10_max": -0.3243597770026533,
|
121 |
+
"nauc_recall_at_10_std": -0.15597889276451127,
|
122 |
+
"nauc_recall_at_1_diff1": -0.5747249798765764,
|
123 |
+
"nauc_recall_at_1_max": -0.20056345586262406,
|
124 |
+
"nauc_recall_at_1_std": -0.24282264555943125,
|
125 |
+
"nauc_recall_at_20_diff1": -0.15625533575627482,
|
126 |
+
"nauc_recall_at_20_max": -0.14759616557309066,
|
127 |
+
"nauc_recall_at_20_std": -0.09494597165646257,
|
128 |
+
"nauc_recall_at_3_diff1": -0.5496824970932833,
|
129 |
+
"nauc_recall_at_3_max": -0.3252839638672748,
|
130 |
+
"nauc_recall_at_3_std": -0.3002414810839818,
|
131 |
+
"nauc_recall_at_5_diff1": -0.30024148108398174,
|
132 |
+
"nauc_recall_at_5_max": -0.38069045702531085,
|
133 |
+
"nauc_recall_at_5_std": -0.1629997316876844,
|
134 |
+
"ndcg_at_1": 0.004,
|
135 |
+
"ndcg_at_10": 0.00971,
|
136 |
+
"ndcg_at_100": 0.02508,
|
137 |
+
"ndcg_at_1000": 0.04516,
|
138 |
+
"ndcg_at_20": 0.01632,
|
139 |
+
"ndcg_at_3": 0.00526,
|
140 |
+
"ndcg_at_5": 0.00785,
|
141 |
+
"precision_at_1": 0.004,
|
142 |
+
"precision_at_10": 0.0018,
|
143 |
+
"precision_at_100": 0.00092,
|
144 |
+
"precision_at_1000": 0.00026,
|
145 |
+
"precision_at_20": 0.0022,
|
146 |
+
"precision_at_3": 0.002,
|
147 |
+
"precision_at_5": 0.0024,
|
148 |
+
"recall_at_1": 0.004,
|
149 |
+
"recall_at_10": 0.018,
|
150 |
+
"recall_at_100": 0.092,
|
151 |
+
"recall_at_1000": 0.264,
|
152 |
+
"recall_at_20": 0.044,
|
153 |
+
"recall_at_3": 0.006,
|
154 |
+
"recall_at_5": 0.012
|
155 |
+
}
|
156 |
+
]
|
157 |
+
},
|
158 |
+
"task_name": "CosQA"
|
159 |
+
}
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/STSBenchmark.json
DELETED
@@ -1,26 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"dataset_revision": "b0fddb56ed78048fa8b90373c8a3cfc37b684831",
|
3 |
-
"evaluation_time": 0.12331175804138184,
|
4 |
-
"kg_co2_emissions": null,
|
5 |
-
"mteb_version": "1.14.15",
|
6 |
-
"scores": {
|
7 |
-
"test": [
|
8 |
-
{
|
9 |
-
"cosine_pearson": 0.34632056143460516,
|
10 |
-
"cosine_spearman": 0.42973159111999676,
|
11 |
-
"euclidean_pearson": 0.4043313982401531,
|
12 |
-
"euclidean_spearman": 0.42973159111999676,
|
13 |
-
"hf_subset": "default",
|
14 |
-
"languages": [
|
15 |
-
"eng-Latn"
|
16 |
-
],
|
17 |
-
"main_score": 0.42973159111999676,
|
18 |
-
"manhattan_pearson": 0.511950240807258,
|
19 |
-
"manhattan_spearman": 0.5019330550880601,
|
20 |
-
"pearson": 0.34632056143460516,
|
21 |
-
"spearman": 0.42973159111999676
|
22 |
-
}
|
23 |
-
]
|
24 |
-
},
|
25 |
-
"task_name": "STSBenchmark"
|
26 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/SprintDuplicateQuestions.json
DELETED
@@ -1,58 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"dataset_revision": "d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46",
|
3 |
-
"evaluation_time": 1.9629368782043457,
|
4 |
-
"kg_co2_emissions": null,
|
5 |
-
"mteb_version": "1.14.15",
|
6 |
-
"scores": {
|
7 |
-
"test": [
|
8 |
-
{
|
9 |
-
"cosine_accuracy": 0.9926237623762376,
|
10 |
-
"cosine_accuracy_threshold": 0.9106360077857971,
|
11 |
-
"cosine_ap": 0.4700755863552174,
|
12 |
-
"cosine_f1": 0.4925187032418952,
|
13 |
-
"cosine_f1_threshold": 0.8986777067184448,
|
14 |
-
"cosine_precision": 0.6539735099337748,
|
15 |
-
"cosine_recall": 0.395,
|
16 |
-
"dot_accuracy": 0.9926237623762376,
|
17 |
-
"dot_accuracy_threshold": 0.9106361269950867,
|
18 |
-
"dot_ap": 0.47007548398718707,
|
19 |
-
"dot_f1": 0.4925187032418952,
|
20 |
-
"dot_f1_threshold": 0.8986777663230896,
|
21 |
-
"dot_precision": 0.6539735099337748,
|
22 |
-
"dot_recall": 0.395,
|
23 |
-
"euclidean_accuracy": 0.9926237623762376,
|
24 |
-
"euclidean_accuracy_threshold": 0.42276236414909363,
|
25 |
-
"euclidean_ap": 0.47007558217981027,
|
26 |
-
"euclidean_f1": 0.4925187032418952,
|
27 |
-
"euclidean_f1_threshold": 0.4501606225967407,
|
28 |
-
"euclidean_precision": 0.6539735099337748,
|
29 |
-
"euclidean_recall": 0.395,
|
30 |
-
"hf_subset": "default",
|
31 |
-
"languages": [
|
32 |
-
"eng-Latn"
|
33 |
-
],
|
34 |
-
"main_score": 0.6386707007383838,
|
35 |
-
"manhattan_accuracy": 0.9939207920792079,
|
36 |
-
"manhattan_accuracy_threshold": 4.824772834777832,
|
37 |
-
"manhattan_ap": 0.6386707007383838,
|
38 |
-
"manhattan_f1": 0.6293103448275862,
|
39 |
-
"manhattan_f1_threshold": 5.194998741149902,
|
40 |
-
"manhattan_precision": 0.6822429906542056,
|
41 |
-
"manhattan_recall": 0.584,
|
42 |
-
"max_accuracy": 0.9939207920792079,
|
43 |
-
"max_ap": 0.6386707007383838,
|
44 |
-
"max_f1": 0.6293103448275862,
|
45 |
-
"max_precision": 0.6822429906542056,
|
46 |
-
"max_recall": 0.584,
|
47 |
-
"similarity_accuracy": 0.9926237623762376,
|
48 |
-
"similarity_accuracy_threshold": 0.9106360077857971,
|
49 |
-
"similarity_ap": 0.4700755863552174,
|
50 |
-
"similarity_f1": 0.4925187032418952,
|
51 |
-
"similarity_f1_threshold": 0.8986777067184448,
|
52 |
-
"similarity_precision": 0.6539735099337748,
|
53 |
-
"similarity_recall": 0.395
|
54 |
-
}
|
55 |
-
]
|
56 |
-
},
|
57 |
-
"task_name": "SprintDuplicateQuestions"
|
58 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/StackExchangeClustering.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"dataset_revision": "6cbc1f7b2bc0622f2e39d2c77fa502909748c259",
|
3 |
-
"evaluation_time": 1075.5739603042603,
|
4 |
-
"kg_co2_emissions": null,
|
5 |
-
"mteb_version": "1.14.15",
|
6 |
-
"scores": {
|
7 |
-
"test": [
|
8 |
-
{
|
9 |
-
"hf_subset": "default",
|
10 |
-
"languages": [
|
11 |
-
"eng-Latn"
|
12 |
-
],
|
13 |
-
"main_score": 0.2747977935355363,
|
14 |
-
"v_measure": 0.2747977935355363,
|
15 |
-
"v_measure_std": 0.04408138950391278,
|
16 |
-
"v_measures": [
|
17 |
-
0.2671568735697825,
|
18 |
-
0.35324106044655595,
|
19 |
-
0.2134334295678833,
|
20 |
-
0.26069561242914296,
|
21 |
-
0.2360037867112385,
|
22 |
-
0.18352010080864292,
|
23 |
-
0.21227539957559294,
|
24 |
-
0.22564157353303899,
|
25 |
-
0.31014309699664405,
|
26 |
-
0.2792317143409387,
|
27 |
-
0.30736400840236383,
|
28 |
-
0.33654065468328326,
|
29 |
-
0.3375811203083562,
|
30 |
-
0.23635769205347795,
|
31 |
-
0.2889733490218442,
|
32 |
-
0.2628972368553193,
|
33 |
-
0.2892573063858698,
|
34 |
-
0.3093369539018476,
|
35 |
-
0.2778955236652676,
|
36 |
-
0.29489160764728006,
|
37 |
-
0.3092126928451642,
|
38 |
-
0.22100223054084894,
|
39 |
-
0.23711645754707986,
|
40 |
-
0.3264131545037563,
|
41 |
-
0.2937622020471872
|
42 |
-
]
|
43 |
-
}
|
44 |
-
]
|
45 |
-
},
|
46 |
-
"task_name": "StackExchangeClustering"
|
47 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/StackOverflowQA.json
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "db8f169f3894c14a00251061f957b2063eef2bd5",
|
3 |
+
"evaluation_time": 21.146663904190063,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {
|
7 |
+
"test": [
|
8 |
+
{
|
9 |
+
"hf_subset": "default",
|
10 |
+
"languages": [
|
11 |
+
"eng-Latn"
|
12 |
+
],
|
13 |
+
"main_score": 0.17615,
|
14 |
+
"map_at_1": 0.14142,
|
15 |
+
"map_at_10": 0.16367,
|
16 |
+
"map_at_100": 0.16807,
|
17 |
+
"map_at_1000": 0.16867,
|
18 |
+
"map_at_20": 0.16588,
|
19 |
+
"map_at_3": 0.1568,
|
20 |
+
"map_at_5": 0.16034,
|
21 |
+
"mrr_at_1": 0.14142427281845538,
|
22 |
+
"mrr_at_10": 0.1636685851204407,
|
23 |
+
"mrr_at_100": 0.16806598010525844,
|
24 |
+
"mrr_at_1000": 0.16867443260066448,
|
25 |
+
"mrr_at_20": 0.16587850269947257,
|
26 |
+
"mrr_at_3": 0.15680374456703444,
|
27 |
+
"mrr_at_5": 0.1603393513874958,
|
28 |
+
"nauc_map_at_1000_diff1": 0.5441830305562326,
|
29 |
+
"nauc_map_at_1000_max": 0.2166816774885428,
|
30 |
+
"nauc_map_at_1000_std": 0.14505555737829307,
|
31 |
+
"nauc_map_at_100_diff1": 0.5446431157527537,
|
32 |
+
"nauc_map_at_100_max": 0.21689938576550866,
|
33 |
+
"nauc_map_at_100_std": 0.14493387106545103,
|
34 |
+
"nauc_map_at_10_diff1": 0.5511736320008027,
|
35 |
+
"nauc_map_at_10_max": 0.21922402299128418,
|
36 |
+
"nauc_map_at_10_std": 0.14589505600247163,
|
37 |
+
"nauc_map_at_1_diff1": 0.6335084714813365,
|
38 |
+
"nauc_map_at_1_max": 0.2416674532732567,
|
39 |
+
"nauc_map_at_1_std": 0.15189301837631614,
|
40 |
+
"nauc_map_at_20_diff1": 0.5481004538160913,
|
41 |
+
"nauc_map_at_20_max": 0.21754392477744908,
|
42 |
+
"nauc_map_at_20_std": 0.14574143361267317,
|
43 |
+
"nauc_map_at_3_diff1": 0.569602881272386,
|
44 |
+
"nauc_map_at_3_max": 0.22657597605102178,
|
45 |
+
"nauc_map_at_3_std": 0.14362624083093203,
|
46 |
+
"nauc_map_at_5_diff1": 0.5601655545127238,
|
47 |
+
"nauc_map_at_5_max": 0.22021980923815318,
|
48 |
+
"nauc_map_at_5_std": 0.145190486252428,
|
49 |
+
"nauc_mrr_at_1000_diff1": 0.5441830305562326,
|
50 |
+
"nauc_mrr_at_1000_max": 0.2166816774885428,
|
51 |
+
"nauc_mrr_at_1000_std": 0.14505555737829307,
|
52 |
+
"nauc_mrr_at_100_diff1": 0.5446431157527537,
|
53 |
+
"nauc_mrr_at_100_max": 0.21689938576550866,
|
54 |
+
"nauc_mrr_at_100_std": 0.14493387106545103,
|
55 |
+
"nauc_mrr_at_10_diff1": 0.5511736320008027,
|
56 |
+
"nauc_mrr_at_10_max": 0.21922402299128418,
|
57 |
+
"nauc_mrr_at_10_std": 0.14589505600247163,
|
58 |
+
"nauc_mrr_at_1_diff1": 0.6335084714813365,
|
59 |
+
"nauc_mrr_at_1_max": 0.2416674532732567,
|
60 |
+
"nauc_mrr_at_1_std": 0.15189301837631614,
|
61 |
+
"nauc_mrr_at_20_diff1": 0.5481004538160913,
|
62 |
+
"nauc_mrr_at_20_max": 0.21754392477744908,
|
63 |
+
"nauc_mrr_at_20_std": 0.14574143361267317,
|
64 |
+
"nauc_mrr_at_3_diff1": 0.569602881272386,
|
65 |
+
"nauc_mrr_at_3_max": 0.22657597605102178,
|
66 |
+
"nauc_mrr_at_3_std": 0.14362624083093203,
|
67 |
+
"nauc_mrr_at_5_diff1": 0.5601655545127238,
|
68 |
+
"nauc_mrr_at_5_max": 0.22021980923815318,
|
69 |
+
"nauc_mrr_at_5_std": 0.145190486252428,
|
70 |
+
"nauc_ndcg_at_1000_diff1": 0.4728678699567455,
|
71 |
+
"nauc_ndcg_at_1000_max": 0.18937253079534216,
|
72 |
+
"nauc_ndcg_at_1000_std": 0.14596120873695492,
|
73 |
+
"nauc_ndcg_at_100_diff1": 0.4829489403420902,
|
74 |
+
"nauc_ndcg_at_100_max": 0.19711295138806267,
|
75 |
+
"nauc_ndcg_at_100_std": 0.14004483265553003,
|
76 |
+
"nauc_ndcg_at_10_diff1": 0.5147356366280121,
|
77 |
+
"nauc_ndcg_at_10_max": 0.20936478000130024,
|
78 |
+
"nauc_ndcg_at_10_std": 0.14480134602662714,
|
79 |
+
"nauc_ndcg_at_1_diff1": 0.6335084714813365,
|
80 |
+
"nauc_ndcg_at_1_max": 0.2416674532732567,
|
81 |
+
"nauc_ndcg_at_1_std": 0.15189301837631614,
|
82 |
+
"nauc_ndcg_at_20_diff1": 0.5045372953308567,
|
83 |
+
"nauc_ndcg_at_20_max": 0.20390468798029948,
|
84 |
+
"nauc_ndcg_at_20_std": 0.14429100965430774,
|
85 |
+
"nauc_ndcg_at_3_diff1": 0.5501813298382772,
|
86 |
+
"nauc_ndcg_at_3_max": 0.22229855178363508,
|
87 |
+
"nauc_ndcg_at_3_std": 0.1399986570615583,
|
88 |
+
"nauc_ndcg_at_5_diff1": 0.5343279242377332,
|
89 |
+
"nauc_ndcg_at_5_max": 0.21164562906788129,
|
90 |
+
"nauc_ndcg_at_5_std": 0.14278785553527687,
|
91 |
+
"nauc_precision_at_1000_diff1": 0.2504046219335285,
|
92 |
+
"nauc_precision_at_1000_max": 0.08591924265428995,
|
93 |
+
"nauc_precision_at_1000_std": 0.1677320203837767,
|
94 |
+
"nauc_precision_at_100_diff1": 0.31425670977915415,
|
95 |
+
"nauc_precision_at_100_max": 0.1387542114851391,
|
96 |
+
"nauc_precision_at_100_std": 0.1261904558936239,
|
97 |
+
"nauc_precision_at_10_diff1": 0.41968706662348626,
|
98 |
+
"nauc_precision_at_10_max": 0.18390157987927358,
|
99 |
+
"nauc_precision_at_10_std": 0.14312672622707642,
|
100 |
+
"nauc_precision_at_1_diff1": 0.6335084714813365,
|
101 |
+
"nauc_precision_at_1_max": 0.2416674532732567,
|
102 |
+
"nauc_precision_at_1_std": 0.15189301837631614,
|
103 |
+
"nauc_precision_at_20_diff1": 0.39118835707188254,
|
104 |
+
"nauc_precision_at_20_max": 0.16759815130477784,
|
105 |
+
"nauc_precision_at_20_std": 0.14154312425469426,
|
106 |
+
"nauc_precision_at_3_diff1": 0.4986851913309839,
|
107 |
+
"nauc_precision_at_3_max": 0.2110426423927967,
|
108 |
+
"nauc_precision_at_3_std": 0.13007101364000376,
|
109 |
+
"nauc_precision_at_5_diff1": 0.4672079991177685,
|
110 |
+
"nauc_precision_at_5_max": 0.18897950891809692,
|
111 |
+
"nauc_precision_at_5_std": 0.13674491342908243,
|
112 |
+
"nauc_recall_at_1000_diff1": 0.25040462193352936,
|
113 |
+
"nauc_recall_at_1000_max": 0.08591924265429102,
|
114 |
+
"nauc_recall_at_1000_std": 0.1677320203837774,
|
115 |
+
"nauc_recall_at_100_diff1": 0.3142567097791538,
|
116 |
+
"nauc_recall_at_100_max": 0.1387542114851391,
|
117 |
+
"nauc_recall_at_100_std": 0.12619045589362404,
|
118 |
+
"nauc_recall_at_10_diff1": 0.41968706662348615,
|
119 |
+
"nauc_recall_at_10_max": 0.18390157987927366,
|
120 |
+
"nauc_recall_at_10_std": 0.1431267262270766,
|
121 |
+
"nauc_recall_at_1_diff1": 0.6335084714813365,
|
122 |
+
"nauc_recall_at_1_max": 0.2416674532732567,
|
123 |
+
"nauc_recall_at_1_std": 0.15189301837631614,
|
124 |
+
"nauc_recall_at_20_diff1": 0.3911883570718826,
|
125 |
+
"nauc_recall_at_20_max": 0.16759815130477776,
|
126 |
+
"nauc_recall_at_20_std": 0.1415431242546944,
|
127 |
+
"nauc_recall_at_3_diff1": 0.49868519133098393,
|
128 |
+
"nauc_recall_at_3_max": 0.21104264239279674,
|
129 |
+
"nauc_recall_at_3_std": 0.13007101364000365,
|
130 |
+
"nauc_recall_at_5_diff1": 0.4672079991177685,
|
131 |
+
"nauc_recall_at_5_max": 0.18897950891809673,
|
132 |
+
"nauc_recall_at_5_std": 0.13674491342908224,
|
133 |
+
"ndcg_at_1": 0.14142,
|
134 |
+
"ndcg_at_10": 0.17615,
|
135 |
+
"ndcg_at_100": 0.20104,
|
136 |
+
"ndcg_at_1000": 0.22165,
|
137 |
+
"ndcg_at_20": 0.18433,
|
138 |
+
"ndcg_at_3": 0.16187,
|
139 |
+
"ndcg_at_5": 0.16825,
|
140 |
+
"precision_at_1": 0.14142,
|
141 |
+
"precision_at_10": 0.02161,
|
142 |
+
"precision_at_100": 0.00341,
|
143 |
+
"precision_at_1000": 0.00051,
|
144 |
+
"precision_at_20": 0.01244,
|
145 |
+
"precision_at_3": 0.05884,
|
146 |
+
"precision_at_5": 0.03842,
|
147 |
+
"recall_at_1": 0.14142,
|
148 |
+
"recall_at_10": 0.21615,
|
149 |
+
"recall_at_100": 0.34052,
|
150 |
+
"recall_at_1000": 0.51254,
|
151 |
+
"recall_at_20": 0.24875,
|
152 |
+
"recall_at_3": 0.17653,
|
153 |
+
"recall_at_5": 0.19208
|
154 |
+
}
|
155 |
+
]
|
156 |
+
},
|
157 |
+
"task_name": "StackOverflowQA"
|
158 |
+
}
|
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/SyntheticText2SQL.json
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_revision": "686b87296c3a0191b5d9415a00526c62db9fce09",
|
3 |
+
"evaluation_time": 92.1711049079895,
|
4 |
+
"kg_co2_emissions": null,
|
5 |
+
"mteb_version": "1.14.15",
|
6 |
+
"scores": {
|
7 |
+
"test": [
|
8 |
+
{
|
9 |
+
"hf_subset": "default",
|
10 |
+
"languages": [
|
11 |
+
"eng-Latn",
|
12 |
+
"sql-Code"
|
13 |
+
],
|
14 |
+
"main_score": 0.00489,
|
15 |
+
"map_at_1": 0.00034,
|
16 |
+
"map_at_10": 0.00331,
|
17 |
+
"map_at_100": 0.00393,
|
18 |
+
"map_at_1000": 0.00414,
|
19 |
+
"map_at_20": 0.00359,
|
20 |
+
"map_at_3": 0.00251,
|
21 |
+
"map_at_5": 0.00291,
|
22 |
+
"mrr_at_1": 0.002221842420099128,
|
23 |
+
"mrr_at_10": 0.004530496754048283,
|
24 |
+
"mrr_at_100": 0.005170980682014603,
|
25 |
+
"mrr_at_1000": 0.005371938321099836,
|
26 |
+
"mrr_at_20": 0.004822607995513001,
|
27 |
+
"mrr_at_3": 0.0038454964963254143,
|
28 |
+
"mrr_at_5": 0.004195863954879508,
|
29 |
+
"nauc_map_at_1000_diff1": -0.27416046541710665,
|
30 |
+
"nauc_map_at_1000_max": -0.12307636241646212,
|
31 |
+
"nauc_map_at_1000_std": -0.3169779465421886,
|
32 |
+
"nauc_map_at_100_diff1": -0.2867723753018623,
|
33 |
+
"nauc_map_at_100_max": -0.11793114543437405,
|
34 |
+
"nauc_map_at_100_std": -0.32768340793729833,
|
35 |
+
"nauc_map_at_10_diff1": -0.3071810467939698,
|
36 |
+
"nauc_map_at_10_max": -0.09394849271438202,
|
37 |
+
"nauc_map_at_10_std": -0.3443159235101931,
|
38 |
+
"nauc_map_at_1_diff1": -0.045574562309770715,
|
39 |
+
"nauc_map_at_1_max": -0.415009003625047,
|
40 |
+
"nauc_map_at_1_std": -0.28503182744193584,
|
41 |
+
"nauc_map_at_20_diff1": -0.30073635348293454,
|
42 |
+
"nauc_map_at_20_max": -0.1035440934145476,
|
43 |
+
"nauc_map_at_20_std": -0.33728144942994526,
|
44 |
+
"nauc_map_at_3_diff1": -0.36276475560891563,
|
45 |
+
"nauc_map_at_3_max": -0.09000122816382457,
|
46 |
+
"nauc_map_at_3_std": -0.35808488719288767,
|
47 |
+
"nauc_map_at_5_diff1": -0.34649671639377566,
|
48 |
+
"nauc_map_at_5_max": -0.07741484623960085,
|
49 |
+
"nauc_map_at_5_std": -0.3454332041446047,
|
50 |
+
"nauc_mrr_at_1000_diff1": -0.32007654216936365,
|
51 |
+
"nauc_mrr_at_1000_max": -0.05306747639186935,
|
52 |
+
"nauc_mrr_at_1000_std": -0.33505538550557523,
|
53 |
+
"nauc_mrr_at_100_diff1": -0.33152877543566905,
|
54 |
+
"nauc_mrr_at_100_max": -0.04652715811851764,
|
55 |
+
"nauc_mrr_at_100_std": -0.3439648778335655,
|
56 |
+
"nauc_mrr_at_10_diff1": -0.35260191778612204,
|
57 |
+
"nauc_mrr_at_10_max": -0.018284442733176375,
|
58 |
+
"nauc_mrr_at_10_std": -0.3583806093519501,
|
59 |
+
"nauc_mrr_at_1_diff1": -0.49322913632443244,
|
60 |
+
"nauc_mrr_at_1_max": 0.18386885076318166,
|
61 |
+
"nauc_mrr_at_1_std": -0.36881544615998557,
|
62 |
+
"nauc_mrr_at_20_diff1": -0.34523116750414906,
|
63 |
+
"nauc_mrr_at_20_max": -0.030423846920737567,
|
64 |
+
"nauc_mrr_at_20_std": -0.3523413443042862,
|
65 |
+
"nauc_mrr_at_3_diff1": -0.3986937025555519,
|
66 |
+
"nauc_mrr_at_3_max": 0.00596597350896994,
|
67 |
+
"nauc_mrr_at_3_std": -0.37595027480484544,
|
68 |
+
"nauc_mrr_at_5_diff1": -0.37058906995780483,
|
69 |
+
"nauc_mrr_at_5_max": 0.0023804395413750843,
|
70 |
+
"nauc_mrr_at_5_std": -0.3649770343981212,
|
71 |
+
"nauc_ndcg_at_1000_diff1": -0.12191989446547287,
|
72 |
+
"nauc_ndcg_at_1000_max": -0.18069129976379253,
|
73 |
+
"nauc_ndcg_at_1000_std": -0.21737660540578904,
|
74 |
+
"nauc_ndcg_at_100_diff1": -0.21534614581420813,
|
75 |
+
"nauc_ndcg_at_100_max": -0.16549108196966383,
|
76 |
+
"nauc_ndcg_at_100_std": -0.2967519876094673,
|
77 |
+
"nauc_ndcg_at_10_diff1": -0.2766087694329189,
|
78 |
+
"nauc_ndcg_at_10_max": -0.10425653229278331,
|
79 |
+
"nauc_ndcg_at_10_std": -0.34614483144111813,
|
80 |
+
"nauc_ndcg_at_1_diff1": -0.045574562309770715,
|
81 |
+
"nauc_ndcg_at_1_max": -0.415009003625047,
|
82 |
+
"nauc_ndcg_at_1_std": -0.28503182744193584,
|
83 |
+
"nauc_ndcg_at_20_diff1": -0.26495356113264346,
|
84 |
+
"nauc_ndcg_at_20_max": -0.12302281530014428,
|
85 |
+
"nauc_ndcg_at_20_std": -0.33040207062914734,
|
86 |
+
"nauc_ndcg_at_3_diff1": -0.35550615579366496,
|
87 |
+
"nauc_ndcg_at_3_max": -0.09065063772541752,
|
88 |
+
"nauc_ndcg_at_3_std": -0.3666750120549603,
|
89 |
+
"nauc_ndcg_at_5_diff1": -0.3367147607777083,
|
90 |
+
"nauc_ndcg_at_5_max": -0.07594752160761341,
|
91 |
+
"nauc_ndcg_at_5_std": -0.349392770228869,
|
92 |
+
"nauc_precision_at_1000_diff1": -0.05796266193135331,
|
93 |
+
"nauc_precision_at_1000_max": -0.19596247289607774,
|
94 |
+
"nauc_precision_at_1000_std": -0.1726159439969235,
|
95 |
+
"nauc_precision_at_100_diff1": -0.1623283482675489,
|
96 |
+
"nauc_precision_at_100_max": -0.20045006262758877,
|
97 |
+
"nauc_precision_at_100_std": -0.2711122975734177,
|
98 |
+
"nauc_precision_at_10_diff1": -0.23692269420435214,
|
99 |
+
"nauc_precision_at_10_max": -0.11995105342526458,
|
100 |
+
"nauc_precision_at_10_std": -0.34723986176409266,
|
101 |
+
"nauc_precision_at_1_diff1": -0.045574562309770715,
|
102 |
+
"nauc_precision_at_1_max": -0.415009003625047,
|
103 |
+
"nauc_precision_at_1_std": -0.28503182744193584,
|
104 |
+
"nauc_precision_at_20_diff1": -0.22656401175983737,
|
105 |
+
"nauc_precision_at_20_max": -0.14607723359403244,
|
106 |
+
"nauc_precision_at_20_std": -0.3206744368813374,
|
107 |
+
"nauc_precision_at_3_diff1": -0.3421859065827053,
|
108 |
+
"nauc_precision_at_3_max": -0.09374847026615557,
|
109 |
+
"nauc_precision_at_3_std": -0.37955719702776525,
|
110 |
+
"nauc_precision_at_5_diff1": -0.3217864826892486,
|
111 |
+
"nauc_precision_at_5_max": -0.07574764495371311,
|
112 |
+
"nauc_precision_at_5_std": -0.35431940648491467,
|
113 |
+
"nauc_recall_at_1000_diff1": -0.057962661931353035,
|
114 |
+
"nauc_recall_at_1000_max": -0.19596247289607757,
|
115 |
+
"nauc_recall_at_1000_std": -0.17261594399692332,
|
116 |
+
"nauc_recall_at_100_diff1": -0.16232834826754888,
|
117 |
+
"nauc_recall_at_100_max": -0.20045006262758874,
|
118 |
+
"nauc_recall_at_100_std": -0.2711122975734177,
|
119 |
+
"nauc_recall_at_10_diff1": -0.2369226942043523,
|
120 |
+
"nauc_recall_at_10_max": -0.11995105342526483,
|
121 |
+
"nauc_recall_at_10_std": -0.34723986176409277,
|
122 |
+
"nauc_recall_at_1_diff1": -0.045574562309770715,
|
123 |
+
"nauc_recall_at_1_max": -0.415009003625047,
|
124 |
+
"nauc_recall_at_1_std": -0.28503182744193584,
|
125 |
+
"nauc_recall_at_20_diff1": -0.22656401175983737,
|
126 |
+
"nauc_recall_at_20_max": -0.14607723359403255,
|
127 |
+
"nauc_recall_at_20_std": -0.3206744368813374,
|
128 |
+
"nauc_recall_at_3_diff1": -0.3421859065827052,
|
129 |
+
"nauc_recall_at_3_max": -0.09374847026615546,
|
130 |
+
"nauc_recall_at_3_std": -0.37955719702776536,
|
131 |
+
"nauc_recall_at_5_diff1": -0.3217864826892487,
|
132 |
+
"nauc_recall_at_5_max": -0.07574764495371322,
|
133 |
+
"nauc_recall_at_5_std": -0.3543194064849148,
|
134 |
+
"ndcg_at_1": 0.00034,
|
135 |
+
"ndcg_at_10": 0.00489,
|
136 |
+
"ndcg_at_100": 0.00885,
|
137 |
+
"ndcg_at_1000": 0.01629,
|
138 |
+
"ndcg_at_20": 0.00592,
|
139 |
+
"ndcg_at_3": 0.00322,
|
140 |
+
"ndcg_at_5": 0.00394,
|
141 |
+
"precision_at_1": 0.00034,
|
142 |
+
"precision_at_10": 0.00099,
|
143 |
+
"precision_at_100": 0.00031,
|
144 |
+
"precision_at_1000": 9e-05,
|
145 |
+
"precision_at_20": 0.0007,
|
146 |
+
"precision_at_3": 0.00177,
|
147 |
+
"precision_at_5": 0.0014,
|
148 |
+
"recall_at_1": 0.00034,
|
149 |
+
"recall_at_10": 0.00991,
|
150 |
+
"recall_at_100": 0.03076,
|
151 |
+
"recall_at_1000": 0.09383,
|
152 |
+
"recall_at_20": 0.01401,
|
153 |
+
"recall_at_3": 0.0053,
|
154 |
+
"recall_at_5": 0.00701
|
155 |
+
}
|
156 |
+
]
|
157 |
+
},
|
158 |
+
"task_name": "SyntheticText2SQL"
|
159 |
+
}
|
mteb_results/mteb_parsed_results.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"gte-Qwen2-7B-instruct-M2V-Distilled": "ResultSet(datasets={'Banking77Classification': DatasetResult(scores=[0.4396103896103896], time=6.451777696609497), 'StackExchangeClustering': DatasetResult(scores=[0.2747977935355363], time=1075.5739603042603), 'STSBenchmark': DatasetResult(scores=[0.42973159111999676], time=0.12331175804138184), 'CQADupstackProgrammersRetrieval': DatasetResult(scores=[0.0501], time=99.69791841506958), 'SprintDuplicateQuestions': DatasetResult(scores=[0.6386707007383838], time=1.9629368782043457)})"
|
3 |
-
}
|
|
|
|
|
|
|
|
mteb_results/mteb_raw_results.json
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
[
|
2 |
-
"dataset_revision='0fd18e25b25c072e09e0d92ab615fda904d66300' task_name='Banking77Classification' mteb_version='1.14.15' scores={'test': [{'accuracy': 0.4396103896103896, 'f1': 0.4142711532114576, 'f1_weighted': 0.4142711532114576, 'scores_per_experiment': [{'accuracy': 0.4279220779220779, 'f1': 0.4030476288783657, 'f1_weighted': 0.4030476288783656}, {'accuracy': 0.4211038961038961, 'f1': 0.39776168133611584, 'f1_weighted': 0.39776168133611584}, {'accuracy': 0.45064935064935063, 'f1': 0.42872843564828145, 'f1_weighted': 0.42872843564828145}, {'accuracy': 0.4448051948051948, 'f1': 0.420756828398419, 'f1_weighted': 0.42075682839841905}, {'accuracy': 0.44675324675324674, 'f1': 0.42100682221185654, 'f1_weighted': 0.42100682221185654}, {'accuracy': 0.45324675324675323, 'f1': 0.4392342490231314, 'f1_weighted': 0.4392342490231314}, {'accuracy': 0.437012987012987, 'f1': 0.4056017558988273, 'f1_weighted': 0.40560175589882724}, {'accuracy': 0.42337662337662335, 'f1': 0.39123709562594644, 'f1_weighted': 0.39123709562594655}, {'accuracy': 0.44512987012987015, 'f1': 0.41578171494860966, 'f1_weighted': 0.41578171494860966}, {'accuracy': 0.4461038961038961, 'f1': 0.4195553201450221, 'f1_weighted': 0.419555320145022}], 'main_score': 0.4396103896103896, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=6.451777696609497 kg_co2_emissions=None",
|
3 |
-
"dataset_revision='6cbc1f7b2bc0622f2e39d2c77fa502909748c259' task_name='StackExchangeClustering' mteb_version='1.14.15' scores={'test': [{'v_measure': 0.2747977935355363, 'v_measure_std': 0.04408138950391278, 'v_measures': [0.2671568735697825, 0.35324106044655595, 0.2134334295678833, 0.26069561242914296, 0.2360037867112385, 0.18352010080864292, 0.21227539957559294, 0.22564157353303899, 0.31014309699664405, 0.2792317143409387, 0.30736400840236383, 0.33654065468328326, 0.3375811203083562, 0.23635769205347795, 0.2889733490218442, 0.2628972368553193, 0.2892573063858698, 0.3093369539018476, 0.2778955236652676, 0.29489160764728006, 0.3092126928451642, 0.22100223054084894, 0.23711645754707986, 0.3264131545037563, 0.2937622020471872], 'main_score': 0.2747977935355363, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=1075.5739603042603 kg_co2_emissions=None",
|
4 |
-
"dataset_revision='b0fddb56ed78048fa8b90373c8a3cfc37b684831' task_name='STSBenchmark' mteb_version='1.14.15' scores={'test': [{'pearson': 0.34632056143460516, 'spearman': 0.42973159111999676, 'cosine_pearson': 0.34632056143460516, 'cosine_spearman': 0.42973159111999676, 'manhattan_pearson': 0.511950240807258, 'manhattan_spearman': 0.5019330550880601, 'euclidean_pearson': 0.4043313982401531, 'euclidean_spearman': 0.42973159111999676, 'main_score': 0.42973159111999676, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=0.12331175804138184 kg_co2_emissions=None",
|
5 |
-
"dataset_revision='6184bc1440d2dbc7612be22b50686b8826d22b32' task_name='CQADupstackProgrammersRetrieval' mteb_version='1.14.15' scores={'test': [{'ndcg_at_1': 0.03082, 'ndcg_at_3': 0.03989, 'ndcg_at_5': 0.04484, 'ndcg_at_10': 0.0501, 'ndcg_at_20': 0.05662, 'ndcg_at_100': 0.07072, 'ndcg_at_1000': 0.09327, 'map_at_1': 0.02467, 'map_at_3': 0.03388, 'map_at_5': 0.03693, 'map_at_10': 0.03898, 'map_at_20': 0.04068, 'map_at_100': 0.04261, 'map_at_1000': 0.04333, 'recall_at_1': 0.02467, 'recall_at_3': 0.04648, 'recall_at_5': 0.05869, 'recall_at_10': 0.07499, 'recall_at_20': 0.09901, 'recall_at_100': 0.16969, 'recall_at_1000': 0.33718, 'precision_at_1': 0.03082, 'precision_at_3': 0.02017, 'precision_at_5': 0.0153, 'precision_at_10': 0.00993, 'precision_at_20': 0.00685, 'precision_at_100': 0.00241, 'precision_at_1000': 0.00052, 'mrr_at_1': 0.030821917808219176, 'mrr_at_3': 0.04280821917808219, 'mrr_at_5': 0.04634703196347032, 'mrr_at_10': 0.04904462926723201, 'mrr_at_20': 0.05126402659708249, 'mrr_at_100': 0.05339942610218758, 'mrr_at_1000': 0.05413492750157237, 'nauc_ndcg_at_1_max': 0.35182174117717013, 'nauc_ndcg_at_1_std': -0.24426280067522707, 'nauc_ndcg_at_1_diff1': 0.1772995319079407, 'nauc_ndcg_at_3_max': 0.23212930749840155, 'nauc_ndcg_at_3_std': -0.1728371812831961, 'nauc_ndcg_at_3_diff1': 0.03670154146101528, 'nauc_ndcg_at_5_max': 0.20474332948099355, 'nauc_ndcg_at_5_std': -0.1734952739301359, 'nauc_ndcg_at_5_diff1': 0.0107566708693031, 'nauc_ndcg_at_10_max': 0.19884193622357532, 'nauc_ndcg_at_10_std': -0.16919003671988075, 'nauc_ndcg_at_10_diff1': 0.0026192804576363727, 'nauc_ndcg_at_20_max': 0.20925361343315524, 'nauc_ndcg_at_20_std': -0.17106125631597793, 'nauc_ndcg_at_20_diff1': 0.0031543394811079034, 'nauc_ndcg_at_100_max': 0.20125970115134734, 'nauc_ndcg_at_100_std': -0.15865628929382014, 'nauc_ndcg_at_100_diff1': 0.0023309149151885546, 'nauc_ndcg_at_1000_max': 0.20925878430027478, 'nauc_ndcg_at_1000_std': -0.1717044268161809, 'nauc_ndcg_at_1000_diff1': -0.010372586628261796, 'nauc_map_at_1_max': 0.33459947679728974, 'nauc_map_at_1_std': -0.23115450977179597, 'nauc_map_at_1_diff1': 0.1731091343679673, 'nauc_map_at_3_max': 0.2486807528974488, 'nauc_map_at_3_std': -0.18512855007450404, 'nauc_map_at_3_diff1': 0.06042780588964212, 'nauc_map_at_5_max': 0.22647048266105405, 'nauc_map_at_5_std': -0.18107585673560017, 'nauc_map_at_5_diff1': 0.04407217741234605, 'nauc_map_at_10_max': 0.22061594321968936, 'nauc_map_at_10_std': -0.17777470317814356, 'nauc_map_at_10_diff1': 0.03906418656483989, 'nauc_map_at_20_max': 0.22396003211648763, 'nauc_map_at_20_std': -0.17867373725662639, 'nauc_map_at_20_diff1': 0.03795725531499195, 'nauc_map_at_100_max': 0.22324901446317413, 'nauc_map_at_100_std': -0.17630470695891512, 'nauc_map_at_100_diff1': 0.03759221625144172, 'nauc_map_at_1000_max': 0.2240572170754659, 'nauc_map_at_1000_std': -0.17708810912472517, 'nauc_map_at_1000_diff1': 0.03644747951501248, 'nauc_recall_at_1_max': 0.33459947679728974, 'nauc_recall_at_1_std': -0.23115450977179597, 'nauc_recall_at_1_diff1': 0.1731091343679673, 'nauc_recall_at_3_max': 0.1864107664448688, 'nauc_recall_at_3_std': -0.14586036842324565, 'nauc_recall_at_3_diff1': -0.021696811828998432, 'nauc_recall_at_5_max': 0.1453135254521713, 'nauc_recall_at_5_std': -0.1531619473747777, 'nauc_recall_at_5_diff1': -0.0538517948884412, 'nauc_recall_at_10_max': 0.1384336247044034, 'nauc_recall_at_10_std': -0.14737738059263306, 'nauc_recall_at_10_diff1': -0.051375323084735164, 'nauc_recall_at_20_max': 0.16386688869593355, 'nauc_recall_at_20_std': -0.1528456365862212, 'nauc_recall_at_20_diff1': -0.03578815918976938, 'nauc_recall_at_100_max': 0.14861973646512244, 'nauc_recall_at_100_std': -0.12240747671934184, 'nauc_recall_at_100_diff1': -0.023004658252697183, 'nauc_recall_at_1000_max': 0.16414155669676642, 'nauc_recall_at_1000_std': -0.1513320281746568, 'nauc_recall_at_1000_diff1': -0.047075752528689695, 'nauc_precision_at_1_max': 0.35182174117717013, 'nauc_precision_at_1_std': -0.24426280067522707, 'nauc_precision_at_1_diff1': 0.1772995319079407, 'nauc_precision_at_3_max': 0.21285488271783465, 'nauc_precision_at_3_std': -0.1483164417030193, 'nauc_precision_at_3_diff1': -0.013044619440245884, 'nauc_precision_at_5_max': 0.1756649379589832, 'nauc_precision_at_5_std': -0.15632134056178232, 'nauc_precision_at_5_diff1': -0.05113181393685194, 'nauc_precision_at_10_max': 0.18962064467698705, 'nauc_precision_at_10_std': -0.14827004787357115, 'nauc_precision_at_10_diff1': -0.052513811685878764, 'nauc_precision_at_20_max': 0.22086458009752882, 'nauc_precision_at_20_std': -0.14430508663959002, 'nauc_precision_at_20_diff1': -0.040789324913047875, 'nauc_precision_at_100_max': 0.22138981394024387, 'nauc_precision_at_100_std': -0.13384472263037697, 'nauc_precision_at_100_diff1': -0.04518222914182943, 'nauc_precision_at_1000_max': 0.2542912736794115, 'nauc_precision_at_1000_std': -0.1881459402790264, 'nauc_precision_at_1000_diff1': -0.07195606207962846, 'nauc_mrr_at_1_max': 0.35182174117717013, 'nauc_mrr_at_1_std': -0.24426280067522707, 'nauc_mrr_at_1_diff1': 0.1772995319079407, 'nauc_mrr_at_3_max': 0.26889485727748363, 'nauc_mrr_at_3_std': -0.19153801111553947, 'nauc_mrr_at_3_diff1': 0.06173430027850725, 'nauc_mrr_at_5_max': 0.253857849052297, 'nauc_mrr_at_5_std': -0.19604549670316734, 'nauc_mrr_at_5_diff1': 0.036743759763164886, 'nauc_mrr_at_10_max': 0.25392922716866984, 'nauc_mrr_at_10_std': -0.1935061134919541, 'nauc_mrr_at_10_diff1': 0.03361519179733555, 'nauc_mrr_at_20_max': 0.25624951214228564, 'nauc_mrr_at_20_std': -0.19212268093923462, 'nauc_mrr_at_20_diff1': 0.03479828151019169, 'nauc_mrr_at_100_max': 0.2523932973431928, 'nauc_mrr_at_100_std': -0.1900913512193067, 'nauc_mrr_at_100_diff1': 0.03435870935950355, 'nauc_mrr_at_1000_max': 0.2523936325136619, 'nauc_mrr_at_1000_std': -0.19078164353963076, 'nauc_mrr_at_1000_diff1': 0.033601872249839834, 'main_score': 0.0501, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=99.69791841506958 kg_co2_emissions=None",
|
6 |
-
"dataset_revision='d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46' task_name='SprintDuplicateQuestions' mteb_version='1.14.15' scores={'test': [{'similarity_accuracy': 0.9926237623762376, 'similarity_accuracy_threshold': 0.9106360077857971, 'similarity_f1': 0.4925187032418952, 'similarity_f1_threshold': 0.8986777067184448, 'similarity_precision': 0.6539735099337748, 'similarity_recall': 0.395, 'similarity_ap': 0.4700755863552174, 'cosine_accuracy': 0.9926237623762376, 'cosine_accuracy_threshold': 0.9106360077857971, 'cosine_f1': 0.4925187032418952, 'cosine_f1_threshold': 0.8986777067184448, 'cosine_precision': 0.6539735099337748, 'cosine_recall': 0.395, 'cosine_ap': 0.4700755863552174, 'manhattan_accuracy': 0.9939207920792079, 'manhattan_accuracy_threshold': 4.824772834777832, 'manhattan_f1': 0.6293103448275862, 'manhattan_f1_threshold': 5.194998741149902, 'manhattan_precision': 0.6822429906542056, 'manhattan_recall': 0.584, 'manhattan_ap': 0.6386707007383838, 'euclidean_accuracy': 0.9926237623762376, 'euclidean_accuracy_threshold': 0.42276236414909363, 'euclidean_f1': 0.4925187032418952, 'euclidean_f1_threshold': 0.4501606225967407, 'euclidean_precision': 0.6539735099337748, 'euclidean_recall': 0.395, 'euclidean_ap': 0.47007558217981027, 'dot_accuracy': 0.9926237623762376, 'dot_accuracy_threshold': 0.9106361269950867, 'dot_f1': 0.4925187032418952, 'dot_f1_threshold': 0.8986777663230896, 'dot_precision': 0.6539735099337748, 'dot_recall': 0.395, 'dot_ap': 0.47007548398718707, 'max_accuracy': 0.9939207920792079, 'max_f1': 0.6293103448275862, 'max_precision': 0.6822429906542056, 'max_recall': 0.584, 'max_ap': 0.6386707007383838, 'main_score': 0.6386707007383838, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=1.9629368782043457 kg_co2_emissions=None"
|
7 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mteb_results/mteb_report.txt
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
================================================================================
|
2 |
-
MTEB Evaluation Report
|
3 |
-
================================================================================
|
4 |
-
|
5 |
-
Model: gte-Qwen2-7B-instruct-M2V-Distilled
|
6 |
-
Model Path: .
|
7 |
-
Evaluation Time: 1235.71 seconds
|
8 |
-
Total Datasets: 1
|
9 |
-
|
10 |
-
Summary Statistics:
|
11 |
-
Average Score: 0.0501
|
12 |
-
Median Score: 0.0501
|
13 |
-
Standard Deviation: 0.0000
|
14 |
-
Score Range: 0.0501 - 0.0501
|
15 |
-
|
16 |
-
Detailed Results:
|
17 |
-
--------------------------------------------------
|
18 |
-
Model Average (All) Average (MTEB) Classification Clustering PairClassification Reranking Retrieval STS Summarization PEARL WordSim
|
19 |
-
gte-Qwen2-7B-instruct-M2V-Distilled nan nan nan nan nan nan 5.01 nan nan nan nan
|
20 |
-
|
21 |
-
================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mteb_results/mteb_summary.json
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"model_name": "gte-Qwen2-7B-instruct-M2V-Distilled",
|
3 |
-
"evaluation_time_seconds": 1235.7057559490204,
|
4 |
-
"task_scores": {
|
5 |
-
"gte-Qwen2-7B-instruct-M2V-Distilled": {
|
6 |
-
"task_means": "Classification NaN\nClustering NaN\nPairClassification NaN\nReranking NaN\nRetrieval 0.0501\nSTS NaN\nSummarization NaN\nPEARL NaN\nWordSim NaN\ndtype: float64",
|
7 |
-
"dataset_scores": {
|
8 |
-
"CQADupstack": 0.0501
|
9 |
-
}
|
10 |
-
}
|
11 |
-
},
|
12 |
-
"summary_stats": {
|
13 |
-
"total_datasets": 1,
|
14 |
-
"average_score": 0.0501,
|
15 |
-
"median_score": 0.0501,
|
16 |
-
"std_dev": 0.0,
|
17 |
-
"min_score": 0.0501,
|
18 |
-
"max_score": 0.0501
|
19 |
-
}
|
20 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pyproject.toml
CHANGED
@@ -12,6 +12,7 @@ dependencies = [
|
|
12 |
"matplotlib>=3.10.3",
|
13 |
"model2vec[train]>=0.5.0",
|
14 |
"mteb>=1.14.15",
|
|
|
15 |
"psutil>=7.0.0",
|
16 |
"scikit-learn>=1.6.1",
|
17 |
"sentence-transformers>=4.1.0",
|
|
|
12 |
"matplotlib>=3.10.3",
|
13 |
"model2vec[train]>=0.5.0",
|
14 |
"mteb>=1.14.15",
|
15 |
+
"numpy>=1.26.4",
|
16 |
"psutil>=7.0.0",
|
17 |
"scikit-learn>=1.6.1",
|
18 |
"sentence-transformers>=4.1.0",
|
uv.lock
CHANGED
@@ -498,6 +498,7 @@ dependencies = [
|
|
498 |
{ name = "matplotlib" },
|
499 |
{ name = "model2vec", extra = ["train"] },
|
500 |
{ name = "mteb" },
|
|
|
501 |
{ name = "psutil" },
|
502 |
{ name = "scikit-learn" },
|
503 |
{ name = "sentence-transformers" },
|
@@ -519,6 +520,7 @@ requires-dist = [
|
|
519 |
{ name = "matplotlib", specifier = ">=3.10.3" },
|
520 |
{ name = "model2vec", extras = ["train"], specifier = ">=0.5.0" },
|
521 |
{ name = "mteb", specifier = ">=1.14.15" },
|
|
|
522 |
{ name = "psutil", specifier = ">=7.0.0" },
|
523 |
{ name = "scikit-learn", specifier = ">=1.6.1" },
|
524 |
{ name = "sentence-transformers", specifier = ">=4.1.0" },
|
|
|
498 |
{ name = "matplotlib" },
|
499 |
{ name = "model2vec", extra = ["train"] },
|
500 |
{ name = "mteb" },
|
501 |
+
{ name = "numpy" },
|
502 |
{ name = "psutil" },
|
503 |
{ name = "scikit-learn" },
|
504 |
{ name = "sentence-transformers" },
|
|
|
520 |
{ name = "matplotlib", specifier = ">=3.10.3" },
|
521 |
{ name = "model2vec", extras = ["train"], specifier = ">=0.5.0" },
|
522 |
{ name = "mteb", specifier = ">=1.14.15" },
|
523 |
+
{ name = "numpy", specifier = ">=1.26.4" },
|
524 |
{ name = "psutil", specifier = ">=7.0.0" },
|
525 |
{ name = "scikit-learn", specifier = ">=1.6.1" },
|
526 |
{ name = "sentence-transformers", specifier = ">=4.1.0" },
|