Spaces:
Paused
Paused
""" | |
Create individual markdown reports for each BEIR dataset | |
Analyze Score 1 vs Score 2 meanings with examples | |
""" | |
import json | |
import os | |
import random | |
from pathlib import Path | |
from collections import defaultdict | |
def create_dataset_report(dataset_name, beir_base_path): | |
"""Create a detailed markdown report for a single dataset""" | |
dataset_path = Path(beir_base_path) / dataset_name | |
if not dataset_path.exists(): | |
return None | |
report = [] | |
report.append(f"# {dataset_name.upper()} Dataset Analysis\n") | |
report.append(f"## Overview\n") | |
# Check which splits exist | |
qrels_path = dataset_path / "qrels" | |
if not qrels_path.exists(): | |
report.append("No qrels directory found.\n") | |
return "\n".join(report) | |
# Load queries | |
queries = {} | |
queries_file = dataset_path / "queries.jsonl" | |
if queries_file.exists(): | |
with open(queries_file, 'r') as f: | |
for line in f: | |
q = json.loads(line) | |
queries[q['_id']] = q['text'] | |
report.append(f"- Total queries available: {len(queries)}\n") | |
# Load MORE corpus for better examples | |
corpus = {} | |
corpus_count = 0 | |
corpus_file = dataset_path / "corpus.jsonl" | |
if corpus_file.exists(): | |
with open(corpus_file, 'r') as f: | |
for i, line in enumerate(f): | |
corpus_count += 1 | |
if i < 50000: # Load MUCH MORE for examples | |
doc = json.loads(line) | |
corpus[doc['_id']] = { | |
'title': doc.get('title', ''), | |
'text': doc.get('text', '') | |
} | |
report.append(f"- Total corpus documents: {corpus_count}\n") | |
report.append(f"- Corpus documents loaded for examples: {len(corpus)}\n") | |
# Analyze each split | |
split_data = {} | |
all_score_examples = {} # Collect examples across all splits | |
for split in ["train", "dev", "test"]: | |
qrel_file = qrels_path / f"{split}.tsv" | |
if qrel_file.exists(): | |
score_dist = {} | |
queries_set = set() | |
total_pairs = 0 | |
examples_by_query = {} | |
with open(qrel_file, 'r') as f: | |
# Skip header if exists | |
first_line = f.readline().strip() | |
if not first_line.startswith("query"): | |
f.seek(0) | |
for line in f: | |
parts = line.strip().split('\t') | |
if len(parts) >= 3: | |
if len(parts) == 3: | |
query_id, doc_id, score = parts | |
else: # len(parts) == 4, TREC format | |
query_id = parts[0] | |
doc_id = parts[2] | |
score = parts[3] | |
score = int(float(score)) | |
queries_set.add(query_id) | |
score_dist[score] = score_dist.get(score, 0) + 1 | |
total_pairs += 1 | |
# Collect ALL examples WITH SPLIT INFO | |
if query_id in queries and doc_id in corpus: | |
if score not in all_score_examples: | |
all_score_examples[score] = {} | |
key = f"{split}:{query_id}" | |
if key not in all_score_examples[score]: | |
all_score_examples[score][key] = [] | |
all_score_examples[score][key].append(doc_id) | |
if query_id not in examples_by_query: | |
examples_by_query[query_id] = {} | |
if score not in examples_by_query[query_id]: | |
examples_by_query[query_id][score] = [] | |
examples_by_query[query_id][score].append(doc_id) | |
split_data[split] = { | |
"queries": len(queries_set), | |
"total_pairs": total_pairs, | |
"score_dist": score_dist, | |
"examples": examples_by_query | |
} | |
# Write split statistics | |
report.append("\n## Split Statistics\n") | |
report.append("| Split | Queries | Total Pairs | Score Distribution |\n") | |
report.append("|-------|---------|-------------|-------------------|\n") | |
for split in ["train", "dev", "test"]: | |
if split in split_data: | |
data = split_data[split] | |
scores_str = ", ".join([f"{k}:{v}" for k, v in sorted(data["score_dist"].items())]) | |
report.append(f"| {split} | {data['queries']} | {data['total_pairs']} | {scores_str} |\n") | |
else: | |
report.append(f"| {split} | N/A | N/A | N/A |\n") | |
# Analyze score meanings | |
report.append("\n## Score Analysis\n") | |
unique_scores = set() | |
for split in split_data.values(): | |
unique_scores.update(split["score_dist"].keys()) | |
if len(unique_scores) == 1: | |
report.append(f"**Only one score level ({list(unique_scores)[0]}) found in this dataset.**\n") | |
report.append("- This means all labeled documents are considered equally relevant\n") | |
report.append("- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)\n") | |
else: | |
report.append(f"**Multiple score levels found: {sorted(unique_scores)}**\n") | |
# Add examples | |
report.append("\n## Examples\n") | |
# Show multiple examples with score comparisons | |
report.append("### Score Comparison Examples\n") | |
# Find queries with multiple score levels | |
queries_with_multiple_scores = [] | |
for split_name, split in split_data.items(): | |
for query_id, scores_dict in split["examples"].items(): | |
if len(scores_dict) > 1 and query_id in queries: | |
queries_with_multiple_scores.append((split_name, query_id, scores_dict)) | |
# Show up to 5 comparison examples | |
if queries_with_multiple_scores: | |
for i, (split_name, query_id, scores_dict) in enumerate(queries_with_multiple_scores[:5], 1): | |
report.append(f"#### Comparison Example {i}\n") | |
report.append(f"**Query:** {queries[query_id]}\n") | |
report.append(f"**From:** {split_name} split\n\n") | |
for score in sorted(scores_dict.keys(), reverse=True): | |
doc_ids = scores_dict[score] | |
report.append(f"**Score {score} Documents:**\n") | |
shown = 0 | |
for doc_id in doc_ids[:3]: # Show up to 3 docs per score | |
if doc_id in corpus: | |
doc = corpus[doc_id] | |
shown += 1 | |
report.append(f"\n*Document {shown}:*\n") | |
if doc['title']: | |
report.append(f"- Title: {doc['title'][:200]}\n") | |
text_preview = doc['text'][:400].replace('\n', ' ') | |
report.append(f"- Text: {text_preview}...\n") | |
report.append("\n") | |
report.append("---\n\n") | |
# Show examples PER SPLIT | |
report.append("### Examples by Split and Score\n") | |
for split in ["train", "dev", "test"]: | |
if split not in split_data: | |
continue | |
report.append(f"\n#### {split.upper()} Split\n") | |
# Get scores for this split | |
split_scores = sorted(split_data[split]["score_dist"].keys()) | |
report.append(f"**Scores in {split}: {split_scores}**\n") | |
for score in split_scores: | |
report.append(f"\n##### Score {score} Examples ({split})\n") | |
# Find examples for this score in this split | |
examples_shown = 0 | |
for key, doc_ids in all_score_examples.get(score, {}).items(): | |
if key.startswith(f"{split}:"): | |
query_id = key.split(':', 1)[1] | |
if query_id in queries and examples_shown < 10: # Changed from 3 to 5 | |
examples_shown += 1 | |
report.append(f"\n**Example {examples_shown}:**\n") | |
report.append(f"- Query: {queries[query_id]}\n") | |
# Show first doc | |
for doc_id in doc_ids[:1]: | |
if doc_id in corpus: | |
doc = corpus[doc_id] | |
if doc['title']: | |
report.append(f"- Doc Title: {doc['title']}\n") | |
text_preview = doc['text'].replace('\n', ' ') | |
report.append(f"- Doc Text: {text_preview}...\n") | |
break | |
if examples_shown == 0: | |
report.append("No examples found in loaded corpus.\n") | |
# Add sections to be filled | |
report.append("\n## Training Recommendations\n") | |
report.append("[TO BE FILLED BASED ON ANALYSIS]\n") | |
report.append("\n## Observations\n") | |
# Check for train/test mismatch | |
if "train" in split_data and "test" in split_data: | |
train_scores = set(split_data["train"]["score_dist"].keys()) | |
test_scores = set(split_data["test"]["score_dist"].keys()) | |
if train_scores != test_scores: | |
report.append(f"⚠️ **Score mismatch between train and test:**\n") | |
report.append(f"- Train has scores: {sorted(train_scores)}\n") | |
report.append(f"- Test has scores: {sorted(test_scores)}\n") | |
report.append(f"- This could cause issues when training models\n") | |
# Check for single score in train | |
if "train" in split_data: | |
train_scores = set(split_data["train"]["score_dist"].keys()) | |
if len(train_scores) == 1 and 1 in train_scores: | |
report.append(f"⚠️ **Training data only has Score 1:**\n") | |
report.append(f"- All training documents are marked as equally relevant\n") | |
report.append(f"- Model cannot learn to distinguish relevance levels\n") | |
report.append(f"- May indicate collapsed/merged relevance levels\n") | |
return "\n".join(report) | |
def main(): | |
beir_base = "../beir_data" | |
output_dir = Path(beir_base).parent / "test_scores" / "dataset_reports" | |
output_dir.mkdir(exist_ok=True) | |
# List all datasets | |
datasets = [d for d in os.listdir(beir_base) | |
if os.path.isdir(os.path.join(beir_base, d))] | |
print("="*80) | |
print("Creating individual dataset reports...") | |
print("="*80) | |
for dataset in sorted(datasets): | |
print(f"Processing {dataset}...") | |
report = create_dataset_report(dataset, beir_base) | |
if report: | |
# Save report | |
output_file = output_dir / f"{dataset}_analysis.md" | |
with open(output_file, 'w', encoding='utf-8') as f: | |
f.write(report) | |
print(f" ✓ Report saved to: {output_file}") | |
print(f"\nAll reports saved in: {output_dir}") | |
if __name__ == "__main__": | |
main() |