train-mbed / test_scores /analyze_each_dataset.py
amos1088's picture
no
cae25d0
"""
Create individual markdown reports for each BEIR dataset
Analyze Score 1 vs Score 2 meanings with examples
"""
import json
import os
import random
from pathlib import Path
from collections import defaultdict
def create_dataset_report(dataset_name, beir_base_path):
"""Create a detailed markdown report for a single dataset"""
dataset_path = Path(beir_base_path) / dataset_name
if not dataset_path.exists():
return None
report = []
report.append(f"# {dataset_name.upper()} Dataset Analysis\n")
report.append(f"## Overview\n")
# Check which splits exist
qrels_path = dataset_path / "qrels"
if not qrels_path.exists():
report.append("No qrels directory found.\n")
return "\n".join(report)
# Load queries
queries = {}
queries_file = dataset_path / "queries.jsonl"
if queries_file.exists():
with open(queries_file, 'r') as f:
for line in f:
q = json.loads(line)
queries[q['_id']] = q['text']
report.append(f"- Total queries available: {len(queries)}\n")
# Load MORE corpus for better examples
corpus = {}
corpus_count = 0
corpus_file = dataset_path / "corpus.jsonl"
if corpus_file.exists():
with open(corpus_file, 'r') as f:
for i, line in enumerate(f):
corpus_count += 1
if i < 50000: # Load MUCH MORE for examples
doc = json.loads(line)
corpus[doc['_id']] = {
'title': doc.get('title', ''),
'text': doc.get('text', '')
}
report.append(f"- Total corpus documents: {corpus_count}\n")
report.append(f"- Corpus documents loaded for examples: {len(corpus)}\n")
# Analyze each split
split_data = {}
all_score_examples = {} # Collect examples across all splits
for split in ["train", "dev", "test"]:
qrel_file = qrels_path / f"{split}.tsv"
if qrel_file.exists():
score_dist = {}
queries_set = set()
total_pairs = 0
examples_by_query = {}
with open(qrel_file, 'r') as f:
# Skip header if exists
first_line = f.readline().strip()
if not first_line.startswith("query"):
f.seek(0)
for line in f:
parts = line.strip().split('\t')
if len(parts) >= 3:
if len(parts) == 3:
query_id, doc_id, score = parts
else: # len(parts) == 4, TREC format
query_id = parts[0]
doc_id = parts[2]
score = parts[3]
score = int(float(score))
queries_set.add(query_id)
score_dist[score] = score_dist.get(score, 0) + 1
total_pairs += 1
# Collect ALL examples WITH SPLIT INFO
if query_id in queries and doc_id in corpus:
if score not in all_score_examples:
all_score_examples[score] = {}
key = f"{split}:{query_id}"
if key not in all_score_examples[score]:
all_score_examples[score][key] = []
all_score_examples[score][key].append(doc_id)
if query_id not in examples_by_query:
examples_by_query[query_id] = {}
if score not in examples_by_query[query_id]:
examples_by_query[query_id][score] = []
examples_by_query[query_id][score].append(doc_id)
split_data[split] = {
"queries": len(queries_set),
"total_pairs": total_pairs,
"score_dist": score_dist,
"examples": examples_by_query
}
# Write split statistics
report.append("\n## Split Statistics\n")
report.append("| Split | Queries | Total Pairs | Score Distribution |\n")
report.append("|-------|---------|-------------|-------------------|\n")
for split in ["train", "dev", "test"]:
if split in split_data:
data = split_data[split]
scores_str = ", ".join([f"{k}:{v}" for k, v in sorted(data["score_dist"].items())])
report.append(f"| {split} | {data['queries']} | {data['total_pairs']} | {scores_str} |\n")
else:
report.append(f"| {split} | N/A | N/A | N/A |\n")
# Analyze score meanings
report.append("\n## Score Analysis\n")
unique_scores = set()
for split in split_data.values():
unique_scores.update(split["score_dist"].keys())
if len(unique_scores) == 1:
report.append(f"**Only one score level ({list(unique_scores)[0]}) found in this dataset.**\n")
report.append("- This means all labeled documents are considered equally relevant\n")
report.append("- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)\n")
else:
report.append(f"**Multiple score levels found: {sorted(unique_scores)}**\n")
# Add examples
report.append("\n## Examples\n")
# Show multiple examples with score comparisons
report.append("### Score Comparison Examples\n")
# Find queries with multiple score levels
queries_with_multiple_scores = []
for split_name, split in split_data.items():
for query_id, scores_dict in split["examples"].items():
if len(scores_dict) > 1 and query_id in queries:
queries_with_multiple_scores.append((split_name, query_id, scores_dict))
# Show up to 5 comparison examples
if queries_with_multiple_scores:
for i, (split_name, query_id, scores_dict) in enumerate(queries_with_multiple_scores[:5], 1):
report.append(f"#### Comparison Example {i}\n")
report.append(f"**Query:** {queries[query_id]}\n")
report.append(f"**From:** {split_name} split\n\n")
for score in sorted(scores_dict.keys(), reverse=True):
doc_ids = scores_dict[score]
report.append(f"**Score {score} Documents:**\n")
shown = 0
for doc_id in doc_ids[:3]: # Show up to 3 docs per score
if doc_id in corpus:
doc = corpus[doc_id]
shown += 1
report.append(f"\n*Document {shown}:*\n")
if doc['title']:
report.append(f"- Title: {doc['title'][:200]}\n")
text_preview = doc['text'][:400].replace('\n', ' ')
report.append(f"- Text: {text_preview}...\n")
report.append("\n")
report.append("---\n\n")
# Show examples PER SPLIT
report.append("### Examples by Split and Score\n")
for split in ["train", "dev", "test"]:
if split not in split_data:
continue
report.append(f"\n#### {split.upper()} Split\n")
# Get scores for this split
split_scores = sorted(split_data[split]["score_dist"].keys())
report.append(f"**Scores in {split}: {split_scores}**\n")
for score in split_scores:
report.append(f"\n##### Score {score} Examples ({split})\n")
# Find examples for this score in this split
examples_shown = 0
for key, doc_ids in all_score_examples.get(score, {}).items():
if key.startswith(f"{split}:"):
query_id = key.split(':', 1)[1]
if query_id in queries and examples_shown < 10: # Changed from 3 to 5
examples_shown += 1
report.append(f"\n**Example {examples_shown}:**\n")
report.append(f"- Query: {queries[query_id]}\n")
# Show first doc
for doc_id in doc_ids[:1]:
if doc_id in corpus:
doc = corpus[doc_id]
if doc['title']:
report.append(f"- Doc Title: {doc['title']}\n")
text_preview = doc['text'].replace('\n', ' ')
report.append(f"- Doc Text: {text_preview}...\n")
break
if examples_shown == 0:
report.append("No examples found in loaded corpus.\n")
# Add sections to be filled
report.append("\n## Training Recommendations\n")
report.append("[TO BE FILLED BASED ON ANALYSIS]\n")
report.append("\n## Observations\n")
# Check for train/test mismatch
if "train" in split_data and "test" in split_data:
train_scores = set(split_data["train"]["score_dist"].keys())
test_scores = set(split_data["test"]["score_dist"].keys())
if train_scores != test_scores:
report.append(f"⚠️ **Score mismatch between train and test:**\n")
report.append(f"- Train has scores: {sorted(train_scores)}\n")
report.append(f"- Test has scores: {sorted(test_scores)}\n")
report.append(f"- This could cause issues when training models\n")
# Check for single score in train
if "train" in split_data:
train_scores = set(split_data["train"]["score_dist"].keys())
if len(train_scores) == 1 and 1 in train_scores:
report.append(f"⚠️ **Training data only has Score 1:**\n")
report.append(f"- All training documents are marked as equally relevant\n")
report.append(f"- Model cannot learn to distinguish relevance levels\n")
report.append(f"- May indicate collapsed/merged relevance levels\n")
return "\n".join(report)
def main():
beir_base = "../beir_data"
output_dir = Path(beir_base).parent / "test_scores" / "dataset_reports"
output_dir.mkdir(exist_ok=True)
# List all datasets
datasets = [d for d in os.listdir(beir_base)
if os.path.isdir(os.path.join(beir_base, d))]
print("="*80)
print("Creating individual dataset reports...")
print("="*80)
for dataset in sorted(datasets):
print(f"Processing {dataset}...")
report = create_dataset_report(dataset, beir_base)
if report:
# Save report
output_file = output_dir / f"{dataset}_analysis.md"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(report)
print(f" ✓ Report saved to: {output_file}")
print(f"\nAll reports saved in: {output_dir}")
if __name__ == "__main__":
main()