""" Create individual markdown reports for each BEIR dataset Analyze Score 1 vs Score 2 meanings with examples """ import json import os import random from pathlib import Path from collections import defaultdict def create_dataset_report(dataset_name, beir_base_path): """Create a detailed markdown report for a single dataset""" dataset_path = Path(beir_base_path) / dataset_name if not dataset_path.exists(): return None report = [] report.append(f"# {dataset_name.upper()} Dataset Analysis\n") report.append(f"## Overview\n") # Check which splits exist qrels_path = dataset_path / "qrels" if not qrels_path.exists(): report.append("No qrels directory found.\n") return "\n".join(report) # Load queries queries = {} queries_file = dataset_path / "queries.jsonl" if queries_file.exists(): with open(queries_file, 'r') as f: for line in f: q = json.loads(line) queries[q['_id']] = q['text'] report.append(f"- Total queries available: {len(queries)}\n") # Load MORE corpus for better examples corpus = {} corpus_count = 0 corpus_file = dataset_path / "corpus.jsonl" if corpus_file.exists(): with open(corpus_file, 'r') as f: for i, line in enumerate(f): corpus_count += 1 if i < 50000: # Load MUCH MORE for examples doc = json.loads(line) corpus[doc['_id']] = { 'title': doc.get('title', ''), 'text': doc.get('text', '') } report.append(f"- Total corpus documents: {corpus_count}\n") report.append(f"- Corpus documents loaded for examples: {len(corpus)}\n") # Analyze each split split_data = {} all_score_examples = {} # Collect examples across all splits for split in ["train", "dev", "test"]: qrel_file = qrels_path / f"{split}.tsv" if qrel_file.exists(): score_dist = {} queries_set = set() total_pairs = 0 examples_by_query = {} with open(qrel_file, 'r') as f: # Skip header if exists first_line = f.readline().strip() if not first_line.startswith("query"): f.seek(0) for line in f: parts = line.strip().split('\t') if len(parts) >= 3: if len(parts) == 3: query_id, doc_id, score = parts else: # len(parts) == 4, TREC format query_id = parts[0] doc_id = parts[2] score = parts[3] score = int(float(score)) queries_set.add(query_id) score_dist[score] = score_dist.get(score, 0) + 1 total_pairs += 1 # Collect ALL examples WITH SPLIT INFO if query_id in queries and doc_id in corpus: if score not in all_score_examples: all_score_examples[score] = {} key = f"{split}:{query_id}" if key not in all_score_examples[score]: all_score_examples[score][key] = [] all_score_examples[score][key].append(doc_id) if query_id not in examples_by_query: examples_by_query[query_id] = {} if score not in examples_by_query[query_id]: examples_by_query[query_id][score] = [] examples_by_query[query_id][score].append(doc_id) split_data[split] = { "queries": len(queries_set), "total_pairs": total_pairs, "score_dist": score_dist, "examples": examples_by_query } # Write split statistics report.append("\n## Split Statistics\n") report.append("| Split | Queries | Total Pairs | Score Distribution |\n") report.append("|-------|---------|-------------|-------------------|\n") for split in ["train", "dev", "test"]: if split in split_data: data = split_data[split] scores_str = ", ".join([f"{k}:{v}" for k, v in sorted(data["score_dist"].items())]) report.append(f"| {split} | {data['queries']} | {data['total_pairs']} | {scores_str} |\n") else: report.append(f"| {split} | N/A | N/A | N/A |\n") # Analyze score meanings report.append("\n## Score Analysis\n") unique_scores = set() for split in split_data.values(): unique_scores.update(split["score_dist"].keys()) if len(unique_scores) == 1: report.append(f"**Only one score level ({list(unique_scores)[0]}) found in this dataset.**\n") report.append("- This means all labeled documents are considered equally relevant\n") report.append("- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)\n") else: report.append(f"**Multiple score levels found: {sorted(unique_scores)}**\n") # Add examples report.append("\n## Examples\n") # Show multiple examples with score comparisons report.append("### Score Comparison Examples\n") # Find queries with multiple score levels queries_with_multiple_scores = [] for split_name, split in split_data.items(): for query_id, scores_dict in split["examples"].items(): if len(scores_dict) > 1 and query_id in queries: queries_with_multiple_scores.append((split_name, query_id, scores_dict)) # Show up to 5 comparison examples if queries_with_multiple_scores: for i, (split_name, query_id, scores_dict) in enumerate(queries_with_multiple_scores[:5], 1): report.append(f"#### Comparison Example {i}\n") report.append(f"**Query:** {queries[query_id]}\n") report.append(f"**From:** {split_name} split\n\n") for score in sorted(scores_dict.keys(), reverse=True): doc_ids = scores_dict[score] report.append(f"**Score {score} Documents:**\n") shown = 0 for doc_id in doc_ids[:3]: # Show up to 3 docs per score if doc_id in corpus: doc = corpus[doc_id] shown += 1 report.append(f"\n*Document {shown}:*\n") if doc['title']: report.append(f"- Title: {doc['title'][:200]}\n") text_preview = doc['text'][:400].replace('\n', ' ') report.append(f"- Text: {text_preview}...\n") report.append("\n") report.append("---\n\n") # Show examples PER SPLIT report.append("### Examples by Split and Score\n") for split in ["train", "dev", "test"]: if split not in split_data: continue report.append(f"\n#### {split.upper()} Split\n") # Get scores for this split split_scores = sorted(split_data[split]["score_dist"].keys()) report.append(f"**Scores in {split}: {split_scores}**\n") for score in split_scores: report.append(f"\n##### Score {score} Examples ({split})\n") # Find examples for this score in this split examples_shown = 0 for key, doc_ids in all_score_examples.get(score, {}).items(): if key.startswith(f"{split}:"): query_id = key.split(':', 1)[1] if query_id in queries and examples_shown < 10: # Changed from 3 to 5 examples_shown += 1 report.append(f"\n**Example {examples_shown}:**\n") report.append(f"- Query: {queries[query_id]}\n") # Show first doc for doc_id in doc_ids[:1]: if doc_id in corpus: doc = corpus[doc_id] if doc['title']: report.append(f"- Doc Title: {doc['title']}\n") text_preview = doc['text'].replace('\n', ' ') report.append(f"- Doc Text: {text_preview}...\n") break if examples_shown == 0: report.append("No examples found in loaded corpus.\n") # Add sections to be filled report.append("\n## Training Recommendations\n") report.append("[TO BE FILLED BASED ON ANALYSIS]\n") report.append("\n## Observations\n") # Check for train/test mismatch if "train" in split_data and "test" in split_data: train_scores = set(split_data["train"]["score_dist"].keys()) test_scores = set(split_data["test"]["score_dist"].keys()) if train_scores != test_scores: report.append(f"⚠️ **Score mismatch between train and test:**\n") report.append(f"- Train has scores: {sorted(train_scores)}\n") report.append(f"- Test has scores: {sorted(test_scores)}\n") report.append(f"- This could cause issues when training models\n") # Check for single score in train if "train" in split_data: train_scores = set(split_data["train"]["score_dist"].keys()) if len(train_scores) == 1 and 1 in train_scores: report.append(f"⚠️ **Training data only has Score 1:**\n") report.append(f"- All training documents are marked as equally relevant\n") report.append(f"- Model cannot learn to distinguish relevance levels\n") report.append(f"- May indicate collapsed/merged relevance levels\n") return "\n".join(report) def main(): beir_base = "../beir_data" output_dir = Path(beir_base).parent / "test_scores" / "dataset_reports" output_dir.mkdir(exist_ok=True) # List all datasets datasets = [d for d in os.listdir(beir_base) if os.path.isdir(os.path.join(beir_base, d))] print("="*80) print("Creating individual dataset reports...") print("="*80) for dataset in sorted(datasets): print(f"Processing {dataset}...") report = create_dataset_report(dataset, beir_base) if report: # Save report output_file = output_dir / f"{dataset}_analysis.md" with open(output_file, 'w', encoding='utf-8') as f: f.write(report) print(f" ✓ Report saved to: {output_file}") print(f"\nAll reports saved in: {output_dir}") if __name__ == "__main__": main()