Spaces:
Paused
Paused
File size: 11,308 Bytes
cae25d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 |
"""
Create individual markdown reports for each BEIR dataset
Analyze Score 1 vs Score 2 meanings with examples
"""
import json
import os
import random
from pathlib import Path
from collections import defaultdict
def create_dataset_report(dataset_name, beir_base_path):
"""Create a detailed markdown report for a single dataset"""
dataset_path = Path(beir_base_path) / dataset_name
if not dataset_path.exists():
return None
report = []
report.append(f"# {dataset_name.upper()} Dataset Analysis\n")
report.append(f"## Overview\n")
# Check which splits exist
qrels_path = dataset_path / "qrels"
if not qrels_path.exists():
report.append("No qrels directory found.\n")
return "\n".join(report)
# Load queries
queries = {}
queries_file = dataset_path / "queries.jsonl"
if queries_file.exists():
with open(queries_file, 'r') as f:
for line in f:
q = json.loads(line)
queries[q['_id']] = q['text']
report.append(f"- Total queries available: {len(queries)}\n")
# Load MORE corpus for better examples
corpus = {}
corpus_count = 0
corpus_file = dataset_path / "corpus.jsonl"
if corpus_file.exists():
with open(corpus_file, 'r') as f:
for i, line in enumerate(f):
corpus_count += 1
if i < 50000: # Load MUCH MORE for examples
doc = json.loads(line)
corpus[doc['_id']] = {
'title': doc.get('title', ''),
'text': doc.get('text', '')
}
report.append(f"- Total corpus documents: {corpus_count}\n")
report.append(f"- Corpus documents loaded for examples: {len(corpus)}\n")
# Analyze each split
split_data = {}
all_score_examples = {} # Collect examples across all splits
for split in ["train", "dev", "test"]:
qrel_file = qrels_path / f"{split}.tsv"
if qrel_file.exists():
score_dist = {}
queries_set = set()
total_pairs = 0
examples_by_query = {}
with open(qrel_file, 'r') as f:
# Skip header if exists
first_line = f.readline().strip()
if not first_line.startswith("query"):
f.seek(0)
for line in f:
parts = line.strip().split('\t')
if len(parts) >= 3:
if len(parts) == 3:
query_id, doc_id, score = parts
else: # len(parts) == 4, TREC format
query_id = parts[0]
doc_id = parts[2]
score = parts[3]
score = int(float(score))
queries_set.add(query_id)
score_dist[score] = score_dist.get(score, 0) + 1
total_pairs += 1
# Collect ALL examples WITH SPLIT INFO
if query_id in queries and doc_id in corpus:
if score not in all_score_examples:
all_score_examples[score] = {}
key = f"{split}:{query_id}"
if key not in all_score_examples[score]:
all_score_examples[score][key] = []
all_score_examples[score][key].append(doc_id)
if query_id not in examples_by_query:
examples_by_query[query_id] = {}
if score not in examples_by_query[query_id]:
examples_by_query[query_id][score] = []
examples_by_query[query_id][score].append(doc_id)
split_data[split] = {
"queries": len(queries_set),
"total_pairs": total_pairs,
"score_dist": score_dist,
"examples": examples_by_query
}
# Write split statistics
report.append("\n## Split Statistics\n")
report.append("| Split | Queries | Total Pairs | Score Distribution |\n")
report.append("|-------|---------|-------------|-------------------|\n")
for split in ["train", "dev", "test"]:
if split in split_data:
data = split_data[split]
scores_str = ", ".join([f"{k}:{v}" for k, v in sorted(data["score_dist"].items())])
report.append(f"| {split} | {data['queries']} | {data['total_pairs']} | {scores_str} |\n")
else:
report.append(f"| {split} | N/A | N/A | N/A |\n")
# Analyze score meanings
report.append("\n## Score Analysis\n")
unique_scores = set()
for split in split_data.values():
unique_scores.update(split["score_dist"].keys())
if len(unique_scores) == 1:
report.append(f"**Only one score level ({list(unique_scores)[0]}) found in this dataset.**\n")
report.append("- This means all labeled documents are considered equally relevant\n")
report.append("- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)\n")
else:
report.append(f"**Multiple score levels found: {sorted(unique_scores)}**\n")
# Add examples
report.append("\n## Examples\n")
# Show multiple examples with score comparisons
report.append("### Score Comparison Examples\n")
# Find queries with multiple score levels
queries_with_multiple_scores = []
for split_name, split in split_data.items():
for query_id, scores_dict in split["examples"].items():
if len(scores_dict) > 1 and query_id in queries:
queries_with_multiple_scores.append((split_name, query_id, scores_dict))
# Show up to 5 comparison examples
if queries_with_multiple_scores:
for i, (split_name, query_id, scores_dict) in enumerate(queries_with_multiple_scores[:5], 1):
report.append(f"#### Comparison Example {i}\n")
report.append(f"**Query:** {queries[query_id]}\n")
report.append(f"**From:** {split_name} split\n\n")
for score in sorted(scores_dict.keys(), reverse=True):
doc_ids = scores_dict[score]
report.append(f"**Score {score} Documents:**\n")
shown = 0
for doc_id in doc_ids[:3]: # Show up to 3 docs per score
if doc_id in corpus:
doc = corpus[doc_id]
shown += 1
report.append(f"\n*Document {shown}:*\n")
if doc['title']:
report.append(f"- Title: {doc['title'][:200]}\n")
text_preview = doc['text'][:400].replace('\n', ' ')
report.append(f"- Text: {text_preview}...\n")
report.append("\n")
report.append("---\n\n")
# Show examples PER SPLIT
report.append("### Examples by Split and Score\n")
for split in ["train", "dev", "test"]:
if split not in split_data:
continue
report.append(f"\n#### {split.upper()} Split\n")
# Get scores for this split
split_scores = sorted(split_data[split]["score_dist"].keys())
report.append(f"**Scores in {split}: {split_scores}**\n")
for score in split_scores:
report.append(f"\n##### Score {score} Examples ({split})\n")
# Find examples for this score in this split
examples_shown = 0
for key, doc_ids in all_score_examples.get(score, {}).items():
if key.startswith(f"{split}:"):
query_id = key.split(':', 1)[1]
if query_id in queries and examples_shown < 10: # Changed from 3 to 5
examples_shown += 1
report.append(f"\n**Example {examples_shown}:**\n")
report.append(f"- Query: {queries[query_id]}\n")
# Show first doc
for doc_id in doc_ids[:1]:
if doc_id in corpus:
doc = corpus[doc_id]
if doc['title']:
report.append(f"- Doc Title: {doc['title']}\n")
text_preview = doc['text'].replace('\n', ' ')
report.append(f"- Doc Text: {text_preview}...\n")
break
if examples_shown == 0:
report.append("No examples found in loaded corpus.\n")
# Add sections to be filled
report.append("\n## Training Recommendations\n")
report.append("[TO BE FILLED BASED ON ANALYSIS]\n")
report.append("\n## Observations\n")
# Check for train/test mismatch
if "train" in split_data and "test" in split_data:
train_scores = set(split_data["train"]["score_dist"].keys())
test_scores = set(split_data["test"]["score_dist"].keys())
if train_scores != test_scores:
report.append(f"⚠️ **Score mismatch between train and test:**\n")
report.append(f"- Train has scores: {sorted(train_scores)}\n")
report.append(f"- Test has scores: {sorted(test_scores)}\n")
report.append(f"- This could cause issues when training models\n")
# Check for single score in train
if "train" in split_data:
train_scores = set(split_data["train"]["score_dist"].keys())
if len(train_scores) == 1 and 1 in train_scores:
report.append(f"⚠️ **Training data only has Score 1:**\n")
report.append(f"- All training documents are marked as equally relevant\n")
report.append(f"- Model cannot learn to distinguish relevance levels\n")
report.append(f"- May indicate collapsed/merged relevance levels\n")
return "\n".join(report)
def main():
beir_base = "../beir_data"
output_dir = Path(beir_base).parent / "test_scores" / "dataset_reports"
output_dir.mkdir(exist_ok=True)
# List all datasets
datasets = [d for d in os.listdir(beir_base)
if os.path.isdir(os.path.join(beir_base, d))]
print("="*80)
print("Creating individual dataset reports...")
print("="*80)
for dataset in sorted(datasets):
print(f"Processing {dataset}...")
report = create_dataset_report(dataset, beir_base)
if report:
# Save report
output_file = output_dir / f"{dataset}_analysis.md"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(report)
print(f" ✓ Report saved to: {output_file}")
print(f"\nAll reports saved in: {output_dir}")
if __name__ == "__main__":
main() |