File size: 11,308 Bytes
cae25d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
"""
Create individual markdown reports for each BEIR dataset
Analyze Score 1 vs Score 2 meanings with examples
"""

import json
import os
import random
from pathlib import Path
from collections import defaultdict

def create_dataset_report(dataset_name, beir_base_path):
    """Create a detailed markdown report for a single dataset"""
    dataset_path = Path(beir_base_path) / dataset_name
    
    if not dataset_path.exists():
        return None
    
    report = []
    report.append(f"# {dataset_name.upper()} Dataset Analysis\n")
    report.append(f"## Overview\n")
    
    # Check which splits exist
    qrels_path = dataset_path / "qrels"
    if not qrels_path.exists():
        report.append("No qrels directory found.\n")
        return "\n".join(report)
    
    # Load queries
    queries = {}
    queries_file = dataset_path / "queries.jsonl"
    if queries_file.exists():
        with open(queries_file, 'r') as f:
            for line in f:
                q = json.loads(line)
                queries[q['_id']] = q['text']
    report.append(f"- Total queries available: {len(queries)}\n")
    
    # Load MORE corpus for better examples
    corpus = {}
    corpus_count = 0
    corpus_file = dataset_path / "corpus.jsonl"
    if corpus_file.exists():
        with open(corpus_file, 'r') as f:
            for i, line in enumerate(f):
                corpus_count += 1
                if i < 50000:  # Load MUCH MORE for examples
                    doc = json.loads(line)
                    corpus[doc['_id']] = {
                        'title': doc.get('title', ''),
                        'text': doc.get('text', '')
                    }
    report.append(f"- Total corpus documents: {corpus_count}\n")
    report.append(f"- Corpus documents loaded for examples: {len(corpus)}\n")
    
    # Analyze each split
    split_data = {}
    all_score_examples = {}  # Collect examples across all splits
    
    for split in ["train", "dev", "test"]:
        qrel_file = qrels_path / f"{split}.tsv"
        
        if qrel_file.exists():
            score_dist = {}
            queries_set = set()
            total_pairs = 0
            examples_by_query = {}
            
            with open(qrel_file, 'r') as f:
                # Skip header if exists
                first_line = f.readline().strip()
                if not first_line.startswith("query"):
                    f.seek(0)
                
                for line in f:
                    parts = line.strip().split('\t')
                    if len(parts) >= 3:
                        if len(parts) == 3:
                            query_id, doc_id, score = parts
                        else:  # len(parts) == 4, TREC format
                            query_id = parts[0]
                            doc_id = parts[2]
                            score = parts[3]
                        score = int(float(score))
                        
                        queries_set.add(query_id)
                        score_dist[score] = score_dist.get(score, 0) + 1
                        total_pairs += 1
                        
                        # Collect ALL examples WITH SPLIT INFO
                        if query_id in queries and doc_id in corpus:
                            if score not in all_score_examples:
                                all_score_examples[score] = {}
                            key = f"{split}:{query_id}"
                            if key not in all_score_examples[score]:
                                all_score_examples[score][key] = []
                            all_score_examples[score][key].append(doc_id)
                            
                            if query_id not in examples_by_query:
                                examples_by_query[query_id] = {}
                            if score not in examples_by_query[query_id]:
                                examples_by_query[query_id][score] = []
                            examples_by_query[query_id][score].append(doc_id)
            
            split_data[split] = {
                "queries": len(queries_set),
                "total_pairs": total_pairs,
                "score_dist": score_dist,
                "examples": examples_by_query
            }
    
    # Write split statistics
    report.append("\n## Split Statistics\n")
    report.append("| Split | Queries | Total Pairs | Score Distribution |\n")
    report.append("|-------|---------|-------------|-------------------|\n")
    
    for split in ["train", "dev", "test"]:
        if split in split_data:
            data = split_data[split]
            scores_str = ", ".join([f"{k}:{v}" for k, v in sorted(data["score_dist"].items())])
            report.append(f"| {split} | {data['queries']} | {data['total_pairs']} | {scores_str} |\n")
        else:
            report.append(f"| {split} | N/A | N/A | N/A |\n")
    
    # Analyze score meanings
    report.append("\n## Score Analysis\n")
    
    unique_scores = set()
    for split in split_data.values():
        unique_scores.update(split["score_dist"].keys())
    
    if len(unique_scores) == 1:
        report.append(f"**Only one score level ({list(unique_scores)[0]}) found in this dataset.**\n")
        report.append("- This means all labeled documents are considered equally relevant\n")
        report.append("- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)\n")
    else:
        report.append(f"**Multiple score levels found: {sorted(unique_scores)}**\n")

    
    # Add examples
    report.append("\n## Examples\n")
    
    # Show multiple examples with score comparisons
    report.append("### Score Comparison Examples\n")
    
    # Find queries with multiple score levels
    queries_with_multiple_scores = []
    for split_name, split in split_data.items():
        for query_id, scores_dict in split["examples"].items():
            if len(scores_dict) > 1 and query_id in queries:
                queries_with_multiple_scores.append((split_name, query_id, scores_dict))
    
    # Show up to 5 comparison examples
    if queries_with_multiple_scores:
        for i, (split_name, query_id, scores_dict) in enumerate(queries_with_multiple_scores[:5], 1):
            report.append(f"#### Comparison Example {i}\n")
            report.append(f"**Query:** {queries[query_id]}\n")
            report.append(f"**From:** {split_name} split\n\n")
            
            for score in sorted(scores_dict.keys(), reverse=True):
                doc_ids = scores_dict[score]
                report.append(f"**Score {score} Documents:**\n")
                
                shown = 0
                for doc_id in doc_ids[:3]:  # Show up to 3 docs per score
                    if doc_id in corpus:
                        doc = corpus[doc_id]
                        shown += 1
                        report.append(f"\n*Document {shown}:*\n")
                        if doc['title']:
                            report.append(f"- Title: {doc['title'][:200]}\n")
                        text_preview = doc['text'][:400].replace('\n', ' ')
                        report.append(f"- Text: {text_preview}...\n")
                
                report.append("\n")
            report.append("---\n\n")
    
    # Show examples PER SPLIT
    report.append("### Examples by Split and Score\n")
    
    for split in ["train", "dev", "test"]:
        if split not in split_data:
            continue
            
        report.append(f"\n#### {split.upper()} Split\n")
        
        # Get scores for this split
        split_scores = sorted(split_data[split]["score_dist"].keys())
        report.append(f"**Scores in {split}: {split_scores}**\n")
        
        for score in split_scores:
            report.append(f"\n##### Score {score} Examples ({split})\n")
            
            # Find examples for this score in this split
            examples_shown = 0
            for key, doc_ids in all_score_examples.get(score, {}).items():
                if key.startswith(f"{split}:"):
                    query_id = key.split(':', 1)[1]
                    if query_id in queries and examples_shown < 10:  # Changed from 3 to 5
                        examples_shown += 1
                        report.append(f"\n**Example {examples_shown}:**\n")
                        report.append(f"- Query: {queries[query_id]}\n")
                        
                        # Show first doc
                        for doc_id in doc_ids[:1]:
                            if doc_id in corpus:
                                doc = corpus[doc_id]
                                if doc['title']:
                                    report.append(f"- Doc Title: {doc['title']}\n")
                                text_preview = doc['text'].replace('\n', ' ')
                                report.append(f"- Doc Text: {text_preview}...\n")
                                break
            
            if examples_shown == 0:
                report.append("No examples found in loaded corpus.\n")
    
    # Add sections to be filled
    report.append("\n## Training Recommendations\n")
    report.append("[TO BE FILLED BASED ON ANALYSIS]\n")
    
    report.append("\n## Observations\n")
    
    # Check for train/test mismatch
    if "train" in split_data and "test" in split_data:
        train_scores = set(split_data["train"]["score_dist"].keys())
        test_scores = set(split_data["test"]["score_dist"].keys())
        
        if train_scores != test_scores:
            report.append(f"⚠️ **Score mismatch between train and test:**\n")
            report.append(f"- Train has scores: {sorted(train_scores)}\n")
            report.append(f"- Test has scores: {sorted(test_scores)}\n")
            report.append(f"- This could cause issues when training models\n")
    
    # Check for single score in train
    if "train" in split_data:
        train_scores = set(split_data["train"]["score_dist"].keys())
        if len(train_scores) == 1 and 1 in train_scores:
            report.append(f"⚠️ **Training data only has Score 1:**\n")
            report.append(f"- All training documents are marked as equally relevant\n")
            report.append(f"- Model cannot learn to distinguish relevance levels\n")
            report.append(f"- May indicate collapsed/merged relevance levels\n")
    
    return "\n".join(report)

def main():
    beir_base = "../beir_data"
    output_dir = Path(beir_base).parent / "test_scores" / "dataset_reports"
    output_dir.mkdir(exist_ok=True)
    
    # List all datasets
    datasets = [d for d in os.listdir(beir_base) 
                if os.path.isdir(os.path.join(beir_base, d))]
    
    print("="*80)
    print("Creating individual dataset reports...")
    print("="*80)
    
    for dataset in sorted(datasets):
        print(f"Processing {dataset}...")
        report = create_dataset_report(dataset, beir_base)
        
        if report:
            # Save report
            output_file = output_dir / f"{dataset}_analysis.md"
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(report)
            print(f"  ✓ Report saved to: {output_file}")
    
    print(f"\nAll reports saved in: {output_dir}")

if __name__ == "__main__":
    main()