train-mbed / test_scores /test_fiqa_relevance.py
amos1088's picture
no
cae25d0
"""
Test FiQA score 1 documents to see how relevant they really are
"""
import json
from pathlib import Path
from collections import defaultdict
import random
def load_fiqa_data():
"""Load FiQA queries, corpus, and qrels"""
base_path = Path("../beir_data/fiqa")
# Load queries
queries = {}
with open(base_path / "queries.jsonl", 'r') as f:
for line in f:
q = json.loads(line)
queries[q['_id']] = q['text']
# Load corpus
corpus = {}
with open(base_path / "corpus.jsonl", 'r') as f:
for line in f:
doc = json.loads(line)
corpus[doc['_id']] = {
'title': doc.get('title', ''),
'text': doc.get('text', '')
}
# Load train qrels (all score 1)
train_qrels = defaultdict(list)
with open(base_path / "qrels/train.tsv", 'r') as f:
next(f) # Skip header
for line in f:
parts = line.strip().split('\t')
if len(parts) == 3:
query_id, doc_id, score = parts
train_qrels[query_id].append(doc_id)
return queries, corpus, train_qrels
def examine_fiqa_examples():
"""Look at actual FiQA examples to judge relevance"""
queries, corpus, train_qrels = load_fiqa_data()
print("="*80)
print("FiQA Training Data Analysis")
print("="*80)
print(f"Total queries in train: {len(train_qrels)}")
print(f"Total corpus size: {len(corpus)}")
# Get distribution of docs per query
docs_per_query = [len(docs) for docs in train_qrels.values()]
print(f"Docs per query: min={min(docs_per_query)}, max={max(docs_per_query)}, avg={sum(docs_per_query)/len(docs_per_query):.1f}")
print("\n" + "="*80)
print("EXAMINING SPECIFIC EXAMPLES")
print("="*80)
# Take first 5 queries with their documents
sample_queries = list(train_qrels.keys())[:5]
for i, query_id in enumerate(sample_queries, 1):
if query_id not in queries:
continue
print(f"\n[EXAMPLE {i}]")
print(f"QUERY: {queries[query_id]}")
print("-"*80)
docs = train_qrels[query_id]
print(f"Number of 'relevant' docs: {len(docs)}")
# Show first 2 docs for this query
for j, doc_id in enumerate(docs[:2], 1):
if doc_id in corpus:
doc = corpus[doc_id]
print(f"\nDoc {j} (ID: {doc_id}):")
print(f"Title: {doc['title'][:100]}" if doc['title'] else "[No title]")
print(f"Text: {doc['text'][:300]}...")
print("\n" + "="*80)
# Now let's check some random negative examples (not in qrels)
print("\nCOMPARING WITH RANDOM (UNLABELED) DOCUMENTS")
print("="*80)
# Pick a query and show both labeled and unlabeled docs
test_query_id = sample_queries[0]
if test_query_id in queries:
print(f"\nQUERY: {queries[test_query_id]}")
print("-"*80)
# Get labeled docs
labeled_docs = set(train_qrels[test_query_id])
# Get some random unlabeled docs
all_doc_ids = list(corpus.keys())
unlabeled_docs = [d for d in all_doc_ids if d not in labeled_docs]
random_unlabeled = random.sample(unlabeled_docs, min(2, len(unlabeled_docs)))
print("\n[LABELED AS RELEVANT (Score 1)]:")
for doc_id in list(labeled_docs)[:1]:
if doc_id in corpus:
doc = corpus[doc_id]
print(f"Text: {doc['text'][:250]}...")
print("\n[UNLABELED (Implicit Score 0)]:")
for doc_id in random_unlabeled[:1]:
if doc_id in corpus:
doc = corpus[doc_id]
print(f"Text: {doc['text'][:250]}...")
def analyze_relevance_quality():
"""Analyze if Score 1 docs are truly highly relevant or mixed"""
queries, corpus, train_qrels = load_fiqa_data()
print("\n" + "="*80)
print("RELEVANCE QUALITY ANALYSIS")
print("="*80)
# Find queries with multiple relevant docs to see if they're all equally relevant
multi_doc_queries = [(q, docs) for q, docs in train_qrels.items() if len(docs) > 2]
if multi_doc_queries:
query_id, doc_ids = multi_doc_queries[0]
if query_id in queries:
print(f"\nQuery with {len(doc_ids)} 'relevant' docs:")
print(f"QUERY: {queries[query_id]}")
print("-"*80)
# Show all relevant docs to see if they're equally relevant
for i, doc_id in enumerate(doc_ids[:4], 1):
if doc_id in corpus:
doc = corpus[doc_id]
print(f"\nRelevant Doc {i}:")
print(f"Text snippet: {doc['text'][:200]}...")
print("\n[QUESTION]: Do all these docs seem EQUALLY relevant to the query?")
print("If some are more relevant than others, then Score 1 is mixing different relevance levels!")
if __name__ == "__main__":
examine_fiqa_examples()
analyze_relevance_quality()