Spaces:
Paused
Paused
""" | |
Test FiQA score 1 documents to see how relevant they really are | |
""" | |
import json | |
from pathlib import Path | |
from collections import defaultdict | |
import random | |
def load_fiqa_data(): | |
"""Load FiQA queries, corpus, and qrels""" | |
base_path = Path("../beir_data/fiqa") | |
# Load queries | |
queries = {} | |
with open(base_path / "queries.jsonl", 'r') as f: | |
for line in f: | |
q = json.loads(line) | |
queries[q['_id']] = q['text'] | |
# Load corpus | |
corpus = {} | |
with open(base_path / "corpus.jsonl", 'r') as f: | |
for line in f: | |
doc = json.loads(line) | |
corpus[doc['_id']] = { | |
'title': doc.get('title', ''), | |
'text': doc.get('text', '') | |
} | |
# Load train qrels (all score 1) | |
train_qrels = defaultdict(list) | |
with open(base_path / "qrels/train.tsv", 'r') as f: | |
next(f) # Skip header | |
for line in f: | |
parts = line.strip().split('\t') | |
if len(parts) == 3: | |
query_id, doc_id, score = parts | |
train_qrels[query_id].append(doc_id) | |
return queries, corpus, train_qrels | |
def examine_fiqa_examples(): | |
"""Look at actual FiQA examples to judge relevance""" | |
queries, corpus, train_qrels = load_fiqa_data() | |
print("="*80) | |
print("FiQA Training Data Analysis") | |
print("="*80) | |
print(f"Total queries in train: {len(train_qrels)}") | |
print(f"Total corpus size: {len(corpus)}") | |
# Get distribution of docs per query | |
docs_per_query = [len(docs) for docs in train_qrels.values()] | |
print(f"Docs per query: min={min(docs_per_query)}, max={max(docs_per_query)}, avg={sum(docs_per_query)/len(docs_per_query):.1f}") | |
print("\n" + "="*80) | |
print("EXAMINING SPECIFIC EXAMPLES") | |
print("="*80) | |
# Take first 5 queries with their documents | |
sample_queries = list(train_qrels.keys())[:5] | |
for i, query_id in enumerate(sample_queries, 1): | |
if query_id not in queries: | |
continue | |
print(f"\n[EXAMPLE {i}]") | |
print(f"QUERY: {queries[query_id]}") | |
print("-"*80) | |
docs = train_qrels[query_id] | |
print(f"Number of 'relevant' docs: {len(docs)}") | |
# Show first 2 docs for this query | |
for j, doc_id in enumerate(docs[:2], 1): | |
if doc_id in corpus: | |
doc = corpus[doc_id] | |
print(f"\nDoc {j} (ID: {doc_id}):") | |
print(f"Title: {doc['title'][:100]}" if doc['title'] else "[No title]") | |
print(f"Text: {doc['text'][:300]}...") | |
print("\n" + "="*80) | |
# Now let's check some random negative examples (not in qrels) | |
print("\nCOMPARING WITH RANDOM (UNLABELED) DOCUMENTS") | |
print("="*80) | |
# Pick a query and show both labeled and unlabeled docs | |
test_query_id = sample_queries[0] | |
if test_query_id in queries: | |
print(f"\nQUERY: {queries[test_query_id]}") | |
print("-"*80) | |
# Get labeled docs | |
labeled_docs = set(train_qrels[test_query_id]) | |
# Get some random unlabeled docs | |
all_doc_ids = list(corpus.keys()) | |
unlabeled_docs = [d for d in all_doc_ids if d not in labeled_docs] | |
random_unlabeled = random.sample(unlabeled_docs, min(2, len(unlabeled_docs))) | |
print("\n[LABELED AS RELEVANT (Score 1)]:") | |
for doc_id in list(labeled_docs)[:1]: | |
if doc_id in corpus: | |
doc = corpus[doc_id] | |
print(f"Text: {doc['text'][:250]}...") | |
print("\n[UNLABELED (Implicit Score 0)]:") | |
for doc_id in random_unlabeled[:1]: | |
if doc_id in corpus: | |
doc = corpus[doc_id] | |
print(f"Text: {doc['text'][:250]}...") | |
def analyze_relevance_quality(): | |
"""Analyze if Score 1 docs are truly highly relevant or mixed""" | |
queries, corpus, train_qrels = load_fiqa_data() | |
print("\n" + "="*80) | |
print("RELEVANCE QUALITY ANALYSIS") | |
print("="*80) | |
# Find queries with multiple relevant docs to see if they're all equally relevant | |
multi_doc_queries = [(q, docs) for q, docs in train_qrels.items() if len(docs) > 2] | |
if multi_doc_queries: | |
query_id, doc_ids = multi_doc_queries[0] | |
if query_id in queries: | |
print(f"\nQuery with {len(doc_ids)} 'relevant' docs:") | |
print(f"QUERY: {queries[query_id]}") | |
print("-"*80) | |
# Show all relevant docs to see if they're equally relevant | |
for i, doc_id in enumerate(doc_ids[:4], 1): | |
if doc_id in corpus: | |
doc = corpus[doc_id] | |
print(f"\nRelevant Doc {i}:") | |
print(f"Text snippet: {doc['text'][:200]}...") | |
print("\n[QUESTION]: Do all these docs seem EQUALLY relevant to the query?") | |
print("If some are more relevant than others, then Score 1 is mixing different relevance levels!") | |
if __name__ == "__main__": | |
examine_fiqa_examples() | |
analyze_relevance_quality() |