safe_rag / data_processing /data_loader.py
goodmodeler's picture
ADD: pipeline
0a02cd7
import logging
from datasets import load_dataset
logger = logging.getLogger(__name__)
class DataLoader:
def __init__(self, cache_dir: str = "./cache"):
self.cache_dir = cache_dir
def load_msmarco_passage(self, split: str = "train"):
"""Load MS MARCO Passage Ranking dataset from Hugging Face (v2.1)"""
try:
logger.info(f"Downloading MS MARCO Passage Ranking {split} (v2.1) from Hugging Face")
ds = load_dataset("ms_marco", "v2.1", split=split)
return ds
except Exception as e:
logger.error(f"Failed to load MS MARCO Passage Ranking: {e}")
raise
def get_passage_dataset(self, split: str = "train"):
"""Load MS MARCO Passage Ranking dataset"""
try:
ds = self.load_msmarco_passage(split)
logger.info("MS MARCO Passage Ranking loaded successfully")
return ds
except Exception as e:
logger.error(f"Failed to load MS MARCO Passage Ranking: {e}")
raise