import os from copy import deepcopy from enum import Enum from functools import partial from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union import datasets import psutil import torch import transformers as tr from datasets import load_dataset from torch.utils.data import Dataset from tqdm import tqdm from relik.common.log import get_console_logger, get_logger from relik.retriever.common.model_inputs import ModelInputs from relik.retriever.data.base.datasets import BaseDataset, IterableBaseDataset from relik.retriever.data.utils import HardNegativesManager console_logger = get_console_logger() logger = get_logger(__name__) class SubsampleStrategyEnum(Enum): NONE = "none" RANDOM = "random" IN_ORDER = "in_order" class GoldenRetrieverDataset: def __init__( self, name: str, path: Union[str, os.PathLike, List[str], List[os.PathLike]] = None, data: Any = None, tokenizer: Optional[Union[str, tr.PreTrainedTokenizer]] = None, # passages: Union[str, os.PathLike, List[str]] = None, passage_batch_size: int = 32, question_batch_size: int = 32, max_positives: int = -1, max_negatives: int = 0, max_hard_negatives: int = 0, max_question_length: int = 256, max_passage_length: int = 64, shuffle: bool = False, subsample_strategy: Optional[str] = SubsampleStrategyEnum.NONE, subsample_portion: float = 0.1, num_proc: Optional[int] = None, load_from_cache_file: bool = True, keep_in_memory: bool = False, prefetch: bool = True, load_fn_kwargs: Optional[Dict[str, Any]] = None, batch_fn_kwargs: Optional[Dict[str, Any]] = None, collate_fn_kwargs: Optional[Dict[str, Any]] = None, ): if path is None and data is None: raise ValueError("Either `path` or `data` must be provided") if tokenizer is None: raise ValueError("A tokenizer must be provided") # dataset parameters self.name = name self.path = Path(path) or path if path is not None and not isinstance(self.path, Sequence): self.path = [self.path] # self.project_folder = Path(__file__).parent.parent.parent self.data = data # hyper-parameters self.passage_batch_size = passage_batch_size self.question_batch_size = question_batch_size self.max_positives = max_positives self.max_negatives = max_negatives self.max_hard_negatives = max_hard_negatives self.max_question_length = max_question_length self.max_passage_length = max_passage_length self.shuffle = shuffle self.num_proc = num_proc self.load_from_cache_file = load_from_cache_file self.keep_in_memory = keep_in_memory self.prefetch = prefetch self.tokenizer = tokenizer if isinstance(self.tokenizer, str): self.tokenizer = tr.AutoTokenizer.from_pretrained(self.tokenizer) self.padding_ops = { "input_ids": partial( self.pad_sequence, value=self.tokenizer.pad_token_id, ), "attention_mask": partial(self.pad_sequence, value=0), "token_type_ids": partial( self.pad_sequence, value=self.tokenizer.pad_token_type_id, ), } # check if subsample strategy is valid if subsample_strategy is not None: # subsample_strategy can be a string or a SubsampleStrategy if isinstance(subsample_strategy, str): try: subsample_strategy = SubsampleStrategyEnum(subsample_strategy) except ValueError: raise ValueError( f"Subsample strategy {subsample_strategy} is not valid. " f"Valid strategies are: {SubsampleStrategyEnum.__members__}" ) if not isinstance(subsample_strategy, SubsampleStrategyEnum): raise ValueError( f"Subsample strategy {subsample_strategy} is not valid. " f"Valid strategies are: {SubsampleStrategyEnum.__members__}" ) self.subsample_strategy = subsample_strategy self.subsample_portion = subsample_portion # load the dataset if data is None: self.data: Dataset = self.load( self.path, tokenizer=self.tokenizer, load_from_cache_file=load_from_cache_file, load_fn_kwargs=load_fn_kwargs, num_proc=num_proc, shuffle=shuffle, keep_in_memory=keep_in_memory, max_positives=max_positives, max_negatives=max_negatives, max_hard_negatives=max_hard_negatives, max_question_length=max_question_length, max_passage_length=max_passage_length, ) else: self.data: Dataset = data self.hn_manager: Optional[HardNegativesManager] = None # keep track of how many times the dataset has been iterated over self.number_of_complete_iterations = 0 def __repr__(self) -> str: return f"GoldenRetrieverDataset({self.name=}, {self.path=})" def __len__(self) -> int: raise NotImplementedError def __getitem__( self, index ) -> Union[Dict[str, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: raise NotImplementedError def to_torch_dataset(self, *args, **kwargs) -> torch.utils.data.Dataset: raise NotImplementedError def load( self, paths: Union[str, os.PathLike, List[str], List[os.PathLike]], tokenizer: tr.PreTrainedTokenizer = None, load_fn_kwargs: Dict = None, load_from_cache_file: bool = True, num_proc: Optional[int] = None, shuffle: bool = False, keep_in_memory: bool = True, max_positives: int = -1, max_negatives: int = -1, max_hard_negatives: int = -1, max_passages: int = -1, max_question_length: int = 256, max_passage_length: int = 64, *args, **kwargs, ) -> Any: # if isinstance(paths, Sequence): # paths = [self.project_folder / path for path in paths] # else: # paths = [self.project_folder / paths] # read the data and put it in a placeholder list for path in paths: if not path.exists(): raise ValueError(f"{path} does not exist") fn_kwargs = dict( tokenizer=tokenizer, max_positives=max_positives, max_negatives=max_negatives, max_hard_negatives=max_hard_negatives, max_passages=max_passages, max_question_length=max_question_length, max_passage_length=max_passage_length, ) if load_fn_kwargs is not None: fn_kwargs.update(load_fn_kwargs) if num_proc is None: num_proc = psutil.cpu_count(logical=False) # The data is a list of dictionaries, each dictionary is a sample # Each sample has the following keys: # - "question": the question # - "answers": a list of answers # - "positive_ctxs": a list of positive passages # - "negative_ctxs": a list of negative passages # - "hard_negative_ctxs": a list of hard negative passages # use the huggingface dataset library to load the data, by default it will load the # data in a dict with the key being "train". logger.info(f"Loading data for dataset {self.name}") data = load_dataset( "json", data_files=[str(p) for p in paths], # datasets needs str paths and not Path split="train", streaming=False, # TODO maybe we can make streaming work keep_in_memory=keep_in_memory, ) # add id if not present if isinstance(data, datasets.Dataset): data = data.add_column("sample_idx", range(len(data))) else: data = data.map( lambda x, idx: x.update({"sample_idx": idx}), with_indices=True ) map_kwargs = dict( function=self.load_fn, fn_kwargs=fn_kwargs, ) if isinstance(data, datasets.Dataset): map_kwargs.update( dict( load_from_cache_file=load_from_cache_file, keep_in_memory=keep_in_memory, num_proc=num_proc, desc="Loading data", ) ) # preprocess the data data = data.map(**map_kwargs) # shuffle the data if shuffle: data.shuffle(seed=42) return data @staticmethod def create_batches( data: Dataset, batch_fn: Callable, batch_fn_kwargs: Optional[Dict[str, Any]] = None, prefetch: bool = True, *args, **kwargs, ) -> Union[Iterable, List]: if not prefetch: # if we are streaming, we don't need to create batches right now # we will create them on the fly when we need them batched_data = ( batch for batch in batch_fn( data, **(batch_fn_kwargs if batch_fn_kwargs is not None else {}) ) ) else: batched_data = [ batch for batch in tqdm( batch_fn( data, **(batch_fn_kwargs if batch_fn_kwargs is not None else {}) ), desc="Creating batches", ) ] return batched_data @staticmethod def collate_batches( batched_data: Union[Iterable, List], collate_fn: Callable, collate_fn_kwargs: Optional[Dict[str, Any]] = None, prefetch: bool = True, *args, **kwargs, ) -> Union[Iterable, List]: if not prefetch: collated_data = ( collate_fn(batch, **(collate_fn_kwargs if collate_fn_kwargs else {})) for batch in batched_data ) else: collated_data = [ collate_fn(batch, **(collate_fn_kwargs if collate_fn_kwargs else {})) for batch in tqdm(batched_data, desc="Collating batches") ] return collated_data @staticmethod def load_fn(sample: Dict, *args, **kwargs) -> Dict: raise NotImplementedError @staticmethod def batch_fn(data: Dataset, *args, **kwargs) -> Any: raise NotImplementedError @staticmethod def collate_fn(batch: Any, *args, **kwargs) -> Any: raise NotImplementedError @staticmethod def pad_sequence( sequence: Union[List, torch.Tensor], length: int, value: Any = None, pad_to_left: bool = False, ) -> Union[List, torch.Tensor]: """ Pad the input to the specified length with the given value. Args: sequence (:obj:`List`, :obj:`torch.Tensor`): Element to pad, it can be either a :obj:`List` or a :obj:`torch.Tensor`. length (:obj:`int`, :obj:`str`, optional, defaults to :obj:`subtoken`): Length after pad. value (:obj:`Any`, optional): Value to use as padding. pad_to_left (:obj:`bool`, optional, defaults to :obj:`False`): If :obj:`True`, pads to the left, right otherwise. Returns: :obj:`List`, :obj:`torch.Tensor`: The padded sequence. """ padding = [value] * abs(length - len(sequence)) if isinstance(sequence, torch.Tensor): if len(sequence.shape) > 1: raise ValueError( f"Sequence tensor must be 1D. Current shape is `{len(sequence.shape)}`" ) padding = torch.as_tensor(padding) if pad_to_left: if isinstance(sequence, torch.Tensor): return torch.cat((padding, sequence), -1) return padding + sequence if isinstance(sequence, torch.Tensor): return torch.cat((sequence, padding), -1) return sequence + padding def convert_to_batch( self, samples: Any, *args, **kwargs ) -> Dict[str, torch.Tensor]: """ Convert the list of samples to a batch. Args: samples (:obj:`List`): List of samples to convert to a batch. Returns: :obj:`Dict[str, torch.Tensor]`: The batch. """ # invert questions from list of dict to dict of list samples = {k: [d[k] for d in samples] for k in samples[0]} # get max length of questions max_len = max(len(x) for x in samples["input_ids"]) # pad the questions for key in samples: if key in self.padding_ops: samples[key] = torch.as_tensor( [self.padding_ops[key](b, max_len) for b in samples[key]] ) return samples def shuffle_data(self, seed: int = 42): self.data = self.data.shuffle(seed=seed) class InBatchNegativesDataset(GoldenRetrieverDataset): def __len__(self) -> int: if isinstance(self.data, datasets.Dataset): return len(self.data) def __getitem__( self, index ) -> Union[Dict[str, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: return self.data[index] def to_torch_dataset(self) -> torch.utils.data.Dataset: shuffle_this_time = self.shuffle if ( self.subsample_strategy and self.subsample_strategy != SubsampleStrategyEnum.NONE ): number_of_samples = int(len(self.data) * self.subsample_portion) if self.subsample_strategy == SubsampleStrategyEnum.RANDOM: logger.info( f"Random subsampling {number_of_samples} samples from {len(self.data)}" ) data = ( deepcopy(self.data) .shuffle(seed=42 + self.number_of_complete_iterations) .select(range(0, number_of_samples)) ) elif self.subsample_strategy == SubsampleStrategyEnum.IN_ORDER: # number_of_samples = int(len(self.data) * self.subsample_portion) already_selected = ( number_of_samples * self.number_of_complete_iterations ) logger.info( f"Subsampling {number_of_samples} samples out of {len(self.data)}" ) to_select = min(already_selected + number_of_samples, len(self.data)) logger.info( f"Portion of data selected: {already_selected} " f"to {to_select}" ) data = deepcopy(self.data).select(range(already_selected, to_select)) # don't shuffle the data if we are subsampling, and we have still not completed # one full iteration over the dataset if self.number_of_complete_iterations > 0: shuffle_this_time = False # reset the number of complete iterations if to_select >= len(self.data): # reset the number of complete iterations, # we have completed one full iteration over the dataset # the value is -1 because we want to start from 0 at the next iteration self.number_of_complete_iterations = -1 else: raise ValueError( f"Subsample strategy `{self.subsample_strategy}` is not valid. " f"Valid strategies are: {SubsampleStrategyEnum.__members__}" ) else: data = data = self.data # do we need to shuffle the data? if self.shuffle and shuffle_this_time: logger.info("Shuffling the data") data = data.shuffle(seed=42 + self.number_of_complete_iterations) batch_fn_kwargs = { "passage_batch_size": self.passage_batch_size, "question_batch_size": self.question_batch_size, "hard_negatives_manager": self.hn_manager, } batched_data = self.create_batches( data, batch_fn=self.batch_fn, batch_fn_kwargs=batch_fn_kwargs, prefetch=self.prefetch, ) batched_data = self.collate_batches( batched_data, self.collate_fn, prefetch=self.prefetch ) # increment the number of complete iterations self.number_of_complete_iterations += 1 if self.prefetch: return BaseDataset(name=self.name, data=batched_data) else: return IterableBaseDataset(name=self.name, data=batched_data) @staticmethod def load_fn( sample: Dict, tokenizer: tr.PreTrainedTokenizer, max_positives: int, max_negatives: int, max_hard_negatives: int, max_passages: int = -1, max_question_length: int = 256, max_passage_length: int = 128, *args, **kwargs, ) -> Dict: # remove duplicates and limit the number of passages positives = list(set([p["text"].strip() for p in sample["positive_ctxs"]])) if max_positives != -1: positives = positives[:max_positives] negatives = list(set([n["text"].strip() for n in sample["negative_ctxs"]])) if max_negatives != -1: negatives = negatives[:max_negatives] hard_negatives = list( set([h["text"].strip() for h in sample["hard_negative_ctxs"]]) ) if max_hard_negatives != -1: hard_negatives = hard_negatives[:max_hard_negatives] question = tokenizer( sample["question"], max_length=max_question_length, truncation=True ) passage = positives + negatives + hard_negatives if max_passages != -1: passage = passage[:max_passages] passage = tokenizer(passage, max_length=max_passage_length, truncation=True) # invert the passage data structure from a dict of lists to a list of dicts passage = [dict(zip(passage, t)) for t in zip(*passage.values())] output = dict( question=question, passage=passage, positives=positives, positive_pssgs=passage[: len(positives)], ) return output @staticmethod def batch_fn( data: Dataset, passage_batch_size: int, question_batch_size: int, hard_negatives_manager: Optional[HardNegativesManager] = None, *args, **kwargs, ) -> Dict[str, List[Dict[str, Any]]]: def split_batch( batch: Union[Dict[str, Any], ModelInputs], question_batch_size: int ) -> List[ModelInputs]: """ Split a batch into multiple batches of size `question_batch_size` while keeping the same number of passages. """ def split_fn(x): return [ x[i : i + question_batch_size] for i in range(0, len(x), question_batch_size) ] # split the sample_idx sample_idx = split_fn(batch["sample_idx"]) # split the questions questions = split_fn(batch["questions"]) # split the positives positives = split_fn(batch["positives"]) # split the positives_pssgs positives_pssgs = split_fn(batch["positives_pssgs"]) # collect the new batches batches = [] for i in range(len(questions)): batches.append( ModelInputs( dict( sample_idx=sample_idx[i], questions=questions[i], passages=batch["passages"], positives=positives[i], positives_pssgs=positives_pssgs[i], ) ) ) return batches batch = [] passages_in_batch = {} for sample in data: if len(passages_in_batch) >= passage_batch_size: # create the batch dict batch_dict = ModelInputs( dict( sample_idx=[s["sample_idx"] for s in batch], questions=[s["question"] for s in batch], passages=list(passages_in_batch.values()), positives_pssgs=[s["positive_pssgs"] for s in batch], positives=[s["positives"] for s in batch], ) ) # split the batch if needed if len(batch) > question_batch_size: for splited_batch in split_batch(batch_dict, question_batch_size): yield splited_batch else: yield batch_dict # reset batch batch = [] passages_in_batch = {} batch.append(sample) # yes it's a bit ugly but it works :) # count the number of passages in the batch and stop if we reach the limit # we use a set to avoid counting the same passage twice # we use a tuple because set doesn't support lists # we use input_ids as discriminator passages_in_batch.update( {tuple(passage["input_ids"]): passage for passage in sample["passage"]} ) # check for hard negatives and add with a probability of 0.1 if hard_negatives_manager is not None: if sample["sample_idx"] in hard_negatives_manager: passages_in_batch.update( { tuple(passage["input_ids"]): passage for passage in hard_negatives_manager.get( sample["sample_idx"] ) } ) # left over if len(batch) > 0: # create the batch dict batch_dict = ModelInputs( dict( sample_idx=[s["sample_idx"] for s in batch], questions=[s["question"] for s in batch], passages=list(passages_in_batch.values()), positives_pssgs=[s["positive_pssgs"] for s in batch], positives=[s["positives"] for s in batch], ) ) # split the batch if needed if len(batch) > question_batch_size: for splited_batch in split_batch(batch_dict, question_batch_size): yield splited_batch else: yield batch_dict def collate_fn(self, batch: Any, *args, **kwargs) -> Any: # convert questions and passages to a batch questions = self.convert_to_batch(batch.questions) passages = self.convert_to_batch(batch.passages) # build an index to map the position of the passage in the batch passage_index = {tuple(c["input_ids"]): i for i, c in enumerate(batch.passages)} # now we can create the labels labels = torch.zeros( questions["input_ids"].shape[0], passages["input_ids"].shape[0] ) # iterate over the questions and set the labels to 1 if the passage is positive for sample_idx in range(len(questions["input_ids"])): for pssg in batch["positives_pssgs"][sample_idx]: # get the index of the positive passage index = passage_index[tuple(pssg["input_ids"])] # set the label to 1 labels[sample_idx, index] = 1 model_inputs = ModelInputs( { "questions": questions, "passages": passages, "labels": labels, "positives": batch["positives"], "sample_idx": batch["sample_idx"], } ) return model_inputs class AidaInBatchNegativesDataset(InBatchNegativesDataset): def __init__(self, use_topics: bool = False, *args, **kwargs): if "load_fn_kwargs" not in kwargs: kwargs["load_fn_kwargs"] = {} kwargs["load_fn_kwargs"]["use_topics"] = use_topics super().__init__(*args, **kwargs) @staticmethod def load_fn( sample: Dict, tokenizer: tr.PreTrainedTokenizer, max_positives: int, max_negatives: int, max_hard_negatives: int, max_passages: int = -1, max_question_length: int = 256, max_passage_length: int = 128, use_topics: bool = False, *args, **kwargs, ) -> Dict: # remove duplicates and limit the number of passages positives = list(set([p["text"].strip() for p in sample["positive_ctxs"]])) if max_positives != -1: positives = positives[:max_positives] negatives = list(set([n["text"].strip() for n in sample["negative_ctxs"]])) if max_negatives != -1: negatives = negatives[:max_negatives] hard_negatives = list( set([h["text"].strip() for h in sample["hard_negative_ctxs"]]) ) if max_hard_negatives != -1: hard_negatives = hard_negatives[:max_hard_negatives] question = sample["question"] if "doc_topic" in sample and use_topics: question = tokenizer( question, sample["doc_topic"], max_length=max_question_length, truncation=True, ) else: question = tokenizer( question, max_length=max_question_length, truncation=True ) passage = positives + negatives + hard_negatives if max_passages != -1: passage = passage[:max_passages] passage = tokenizer(passage, max_length=max_passage_length, truncation=True) # invert the passage data structure from a dict of lists to a list of dicts passage = [dict(zip(passage, t)) for t in zip(*passage.values())] output = dict( question=question, passage=passage, positives=positives, positive_pssgs=passage[: len(positives)], ) return output