Geneformer / geneformer /mtl /collators.py
ctheodoris's picture
dictionaries from parent dir (#405)
85f295e verified
# imports
import torch
import pickle
from ..collator_for_classification import DataCollatorForGeneClassification
from .. import TOKEN_DICTIONARY_FILE
"""Geneformer collator for multi-task cell classification."""
class DataCollatorForMultitaskCellClassification(DataCollatorForGeneClassification):
class_type = "cell"
@staticmethod
def load_token_dictionary():
with open(TOKEN_DICTIONARY_FILE, 'rb') as f:
return pickle.load(f)
def __init__(self, *args, **kwargs) -> None:
# Load the token dictionary
token_dictionary = self.load_token_dictionary()
# Use the loaded token dictionary
super().__init__(token_dictionary=token_dictionary, *args, **kwargs)
def _prepare_batch(self, features):
# Process inputs as usual
batch = self.tokenizer.pad(
features,
class_type=self.class_type,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt",
)
# Check if labels are present
if "label" in features[0]:
# Initialize labels dictionary for all tasks
labels = {task: [] for task in features[0]["label"].keys()}
# Populate labels for each task
for feature in features:
for task, label in feature["label"].items():
labels[task].append(label)
# Convert label lists to tensors, handling dictionaries appropriately
for task in labels:
if isinstance(labels[task][0], (list, torch.Tensor)):
dtype = torch.long
labels[task] = torch.tensor(labels[task], dtype=dtype)
elif isinstance(labels[task][0], dict):
# Handle dict specifically if needed
pass # Resolve nested data structure
# Update the batch to include task-specific labels
batch["labels"] = labels
else:
# If no labels are present, create empty labels for all tasks
batch["labels"] = {
task: torch.tensor([], dtype=torch.long)
for task in features[0]["input_ids"].keys()
}
return batch
def __call__(self, features):
batch = self._prepare_batch(features)
for k, v in batch.items():
if torch.is_tensor(v):
batch[k] = v.clone().detach()
elif isinstance(v, dict):
# Assuming nested structure needs conversion
batch[k] = {
task: torch.tensor(labels, dtype=torch.int64)
for task, labels in v.items()
}
else:
batch[k] = torch.tensor(v, dtype=torch.int64)
return batch