Spaces:
Sleeping
Sleeping
# from .processors.builder import build_processors | |
from .xgpt3_dataset import MultiModalDataset | |
from mplug_owl_video.processing_mplug_owl import MplugOwlImageProcessor, MplugOwlProcessor | |
def train_valid_test_datasets_provider(data_path, config, tokenizer, seq_length=1024, loss_objective = 'sequential'): | |
"""Build train and valid datasets.""" | |
print('> building train and validation datasets for mPLUG-Owl ...') | |
train_ds, valid_ds = build_train_valid_test_datasets( | |
input_file=data_path, | |
tokenizer=tokenizer, | |
max_length=seq_length, | |
config=config, loss_objective = loss_objective) | |
print("> finished creating mPLUG-Owl datasets ...") | |
return train_ds, valid_ds | |
def build_train_valid_test_datasets(input_file, tokenizer, max_length=80, config=None): | |
# train_processors = build_processors(config['train_processors']) | |
# valid_processors = build_processors(config['valid_processors']) | |
image_processor = MplugOwlImageProcessor.from_pretrained(config['pretrained_ckpt']) | |
processor = MplugOwlProcessor(image_processor, tokenizer) | |
assert len(input_file) == 2 # If you have files more than 2, modify code at here or merger them into train and dev | |
train_ds = MultiModalDataset(input_file[0], tokenizer, processor, max_length, loss_objective = loss_objective) | |
valid_ds = MultiModalDataset(input_file[1], tokenizer, processor, max_length, loss_objective = loss_objective) | |
return (train_ds, valid_ds) | |