File size: 1,304 Bytes
f544a5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from functools import partial
# Mock dataset in a dictionary form, similar to what you might find in a data processing library
dataset = {
"train": [
{"text": "Hello world", "id": 1},
{"text": "Partial functions are cool", "id": 2},
]
}
# Function to preprocess the dataset
def prepare_train_dataset(example):
# Let's say we just transform the text to uppercase for simplicity
return {"text": example["text"].upper()}
# Columns to remove from the dataset after the transformation
columns_to_remove = ['id']
# Creating a mock map function for the dataset
def dataset_map(batch, function, remove_columns, batched, batch_size):
# Process each batch
transformed_data = [function(example) for example in batch]
# Remove specified columns
for item in transformed_data:
for column in remove_columns:
item.pop(column, None)
return transformed_data
# Using partial to pre-configure the map function
map_fn_train = partial(
dataset_map,
batch=dataset["train"],
function=prepare_train_dataset,
remove_columns=columns_to_remove,
batched=True,
batch_size=2 # Assuming we process all data in one batch for simplicity
)
# Using the configured function
transformed_dataset = map_fn_train()
print(transformed_dataset)
|