|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
set -e |
|
|
|
if [ -z "$GCS_BUCKET" ]; then |
|
echo "Please set the GCS_BUCKET environment variable" |
|
echo "Example: export GCS_BUCKET=gs://your-bucket-name" |
|
exit 1 |
|
fi |
|
|
|
if [ -z "$PROJECT_ID" ]; then |
|
echo "Please set the PROJECT_ID environment variable" |
|
echo "Example: export PROJECT_ID=your-gcp-project-id" |
|
exit 1 |
|
fi |
|
|
|
echo "Downloading and preparing datasets for LLM training..." |
|
|
|
|
|
TEMP_DIR=$(mktemp -d) |
|
echo "Using temporary directory: $TEMP_DIR" |
|
|
|
cd maxtext |
|
|
|
|
|
echo "Downloading C4 dataset..." |
|
bash download_dataset.sh $PROJECT_ID $GCS_BUCKET |
|
|
|
|
|
echo "Downloading RedPajama dataset..." |
|
mkdir -p $TEMP_DIR/redpajama |
|
gsutil -m cp -r gs://redpajama-data-1/redpajama-v1/* $GCS_BUCKET/redpajama/ |
|
|
|
|
|
echo "Downloading The Pile dataset..." |
|
mkdir -p $TEMP_DIR/pile |
|
gsutil -m cp -r gs://the-pile-v1/* $GCS_BUCKET/pile/ |
|
|
|
|
|
echo "Downloading SlimPajama dataset..." |
|
mkdir -p $TEMP_DIR/slimpajama |
|
gsutil -m cp -r gs://cerebras-slimpajama/* $GCS_BUCKET/slimpajama/ |
|
|
|
|
|
echo "Creating dataset index file..." |
|
cat > $TEMP_DIR/dataset_index.json << EOL |
|
{ |
|
"datasets": [ |
|
{ |
|
"name": "c4", |
|
"path": "$GCS_BUCKET/c4/en/3.0.1/", |
|
"weight": 0.3 |
|
}, |
|
{ |
|
"name": "redpajama", |
|
"path": "$GCS_BUCKET/redpajama/", |
|
"weight": 0.2 |
|
}, |
|
{ |
|
"name": "pile", |
|
"path": "$GCS_BUCKET/pile/", |
|
"weight": 0.2 |
|
}, |
|
{ |
|
"name": "slimpajama", |
|
"path": "$GCS_BUCKET/slimpajama/", |
|
"weight": 0.3 |
|
} |
|
] |
|
} |
|
EOL |
|
|
|
|
|
gsutil cp $TEMP_DIR/dataset_index.json $GCS_BUCKET/dataset_index.json |
|
|
|
echo "Dataset preparation complete!" |
|
echo "Datasets are available at: $GCS_BUCKET" |
|
echo "Dataset index file: $GCS_BUCKET/dataset_index.json" |
|
|
|
|
|
rm -rf $TEMP_DIR |
|
cd .. |
|
|