Safetensors
TEDDY / scripts /tokenize_sample_data.sh
soumyatghosh's picture
Upload folder using huggingface_hub
4527b5f verified
#!/bin/bash -l
# Activate the Poetry environment (adjust this if needed)
poetry shell
# Generate a timestamp string (e.g., 20230404123056)
TS=$(date '+%Y%m%d%H%M%S')
CONFIG_FILE="configs/tokenization_config_${TS}.json"
# Create the config file containing your tokenization arguments
cat <<EOF > "$CONFIG_FILE"
{
"tokenizer_name_or_path": "teddy/models/teddy_g/400M",
"gene_id_column": "index",
"bio_annotations": true,
"disease_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_disease_mapping.json",
"tissue_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_tissue_mapping.json",
"cell_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_cell_mapping.json",
"sex_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_sex_mapping.json",
"max_shard_samples": 500,
"max_seq_len": 2048,
"pad_length": 2048,
"add_cls": false,
"bins": 0,
"continuous_rank": true,
"add_disease_annotation": false,
"include_zero_genes": false,
"load_dir": "data/processed",
"save_dir": "data/tokenized"
}
EOF
# Execute the tokenization.py script with three arguments:
# --data_path, --metadata_path, and --config_path
python teddy/data_processing/tokenization/tokenization.py \
--data_path data/processed/sample_data.h5ad \
--metadata_path data/processed/sample_data_metadata.json \
--config_path "$CONFIG_FILE"