|
#!/bin/bash -l |
|
|
|
|
|
poetry shell |
|
|
|
|
|
TS=$(date '+%Y%m%d%H%M%S') |
|
|
|
CONFIG_FILE="configs/tokenization_config_${TS}.json" |
|
|
|
|
|
cat <<EOF > "$CONFIG_FILE" |
|
{ |
|
"tokenizer_name_or_path": "teddy/models/teddy_g/400M", |
|
"gene_id_column": "index", |
|
"bio_annotations": true, |
|
"disease_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_disease_mapping.json", |
|
"tissue_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_tissue_mapping.json", |
|
"cell_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_cell_mapping.json", |
|
"sex_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_sex_mapping.json", |
|
"max_shard_samples": 500, |
|
"max_seq_len": 2048, |
|
"pad_length": 2048, |
|
"add_cls": false, |
|
"bins": 0, |
|
"continuous_rank": true, |
|
"add_disease_annotation": false, |
|
"include_zero_genes": false, |
|
"load_dir": "data/processed", |
|
"save_dir": "data/tokenized" |
|
} |
|
EOF |
|
|
|
|
|
|
|
python teddy/data_processing/tokenization/tokenization.py \ |
|
--data_path data/processed/sample_data.h5ad \ |
|
--metadata_path data/processed/sample_data_metadata.json \ |
|
--config_path "$CONFIG_FILE" |
|
|