deeptaxa / config.json
Ahmed Moustafa
Initial commit
f542637
{
"version": "deeptaxa.v1.2",
"model_type": "hybridcnnbert",
"tokenizer_name": "zhihan1996/DNABERT-2-117M",
"variants": {
"full-length": {
"checkpoint_file": "deeptaxa-full-length-v1.pt",
"architecture": {
"max_length": 512,
"embed_dim": 896,
"num_filters": 256,
"kernel_sizes": [
3,
5,
7
],
"num_conv_layers": 1,
"hidden_size": 896,
"num_hidden_layers": 4,
"num_attention_heads": 7,
"intermediate_size": 3584,
"hidden_dropout_prob": 0.2
},
"training_hyperparameters": {
"learning_rate": 0.0005,
"batch_size": 64,
"epochs": 10,
"loss_function": "cross_entropy",
"level_weights": [
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0
],
"optimizer": "AdamW",
"optimizer_params": {
"lr": 0.0005,
"betas": [
0.9,
0.999
],
"weight_decay": 0.01
},
"scheduler_warmup_ratio": 0.1,
"seed": 42
},
"total_parameters": 76365205,
"training_date": "2026-04-25",
"training_hardware": "NVIDIA GeForce RTX 4090",
"training_dataset": {
"name": "Greengenes2 2024.09 (full-length 16S)",
"train_sequences": 277336,
"test_sequences": 69335,
"train_fasta": "gg_2024_09_training.fna.gz",
"train_taxonomy": "gg_2024_09_training.tsv.gz"
},
"taxonomic_levels": {
"domain": 2,
"phylum": 129,
"class": 349,
"order": 997,
"family": 2250,
"genus": 7287,
"species": 16909
},
"test_metrics": {
"_note": "Single-seed test-set metrics for the published checkpoint (seed 42).",
"domain": {
"accuracy": 0.9999,
"f1_score": 0.9999,
"ece": 0.0001
},
"phylum": {
"accuracy": 0.9968,
"f1_score": 0.9967,
"ece": 0.0024
},
"class": {
"accuracy": 0.9963,
"f1_score": 0.9959,
"ece": 0.0024
},
"order": {
"accuracy": 0.9909,
"f1_score": 0.9899,
"ece": 0.0055
},
"family": {
"accuracy": 0.9861,
"f1_score": 0.9841,
"ece": 0.0075
},
"genus": {
"accuracy": 0.9693,
"f1_score": 0.9651,
"ece": 0.0143
},
"species": {
"accuracy": 0.9288,
"f1_score": 0.9203,
"ece": 0.0251
}
},
"test_metrics_multiseed": {
"_note": "3-seed mean \u00b1 std across seeds 42, 123, 456. Reported alongside the single-seed published metrics for reproducibility.",
"seeds": [
42,
123,
456
],
"domain": {
"accuracy_mean": 0.9998,
"accuracy_std": 0.0,
"f1_mean": 0.9998,
"f1_std": 0.0,
"ece_mean": 0.0001,
"ece_std": 0.0
},
"phylum": {
"accuracy_mean": 0.9969,
"accuracy_std": 0.0002,
"f1_mean": 0.9968,
"f1_std": 0.0002,
"ece_mean": 0.0023,
"ece_std": 0.0002
},
"class": {
"accuracy_mean": 0.9963,
"accuracy_std": 0.0,
"f1_mean": 0.9959,
"f1_std": 0.0,
"ece_mean": 0.0024,
"ece_std": 0.0
},
"order": {
"accuracy_mean": 0.9907,
"accuracy_std": 0.0003,
"f1_mean": 0.9897,
"f1_std": 0.0003,
"ece_mean": 0.0056,
"ece_std": 0.0002
},
"family": {
"accuracy_mean": 0.9861,
"accuracy_std": 0.0001,
"f1_mean": 0.9841,
"f1_std": 0.0002,
"ece_mean": 0.0075,
"ece_std": 0.0001
},
"genus": {
"accuracy_mean": 0.969,
"accuracy_std": 0.0003,
"f1_mean": 0.9648,
"f1_std": 0.0003,
"ece_mean": 0.0144,
"ece_std": 0.0001
},
"species": {
"accuracy_mean": 0.9296,
"accuracy_std": 0.0007,
"f1_mean": 0.9212,
"f1_std": 0.0008,
"ece_mean": 0.0242,
"ece_std": 0.0008
}
},
"derived_from": "Optuna hyperparameter search on Greengenes2 2024.09 full-length sequences. The full-length v1 checkpoint was updated in place in April 2026 to a smaller, faster configuration (76.4 M parameters) that matches or beats the prior 112.3 M parameter configuration at every taxonomic rank under identical evaluation."
},
"v3v4": {
"checkpoint_file": "deeptaxa-v3v4-v1.pt",
"architecture": {
"max_length": 512,
"embed_dim": 896,
"num_filters": 256,
"kernel_sizes": [
3,
5,
7
],
"num_conv_layers": 1,
"hidden_size": 896,
"num_hidden_layers": 4,
"num_attention_heads": 7,
"intermediate_size": 3584,
"hidden_dropout_prob": 0.2
},
"training_hyperparameters": {
"learning_rate": 0.0005,
"batch_size": 64,
"epochs": 10,
"loss_function": "cross_entropy",
"level_weights": [
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0
],
"optimizer": "AdamW",
"optimizer_params": {
"lr": 0.0005,
"betas": [
0.9,
0.999
],
"weight_decay": 0.01
},
"scheduler_warmup_ratio": 0.1,
"seed": 42
},
"total_parameters": 75813550,
"training_date": "2026-04-25",
"training_hardware": "NVIDIA GeForce RTX 4090",
"training_dataset": {
"name": "Greengenes2 2024.09 (in-silico V3-V4 extractions)",
"train_amplicons": 273003,
"test_amplicons": 68282,
"extraction_yield_train": 0.984,
"extraction_yield_test": 0.985,
"forward_primer": "CCTACGGGNGGCWGCAG",
"reverse_primer": "GACTACHVGGGTATCTAATCC",
"primer_name_forward": "341F",
"primer_name_reverse": "805R",
"max_primer_mismatches": 2,
"amplicon_length_median_bp": 422,
"amplicon_length_mean_bp": 416,
"amplicon_length_range_bp": [
90,
1552
]
},
"taxonomic_levels": {
"domain": 2,
"phylum": 115,
"class": 270,
"order": 709,
"family": 1528,
"genus": 4529,
"species": 8347
},
"test_metrics": {
"_note": "Single-seed test-set metrics for the published checkpoint (seed 42), V3-V4 SMALL canonical configuration.",
"domain": {
"accuracy": 0.9999,
"f1_score": 0.9999,
"ece": 0.0001
},
"phylum": {
"accuracy": 0.9968,
"f1_score": 0.9966,
"ece": 0.002
},
"class": {
"accuracy": 0.9964,
"f1_score": 0.996,
"ece": 0.0019
},
"order": {
"accuracy": 0.9899,
"f1_score": 0.9888,
"ece": 0.0054
},
"family": {
"accuracy": 0.9841,
"f1_score": 0.9819,
"ece": 0.0074
},
"genus": {
"accuracy": 0.9527,
"f1_score": 0.9473,
"ece": 0.017
},
"species": {
"accuracy": 0.8755,
"f1_score": 0.8592,
"ece": 0.0278
}
},
"derived_from": "Canonical SMALL HybridCNNBERT hyperparameters (matching the full-length v1.1 release), applied from scratch to in-silico V3-V4 extractions from Greengenes2 2024.09. Updated in v1.2 in place over the prior LARGE Optuna v3v4 release: under identical evaluation the SMALL configuration achieves equivalent species-level performance (seed-42 Acc 87.55 vs 87.52, F1 85.92 vs 85.79) at roughly 24 percent fewer parameters."
}
}
}