| { |
| "version": "deeptaxa.v1.2", |
| "model_type": "hybridcnnbert", |
| "tokenizer_name": "zhihan1996/DNABERT-2-117M", |
| "variants": { |
| "full-length": { |
| "checkpoint_file": "deeptaxa-full-length-v1.pt", |
| "architecture": { |
| "max_length": 512, |
| "embed_dim": 896, |
| "num_filters": 256, |
| "kernel_sizes": [ |
| 3, |
| 5, |
| 7 |
| ], |
| "num_conv_layers": 1, |
| "hidden_size": 896, |
| "num_hidden_layers": 4, |
| "num_attention_heads": 7, |
| "intermediate_size": 3584, |
| "hidden_dropout_prob": 0.2 |
| }, |
| "training_hyperparameters": { |
| "learning_rate": 0.0005, |
| "batch_size": 64, |
| "epochs": 10, |
| "loss_function": "cross_entropy", |
| "level_weights": [ |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0 |
| ], |
| "optimizer": "AdamW", |
| "optimizer_params": { |
| "lr": 0.0005, |
| "betas": [ |
| 0.9, |
| 0.999 |
| ], |
| "weight_decay": 0.01 |
| }, |
| "scheduler_warmup_ratio": 0.1, |
| "seed": 42 |
| }, |
| "total_parameters": 76365205, |
| "training_date": "2026-04-25", |
| "training_hardware": "NVIDIA GeForce RTX 4090", |
| "training_dataset": { |
| "name": "Greengenes2 2024.09 (full-length 16S)", |
| "train_sequences": 277336, |
| "test_sequences": 69335, |
| "train_fasta": "gg_2024_09_training.fna.gz", |
| "train_taxonomy": "gg_2024_09_training.tsv.gz" |
| }, |
| "taxonomic_levels": { |
| "domain": 2, |
| "phylum": 129, |
| "class": 349, |
| "order": 997, |
| "family": 2250, |
| "genus": 7287, |
| "species": 16909 |
| }, |
| "test_metrics": { |
| "_note": "Single-seed test-set metrics for the published checkpoint (seed 42).", |
| "domain": { |
| "accuracy": 0.9999, |
| "f1_score": 0.9999, |
| "ece": 0.0001 |
| }, |
| "phylum": { |
| "accuracy": 0.9968, |
| "f1_score": 0.9967, |
| "ece": 0.0024 |
| }, |
| "class": { |
| "accuracy": 0.9963, |
| "f1_score": 0.9959, |
| "ece": 0.0024 |
| }, |
| "order": { |
| "accuracy": 0.9909, |
| "f1_score": 0.9899, |
| "ece": 0.0055 |
| }, |
| "family": { |
| "accuracy": 0.9861, |
| "f1_score": 0.9841, |
| "ece": 0.0075 |
| }, |
| "genus": { |
| "accuracy": 0.9693, |
| "f1_score": 0.9651, |
| "ece": 0.0143 |
| }, |
| "species": { |
| "accuracy": 0.9288, |
| "f1_score": 0.9203, |
| "ece": 0.0251 |
| } |
| }, |
| "test_metrics_multiseed": { |
| "_note": "3-seed mean \u00b1 std across seeds 42, 123, 456. Reported alongside the single-seed published metrics for reproducibility.", |
| "seeds": [ |
| 42, |
| 123, |
| 456 |
| ], |
| "domain": { |
| "accuracy_mean": 0.9998, |
| "accuracy_std": 0.0, |
| "f1_mean": 0.9998, |
| "f1_std": 0.0, |
| "ece_mean": 0.0001, |
| "ece_std": 0.0 |
| }, |
| "phylum": { |
| "accuracy_mean": 0.9969, |
| "accuracy_std": 0.0002, |
| "f1_mean": 0.9968, |
| "f1_std": 0.0002, |
| "ece_mean": 0.0023, |
| "ece_std": 0.0002 |
| }, |
| "class": { |
| "accuracy_mean": 0.9963, |
| "accuracy_std": 0.0, |
| "f1_mean": 0.9959, |
| "f1_std": 0.0, |
| "ece_mean": 0.0024, |
| "ece_std": 0.0 |
| }, |
| "order": { |
| "accuracy_mean": 0.9907, |
| "accuracy_std": 0.0003, |
| "f1_mean": 0.9897, |
| "f1_std": 0.0003, |
| "ece_mean": 0.0056, |
| "ece_std": 0.0002 |
| }, |
| "family": { |
| "accuracy_mean": 0.9861, |
| "accuracy_std": 0.0001, |
| "f1_mean": 0.9841, |
| "f1_std": 0.0002, |
| "ece_mean": 0.0075, |
| "ece_std": 0.0001 |
| }, |
| "genus": { |
| "accuracy_mean": 0.969, |
| "accuracy_std": 0.0003, |
| "f1_mean": 0.9648, |
| "f1_std": 0.0003, |
| "ece_mean": 0.0144, |
| "ece_std": 0.0001 |
| }, |
| "species": { |
| "accuracy_mean": 0.9296, |
| "accuracy_std": 0.0007, |
| "f1_mean": 0.9212, |
| "f1_std": 0.0008, |
| "ece_mean": 0.0242, |
| "ece_std": 0.0008 |
| } |
| }, |
| "derived_from": "Optuna hyperparameter search on Greengenes2 2024.09 full-length sequences. The full-length v1 checkpoint was updated in place in April 2026 to a smaller, faster configuration (76.4 M parameters) that matches or beats the prior 112.3 M parameter configuration at every taxonomic rank under identical evaluation." |
| }, |
| "v3v4": { |
| "checkpoint_file": "deeptaxa-v3v4-v1.pt", |
| "architecture": { |
| "max_length": 512, |
| "embed_dim": 896, |
| "num_filters": 256, |
| "kernel_sizes": [ |
| 3, |
| 5, |
| 7 |
| ], |
| "num_conv_layers": 1, |
| "hidden_size": 896, |
| "num_hidden_layers": 4, |
| "num_attention_heads": 7, |
| "intermediate_size": 3584, |
| "hidden_dropout_prob": 0.2 |
| }, |
| "training_hyperparameters": { |
| "learning_rate": 0.0005, |
| "batch_size": 64, |
| "epochs": 10, |
| "loss_function": "cross_entropy", |
| "level_weights": [ |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0 |
| ], |
| "optimizer": "AdamW", |
| "optimizer_params": { |
| "lr": 0.0005, |
| "betas": [ |
| 0.9, |
| 0.999 |
| ], |
| "weight_decay": 0.01 |
| }, |
| "scheduler_warmup_ratio": 0.1, |
| "seed": 42 |
| }, |
| "total_parameters": 75813550, |
| "training_date": "2026-04-25", |
| "training_hardware": "NVIDIA GeForce RTX 4090", |
| "training_dataset": { |
| "name": "Greengenes2 2024.09 (in-silico V3-V4 extractions)", |
| "train_amplicons": 273003, |
| "test_amplicons": 68282, |
| "extraction_yield_train": 0.984, |
| "extraction_yield_test": 0.985, |
| "forward_primer": "CCTACGGGNGGCWGCAG", |
| "reverse_primer": "GACTACHVGGGTATCTAATCC", |
| "primer_name_forward": "341F", |
| "primer_name_reverse": "805R", |
| "max_primer_mismatches": 2, |
| "amplicon_length_median_bp": 422, |
| "amplicon_length_mean_bp": 416, |
| "amplicon_length_range_bp": [ |
| 90, |
| 1552 |
| ] |
| }, |
| "taxonomic_levels": { |
| "domain": 2, |
| "phylum": 115, |
| "class": 270, |
| "order": 709, |
| "family": 1528, |
| "genus": 4529, |
| "species": 8347 |
| }, |
| "test_metrics": { |
| "_note": "Single-seed test-set metrics for the published checkpoint (seed 42), V3-V4 SMALL canonical configuration.", |
| "domain": { |
| "accuracy": 0.9999, |
| "f1_score": 0.9999, |
| "ece": 0.0001 |
| }, |
| "phylum": { |
| "accuracy": 0.9968, |
| "f1_score": 0.9966, |
| "ece": 0.002 |
| }, |
| "class": { |
| "accuracy": 0.9964, |
| "f1_score": 0.996, |
| "ece": 0.0019 |
| }, |
| "order": { |
| "accuracy": 0.9899, |
| "f1_score": 0.9888, |
| "ece": 0.0054 |
| }, |
| "family": { |
| "accuracy": 0.9841, |
| "f1_score": 0.9819, |
| "ece": 0.0074 |
| }, |
| "genus": { |
| "accuracy": 0.9527, |
| "f1_score": 0.9473, |
| "ece": 0.017 |
| }, |
| "species": { |
| "accuracy": 0.8755, |
| "f1_score": 0.8592, |
| "ece": 0.0278 |
| } |
| }, |
| "derived_from": "Canonical SMALL HybridCNNBERT hyperparameters (matching the full-length v1.1 release), applied from scratch to in-silico V3-V4 extractions from Greengenes2 2024.09. Updated in v1.2 in place over the prior LARGE Optuna v3v4 release: under identical evaluation the SMALL configuration achieves equivalent species-level performance (seed-42 Acc 87.55 vs 87.52, F1 85.92 vs 85.79) at roughly 24 percent fewer parameters." |
| } |
| } |
| } |
|
|