config.json · systems-genomics-lab/deeptaxa at main

deeptaxa / config.json

Ahmed Moustafa

Initial commit

f542637 23 days ago

10.6 kB

	{
	"version": "deeptaxa.v1.2",
	"model_type": "hybridcnnbert",
	"tokenizer_name": "zhihan1996/DNABERT-2-117M",
	"variants": {
	"full-length": {
	"checkpoint_file": "deeptaxa-full-length-v1.pt",
	"architecture": {
	"max_length": 512,
	"embed_dim": 896,
	"num_filters": 256,
	"kernel_sizes": [
	3,
	5,
	7
	],
	"num_conv_layers": 1,
	"hidden_size": 896,
	"num_hidden_layers": 4,
	"num_attention_heads": 7,
	"intermediate_size": 3584,
	"hidden_dropout_prob": 0.2
	},
	"training_hyperparameters": {
	"learning_rate": 0.0005,
	"batch_size": 64,
	"epochs": 10,
	"loss_function": "cross_entropy",
	"level_weights": [
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0
	],
	"optimizer": "AdamW",
	"optimizer_params": {
	"lr": 0.0005,
	"betas": [
	0.9,
	0.999
	],
	"weight_decay": 0.01
	},
	"scheduler_warmup_ratio": 0.1,
	"seed": 42
	},
	"total_parameters": 76365205,
	"training_date": "2026-04-25",
	"training_hardware": "NVIDIA GeForce RTX 4090",
	"training_dataset": {
	"name": "Greengenes2 2024.09 (full-length 16S)",
	"train_sequences": 277336,
	"test_sequences": 69335,
	"train_fasta": "gg_2024_09_training.fna.gz",
	"train_taxonomy": "gg_2024_09_training.tsv.gz"
	},
	"taxonomic_levels": {
	"domain": 2,
	"phylum": 129,
	"class": 349,
	"order": 997,
	"family": 2250,
	"genus": 7287,
	"species": 16909
	},
	"test_metrics": {
	"_note": "Single-seed test-set metrics for the published checkpoint (seed 42).",
	"domain": {
	"accuracy": 0.9999,
	"f1_score": 0.9999,
	"ece": 0.0001
	},
	"phylum": {
	"accuracy": 0.9968,
	"f1_score": 0.9967,
	"ece": 0.0024
	},
	"class": {
	"accuracy": 0.9963,
	"f1_score": 0.9959,
	"ece": 0.0024
	},
	"order": {
	"accuracy": 0.9909,
	"f1_score": 0.9899,
	"ece": 0.0055
	},
	"family": {
	"accuracy": 0.9861,
	"f1_score": 0.9841,
	"ece": 0.0075
	},
	"genus": {
	"accuracy": 0.9693,
	"f1_score": 0.9651,
	"ece": 0.0143
	},
	"species": {
	"accuracy": 0.9288,
	"f1_score": 0.9203,
	"ece": 0.0251
	}
	},
	"test_metrics_multiseed": {
	"_note": "3-seed mean \u00b1 std across seeds 42, 123, 456. Reported alongside the single-seed published metrics for reproducibility.",
	"seeds": [
	42,
	123,
	456
	],
	"domain": {
	"accuracy_mean": 0.9998,
	"accuracy_std": 0.0,
	"f1_mean": 0.9998,
	"f1_std": 0.0,
	"ece_mean": 0.0001,
	"ece_std": 0.0
	},
	"phylum": {
	"accuracy_mean": 0.9969,
	"accuracy_std": 0.0002,
	"f1_mean": 0.9968,
	"f1_std": 0.0002,
	"ece_mean": 0.0023,
	"ece_std": 0.0002
	},
	"class": {
	"accuracy_mean": 0.9963,
	"accuracy_std": 0.0,
	"f1_mean": 0.9959,
	"f1_std": 0.0,
	"ece_mean": 0.0024,
	"ece_std": 0.0
	},
	"order": {
	"accuracy_mean": 0.9907,
	"accuracy_std": 0.0003,
	"f1_mean": 0.9897,
	"f1_std": 0.0003,
	"ece_mean": 0.0056,
	"ece_std": 0.0002
	},
	"family": {
	"accuracy_mean": 0.9861,
	"accuracy_std": 0.0001,
	"f1_mean": 0.9841,
	"f1_std": 0.0002,
	"ece_mean": 0.0075,
	"ece_std": 0.0001
	},
	"genus": {
	"accuracy_mean": 0.969,
	"accuracy_std": 0.0003,
	"f1_mean": 0.9648,
	"f1_std": 0.0003,
	"ece_mean": 0.0144,
	"ece_std": 0.0001
	},
	"species": {
	"accuracy_mean": 0.9296,
	"accuracy_std": 0.0007,
	"f1_mean": 0.9212,
	"f1_std": 0.0008,
	"ece_mean": 0.0242,
	"ece_std": 0.0008
	}
	},
	"derived_from": "Optuna hyperparameter search on Greengenes2 2024.09 full-length sequences. The full-length v1 checkpoint was updated in place in April 2026 to a smaller, faster configuration (76.4 M parameters) that matches or beats the prior 112.3 M parameter configuration at every taxonomic rank under identical evaluation."
	},
	"v3v4": {
	"checkpoint_file": "deeptaxa-v3v4-v1.pt",
	"architecture": {
	"max_length": 512,
	"embed_dim": 896,
	"num_filters": 256,
	"kernel_sizes": [
	3,
	5,
	7
	],
	"num_conv_layers": 1,
	"hidden_size": 896,
	"num_hidden_layers": 4,
	"num_attention_heads": 7,
	"intermediate_size": 3584,
	"hidden_dropout_prob": 0.2
	},
	"training_hyperparameters": {
	"learning_rate": 0.0005,
	"batch_size": 64,
	"epochs": 10,
	"loss_function": "cross_entropy",
	"level_weights": [
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0
	],
	"optimizer": "AdamW",
	"optimizer_params": {
	"lr": 0.0005,
	"betas": [
	0.9,
	0.999
	],
	"weight_decay": 0.01
	},
	"scheduler_warmup_ratio": 0.1,
	"seed": 42
	},
	"total_parameters": 75813550,
	"training_date": "2026-04-25",
	"training_hardware": "NVIDIA GeForce RTX 4090",
	"training_dataset": {
	"name": "Greengenes2 2024.09 (in-silico V3-V4 extractions)",
	"train_amplicons": 273003,
	"test_amplicons": 68282,
	"extraction_yield_train": 0.984,
	"extraction_yield_test": 0.985,
	"forward_primer": "CCTACGGGNGGCWGCAG",
	"reverse_primer": "GACTACHVGGGTATCTAATCC",
	"primer_name_forward": "341F",
	"primer_name_reverse": "805R",
	"max_primer_mismatches": 2,
	"amplicon_length_median_bp": 422,
	"amplicon_length_mean_bp": 416,
	"amplicon_length_range_bp": [
	90,
	1552
	]
	},
	"taxonomic_levels": {
	"domain": 2,
	"phylum": 115,
	"class": 270,
	"order": 709,
	"family": 1528,
	"genus": 4529,
	"species": 8347
	},
	"test_metrics": {
	"_note": "Single-seed test-set metrics for the published checkpoint (seed 42), V3-V4 SMALL canonical configuration.",
	"domain": {
	"accuracy": 0.9999,
	"f1_score": 0.9999,
	"ece": 0.0001
	},
	"phylum": {
	"accuracy": 0.9968,
	"f1_score": 0.9966,
	"ece": 0.002
	},
	"class": {
	"accuracy": 0.9964,
	"f1_score": 0.996,
	"ece": 0.0019
	},
	"order": {
	"accuracy": 0.9899,
	"f1_score": 0.9888,
	"ece": 0.0054
	},
	"family": {
	"accuracy": 0.9841,
	"f1_score": 0.9819,
	"ece": 0.0074
	},
	"genus": {
	"accuracy": 0.9527,
	"f1_score": 0.9473,
	"ece": 0.017
	},
	"species": {
	"accuracy": 0.8755,
	"f1_score": 0.8592,
	"ece": 0.0278
	}
	},
	"derived_from": "Canonical SMALL HybridCNNBERT hyperparameters (matching the full-length v1.1 release), applied from scratch to in-silico V3-V4 extractions from Greengenes2 2024.09. Updated in v1.2 in place over the prior LARGE Optuna v3v4 release: under identical evaluation the SMALL configuration achieves equivalent species-level performance (seed-42 Acc 87.55 vs 87.52, F1 85.92 vs 85.79) at roughly 24 percent fewer parameters."
	}
	}
	}