import math import os from dataclasses import dataclass, field from typing import Dict, List dirname, _ = os.path.split(os.path.dirname(__file__)) @dataclass class GeneEmbeddModelConfig: model_input: str = "" #will be infered num_embed_hidden: int = 100 #30 for exp, 100 for rest ff_input_dim:int = 0 #is infered later, equals gene expression len ff_hidden_dim: List = field(default_factory=lambda: [300]) #300 for exp hico feed_forward1_hidden: int = 256 num_attention_project: int = 64 num_encoder_layers: int = 1 dropout: float = 0.2 n: int = 121 relative_attns: List = field(default_factory=lambda: [29, 4, 6, 8, 10, 11]) num_attention_heads: int = 5 window: int = 2 tokens_len: int = math.ceil(max_length / window) second_input_token_len: int = 0 # is infered during runtime vocab_size: int = 0 # is infered during runtime second_input_vocab_size: int = 0 # is infered during runtime tokenizer: str = ( "overlap" # either overlap or no_overlap or overlap_multi_window ) clf_target:str = 'm' # sub_class_hico or major_class_hico. hico = high confidence num_classes: int = 0 #will be infered during runtime class_mappings:List = field(default_factory=lambda: [])#will be infered during runtime class_weights :List = field(default_factory=lambda: []) # how many extra window sizes other than deafault window temperatures: List = field(default_factory=lambda: [0,10]) tokens_mapping_dict: Dict = None false_input_perc:float = 0.0 @dataclass class GeneEmbeddTrainConfig: dataset_path_train: str = 'path/to/anndata.h5ad' precursor_file_path: str = 'path/to/precursor_file.csv' #if not provided, sampling from the precurosr will not be done mapping_dict_path: str = 'path/to/mapping_dict.json' #required for mapping sub class to major class, i.e: mir-568-3p to miRNA device: str = "cuda" l2_weight_decay: float = 0.05 batch_size: int = 512 batch_per_epoch:int = 0 # is infered during runtime label_smoothing_sim:float = 0.2 label_smoothing_clf:float = 0.0 # learning rate learning_rate: float = 1e-3 # final learning rate ie 'lr annealed to' lr_warmup_start: float = 0.1 # start of lr before initial linear warmup section lr_warmup_end: float = 1 # end of linear warmup section , annealing begin # TODO: 122 is the number of train batches per epoch, should be infered and set # warmup batch should be during the form epoch*(train batch per epoch) warmup_epoch: int = 10 # how many batches linear warm up for final_epoch: int = 20 # final batch of training when want learning rate top_k: int = 10#int(0.1 * batch_size) # if the corresponding rna/GE appears during the top k, the correctly classified cross_val: bool = False labels_mapping_path: str = None filter_seq_length:bool = False num_augment_exp:int = 20 shuffle_exp: bool = False max_epochs: int = 3000