AlienChen commited on Jan 7, 2025

Commit

c4b1fea

verified ·

1 Parent(s): 9a451d4

Upload 106 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

configs/callbacks/checkpoint_every_n_steps.yaml +8 -0
configs/callbacks/checkpoint_monitor.yaml +10 -0
configs/callbacks/learning_rate_monitor.yaml +3 -0
configs/classifier_model/dimamba-classifier.yaml +14 -0
configs/classifier_model/hyenadna-classifier.yaml +4 -0
configs/classifier_model/small-classifier.yaml +11 -0
configs/classifier_model/tiny-classifier.yaml +11 -0
configs/classifier_model/tiny-dimamba-classifier.yaml +14 -0
configs/config.yaml +129 -0
configs/data/amazon_polarity.yaml +10 -0
configs/data/cifar10.yaml +11 -0
configs/data/lm1b.yaml +8 -0
configs/data/peptide.yaml +8 -0
configs/data/protein.yaml +8 -0
configs/data/qm9.yaml +11 -0
configs/data/ten_species.yaml +11 -0
configs/data/text8.yaml +9 -0
configs/guidance/cbg.yaml +5 -0
configs/guidance/cfg.yaml +3 -0
configs/guidance/fudge.yaml +5 -0
configs/guidance/nos.yaml +6 -0
configs/guidance/pplm.yaml +6 -0
configs/lr_scheduler/constant_warmup.yaml +2 -0
configs/lr_scheduler/cosine_decay_warmup.yaml +7 -0
configs/model/dimamba.yaml +12 -0
configs/model/fudge_predictor.yaml +4 -0
configs/model/hf.yaml +2 -0
configs/model/medium.yaml +10 -0
configs/model/small.yaml +11 -0
configs/model/tiny.yaml +10 -0
configs/model/unet.yaml +19 -0
configs/model/unet_campbell.yaml +19 -0
configs/noise/ar.yaml +2 -0
configs/noise/linear.yaml +3 -0
configs/noise/loglinear.yaml +3 -0
configs/noise/polynomial.yaml +5 -0
configs/strategy/ddp.yaml +2 -0
configs/strategy/fsdp.yaml +3 -0
guidance_eval/__init__.py +0 -0
guidance_eval/amazon_polarity_eval.py +228 -0
guidance_eval/qm9_eval.py +208 -0
guidance_eval/ten_species_eval.py +585 -0
main.py +262 -0
models/__init__.py +4 -0
models/__pycache__/__init__.cpython-310.pyc +0 -0
models/__pycache__/__init__.cpython-39.pyc +0 -0
models/__pycache__/bindevaluator.cpython-310.pyc +0 -0
models/__pycache__/dimamba.cpython-310.pyc +0 -0
models/__pycache__/dimamba.cpython-39.pyc +0 -0
models/__pycache__/dit.cpython-310.pyc +0 -0

configs/callbacks/checkpoint_every_n_steps.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+checkpoint_every_n_steps:
+  _target_: lightning.pytorch.callbacks.ModelCheckpoint
+  save_top_k: -1 # Do not save any "best" models; this callback is being used to save every n train steps
+  save_last: True # save model as ${save_dir}/checkpoints/last.ckpt
+  dirpath: ${checkpointing.save_dir}/checkpoints
+  verbose: True
+  auto_insert_metric_name: False
+  # every_n_train_steps: 500

configs/callbacks/checkpoint_monitor.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+checkpoint_monitor:
+  _target_: lightning.pytorch.callbacks.ModelCheckpoint
+  monitor: val/nll # name of the logged metric which determines when model is improving
+  mode: min # can be "max" or "min"
+  save_top_k: 1 # save k best models (determined by above metric)
+  save_last: False # True = additionally always save model from last epoch
+  dirpath: ${checkpointing.save_dir}/checkpoints
+  filename: best
+  auto_insert_metric_name: False
+  verbose: True

configs/callbacks/learning_rate_monitor.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+learning_rate_monitor:
+  _target_: lightning.pytorch.callbacks.LearningRateMonitor
+  logging_interval: step

configs/classifier_model/dimamba-classifier.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: dimamba
+type: dimamba
+hidden_size: 256
+cond_dim: 128
+length: ${model.length}  # Same length as diffusion model
+n_blocks: 8
+scale_by_sigma: True
+dropout: 0.1
+tie_word_embeddings: False
+bidirectional: True,
+bidirectional_strategy: add
+bidirectional_weight_tie: True
+num_classes: ${data.num_classes}
+pooling: mean

configs/classifier_model/hyenadna-classifier.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+name: hyena-32k
+type: hyenadna
+hyena_model_name_or_path: ???
+n_layer: 4

configs/classifier_model/small-classifier.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+name: small
+type: ddit
+hidden_size: 768
+cond_dim: 128
+length: ${model.length}  # Same length as diffusion model
+n_blocks: 12
+n_heads: 12
+scale_by_sigma: True
+dropout: 0.1
+num_classes: ${data.num_classes}
+pooling: mean

configs/classifier_model/tiny-classifier.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+name: tiny
+type: ddit
+hidden_size: 512
+cond_dim: 128
+length: ${model.length}  # Same length as diffusion model
+n_blocks: 8
+n_heads: 8
+scale_by_sigma: True
+dropout: 0.1
+num_classes: ${data.num_classes}
+pooling: mean

configs/classifier_model/tiny-dimamba-classifier.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: tiny
+type: dimamba
+hidden_size: 128
+cond_dim: 128
+length: ${model.length}  # Same length as diffusion model
+n_blocks: 4
+scale_by_sigma: True
+dropout: 0.1
+tie_word_embeddings: False
+bidirectional: True,
+bidirectional_strategy: add
+bidirectional_weight_tie: True
+num_classes: ${data.num_classes}
+pooling: mean

configs/config.yaml ADDED Viewed

	@@ -0,0 +1,129 @@

+defaults:
+  - _self_
+  - /callbacks: [checkpoint_every_n_steps, checkpoint_monitor, learning_rate_monitor]
+  - /data: protein
+  - /model: small
+  - /strategy: ddp
+  - /noise: loglinear
+  - /lr_scheduler: cosine_decay_warmup  # constant_warmup
+  - /classifier_model: null
+  - /guidance: null
+mode: ppl_eval  # train / train_classifier / ppl_eval
+diffusion: uniform  # absorbing_state / uniform
+backbone: dit  # dit / dimamba / ar
+classifier_backbone: null
+parameterization: d3pm  # subs / d3pm / ar
+time_conditioning: True  # UDLM is conditioned on time
+subs_masking: False
+zero_recon_loss: True  # Use for UDLM
+T: 0  # 0 (continuous time) / 1000
+is_vision: False
+seed: 42
+loader:
+  global_batch_size: 512
+  eval_global_batch_size: ${.global_batch_size}
+  # Note: batch_size and eval_batch_size are **per machine**
+  batch_size: ${div_up:${.global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
+  eval_batch_size: ${div_up:${.eval_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
+  num_workers: 0 # ${eval:"len(__import__('os').sched_getaffinity(0))"}
+  pin_memory: True
+  persistent_workers: False # True
+sampling:
+  use_cache: True
+  steps: 32
+  # Note: batch_size is **per machine**
+  batch_size: 1 # ${loader.eval_batch_size}
+  num_sample_batches: 10  # Total samples: `num_gpus` * `batch_size` * `num_sample_batches`
+  use_float64: False
+eval:
+  checkpoint_path: '/home/tc415/discrete-diffusion-guidance/outputs/peptide/2024.12.31/122818/checkpoints/best.ckpt'  # Used to evaluate a checkpoint after training.
+  # target_sequence: 'MSGIALSRLAQERKAWRKDHPFGFVAVPTKNPDGTMNLMNWECAIPGKKGTPWEGGLFKLRMLFKDDYPSSPPKCKFEPPLFHPNVYPSGTVCLSILEEDKDWRPAITIKQILLGIQELLNEPNIQDPAQAEAYTIYCQNRVEYEKRVRAQAKKFAPS'
+  # target_motifs: '123-127' # UBC9
+  # target_sequence: 'MAMAEGERTECAEPPRDEPPADGALKRAEELKTQANDYFKAKDYENAIKFYSQAIELNPSNAIYYGNRSLAYLRTECYGYALGDATRAIELDKKYIKGYYRRAASNMALGKFRAALRDYETVVKVKPHDKDAKMKYQECNKIVKQKAFERAIAGDEHKRSVVDSLDIESMTIEDEYSGPKLEDGKVTISFMKELMQWYKDQKKLHRKCAYQILVQVKEVLSKLSTLVETTLKETEKITVCGDTHGQFYDLLNIFELNGLPSETNPYIFNGDFVDRGSFSVEVILTLFGFKLLYPDHFHLLRGNHETDNMNQIYGFEGEVKAKYTAQMYELFSEVFEWLPLAQCINGKVLIMHGGLFSEDGVTLDDIRKIERNRQPPDSGPMCDLLWSDPQPQNGRSISKRGVSCQFGPDVTKAFLEENNLDYIIRSHEVKAEGYEVAHGGRCVTVFSAPNYCDQMGNKASYIHLQGSDLRPQFHQFTAVPHPNVKPMAYANTLLQLGMM'
+  # target_motifs: '94-100' # PPP5
+  # target_sequence: 'MRHSKRTYCPDWDDKDWDYGKWRSSSSHKRRKRSHSSAQENKRCKYNHSKMCDSHYLESRSINEKDYHSRRYIDEYRNDYTQGCEPGHRQRDHESRYQNHSSKSSGRSGRSSYKSKHRIHHSTSHRRSHGKSHRRKRTRSVEDDEEGHLICQSGDVLSARYEIVDTLGEGAFGKVVECIDHKAGGRHVAVKIVKNVDRYCEAARSEIQVLEHLNTTDPNSTFRCVQMLEWFEHHGHICIVFELLGLSTYDFIKENGFLPFRLDHIRKMAYQICKSVNFLHSNKLTHTDLKPENILFVQSDYTEAYNPKIKRDERTLINPDIKVVDFGSATYDDEHHSTLVSTRHYRAPEVILALGWSQPCDVWSIGCILIEYYLGFTVFPTHDSKEHLAMMERILGPLPKHMIQKTRKRKYFHHDRLDWDEHSSAGRYVSRRCKPLKEFMLSQDVEHERLFDLIQKMLEYDPAKRITLREALKHPFFDLLKKSI'
+  # target_motifs: '336-342' # CLK1
+  # target_sequence: 'MEYHQPEDPAPGKAGTAEAVIPENHEVLAGPDEHPQDTDARDADGEAREREPADQALLPSQCGDNLESPLPEASSAPPGPTLGTLPEVETIRACSMPQELPQSPRTRQPEPDFYCVKWIPWKGEQTPIITQSTNGPCPLLAIMNILFLQWKVKLPPQKEVITSDELMAHLGNCLLSIKPQEKSEGLQLNFQQNVDDAMTVLPKLATGLDVNVRFTGVSDFEYTPECSVFDLLGIPLYHGWLVDPQSPEAVRAVGKLSYNQLVERIITCKHSSDTNLVTEGLIAEQFLETTAAQLTYHGLCELTAAAKEGELSVFFRNNHFSTMTKHKSHLYLLVTDQGFLQEEQVVWESLHNVDGDSCFCDSDFHLSHSLGKGPGAEGGSGSPETQLQVDQDYLIALSLQQQQPRGPLGLTDLELAQQLQQEEYQQQQAAQPVRMRTRVLSLQGRGATSGRPAGERRQRPKHESDCILL'
+  # target_motifs: '202-210' # MINDY1
+  # target_sequence: 'MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDTKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEEPMDTDQGNSMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK'
+  # target_motifs: '152-157' # UCHL5
+  # target_sequence: 'MSSGCQKTTTSKSIPTRWVTINDATHMPHDYSTTPGGTPFIITPGGTRIIYDRQFLLECRTSPLARTPPYSLPDIPGVTSPPSKHIINVKAHNGEPLNNNIAAPADKSTGDDAQFEMDI'
+  # target_motifs: '40-50' # 4E-BP2
+  # target_sequence: 'MASTDYSTYSQAAAQQGYSAYTAQPTQGYAQTTQAYGQQSYGTYGQPTDVSYTQAQTTATYGQTAYATSYGQPPTGYTTPTAPQAYSQPVQGYGTGAYDTTTATVTTTQASYAAQSAYGTQPAYPAYGQQPAATAPTRPQDGNKPTETSQPQSSTGGYNQPSLGYGQSNYSYPQVPGSYPMQPVTAPPSYPPTSYSSTQPTSYDQSSYSQQNTYGQPSSYGQQSSYGQQSSYGQQPPTSYPPQTGSYSQAPSQYSQQSSSYGQQNPSYDSVRRGAWGNNMNSGLNKSPPLGGAQTISKNTEQRPQPDPYQILGPTSSRLANPGSGQIQLWQFLLELLSDSANASCITWEGTNGEFKMTDPDEVARRWGERKSKPNMNYDKLSRALRYYYDKNIMTKVHGKRYAYKFDFHGIAQALQPHPTESSMYKYPSDISYMPSYHAHQQKVNFVPPHPSSMPVTSSSFFGAASQYWTSPTGGIYPNPNVPRHPNTHVPSHLGSYY'
+  # target_motifs: '323-330' # EWS::FLI1
+  target_sequence: 'MLQTKDLIWTLFFLGTAVSLQVDIVPSQGEISVGESKFFLCQVAGDAKDKDISWFSPNGEKLTPNQQRISVVWNDDSSSTLTIYNANIDDAGIYKCVVTGEDGSESEATVNVKIFQKLMFKNAPTPQEFREGEDAVIVCDVVSSLPPTIIWKHKGRDVILKKDVRFIVLSNNYLQIRGIKKTDEGTYRCEGRILARGEINFKDIQVIVNVPPTIQARQNIVNATANLGQSVTLVCDAEGFPEPTMSWTKDGEQIEQEEDDEKYIFSDDSSQLTIKKVDKNDEAEYICIAENKAGEQDATIHLKVFAKPKITYVENQTAMELEEQVTLTCEASGDPIPSITWRTSTRNISSEEKASWTRPEKQETLDGHMVVRSHARVSSLTLKSIQYTDAGEYICTASNTIGQDSQSMYLEVQYAPKLQGPVAVYTWEGNQVNITCEVFAYPSATISWFRDGQLLPSSNYSNIKIYNTPSASYLEVTPDSENDFGNYNCTAVNRIGQESL'
+  target_motifs: '415-430' # NCAM1_IG
+  # target_sequence: 'TPSSPSIDQVEPYSSTAQVQFDEPEATGGVPILKYKAEWRAVGEEVWHSKWYDAKEASMEGIVTIVGLKPETTYAVRLAALNGKGLGEISAASEFKTQPVQGEPSAPKLEGQMGEDGNSIKVNLIKQDDGGSPIRHYLVRYRALSSEWKPEIRLPSGSDHVMLKSLDWNAEYEVYVVAENQQGKSKAAHFVFRTSAQP'
+  # target_motifs: '98-108' # NCAM1_FN3
+  disable_ema: False
+  generate_samples: True
+  generated_samples_path: ''
+  max_samples: 50_000
+training:
+  ema: 0.9999
+  antithetic_sampling: True
+  importance_sampling: False
+  sampling_eps: 1e-3
+  change_of_variables: False
+  compute_loss_on_pad_tokens: True
+  use_simple_ce_loss: False  # Ignore ELBO; just use CE
+  guidance: null # Can turn off with `training.guidance: null`
+    # cond_dropout: 0.0
+optim:
+  weight_decay: 1e-4
+  lr: 1e-5
+  beta1: 0.9
+  beta2: 0.999
+  eps: 1e-8
+trainer:
+  _target_: lightning.Trainer
+  accelerator: cuda
+  num_nodes: 1
+  devices: 2 # ${device_count:}
+  accumulate_grad_batches: 1 # ${div_up:${loader.global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}}
+  gradient_clip_val: 1.0
+  precision: 'bf16-mixed'
+  num_sanity_val_steps: 2
+  # max_epochs: 10
+  max_steps: 1652000
+  log_every_n_steps: 100
+  limit_train_batches: 1.0   # train on full dataset, can be used to toggle quick run
+  limit_val_batches: 1.0     # validate on full dataset, can be used to toggle quick run
+  val_check_interval: 16520 # 2545
+wandb:
+  project: moPPIt-v2
+  job_type: model-training
+  name: protein_medium_100epochs_lr1e-5_gradclip1_wd1e-4_dropout0.1 #epochs10_lr3e-4_bsz8_64-true_all-params_gradclip1_beta-one0.9_beta-two0.999
+  id: ${.name}
+hydra:
+  run:
+    dir: ./outputs/${wandb.name} # ./outputs/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S}
+  job:
+    chdir: true
+checkpointing:
+  # Use custom `save_dir` if, e.g., saving to S3 bucket, otherwise leave this parameter as is
+  save_dir: ${cwd:}
+  # Note: `checkpoints` path should correspond to `checkpoint_every_n_steps.dirpath`
+  resume_from_ckpt: False
+  resume_ckpt_path: ${.save_dir}/checkpoints/last.ckpt
+  # target_sequence: 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD'
+  # target_motifs: '305-313' # P53_1
+  # target_motifs: '371-382' # P53_2
+  # target_motifs: '351-393' # P53_3
+  # target_motifs: '210-230' # P53_4
+  # target_sequence: 'MLQTKDLIWTLFFLGTAVSLQVDIVPSQGEISVGESKFFLCQVAGDAKDKDISWFSPNGEKLTPNQQRISVVWNDDSSSTLTIYNANIDDAGIYKCVVTGEDGSESEATVNVKIFQKLMFKNAPTPQEFREGEDAVIVCDVVSSLPPTIIWKHKGRDVILKKDVRFIVLSNNYLQIRGIKKTDEGTYRCEGRILARGEINFKDIQVIVNVPPTIQARQNIVNATANLGQSVTLVCDAEGFPEPTMSWTKDGEQIEQEEDDEKYIFSDDSSQLTIKKVDKNDEAEYICIAENKAGEQDATIHLKVFAKPKITYVENQTAMELEEQVTLTCEASGDPIPSITWRTSTRNISSEEKTLDGHMVVRSHARVSSLTLKSIQYTDAGEYICTASNTIGQDSQSMYLEVQYAPKLQGPVAVYTWEGNQVNITCEVFAYPSATISWFRDGQLLPSSNYSNIKIYNTPSASYLEVTPDSENDFGNYNCTAVNRIGQESLEFILVQADTPSSPSIDQVEPYSSTAQVQFDEPEATGGVPILKYKAEWRAVGEEVWHSKWYDAKEASMEGIVTIVGLKPETTYAVRLAALNGKGLGEISAASEFKTQPVHSPPP'
+  # target_motifs: '28-39' # NCAM1_ECD

configs/data/amazon_polarity.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+train: amazon_polarity
+valid: amazon_polarity
+tokenizer_name_or_path: bert-base-uncased
+cache_dir: /share/kuleshov/ssahoo/textdiffusion/data
+wrap: False
+streaming: False
+override_cache: False
+add_special_tokens: True
+label_col: label
+num_classes: 2

configs/data/cifar10.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+train: ???  # (Local) Path to CIFAR-10 training data
+valid: ???  # (Local) Path to CIFAR-10 validation data
+label_col: labels
+num_classes: 10
+streaming: False
+size: 1024
+length: 3072
+add_special_tokens: True
+add_mask_token: True
+tokenizer_name_or_path: raw_pixels

configs/data/lm1b.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+train: lm1b
+valid: lm1b
+tokenizer_name_or_path: bert-base-uncased
+cache_dir: /share/kuleshov/ssahoo/textdiffusion/data
+wrap: False
+streaming: False
+override_cache: False
+add_special_tokens: True

configs/data/peptide.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+train: peptide
+valid: peptide
+tokenizer_name_or_path: facebook/esm2_t33_650M_UR50D
+cache_dir: /home/tc415/discrete-diffusion-guidance/dataset
+wrap: False
+streaming: False
+override_cache: False
+add_special_tokens: True

configs/data/protein.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+train: protein_400k
+valid: protein_400k
+tokenizer_name_or_path: facebook/esm2_t33_650M_UR50D
+cache_dir: /home/tc415/discrete-diffusion-guidance/dataset
+wrap: False
+streaming: False
+override_cache: False
+add_special_tokens: True

configs/data/qm9.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+train: qm9
+valid: qm9
+tokenizer_name_or_path: yairschiff/qm9-tokenizer
+cache_dir: /share/kuleshov/ssahoo/textdiffusion/data
+wrap: False
+streaming: False
+override_cache: False
+add_special_tokens: True
+label_col: qed
+label_col_pctile: 90
+num_classes: 2

configs/data/ten_species.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+train: ten_species
+valid: ten_species
+tokenizer_name_or_path: kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16
+cache_dir: /share/kuleshov/ssahoo/textdiffusion/data
+wrap: False
+streaming: False
+override_cache: False
+add_special_tokens: False
+label_col: species_label
+num_classes: 10
+rc_aug: False

configs/data/text8.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+# TODO: When using this dataset, set model.length = 256 to match D3PM setup
+train: text8
+valid: text8
+tokenizer_name_or_path: text8
+cache_dir: /share/kuleshov/ssahoo/textdiffusion/data
+wrap: True
+streaming: False
+override_cache: False
+add_special_tokens: False

configs/guidance/cbg.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+method: cbg
+condition: 0
+classifier_checkpoint_path: '/home/tc415/discrete-diffusion-guidance/model_path/finetune_bindevaluator_0/model-epoch=30-val_mcc=0.60-val_loss=0.51.ckpt'
+gamma: 2.0
+use_approx: False  # use first-order approximation

configs/guidance/cfg.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+method: cfg
+condition: 0
+gamma: 1.0

configs/guidance/fudge.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+method: fudge
+condition: 0
+classifier_checkpoint_path: ''
+topk: 20
+gamma: 1.0

configs/guidance/nos.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+method: nos
+condition: 0
+classifier_checkpoint_path: ''
+num_nos_steps: 1
+nos_step_size: 0.1
+nos_stability_coef: 0.01

configs/guidance/pplm.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+method: pplm
+condition: 0
+classifier_checkpoint_path: ''
+num_pplm_steps: 1
+pplm_step_size: 0.1
+pplm_stability_coef: 0.01

configs/lr_scheduler/constant_warmup.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ _target_: transformers.get_constant_schedule_with_warmup
2	+ num_warmup_steps: 2500

configs/lr_scheduler/cosine_decay_warmup.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+_target_: utils.CosineDecayWarmupLRScheduler
+t_in_epochs: False
+t_initial: ${eval:${trainer.max_steps}-${.warmup_t}}
+warmup_prefix: True
+warmup_lr_init: 1e-7
+warmup_t: ${eval:0.1*${trainer.max_steps}}
+lr_min: 1e-7

configs/model/dimamba.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+name: dimamba
+type: dimamba
+hidden_size: 256
+cond_dim: 128
+length: 32768
+n_blocks: 8
+scale_by_sigma: True
+dropout: 0.1
+tie_word_embeddings: False
+bidirectional: True,
+bidirectional_strategy: add
+bidirectional_weight_tie: True

configs/model/fudge_predictor.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+name: fudge_predictor
+type: lstm
+hidden_dim: 300
+length: 1024

configs/model/hf.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ pretrained_model_name_or_path: null
2	+ length: 128

configs/model/medium.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+name: medium
+type: ddit
+hidden_size: 1024
+cond_dim: 128
+length: 4096
+n_blocks: 24
+n_heads: 16
+scale_by_sigma: True
+dropout: 0.1
+tie_word_embeddings: False

configs/model/small.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+name: small
+type: ddit
+hidden_size: 768
+cond_dim: 128
+length: null
+length_range: '25,27,28,31,35,43-49'
+n_blocks: 12
+n_heads: 12
+scale_by_sigma: True
+dropout: 0.1
+tie_word_embeddings: False

configs/model/tiny.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+name: tiny
+type: ddit
+hidden_size: 512
+cond_dim: 128
+length: 1024
+n_blocks: 8
+n_heads: 8
+scale_by_sigma: True
+dropout: 0.1
+tie_word_embeddings: False

configs/model/unet.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: unet
+type: unet
+ch: 128
+num_res_blocks: 2
+num_scales: 4
+ch_mult: [1, 2, 2, 2]
+input_channels: 3
+output_channels: -1 # determined by vocab_size
+scale_count_to_put_attn: 1 # at 16 res
+data_min_max: [0, 255] # No need currently
+dropout: 0.1
+skip_rescale: True
+time_conditioning: True # Whether to add in time embeddings
+time_scale_factor: 1000
+time_embed_dim: ${.ch}
+fix_logistic: False
+size: ${data.size}
+cond_dim: ${.ch}
+length: ${data.length}

configs/model/unet_campbell.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: unet
+type: unet
+ch: 128
+num_res_blocks: 2
+num_scales: 4
+ch_mult: [1, 2, 2, 2]
+input_channels: 3
+output_channels: -1 # determined by input_channels * 2
+scale_count_to_put_attn: 1 # at 16 res
+data_min_max: [0, 255] # No need currently, determined by [0, vocab_size]
+dropout: 0.1
+skip_rescale: True
+time_conditioning: True # Whether to add in time embeddings
+time_scale_factor: 1000
+time_embed_dim: ${.ch}
+fix_logistic: False
+size: ${data.size}
+cond_dim: ${.ch}
+length: ${data.length}

configs/noise/ar.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ type: ar
2	+ scale: 6.0

configs/noise/linear.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+type: linear
+sigma_min: 1e-3
+sigma_max: 7.0

configs/noise/loglinear.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+type: loglinear
+sigma_min: 1e-4
+sigma_max: 20

configs/noise/polynomial.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+type: polynomial
+a: -3
+b: 5
+c: -4
+eps: 1e-3

configs/strategy/ddp.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ _target_: lightning.pytorch.strategies.DDPStrategy
2	+ find_unused_parameters: false

configs/strategy/fsdp.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+# TODO(yair): Currently not compatible with grad clipping
+_target_: lightning.pytorch.strategies.FSDPStrategy
+sharding_strategy: SHARD_GRAD_OP

guidance_eval/__init__.py ADDED Viewed

File without changes

guidance_eval/amazon_polarity_eval.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import collections
+import json
+import os
+import hydra
+import lightning as L
+import omegaconf
+import pandas as pd
+import rdkit
+import rich.syntax
+import rich.tree
+import spacy
+import torch
+import transformers
+# from evaluate import load
+from nltk.util import ngrams
+from tqdm.auto import tqdm
+import dataloader
+import diffusion
+import eval_utils
+rdkit.rdBase.DisableLog('rdApp.error')
+omegaconf.OmegaConf.register_new_resolver(
+  'cwd', os.getcwd)
+omegaconf.OmegaConf.register_new_resolver(
+  'device_count', torch.cuda.device_count)
+omegaconf.OmegaConf.register_new_resolver(
+  'eval', eval)
+omegaconf.OmegaConf.register_new_resolver(
+  'div_up', lambda x, y: (x + y - 1) // y)
+omegaconf.OmegaConf.register_new_resolver(
+  'if_then_else',
+  lambda condition, x, y: x if condition else y
+)
+def _print_config(
+    config: omegaconf.DictConfig,
+    resolve: bool = True) -> None:
+  """Prints content of DictConfig using Rich library and its tree structure.
+  Args:
+    config (DictConfig): Configuration composed by Hydra.
+    resolve (bool): Whether to resolve reference fields of DictConfig.
+  """
+  style = 'dim'
+  tree = rich.tree.Tree('CONFIG', style=style,
+                        guide_style=style)
+  fields = config.keys()
+  for field in fields:
+    branch = tree.add(field, style=style, guide_style=style)
+    config_section = config.get(field)
+    branch_content = str(config_section)
+    if isinstance(config_section, omegaconf.DictConfig):
+      branch_content = omegaconf.OmegaConf.to_yaml(
+        config_section, resolve=resolve)
+    branch.add(rich.syntax.Syntax(branch_content, 'yaml'))
+  rich.print(tree)
+def compute_diversity(sentences):
+  # compute diversity
+  ngram_range = [2, 3, 4]
+  tokenizer = spacy.load("en_core_web_sm").tokenizer
+  token_list = []
+  for sentence in sentences:
+    token_list.append(
+      [str(token) for token in tokenizer(sentence)])
+  ngram_sets = {}
+  ngram_counts = collections.defaultdict(int)
+  n_gram_repetition = {}
+  for n in ngram_range:
+    ngram_sets[n] = set()
+    for tokens in token_list:
+      ngram_sets[n].update(ngrams(tokens, n))
+      ngram_counts[n] += len(list(ngrams(tokens, n)))
+    n_gram_repetition[f"{n}gram_repetition"] = (
+          1 - len(ngram_sets[n]) / ngram_counts[n])
+  diversity = 1
+  for val in n_gram_repetition.values():
+    diversity *= (1 - val)
+  return diversity
+def compute_sentiment_classifier_score(sentences, eval_model_name_or_path):
+  tokenizer = transformers.AutoTokenizer.from_pretrained(eval_model_name_or_path)
+  eval_model = transformers.AutoModelForSequenceClassification.from_pretrained(
+    eval_model_name_or_path).to('cuda')
+  eval_model.eval()
+  total_pos = 0
+  total_neg = 0
+  pbar = tqdm(sentences, desc='Classifier eval')
+  for sen in pbar:
+    # Tokenize the input text
+    inputs = tokenizer(
+      sen,
+      return_tensors="pt",
+      truncation=True,
+      padding=True).to('cuda')
+    # Get the model predictions
+    with torch.no_grad():
+      outputs = eval_model(**inputs)
+    # Convert logits to probabilities
+    probs = torch.nn.functional.softmax(
+      outputs.logits, dim=-1)
+    # Get the predicted class
+    predicted_class = torch.argmax(probs, dim=1).item()
+    if predicted_class == 1:
+      total_pos += 1
+    else:
+      total_neg += 1
+    pbar.set_postfix(accuracy=total_pos / (total_pos + total_neg))
+  return total_pos / (total_pos + total_neg)
+# def compute_mauve(config, tokenizer, sentences):
+#   os.environ["TOKENIZERS_PARALLELISM"] = "false"
+#   # compute mauve
+#   torch.cuda.empty_cache()
+#   mauve = load("mauve")
+#   human_references = []
+#
+#   valid_loader = dataloader.get_dataloaders(
+#     config, tokenizer, valid_seed=config.seed)
+#
+#   # construct reference
+#   for batch_id in range(config.sampling.num_sample_batches):
+#     batch = next(iter(valid_loader))
+#     input_ids = batch['input_ids']
+#     for i in range(config.sampling.batch_size):
+#       idx = (
+#             input_ids[i] == tokenizer.eos_token_id).nonzero(
+#         as_tuple=True)
+#       if idx[0].numel() > 0:
+#         idx = idx[0][0].item()
+#         input_ids[i, (idx + 1):] = 0
+#     human_references.extend(
+#       tokenizer.batch_decode(
+#         input_ids, skip_special_tokens=True))
+#
+#   assert len(sentences) == len(human_references)
+#
+#   results = mauve.compute(predictions=sentences,
+#                           references=human_references,
+#                           featurize_model_name=config.data.mauve_model,
+#                           max_text_length=256, device_id=0)
+#   return results.mauve
+@hydra.main(version_base=None, config_path='../configs',
+            config_name='config')
+def main(config: omegaconf.DictConfig) -> None:
+  # Reproducibility
+  L.seed_everything(config.seed)
+  os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+  torch.use_deterministic_algorithms(True)
+  torch.backends.cudnn.benchmark = False
+  _print_config(config, resolve=True)
+  print(f"Checkpoint: {config.eval.checkpoint_path}")
+  tokenizer = dataloader.get_tokenizer(config)
+  pretrained = diffusion.Diffusion.load_from_checkpoint(
+    config.eval.checkpoint_path,
+    tokenizer=tokenizer,
+    config=config, logger=False)
+  pretrained.eval()
+  result_dicts = []
+  samples = []
+  for _ in tqdm(
+      range(config.sampling.num_sample_batches),
+      desc='Gen. batches', leave=False):
+    sample = pretrained.sample()
+    samples.extend(
+      pretrained.tokenizer.batch_decode(sample))
+  samples = [
+    s.replace('[CLS]', '').replace('[SEP]', '').replace('[PAD]', '').replace('[MASK]', '').strip()
+    for s in samples
+  ]
+  del pretrained  # free up space for eval
+  diversity_score = compute_diversity(samples)
+  classifier_accuracy = compute_sentiment_classifier_score(
+    samples, eval_model_name_or_path=config.eval.classifier_model_name_or_path)
+  generative_ppl = eval_utils.compute_generative_ppl(
+    samples,
+    eval_model_name_or_path=config.eval.generative_ppl_model_name_or_path,
+    gen_ppl_eval_batch_size=8,
+    max_length=config.model.length)
+  result_dicts.append({
+    'Seed': config.seed,
+    'T': config.sampling.steps,
+    'Num Samples': config.sampling.batch_size * config.sampling.num_sample_batches,
+    'Diversity': diversity_score,
+    'Accuracy': classifier_accuracy,
+    'Gen. PPL': generative_ppl,
+  } | {k.capitalize(): v for k, v in config.guidance.items()})
+  print("Guidance:", ", ".join([f"{k.capitalize()} - {v}" for k, v in config.guidance.items()]))
+  print(f"\tDiversity: {diversity_score:0.3f} ",
+        f"Accuracy: {classifier_accuracy:0.3f} ",
+        f"Gen. PPL: {generative_ppl:0.3f}")
+  print(f"Generated {len(samples)} sentences.")
+  with open(config.eval.generated_samples_path, 'w') as f:
+    json.dump(
+      {
+        'generated_seqs': samples,
+      },
+      f, indent=4) # type: ignore
+  results_df = pd.DataFrame.from_records(result_dicts)
+  results_df.to_csv(config.eval.results_csv_path)
+if __name__ == '__main__':
+  main()

guidance_eval/qm9_eval.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import json
+import os
+import time
+import typing
+import datasets
+import hydra
+import lightning as L
+import numpy as np
+import omegaconf
+import pandas as pd
+import rdkit
+import rich.syntax
+import rich.tree
+import torch
+from rdkit import Chem as rdChem
+from rdkit.Chem import QED
+from tqdm.auto import tqdm
+import dataloader
+import diffusion
+rdkit.rdBase.DisableLog('rdApp.error')
+omegaconf.OmegaConf.register_new_resolver(
+  'cwd', os.getcwd)
+omegaconf.OmegaConf.register_new_resolver(
+  'device_count', torch.cuda.device_count)
+omegaconf.OmegaConf.register_new_resolver(
+  'eval', eval)
+omegaconf.OmegaConf.register_new_resolver(
+  'div_up', lambda x, y: (x + y - 1) // y)
+omegaconf.OmegaConf.register_new_resolver(
+  'if_then_else',
+  lambda condition, x, y: x if condition else y
+)
+def _print_config(
+    config: omegaconf.DictConfig,
+    resolve: bool = True) -> None:
+  """Prints content of DictConfig using Rich library and its tree structure.
+  Args:
+    config (DictConfig): Configuration composed by Hydra.
+    resolve (bool): Whether to resolve reference fields of DictConfig.
+  """
+  style = 'dim'
+  tree = rich.tree.Tree('CONFIG', style=style,
+                        guide_style=style)
+  fields = config.keys()
+  for field in fields:
+    branch = tree.add(field, style=style, guide_style=style)
+    config_section = config.get(field)
+    branch_content = str(config_section)
+    if isinstance(config_section, omegaconf.DictConfig):
+      branch_content = omegaconf.OmegaConf.to_yaml(
+        config_section, resolve=resolve)
+    branch.add(rich.syntax.Syntax(branch_content, 'yaml'))
+  rich.print(tree)
+def get_mol_property_fn(
+    prop: str
+) -> typing.Callable[[rdChem.Mol], typing.Union[int, float]]:
+  if prop == 'qed':
+    return QED.qed
+  if prop == 'ring_count':
+    return lambda x_mol: len(rdChem.GetSymmSSSR(x_mol))
+  raise NotImplementedError(
+    f"Property function for {prop} not implemented")
+@hydra.main(version_base=None, config_path='../configs',
+            config_name='config')
+def main(config: omegaconf.DictConfig) -> None:
+  # Reproducibility
+  L.seed_everything(config.seed)
+  os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+  torch.use_deterministic_algorithms(True)
+  torch.backends.cudnn.benchmark = False
+  _print_config(config, resolve=True)
+  print(f"Checkpoint: {config.eval.checkpoint_path}")
+  qm9_dataset = datasets.load_dataset(
+    'yairschiff/qm9', trust_remote_code=True,
+    split='train')
+  tokenizer = dataloader.get_tokenizer(config)
+  pretrained = diffusion.Diffusion.load_from_checkpoint(
+    config.eval.checkpoint_path,
+    tokenizer=tokenizer,
+    config=config, logger=False)
+  pretrained.eval()
+  label_col = config.data.label_col
+  pctile_threshold = config.data.label_col_pctile
+  pctile_threshold_value = np.percentile(
+    qm9_dataset[label_col], q=pctile_threshold)
+  above_threshold = np.array(qm9_dataset[label_col])[
+    qm9_dataset[label_col] >= pctile_threshold_value]
+  below_threshold = np.array(qm9_dataset[label_col])[
+    qm9_dataset[label_col] < pctile_threshold_value]
+  result_dicts = []
+  mol_property_fn = get_mol_property_fn(label_col)
+  print(
+    f"All          - {label_col.upper()} Mean: {np.mean(qm9_dataset[label_col]):0.3f}, {label_col.upper()} Median: {np.median(qm9_dataset[label_col]):0.3f}")
+  print(
+    f"Below {pctile_threshold}%ile - {label_col.upper()} Mean: {np.mean(below_threshold):0.3f}, {label_col.upper()} Median: {np.median(below_threshold):0.3f}")
+  print(
+    f"Above {pctile_threshold}%ile - {label_col.upper()} Mean: {np.mean(above_threshold):0.3f}, {label_col.upper()} Median: {np.median(above_threshold):0.3f}")
+  result_dicts.append({
+    'Seed': -1,
+    'T': -1,
+    'Num Samples': len(qm9_dataset),
+    'Valid': 1.0,
+    'Unique': 1.0,
+    'Novel': 1.0,
+    f'{label_col.upper()} Mean': np.mean(qm9_dataset[label_col]),
+    f'{label_col.upper()} 25%ile': np.percentile(qm9_dataset[label_col], q=25),
+    f'{label_col.upper()} Median': np.median(qm9_dataset[label_col]),
+    f'{label_col.upper()} 75%ile': np.percentile(qm9_dataset[label_col], q=75),
+    f'Novel {label_col.upper()} Mean': np.mean(qm9_dataset[label_col]),
+    f'Novel {label_col.upper()} 25%ile': np.percentile(qm9_dataset[label_col], q=25),
+    f'Novel {label_col.upper()} Median': np.median(qm9_dataset[label_col]),
+    f'Novel {label_col.upper()} 75%ile': np.percentile(qm9_dataset[label_col], q=75),
+  } | {k.capitalize(): -1 for k, v in config.guidance.items()})
+  samples = []
+  for _ in tqdm(
+      range(config.sampling.num_sample_batches),
+      desc='Gen. batches', leave=False):
+    start = time.time()
+    sample = pretrained.sample()
+    # print(f"Batch took {time.time() - start:.2f} seconds.")
+    samples.extend(
+      pretrained.tokenizer.batch_decode(sample))
+  invalids = []
+  valids = []
+  mol_property = []
+  for t in samples:
+    t = t.replace('<bos>', '').replace('<eos>', '').replace('<pad>', '')
+    try:
+      mol = rdChem.MolFromSmiles(t)
+      if mol is None or len(t) == 0:
+        invalids.append(t)
+      else:
+        valids.append(t)
+        mol_property.append(mol_property_fn(mol))
+    except rdkit.Chem.rdchem.KekulizeException as e:
+      print(e)
+      invalids.append(t)
+  valid = len(valids)
+  valid_pct = len(valids) / len(samples)
+  unique = len(set(valids))
+  novel = len(set(valids) - set(qm9_dataset['canonical_smiles']))
+  try:
+    unique_pct = unique / valid
+    novel_pct = novel / valid
+  except ZeroDivisionError:
+    unique_pct, novel_pct = 0., 0.
+  mol_property_novel = [
+    mol_property_fn(rdChem.MolFromSmiles(s))
+    for s in set(valids) - set(qm9_dataset['canonical_smiles'])
+  ]
+  result_dicts.append({
+    'Seed': config.seed,
+    'T': config.sampling.steps,
+    'Num Samples': config.sampling.batch_size * config.sampling.num_sample_batches,
+    'Valid': valid_pct,
+    'Unique': unique_pct,
+    'Novel': novel_pct,
+    f'{label_col.upper()} Mean': np.mean(mol_property) if len(mol_property) > 0 else 0.,
+    f'{label_col.upper()} 25%ile': np.percentile(mol_property, q=25) if len(mol_property) > 0 else 0.,
+    f'{label_col.upper()} Median': np.median(mol_property) if len(mol_property) > 0 else 0.,
+    f'{label_col.upper()} 75%ile': np.percentile(mol_property, q=75) if len(mol_property) > 0 else 0.,
+    f'Novel {label_col.upper()} Mean': np.mean(mol_property_novel) if len(mol_property_novel) > 0 else 0.,
+    f'Novel {label_col.upper()} 25%ile': np.percentile(mol_property_novel, q=25) if len(mol_property_novel) > 0 else 0.,
+    f'Novel {label_col.upper()} Median': np.median(mol_property_novel) if len(mol_property_novel) > 0 else 0.,
+    f'Novel {label_col.upper()} 75%ile': np.percentile(mol_property_novel, q=75) if len(mol_property_novel) > 0 else 0.,
+  } | {k.capitalize(): v for k, v in config.guidance.items()})
+  print("Guidance:", ", ".join([f"{k.capitalize()} - {v}" for k, v in config.guidance.items()]))
+  print(f"\tValid: {valid:,d} / {len(samples):,d} ({100 * valid_pct:0.2f}%) ",
+        f"Unique (of valid): {unique:,d} / {valid:,d} ({100 * unique_pct:0.2f}%) ",
+        f"Novel (of valid): {novel:,d} / {valid:,d} ({100 * novel_pct:0.2f}%)\n",
+        f"\t{label_col.upper()} Mean: {np.mean(mol_property) if len(mol_property) else 0.:0.3f}, {label_col.upper()} Median: {np.median(mol_property) if len(mol_property) else 0.:0.3f}\n",
+        f"\tNovel {label_col.upper()} Mean: {np.mean(mol_property_novel) if len(mol_property_novel) else 0.:0.3f}, Novel {label_col.upper()} Median: {np.median(mol_property_novel) if len(mol_property_novel) else 0.:0.3f}"
+        )
+  print(f"Generated {len(samples)} sentences.")
+  with open(config.eval.generated_samples_path, 'w') as f:
+    json.dump(
+      {
+        'valid': valids,
+        'novel': list(set(valids) - set(qm9_dataset['canonical_smiles'])),
+        f"{label_col}_valid": mol_property,
+        f"{label_col}_novel": mol_property_novel,
+      },
+      f, indent=4) # type: ignore
+  results_df = pd.DataFrame.from_records(result_dicts)
+  results_df.to_csv(config.eval.results_csv_path)
+if __name__ == '__main__':
+  main()

guidance_eval/ten_species_eval.py ADDED Viewed

	@@ -0,0 +1,585 @@

+import itertools
+import json
+import os
+import typing
+import datasets
+import hydra
+import lightning as L
+import numpy as np
+import omegaconf
+import pandas as pd
+import rdkit
+import rich.syntax
+import rich.tree
+import scipy
+import torch
+import transformers
+from sklearn.metrics import (
+  f1_score,
+  matthews_corrcoef,
+  precision_score,
+  recall_score,
+  roc_auc_score
+)
+from tqdm.auto import tqdm
+import classifier
+import custom_datasets
+import dataloader
+import diffusion
+rdkit.rdBase.DisableLog('rdApp.error')
+omegaconf.OmegaConf.register_new_resolver(
+  'cwd', os.getcwd)
+omegaconf.OmegaConf.register_new_resolver(
+  'device_count', torch.cuda.device_count)
+omegaconf.OmegaConf.register_new_resolver(
+  'eval', eval)
+omegaconf.OmegaConf.register_new_resolver(
+  'div_up', lambda x, y: (x + y - 1) // y)
+omegaconf.OmegaConf.register_new_resolver(
+  'if_then_else',
+  lambda condition, x, y: x if condition else y
+)
+def _print_config(
+    config: omegaconf.DictConfig,
+    resolve: bool = True) -> None:
+  """Prints content of DictConfig using Rich library and its tree structure.
+  Args:
+    config (DictConfig): Configuration composed by Hydra.
+    resolve (bool): Whether to resolve reference fields of DictConfig.
+  """
+  style = 'dim'
+  tree = rich.tree.Tree('CONFIG', style=style,
+                        guide_style=style)
+  fields = config.keys()
+  for field in fields:
+    branch = tree.add(field, style=style, guide_style=style)
+    config_section = config.get(field)
+    branch_content = str(config_section)
+    if isinstance(config_section, omegaconf.DictConfig):
+      branch_content = omegaconf.OmegaConf.to_yaml(
+        config_section, resolve=resolve)
+    branch.add(rich.syntax.Syntax(branch_content, 'yaml'))
+  rich.print(tree)
+def generate_ordered_kmers(
+    kmer_length: int
+) -> typing.List[str]:
+  """
+  Function that generates all kmers of a given length and orders them by their index
+  defined by the kmer_to_index function.
+  Args:
+      kmer_length (int): The length of the kmers to generate
+  Returns:
+      List[str]: A list of all kmers of the given length ordered by their index
+  """
+  characters = ["A", "C", "G", "T"]
+  kmers = ["".join(kmer) for kmer in
+           itertools.product(characters,
+                             repeat=kmer_length)]
+  ordered_kmers = sorted(kmers, key=kmer_to_index)
+  return ordered_kmers
+def kmer_to_index(kmer: str) -> int:
+  """
+  Function that converts a given kmer to a unique value
+  system.
+  Args:
+      kmer (str): The given kmer
+  Returns:
+      int: The associated unique value
+  Example:
+      >>> kmer_to_index("AAC")
+      1
+  """
+  mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
+  index = 0
+  for char in kmer:
+    index = index * 4 + mapping[char]
+  return index
+def compute_kmer_frequencies(
+    seqs: typing.List[str], kmer_length: int
+) -> typing.Tuple[typing.List[float], typing.List[str]]:
+  """
+  Computes the kmer frequencies in a list of sequences.
+  Each element of the output array is the frequency of a given kmer over the whole
+  set of sequences.
+  Args:
+      seqs (List[str]): List of nucleotide sequences
+      kmer_length (int): Length of the kmers
+  Returns:
+      List[float]: Kmer frequencies
+      List[str]: The kmers
+  Example:
+      >>> sequences = ["AGCT", "AAAA"]
+      >>> compute_kmer_frequencies(seqs, kmer_length=1)
+      ([0.625, 0.125, 0.125, 0.125], ['A', 'C', 'G', 'T'])
+  """
+  kmer_counts: typing.Dict[str, int] = {}
+  count_kmers_occurrences = 0
+  for seq in seqs:
+    for i in range(len(seq) - kmer_length + 1):
+      kmer = seq[i: i + kmer_length]
+      if kmer in kmer_counts:
+        kmer_counts[kmer] += 1
+      else:
+        kmer_counts[kmer] = 1
+      count_kmers_occurrences += 1
+  kmer_list = generate_ordered_kmers(kmer_length)
+  kmer_frequencies = []
+  for kmer in kmer_list:
+    try:
+      kmer_frequencies.append(
+        kmer_counts[kmer] / count_kmers_occurrences)
+    except KeyError:
+      kmer_frequencies.append(0)
+  return kmer_frequencies, kmer_list
+def run_eval_pipeline(
+  seqs: typing.Dict[int, typing.List[str]],
+  num_samples_per_class: int,
+  train_weights_path: str,
+  val_weights_path: str,
+  eval_classifier_checkpoint_path: str,
+  kmer_freqs_path: str
+):
+  # Eval pipeline
+  L.seed_everything(42)
+  # Load classifier
+  with hydra.initialize(version_base=None,
+                        config_path='../configs/'):
+    classifier_config = hydra.compose(
+      config_name='config',
+      overrides=[
+        'hydra.output_subdir=null',
+        'hydra.job.chdir=False',
+        'hydra/job_logging=disabled',
+        'hydra/hydra_logging=disabled',
+        '+is_eval_classifier=True',
+        'mode=train_classifier',
+        'loader.global_batch_size=32',
+        'loader.eval_global_batch_size=64',
+        'loader.batch_size=2',
+        'loader.eval_batch_size=4',
+        'data=ten_species',
+        'classifier_model=hyenadna-classifier',
+        'classifier_model.hyena_model_name_or_path=LongSafari/hyenadna-small-32k-seqlen-hf',
+        'classifier_backbone=hyenadna',
+        'classifier_model.n_layer=8',
+        'model.length=32768',
+        'diffusion=null',
+        'T=null',
+        f"eval.checkpoint_path={eval_classifier_checkpoint_path}"
+      ]
+    )
+  classifier_config = omegaconf.OmegaConf.create(
+    classifier_config)
+  tokenizer = transformers.AutoTokenizer.from_pretrained(
+    classifier_config.data.tokenizer_name_or_path,
+    trust_remote_code=True)
+  pretrained_classifier = classifier.Classifier.load_from_checkpoint(
+    classifier_config.eval.checkpoint_path,
+    tokenizer=tokenizer,
+    config=classifier_config, logger=False)
+  pretrained_classifier.eval()
+  tokenizer = dataloader.get_tokenizer(classifier_config)
+  _, val_dl = dataloader.get_dataloaders(
+    classifier_config, tokenizer, skip_train=True,
+    valid_seed=classifier_config.seed)
+  dataset = datasets.load_dataset(
+    'yairschiff/ten_species',
+    split='train',
+    # original dataset only has `train` split
+    chunk_length=classifier_config.model.length,
+    overlap=0,
+    trust_remote_code=True)
+  dataset = dataset.train_test_split(
+    test_size=0.05, seed=42)
+  train_dataset = dataset['train']
+  val_dataset = dataset['test']
+  print(f"Len of train set {len(train_dataset) * (2 ** 15):,d}")
+  print(f"Len of val set {len(val_dataset) * (2 ** 15):,d}")
+  int_to_species = ['Homo_sapiens', 'Mus_musculus',
+                    'Drosophila_melanogaster',
+                    'Danio_rerio',
+                    'Caenorhabditis_elegans',
+                    'Gallus_gallus', 'Gorilla_gorilla',
+                    'Felis_catus',
+                    'Salmo_trutta', 'Arabidopsis_thaliana']
+  if os.path.exists(train_weights_path):
+    train_weights = torch.load(train_weights_path)
+  else:
+    train_weights = {k: 0 for k in range(10)}
+    for i in tqdm(train_dataset, leave=False):
+      train_weights[i['species_label']] += 1
+    train_weights = {
+      k: v / np.sum(list(train_weights.values())) for k, v
+      in train_weights.items()}
+    torch.save(train_weights, train_weights_path)
+  print('Train weights:')
+  for k, v in train_weights.items():
+    print("\t", int_to_species[k], f"{100 * v:0.2f}")
+  if os.path.exists(val_weights_path):
+    val_weights = torch.load(val_weights_path)
+  else:
+    val_weights = {k: 0 for k in range(10)}
+    for i in tqdm(val_dataset, leave=False):
+      val_weights[i['species_label']] += 1
+    val_weights = {k: v / np.sum(list(val_weights.values()))
+                   for k, v in val_weights.items()}
+    torch.save(val_weights, val_weights_path)
+  print('\nVal weights:')
+  for k, v in val_weights.items():
+    print("\t", int_to_species[k], f"{100 * v:0.2f}")
+  result_dict = {}
+  test_data = []
+  for k, v in seqs.items():
+    test_data.extend(
+      [
+        {
+          'sequence': s.replace('[CLS]', '').replace(
+            '[BOS]', '').replace('[MASK]', '').replace(
+            '[SEP]', '').replace('[PAD]', '').replace(
+            '[UNK]', ''),
+          'species_label': k
+        }
+        for s in v
+      ]
+    )
+  test_dataset = custom_datasets.ten_species_dataset.TenSpeciesDataset(
+    split='test',
+    tokenizer=tokenizer,
+    max_length=classifier_config.model.length,
+    rc_aug=False,
+    add_special_tokens=classifier_config.data.add_special_tokens,
+    dataset=test_data
+  )
+  ## CLASSIFIER ACCURACY
+  test_preds = [
+    pretrained_classifier.forward(
+      test_dataset[i]['input_ids'][None, ...].to(
+        'cuda')).argmax(dim=-1).detach().item()
+    for i in
+    tqdm(range(len(test_dataset)), desc='Testing')
+  ]
+  test_preds = np.array(test_preds)
+  test_labels = []
+  for k, v in seqs.items():
+    test_labels.extend([int(k)] * len(v))
+  test_labels = np.array(test_labels)
+  overall_accuracy_score = (test_preds == test_labels).sum() / test_preds.size
+  overall_f1_score = f1_score(y_pred=test_preds,
+                              y_true=test_labels,
+                              average="macro",
+                              labels=list(range(classifier_config.data.num_classes)))
+  overall_mcc_score = matthews_corrcoef(y_pred=test_preds, y_true=test_labels)
+  print(f"Overall Acc: {overall_accuracy_score:0.2f}")
+  print(f"Overall F1:  {overall_f1_score:0.2f}")
+  print(f"Overall MCC: {overall_mcc_score:0.2f}")
+  result_dict['F1'] = overall_f1_score
+  f1_scores = f1_score(
+    y_pred=test_preds,
+    y_true=test_labels,
+    average=None,
+    labels=list(range(classifier_config.data.num_classes)))
+  precision_scores = precision_score(
+    y_pred=test_preds,
+    y_true=test_labels,
+    average=None,
+    labels=list(range(classifier_config.data.num_classes)))
+  recall_scores = recall_score(
+    y_pred=test_preds,
+    y_true=test_labels,
+    average=None,
+    labels=list(range(classifier_config.data.num_classes)))
+  species_list = ['Homo_sapiens', 'Mus_musculus',
+                  'Drosophila_melanogaster',
+                  'Danio_rerio',
+                  'Caenorhabditis_elegans',
+                  'Gallus_gallus', 'Gorilla_gorilla',
+                  'Felis_catus',
+                  'Salmo_trutta',
+                  'Arabidopsis_thaliana']
+  for s in range(classifier_config.data.num_classes):
+    print(f"Class {s} - {species_list[s]}:")
+    print(f"   F1:        {f1_scores[s]:0.3f}")
+    print(f"   Precision: {precision_scores[s]:0.3f}")
+    print(f"   Recall:    {recall_scores[s]:0.3f}")
+  ## KMER SPECTRUM
+  kmer_lengths = [3, 6]
+  kmer_results = {k: [] for k in kmer_lengths}
+  if os.path.exists(kmer_freqs_path):
+    kmer_freqs = torch.load(kmer_freqs_path)
+  else:
+    kmer_freqs = {s: {
+      kmer_length: {'frequencies': None,
+                    'kmers': None} for kmer_length in
+      kmer_lengths} for s in range(10)}
+    for s in range(10):
+      filter_ds = val_dataset.filter(
+        lambda x: x['species_label'] == s,
+        num_proc=len(os.sched_getaffinity(0)))
+      print(f"Computing kmer frequencies for species class {s}")
+      for kmer_length in kmer_lengths:
+        kmer_frequencies_gt, kmer_list = compute_kmer_frequencies(
+          seqs=filter_ds['sequence'],
+          kmer_length=kmer_length
+        )
+        kmer_freqs[s][kmer_length]['frequencies'] = kmer_frequencies_gt
+        kmer_freqs[s][kmer_length]['kmers'] = kmer_list
+    torch.save(kmer_freqs, kmer_freqs_path)
+  for s in range(10):
+    print(f"Species class {s}")
+    mean_js_divergence = 0
+    for kmer_length in kmer_lengths:
+      kmer_frequencies_gt = kmer_freqs[s][kmer_length]['frequencies']
+      kmer_frequencies_generated, kmer_list = compute_kmer_frequencies(
+        seqs=[i['sequence'] for i in test_data if
+              i['species_label'] == s],
+        kmer_length=kmer_length
+      )
+      js_divergence = np.sum(
+        scipy.spatial.distance.jensenshannon(
+          kmer_frequencies_gt,
+          kmer_frequencies_generated)
+      )
+      kmer_results[kmer_length].append(js_divergence)
+      mean_js_divergence += js_divergence
+      print(
+        f"\tJS divergence with k={kmer_length} : {js_divergence}")
+    print(
+      f"\tMean JS divergence : {mean_js_divergence / len(kmer_lengths):0.2f}")
+  for k, v in kmer_results.items():
+    weighted_kmer_js = (np.array(v) * np.array(
+      list(val_weights.values()))).sum()
+    print(
+      f"Weighted mean JS divergence across classes with k={k}: {weighted_kmer_js:0.2f}")
+    result_dict[f"{k}mer JS"] = weighted_kmer_js
+  ## DISCRIMINATOR AUROC
+  # Hyperparams
+  d_model = 128
+  n_layer = 2
+  batch_size = 8
+  lr = 1e-4
+  epochs = 5
+  disc_data = [
+    {'sequence': i['sequence'], 'species_label': 0}
+    for i in test_data]
+  for s in range(10):
+    filter_val_ds = val_dataset.filter(
+      lambda x: x['species_label'] == s,
+      num_proc=len(os.sched_getaffinity(0)))
+    indices = np.random.permutation(
+      np.arange(len(filter_val_ds)))[:num_samples_per_class]
+    disc_data.extend(
+      [{'sequence': i['sequence'], 'species_label': 1}
+       for i in filter_val_ds.select(indices)]
+    )
+  print(f"Size of discriminator dataset: {len(disc_data)}")
+  disc_dataset_hf = datasets.Dataset.from_list(
+    disc_data)
+  disc_dataset_hf = disc_dataset_hf.train_test_split(
+    test_size=0.1, seed=42)
+  disc_dataset_train = custom_datasets.ten_species_dataset.TenSpeciesDataset(
+    split='train',
+    tokenizer=tokenizer,
+    max_length=classifier_config.model.length,
+    rc_aug=False,
+    add_special_tokens=classifier_config.data.add_special_tokens,
+    dataset=disc_dataset_hf['train']
+  )
+  disc_dataset_val = custom_datasets.ten_species_dataset.TenSpeciesDataset(
+    split='test',
+    tokenizer=tokenizer,
+    max_length=classifier_config.model.length,
+    rc_aug=False,
+    add_special_tokens=classifier_config.data.add_special_tokens,
+    dataset=disc_dataset_hf['test']
+  )
+  disc_train_dl = torch.utils.data.DataLoader(
+    disc_dataset_train,
+    batch_size=batch_size,
+    num_workers=0,
+    pin_memory=True,
+    shuffle=True)
+  disc_val_dl = torch.utils.data.DataLoader(
+    disc_dataset_val,
+    batch_size=batch_size,
+    num_workers=0,
+    pin_memory=True,
+    shuffle=False)
+  hyena_config = transformers.AutoConfig.from_pretrained(
+    'LongSafari/hyenadna-small-32k-seqlen-hf',
+    d_model=d_model,
+    n_layer=n_layer,
+    trust_remote_code=True)
+  disc_model = transformers.AutoModelForSequenceClassification.from_config(
+    hyena_config,
+    pretrained=False,
+    num_labels=2,
+    problem_type='single_label_classification',
+    trust_remote_code=True)
+  optimizer = torch.optim.AdamW(
+    disc_model.parameters(), lr=lr, weight_decay=0,
+    betas=(0.9, 0.999), eps=1e-8)
+  disc_model.to('cuda')
+  losses = []
+  auroc_list = []
+  for ep in tqdm(range(epochs), desc='Epochs'):
+    # Train loop:
+    disc_model.train()
+    train_pbar = tqdm(disc_train_dl, desc='Train',
+                      leave=False)
+    for batch in train_pbar:
+      labels = batch['species_label'].to('cuda')
+      logits = disc_model(
+        batch['input_ids'].to('cuda')).logits
+      loss = torch.nn.functional.cross_entropy(
+        logits.view(-1, logits.size(-1)),
+        labels,
+        ignore_index=-100,
+        reduction='mean')
+      optimizer.zero_grad()
+      loss.backward()
+      optimizer.step()
+      train_pbar.set_postfix({'loss': loss.item()})
+      losses.append(loss.item())
+    # Val loop:
+    disc_model.eval()
+    disc_labels = []
+    disc_preds = []
+    for batch in disc_val_dl:
+      disc_labels.append(
+        batch['species_label'].numpy())
+      disc_preds.append(
+        disc_model(
+          batch['input_ids'].to('cuda')
+        ).logits[..., 1].detach().to('cpu').numpy()
+      )
+    disc_labels = np.concatenate(disc_labels)
+    disc_preds = np.concatenate(disc_preds)
+    auroc = roc_auc_score(y_true=disc_labels, y_score=disc_preds)
+    auroc_list.append(auroc)
+    print(f"Ep {ep} - AUROC score {auroc}")
+  result_dict["Disc AUROC"] = auroc_list[-1]
+  del disc_model
+  print('*****************************')
+  return result_dict
+@hydra.main(version_base=None, config_path='../configs',
+            config_name='config')
+def main(config: omegaconf.DictConfig) -> None:
+  # Reproducibility
+  L.seed_everything(config.seed)
+  os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+  torch.use_deterministic_algorithms(True)
+  torch.backends.cudnn.benchmark = False
+  _print_config(config, resolve=True)
+  print(f"Checkpoint: {config.eval.checkpoint_path}")
+  tokenizer = dataloader.get_tokenizer(config)
+  pretrained = diffusion.Diffusion.load_from_checkpoint(
+    config.eval.checkpoint_path,
+    tokenizer=tokenizer,
+    config=config, logger=False)
+  pretrained.eval()
+  # Generate samples
+  if not os.path.exists(config.eval.generated_samples_path):
+    samples_per_class = {}
+    classes = range(config.data.num_classes)
+    for species in classes:
+      config.guidance.condition = species
+      print("Guidance:", ", ".join([f"{k.capitalize()} - {v}" for k, v in config.guidance.items()]))
+      samples = []
+      for _ in tqdm(
+        range(config.sampling.num_sample_batches), desc='Gen. batches', leave=False):
+        sample = pretrained.sample()
+        samples.extend(pretrained.tokenizer.batch_decode(sample))
+      samples_per_class[species] = samples
+    with open(config.eval.generated_samples_path, 'w') as f:
+      json.dump(samples_per_class, f, indent=4) # type: ignore
+  else:
+    with open(config.eval.generated_samples_path, 'r') as f:
+      samples_per_class = json.load(f)
+    samples_per_class = {int(k): v for k, v in samples_per_class.items()}
+  # Run eval pipeline
+  hydra.core.global_hydra.GlobalHydra.instance().clear()
+  result_dict = run_eval_pipeline(
+    samples_per_class,
+    num_samples_per_class=config.sampling.num_sample_batches*config.sampling.batch_size,
+    train_weights_path=config.eval.train_weights_path,
+    val_weights_path=config.eval.val_weights_path,
+    eval_classifier_checkpoint_path=config.eval.eval_classifier_checkpoint_path,
+    kmer_freqs_path=config.eval.kmer_freqs_path)
+  result_dict['Seed'] = config.seed
+  result_dict['T'] = config.sampling.steps
+  result_dict = result_dict | {k.capitalize(): v for k, v in config.guidance.items()}
+  result_dict['Num Samples'] = sum([len(v) for v in samples_per_class.values()])
+  results_df = pd.DataFrame.from_records([result_dict])
+  results_df.to_csv(config.eval.results_csv_path)
+if __name__ == '__main__':
+  main()

main.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import json
+import os
+import fsspec
+import hydra
+import lightning as L
+import omegaconf
+import rich.syntax
+import rich.tree
+import torch
+from tqdm import tqdm
+from datasets import load_from_disk
+import pdb
+import classifier
+import dataloader
+import diffusion
+import eval_utils
+import utils
+omegaconf.OmegaConf.register_new_resolver(
+  'cwd', os.getcwd)
+omegaconf.OmegaConf.register_new_resolver(
+  'device_count', torch.cuda.device_count)
+omegaconf.OmegaConf.register_new_resolver(
+  'eval', eval)
+omegaconf.OmegaConf.register_new_resolver(
+  'div_up', lambda x, y: (x + y - 1) // y)
+omegaconf.OmegaConf.register_new_resolver(
+  'if_then_else',
+  lambda condition, x, y: x if condition else y
+)
+def _load_from_checkpoint(config, tokenizer):
+  if 'hf' in config.backbone:
+    return diffusion.Diffusion(
+      config, tokenizer=tokenizer).to('cuda')
+  return diffusion.Diffusion.load_from_checkpoint(
+    config.eval.checkpoint_path,
+    tokenizer=tokenizer,
+    config=config, logger=False).to('cuda')
+@L.pytorch.utilities.rank_zero_only
+def _print_config(
+  config: omegaconf.DictConfig,
+  resolve: bool = True,
+  save_cfg: bool = True) -> None:
+  """Prints content of DictConfig using Rich library and its tree structure.
+  Args:
+    config (DictConfig): Configuration composed by Hydra.
+    resolve (bool): Whether to resolve reference fields of DictConfig.
+    save_cfg (bool): Whether to save the configuration tree to a file.
+  """
+  style = 'dim'
+  tree = rich.tree.Tree('CONFIG', style=style, guide_style=style)
+  fields = config.keys()
+  for field in fields:
+    branch = tree.add(field, style=style, guide_style=style)
+    config_section = config.get(field)
+    branch_content = str(config_section)
+    if isinstance(config_section, omegaconf.DictConfig):
+      branch_content = omegaconf.OmegaConf.to_yaml(
+        config_section, resolve=resolve)
+    branch.add(rich.syntax.Syntax(branch_content, 'yaml'))
+  rich.print(tree)
+  if save_cfg:
+    with fsspec.open(
+      '{}/config_tree.txt'.format(
+        config.checkpointing.save_dir), 'w') as fp:
+      rich.print(tree, file=fp)
+@L.pytorch.utilities.rank_zero_only
+def _print_batch(train_ds, valid_ds, tokenizer, k=64):
+  for dl_type, dl in [
+    ('train', train_ds), ('valid', valid_ds)]:
+    print(f'Printing {dl_type} dataloader batch.')
+    batch = next(iter(dl))
+    print('Batch input_ids.shape', batch['input_ids'].shape)
+    first = batch['input_ids'][0, :k]
+    last = batch['input_ids'][0, -k:]
+    print(f'First {k} tokens:', tokenizer.decode(first))
+    print('ids:', first)
+    print(f'Last {k} tokens:', tokenizer.decode(last))
+    print('ids:', last)
+def _train(config, logger, tokenizer,
+           train_classifier=False):
+  logger.info('Starting Training.')
+  wandb_logger = None
+  if config.get('wandb', None) is not None:
+    wandb_logger = L.pytorch.loggers.WandbLogger(
+      config=omegaconf.OmegaConf.to_object(config),
+      ** config.wandb)
+  if (config.checkpointing.resume_from_ckpt
+      and config.checkpointing.resume_ckpt_path is not None
+      and utils.fsspec_exists(
+        config.checkpointing.resume_ckpt_path)):
+    ckpt_path = config.checkpointing.resume_ckpt_path
+  else:
+    ckpt_path = None
+  # Lightning callbacks
+  callbacks = []
+  if 'callbacks' in config:
+    for _, callback in config.callbacks.items():
+      callbacks.append(hydra.utils.instantiate(callback))
+  # train_ds, valid_ds = dataloader.get_dataloaders(
+  #   config, tokenizer)
+  train_dataset = load_from_disk('/home/tc415/discrete-diffusion-guidance/dataset/3000_400k/train')
+  val_dataset = load_from_disk('/home/tc415/discrete-diffusion-guidance/dataset/3000_400k/val')
+  test_dataset = load_from_disk('/home/tc415/discrete-diffusion-guidance/dataset/3000_400k/test')
+  data_module = dataloader.CustomDataModule(train_dataset, val_dataset, test_dataset, tokenizer, config, batch_size=config.loader.batch_size)
+  train_ds = data_module.train_dataloader()
+  valid_ds = data_module.val_dataloader()
+  if not config.is_vision:
+    _print_batch(train_ds, valid_ds, tokenizer)
+  if train_classifier:
+    # This param indicates classifier will be used for
+    #   PPLM / NOS-style guidance
+    #  (see: https://arxiv.org/abs/2305.20009).
+    if getattr(config, 'is_pplm_classifier', False):
+      pretrained_model = _load_from_checkpoint(
+        config, tokenizer)
+      if (getattr(config.classifier_model, 'use_encoder_ema', True)
+          and pretrained_model.ema):
+        pretrained_model.load_ema_params()
+      pretrained_backbone = pretrained_model.backbone
+      # Remove the last layer for the classifier
+      if hasattr(pretrained_backbone, 'output_layer'):  #DiT
+        delattr(pretrained_backbone, 'output_layer')
+      if hasattr(pretrained_backbone, 'model.lm_head'):  #DiMamba
+        delattr(pretrained_backbone, 'model.lm_head')
+      if getattr(config.classifier_model, 'freeze_encoder', True):
+        for param in pretrained_backbone.parameters():
+          param.requires_grad = False
+    else:
+      pretrained_backbone = None
+    model = classifier.Classifier(
+      config,
+      tokenizer=valid_ds.tokenizer,
+      pretrained_backbone=pretrained_backbone)
+  else:
+    model = diffusion.Diffusion(
+      config, tokenizer=tokenizer)
+    # model = diffusion.Diffusion(
+    #   config, tokenizer=valid_ds.tokenizer)
+  trainer = hydra.utils.instantiate(
+    config.trainer,
+    default_root_dir=os.getcwd(),
+    callbacks=callbacks,
+    strategy=hydra.utils.instantiate(config.strategy),
+    logger=wandb_logger)
+  trainer.fit(model, train_ds, valid_ds, ckpt_path=ckpt_path)
+def _gen_ppl_eval(config, tokenizer):
+  pretrained = _load_from_checkpoint(
+    config=config, tokenizer=tokenizer)
+  pretrained.eval()
+  samples = []
+  for _ in tqdm(range(config.sampling.num_sample_batches),
+                desc='Gen. batches', leave=False):
+    sample = pretrained.sample()
+    samples.extend(
+      pretrained.tokenizer.batch_decode(sample))
+  # Replace CLS token with BOS token (if applicable) and
+  # remove padding and mask tokens
+  tok_bos_token = tokenizer.bos_token if tokenizer.bos_token is not None else tokenizer.cls_token
+  samples = [
+    s.replace('[PAD]', '').replace('[MASK]', '').strip()
+    for s in samples
+  ]
+  # Add BOS token to the beginning of each sample (if not already present)
+  samples = [
+    s if s.startswith(tok_bos_token) else f"{tok_bos_token} {s}"
+    for s in samples
+  ]
+  del pretrained  # free up space for eval
+  print(f"Generated {len(samples)} samples.")
+  generative_ppl = eval_utils.compute_generative_ppl(
+    samples,
+    eval_model_name_or_path=config.eval.generative_ppl_model_name_or_path,
+    gen_ppl_eval_batch_size=8,
+    max_length=config.model.length)
+  tokens = tokenizer.batch_encode_plus(
+    samples,
+    return_tensors='pt',
+    add_special_tokens=False,
+    max_length=config.model.length,
+    padding='max_length',
+    truncation=True)['input_ids']
+  _, counts = torch.unique(
+    torch.tensor(tokens), return_counts=True, sorted=False)
+  entropy = torch.special.entr(
+    counts.float() / counts.sum()).sum().item()
+  with open(config.eval.generated_samples_path, 'w') as f:
+    json.dump({
+      'generative_ppl': generative_ppl,
+      'entropy': entropy,
+      'generated_seqs': samples,
+    },
+      f, indent=4) # type: ignore
+  print(f"Entropy: {entropy:0.3f}")
+  print(f"Gen. PPL: {generative_ppl:0.3f}")
+def _ppl_eval(config, tokenizer):
+  print(f"Evaluating perplexity on {config.data.valid}.")
+  pretrained = _load_from_checkpoint(
+    config=config, tokenizer=tokenizer)
+  pretrained.eval()
+  if not config.eval.disable_ema:
+    pretrained.load_ema_params()
+  _, valid_ds = dataloader.get_dataloaders(
+    config, tokenizer, skip_train=True, valid_seed=config.seed)
+  ppl = eval_utils.compute_ppl(pretrained, valid_ds)
+  print(f"PPL: {ppl:0.3f}")
+@hydra.main(version_base=None, config_path='configs',
+            config_name='config')
+def main(config):
+  """Main entry point for training."""
+  L.seed_everything(config.seed)
+  _print_config(config, resolve=True, save_cfg=True)
+  logger = utils.get_logger(__name__)
+  tokenizer = dataloader.get_tokenizer(config)
+  if config.mode == 'gen_ppl_eval':
+    _gen_ppl_eval(config, tokenizer)
+  elif config.mode == 'ppl_eval':
+    _ppl_eval(config, tokenizer)
+  elif 'train' in config.mode:
+    _train(config, logger, tokenizer,
+           train_classifier='classifier' in config.mode)
+  else:
+    raise NotImplementedError(f"Mode {config.mode} not implemented.")
+if __name__ == '__main__':
+  main()

models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from . import dit
+from . import dimamba
+from . import ema
+from . import unet

models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (262 Bytes). View file

models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (260 Bytes). View file

models/__pycache__/bindevaluator.cpython-310.pyc ADDED Viewed

Binary file (2.63 kB). View file

models/__pycache__/dimamba.cpython-310.pyc ADDED Viewed

Binary file (27.9 kB). View file

models/__pycache__/dimamba.cpython-39.pyc ADDED Viewed

Binary file (27.6 kB). View file

models/__pycache__/dit.cpython-310.pyc ADDED Viewed

Binary file (14.9 kB). View file