nvan15 commited on Jan 15

Commit

b816a2c

verified ·

1 Parent(s): 16e46c5

Batch upload part 19

Browse files

Files changed (50) hide show

nl_tasks/expsBOFT/seed42/ft/special_tokens_map.json +24 -0
nl_tasks/expsBOFT/seed42/ft/tokenizer.json +0 -0
nl_tasks/expsBOFT/seed42/ft/tokenizer.model +3 -0
nl_tasks/expsBOFT/seed42/ft/tokenizer_config.json +43 -0
nl_tasks/expsBOFT/seed42/ft2/README.md +205 -0
nl_tasks/expsBOFT/seed42/ft2/adapter_config.json +27 -0
nl_tasks/expsBOFT/seed42/ft2/adapter_model.safetensors +3 -0
nl_tasks/expsBOFT/seed42/trainer_state.json +218 -0
nl_tasks/expsBOFT/seed43/ft/special_tokens_map.json +24 -0
nl_tasks/expsBOFT/seed43/ft/tokenizer.json +0 -0
nl_tasks/expsBOFT/seed43/ft/tokenizer.model +3 -0
nl_tasks/expsBOFT/seed43/ft/tokenizer_config.json +43 -0
nl_tasks/expsBOFT/seed43/ft2/README.md +205 -0
nl_tasks/expsBOFT/seed43/ft2/adapter_config.json +27 -0
nl_tasks/expsBOFT/seed43/ft2/adapter_model.safetensors +3 -0
nl_tasks/expsOFT/seed42/ft/special_tokens_map.json +24 -0
nl_tasks/expsOFT/seed42/ft/tokenizer.json +0 -0
nl_tasks/expsOFT/seed42/ft/tokenizer.model +3 -0
nl_tasks/expsOFT/seed42/ft/tokenizer_config.json +43 -0
nl_tasks/expsOFT/seed42/ft2/README.md +205 -0
nl_tasks/expsOFT/seed42/ft2/adapter_config.json +31 -0
nl_tasks/expsOFT/seed42/ft2/adapter_model.safetensors +3 -0
nl_tasks/expsOFT/seed42/trainer_state.json +218 -0
nl_tasks/expsOFT/seed43/ft/special_tokens_map.json +24 -0
nl_tasks/expsOFT/seed43/ft/tokenizer.json +0 -0
nl_tasks/expsOFT/seed43/ft/tokenizer.model +3 -0
nl_tasks/expsOFT/seed43/ft/tokenizer_config.json +43 -0
nl_tasks/expsOFT/seed43/ft2/README.md +205 -0
nl_tasks/expsOFT/seed43/ft2/adapter_config.json +31 -0
nl_tasks/expsOFT/seed43/ft2/adapter_model.safetensors +3 -0
nl_tasks/expsOFT/seed43/trainer_state.json +218 -0
nl_tasks/expsOFT/seed44/ft/special_tokens_map.json +24 -0
nl_tasks/expsOFT/seed44/ft/tokenizer.json +0 -0
nl_tasks/expsOFT/seed44/ft/tokenizer.model +3 -0
nl_tasks/expsOFT/seed44/ft/tokenizer_config.json +43 -0
nl_tasks/expsOFT/seed44/ft2/README.md +205 -0
nl_tasks/expsOFT/seed44/ft2/adapter_config.json +31 -0
nl_tasks/expsOFT/seed44/ft2/adapter_model.safetensors +3 -0
nl_tasks/expsOFT/seed44/trainer_state.json +218 -0
omini/__init__.py +0 -0
omini/pipeline/flux_omini.py +734 -0
omini/pipeline/flux_omini_ablate_qkv.py +772 -0
omini/pipeline/flux_omini_ablate_scale.py +748 -0
omini/rotation/__init__.py +3 -0
omini/rotation/layer.py +313 -0
omini/rotation/layer_test.py +296 -0
omini/rotation/model.py +390 -0
omini/rotation/rotation_config.py +81 -0
omini/train_flux/train_custom.py +50 -0
omini/train_flux/train_multi_condition.py +160 -0

nl_tasks/expsBOFT/seed42/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

nl_tasks/expsBOFT/seed42/ft/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nl_tasks/expsBOFT/seed42/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

nl_tasks/expsBOFT/seed42/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

nl_tasks/expsBOFT/seed42/ft2/README.md ADDED Viewed

	@@ -0,0 +1,205 @@

+---
+base_model: meta-llama/Llama-2-7b-hf
+library_name: peft
+tags:
+- base_model:adapter:meta-llama/Llama-2-7b-hf
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

nl_tasks/expsBOFT/seed42/ft2/adapter_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama"
+  },
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "boft_block_num": 0,
+  "boft_block_size": 16,
+  "boft_dropout": 0.05,
+  "boft_n_butterfly_factor": 2,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "modules_to_save": null,
+  "peft_type": "BOFT",
+  "peft_version": "0.18.0",
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": null
+}

nl_tasks/expsBOFT/seed42/ft2/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:584526a06a1f45f2f77e6a89a7201b05aa25a3d6be60f231b255a32c48c4b261
+size 34619504

nl_tasks/expsBOFT/seed42/trainer_state.json ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.08375173062086105,
+      "learning_rate": 0.000392,
+      "loss": 0.5193,
+      "step": 50
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.09268203377723694,
+      "learning_rate": 0.0007920000000000001,
+      "loss": 0.3316,
+      "step": 100
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.08198747783899307,
+      "learning_rate": 0.0007964216926581925,
+      "loss": 0.304,
+      "step": 150
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.0816216915845871,
+      "learning_rate": 0.0007854602918076551,
+      "loss": 0.2918,
+      "step": 200
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.07457849383354187,
+      "learning_rate": 0.0007673184950396212,
+      "loss": 0.274,
+      "step": 250
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.07685171067714691,
+      "learning_rate": 0.0007423342497022817,
+      "loss": 0.2687,
+      "step": 300
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.07849128544330597,
+      "learning_rate": 0.0007109729650142636,
+      "loss": 0.2651,
+      "step": 350
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.07266736030578613,
+      "learning_rate": 0.0006738188423714755,
+      "loss": 0.2575,
+      "step": 400
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.06927025318145752,
+      "learning_rate": 0.0006315639927804526,
+      "loss": 0.2525,
+      "step": 450
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.08536054193973541,
+      "learning_rate": 0.00058499554413983,
+      "loss": 0.2494,
+      "step": 500
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.07602768391370773,
+      "learning_rate": 0.000534980978536894,
+      "loss": 0.2429,
+      "step": 550
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.07055249065160751,
+      "learning_rate": 0.00048245197269763485,
+      "loss": 0.2457,
+      "step": 600
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.07144515216350555,
+      "learning_rate": 0.00042838704261214224,
+      "loss": 0.2292,
+      "step": 650
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.07937044650316238,
+      "learning_rate": 0.00037379331563313267,
+      "loss": 0.2169,
+      "step": 700
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.07409252226352692,
+      "learning_rate": 0.00031968776959892677,
+      "loss": 0.2098,
+      "step": 750
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.07844420522451401,
+      "learning_rate": 0.00026707828846051743,
+      "loss": 0.2145,
+      "step": 800
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.07791652530431747,
+      "learning_rate": 0.00021694488731055218,
+      "loss": 0.2082,
+      "step": 850
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.0782908946275711,
+      "learning_rate": 0.00017022145655641685,
+      "loss": 0.2077,
+      "step": 900
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.0826650932431221,
+      "learning_rate": 0.00012777836530893536,
+      "loss": 0.2137,
+      "step": 950
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.0696156919002533,
+      "learning_rate": 9.040624805263558e-05,
+      "loss": 0.2076,
+      "step": 1000
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.06966507434844971,
+      "learning_rate": 5.880127662124091e-05,
+      "loss": 0.2108,
+      "step": 1050
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.08326321095228195,
+      "learning_rate": 3.355219183361582e-05,
+      "loss": 0.2106,
+      "step": 1100
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.0792745053768158,
+      "learning_rate": 1.512933636625089e-05,
+      "loss": 0.2073,
+      "step": 1150
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.07648582756519318,
+      "learning_rate": 3.8758931591217575e-06,
+      "loss": 0.209,
+      "step": 1200
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.0787830799818039,
+      "learning_rate": 1.4925668450960217e-09,
+      "loss": 0.2124,
+      "step": 1250
+    },
+    {
+      "epoch": 2.0,
+      "step": 1250,
+      "total_flos": 1.62594677587968e+18,
+      "train_loss": 0.25041088790893556,
+      "train_runtime": 3370.9131,
+      "train_samples_per_second": 23.732,
+      "train_steps_per_second": 0.371
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 0,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.62594677587968e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

nl_tasks/expsBOFT/seed43/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

nl_tasks/expsBOFT/seed43/ft/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nl_tasks/expsBOFT/seed43/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

nl_tasks/expsBOFT/seed43/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

nl_tasks/expsBOFT/seed43/ft2/README.md ADDED Viewed

	@@ -0,0 +1,205 @@

+---
+base_model: meta-llama/Llama-2-7b-hf
+library_name: peft
+tags:
+- base_model:adapter:meta-llama/Llama-2-7b-hf
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

nl_tasks/expsBOFT/seed43/ft2/adapter_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama"
+  },
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "boft_block_num": 0,
+  "boft_block_size": 16,
+  "boft_dropout": 0.05,
+  "boft_n_butterfly_factor": 2,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "modules_to_save": null,
+  "peft_type": "BOFT",
+  "peft_version": "0.18.0",
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": null
+}

nl_tasks/expsBOFT/seed43/ft2/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:584526a06a1f45f2f77e6a89a7201b05aa25a3d6be60f231b255a32c48c4b261
+size 34619504

nl_tasks/expsOFT/seed42/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

nl_tasks/expsOFT/seed42/ft/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nl_tasks/expsOFT/seed42/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

nl_tasks/expsOFT/seed42/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

nl_tasks/expsOFT/seed42/ft2/README.md ADDED Viewed

	@@ -0,0 +1,205 @@

+---
+base_model: meta-llama/Llama-2-7b-hf
+library_name: peft
+tags:
+- base_model:adapter:meta-llama/Llama-2-7b-hf
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

nl_tasks/expsOFT/seed42/ft2/adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama"
+  },
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "block_share": false,
+  "coft": false,
+  "eps": 6e-05,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "module_dropout": 0.05,
+  "modules_to_save": null,
+  "num_cayley_neumann_terms": 5,
+  "oft_block_size": 64,
+  "peft_type": "OFT",
+  "peft_version": "0.18.0",
+  "r": 0,
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": null,
+  "use_cayley_neumann": true
+}

nl_tasks/expsOFT/seed42/ft2/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d16378461c75d46a179539ea2223803c3af83b5ebb2dcc6face78c64e3ac4f9c
+size 33038696

nl_tasks/expsOFT/seed42/trainer_state.json ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.15338309109210968,
+      "learning_rate": 0.000392,
+      "loss": 0.4726,
+      "step": 50
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.1656411737203598,
+      "learning_rate": 0.0007920000000000001,
+      "loss": 0.3098,
+      "step": 100
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.161162331700325,
+      "learning_rate": 0.0007964216926581925,
+      "loss": 0.2883,
+      "step": 150
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.14719629287719727,
+      "learning_rate": 0.0007854602918076551,
+      "loss": 0.2773,
+      "step": 200
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1362672597169876,
+      "learning_rate": 0.0007673184950396212,
+      "loss": 0.2606,
+      "step": 250
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.1420401930809021,
+      "learning_rate": 0.0007423342497022817,
+      "loss": 0.2549,
+      "step": 300
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.15255458652973175,
+      "learning_rate": 0.0007109729650142636,
+      "loss": 0.2516,
+      "step": 350
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.13546934723854065,
+      "learning_rate": 0.0006738188423714755,
+      "loss": 0.2439,
+      "step": 400
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.1296033263206482,
+      "learning_rate": 0.0006315639927804526,
+      "loss": 0.2383,
+      "step": 450
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.14936736226081848,
+      "learning_rate": 0.00058499554413983,
+      "loss": 0.2348,
+      "step": 500
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.12654532492160797,
+      "learning_rate": 0.000534980978536894,
+      "loss": 0.2274,
+      "step": 550
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.1250297725200653,
+      "learning_rate": 0.00048245197269763485,
+      "loss": 0.2298,
+      "step": 600
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.1344439834356308,
+      "learning_rate": 0.00042838704261214224,
+      "loss": 0.2065,
+      "step": 650
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.12664927542209625,
+      "learning_rate": 0.00037379331563313267,
+      "loss": 0.1907,
+      "step": 700
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.1543550342321396,
+      "learning_rate": 0.00031968776959892677,
+      "loss": 0.1887,
+      "step": 750
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.13837428390979767,
+      "learning_rate": 0.00026707828846051743,
+      "loss": 0.185,
+      "step": 800
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.12324073910713196,
+      "learning_rate": 0.00021694488731055218,
+      "loss": 0.1787,
+      "step": 850
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.14447391033172607,
+      "learning_rate": 0.00017022145655641685,
+      "loss": 0.1779,
+      "step": 900
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.13559409976005554,
+      "learning_rate": 0.00012777836530893536,
+      "loss": 0.1785,
+      "step": 950
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.13572397828102112,
+      "learning_rate": 9.040624805263558e-05,
+      "loss": 0.176,
+      "step": 1000
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.13348858058452606,
+      "learning_rate": 5.880127662124091e-05,
+      "loss": 0.1743,
+      "step": 1050
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.1402943730354309,
+      "learning_rate": 3.355219183361582e-05,
+      "loss": 0.1755,
+      "step": 1100
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.14928816258907318,
+      "learning_rate": 1.512933636625089e-05,
+      "loss": 0.1729,
+      "step": 1150
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.14678366482257843,
+      "learning_rate": 3.8758931591217575e-06,
+      "loss": 0.1785,
+      "step": 1200
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.13319681584835052,
+      "learning_rate": 1.4925668450960217e-09,
+      "loss": 0.1739,
+      "step": 1250
+    },
+    {
+      "epoch": 2.0,
+      "step": 1250,
+      "total_flos": 1.62585013911552e+18,
+      "train_loss": 0.2258549835205078,
+      "train_runtime": 2135.866,
+      "train_samples_per_second": 37.456,
+      "train_steps_per_second": 0.585
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 0,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.62585013911552e+18,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

nl_tasks/expsOFT/seed43/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

nl_tasks/expsOFT/seed43/ft/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nl_tasks/expsOFT/seed43/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

nl_tasks/expsOFT/seed43/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

nl_tasks/expsOFT/seed43/ft2/README.md ADDED Viewed

	@@ -0,0 +1,205 @@

+---
+base_model: meta-llama/Llama-2-7b-hf
+library_name: peft
+tags:
+- base_model:adapter:meta-llama/Llama-2-7b-hf
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

nl_tasks/expsOFT/seed43/ft2/adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama"
+  },
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "block_share": false,
+  "coft": false,
+  "eps": 6e-05,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "module_dropout": 0.05,
+  "modules_to_save": null,
+  "num_cayley_neumann_terms": 5,
+  "oft_block_size": 64,
+  "peft_type": "OFT",
+  "peft_version": "0.18.0",
+  "r": 0,
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": null,
+  "use_cayley_neumann": true
+}

nl_tasks/expsOFT/seed43/ft2/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d16378461c75d46a179539ea2223803c3af83b5ebb2dcc6face78c64e3ac4f9c
+size 33038696

nl_tasks/expsOFT/seed43/trainer_state.json ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.15338309109210968,
+      "learning_rate": 0.000392,
+      "loss": 0.4726,
+      "step": 50
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.1656411737203598,
+      "learning_rate": 0.0007920000000000001,
+      "loss": 0.3098,
+      "step": 100
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.161162331700325,
+      "learning_rate": 0.0007964216926581925,
+      "loss": 0.2883,
+      "step": 150
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.14719629287719727,
+      "learning_rate": 0.0007854602918076551,
+      "loss": 0.2773,
+      "step": 200
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1362672597169876,
+      "learning_rate": 0.0007673184950396212,
+      "loss": 0.2606,
+      "step": 250
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.1420401930809021,
+      "learning_rate": 0.0007423342497022817,
+      "loss": 0.2549,
+      "step": 300
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.15255458652973175,
+      "learning_rate": 0.0007109729650142636,
+      "loss": 0.2516,
+      "step": 350
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.13546934723854065,
+      "learning_rate": 0.0006738188423714755,
+      "loss": 0.2439,
+      "step": 400
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.1296033263206482,
+      "learning_rate": 0.0006315639927804526,
+      "loss": 0.2383,
+      "step": 450
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.14936736226081848,
+      "learning_rate": 0.00058499554413983,
+      "loss": 0.2348,
+      "step": 500
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.12654532492160797,
+      "learning_rate": 0.000534980978536894,
+      "loss": 0.2274,
+      "step": 550
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.1250297725200653,
+      "learning_rate": 0.00048245197269763485,
+      "loss": 0.2298,
+      "step": 600
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.1344439834356308,
+      "learning_rate": 0.00042838704261214224,
+      "loss": 0.2065,
+      "step": 650
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.12664927542209625,
+      "learning_rate": 0.00037379331563313267,
+      "loss": 0.1907,
+      "step": 700
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.1543550342321396,
+      "learning_rate": 0.00031968776959892677,
+      "loss": 0.1887,
+      "step": 750
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.13837428390979767,
+      "learning_rate": 0.00026707828846051743,
+      "loss": 0.185,
+      "step": 800
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.12324073910713196,
+      "learning_rate": 0.00021694488731055218,
+      "loss": 0.1787,
+      "step": 850
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.14447391033172607,
+      "learning_rate": 0.00017022145655641685,
+      "loss": 0.1779,
+      "step": 900
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.13559409976005554,
+      "learning_rate": 0.00012777836530893536,
+      "loss": 0.1785,
+      "step": 950
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.13572397828102112,
+      "learning_rate": 9.040624805263558e-05,
+      "loss": 0.176,
+      "step": 1000
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.13348858058452606,
+      "learning_rate": 5.880127662124091e-05,
+      "loss": 0.1743,
+      "step": 1050
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.1402943730354309,
+      "learning_rate": 3.355219183361582e-05,
+      "loss": 0.1755,
+      "step": 1100
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.14928816258907318,
+      "learning_rate": 1.512933636625089e-05,
+      "loss": 0.1729,
+      "step": 1150
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.14678366482257843,
+      "learning_rate": 3.8758931591217575e-06,
+      "loss": 0.1785,
+      "step": 1200
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.13319681584835052,
+      "learning_rate": 1.4925668450960217e-09,
+      "loss": 0.1739,
+      "step": 1250
+    },
+    {
+      "epoch": 2.0,
+      "step": 1250,
+      "total_flos": 1.62585013911552e+18,
+      "train_loss": 0.2258549835205078,
+      "train_runtime": 2134.8975,
+      "train_samples_per_second": 37.473,
+      "train_steps_per_second": 0.586
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 0,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.62585013911552e+18,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

nl_tasks/expsOFT/seed44/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

nl_tasks/expsOFT/seed44/ft/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nl_tasks/expsOFT/seed44/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

nl_tasks/expsOFT/seed44/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

nl_tasks/expsOFT/seed44/ft2/README.md ADDED Viewed

	@@ -0,0 +1,205 @@

+---
+base_model: meta-llama/Llama-2-7b-hf
+library_name: peft
+tags:
+- base_model:adapter:meta-llama/Llama-2-7b-hf
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

nl_tasks/expsOFT/seed44/ft2/adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama"
+  },
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "block_share": false,
+  "coft": false,
+  "eps": 6e-05,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "module_dropout": 0.05,
+  "modules_to_save": null,
+  "num_cayley_neumann_terms": 5,
+  "oft_block_size": 64,
+  "peft_type": "OFT",
+  "peft_version": "0.18.0",
+  "r": 0,
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": null,
+  "use_cayley_neumann": true
+}

nl_tasks/expsOFT/seed44/ft2/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d16378461c75d46a179539ea2223803c3af83b5ebb2dcc6face78c64e3ac4f9c
+size 33038696

nl_tasks/expsOFT/seed44/trainer_state.json ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.15338309109210968,
+      "learning_rate": 0.000392,
+      "loss": 0.4726,
+      "step": 50
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.1656411737203598,
+      "learning_rate": 0.0007920000000000001,
+      "loss": 0.3098,
+      "step": 100
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.161162331700325,
+      "learning_rate": 0.0007964216926581925,
+      "loss": 0.2883,
+      "step": 150
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.14719629287719727,
+      "learning_rate": 0.0007854602918076551,
+      "loss": 0.2773,
+      "step": 200
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1362672597169876,
+      "learning_rate": 0.0007673184950396212,
+      "loss": 0.2606,
+      "step": 250
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.1420401930809021,
+      "learning_rate": 0.0007423342497022817,
+      "loss": 0.2549,
+      "step": 300
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.15255458652973175,
+      "learning_rate": 0.0007109729650142636,
+      "loss": 0.2516,
+      "step": 350
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.13546934723854065,
+      "learning_rate": 0.0006738188423714755,
+      "loss": 0.2439,
+      "step": 400
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.1296033263206482,
+      "learning_rate": 0.0006315639927804526,
+      "loss": 0.2383,
+      "step": 450
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.14936736226081848,
+      "learning_rate": 0.00058499554413983,
+      "loss": 0.2348,
+      "step": 500
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.12654532492160797,
+      "learning_rate": 0.000534980978536894,
+      "loss": 0.2274,
+      "step": 550
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.1250297725200653,
+      "learning_rate": 0.00048245197269763485,
+      "loss": 0.2298,
+      "step": 600
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.1344439834356308,
+      "learning_rate": 0.00042838704261214224,
+      "loss": 0.2065,
+      "step": 650
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.12664927542209625,
+      "learning_rate": 0.00037379331563313267,
+      "loss": 0.1907,
+      "step": 700
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.1543550342321396,
+      "learning_rate": 0.00031968776959892677,
+      "loss": 0.1887,
+      "step": 750
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.13837428390979767,
+      "learning_rate": 0.00026707828846051743,
+      "loss": 0.185,
+      "step": 800
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.12324073910713196,
+      "learning_rate": 0.00021694488731055218,
+      "loss": 0.1787,
+      "step": 850
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.14447391033172607,
+      "learning_rate": 0.00017022145655641685,
+      "loss": 0.1779,
+      "step": 900
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.13559409976005554,
+      "learning_rate": 0.00012777836530893536,
+      "loss": 0.1785,
+      "step": 950
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.13572397828102112,
+      "learning_rate": 9.040624805263558e-05,
+      "loss": 0.176,
+      "step": 1000
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.13348858058452606,
+      "learning_rate": 5.880127662124091e-05,
+      "loss": 0.1743,
+      "step": 1050
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.1402943730354309,
+      "learning_rate": 3.355219183361582e-05,
+      "loss": 0.1755,
+      "step": 1100
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.14928816258907318,
+      "learning_rate": 1.512933636625089e-05,
+      "loss": 0.1729,
+      "step": 1150
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.14678366482257843,
+      "learning_rate": 3.8758931591217575e-06,
+      "loss": 0.1785,
+      "step": 1200
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.13319681584835052,
+      "learning_rate": 1.4925668450960217e-09,
+      "loss": 0.1739,
+      "step": 1250
+    },
+    {
+      "epoch": 2.0,
+      "step": 1250,
+      "total_flos": 1.62585013911552e+18,
+      "train_loss": 0.2258549835205078,
+      "train_runtime": 2124.0047,
+      "train_samples_per_second": 37.665,
+      "train_steps_per_second": 0.589
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 0,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.62585013911552e+18,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

omini/__init__.py ADDED Viewed

File without changes

omini/pipeline/flux_omini.py ADDED Viewed

	@@ -0,0 +1,734 @@

+import torch
+from typing import List, Union, Optional, Dict, Any, Callable, Type, Tuple
+from diffusers.pipelines import FluxPipeline
+from diffusers.pipelines.flux.pipeline_flux import (
+    FluxPipelineOutput,
+    FluxTransformer2DModel,
+    calculate_shift,
+    retrieve_timesteps,
+    np,
+)
+from diffusers.models.attention_processor import Attention, F
+from diffusers.models.embeddings import apply_rotary_emb
+from transformers import pipeline
+from peft.tuners.tuners_utils import BaseTunerLayer
+from accelerate.utils import is_torch_version
+from contextlib import contextmanager
+import cv2
+from PIL import Image, ImageFilter
+def seed_everything(seed: int = 42):
+    torch.backends.cudnn.deterministic = True
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+def clip_hidden_states(hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+    if hidden_states.dtype == torch.float16:
+        hidden_states = hidden_states.clip(-65504, 65504)
+    return hidden_states
+def encode_images(pipeline: FluxPipeline, images: torch.Tensor):
+    """
+    Encodes the images into tokens and ids for FLUX pipeline.
+    """
+    images = pipeline.image_processor.preprocess(images)
+    images = images.to(pipeline.device).to(pipeline.dtype)
+    images = pipeline.vae.encode(images).latent_dist.sample()
+    images = (
+        images - pipeline.vae.config.shift_factor
+    ) * pipeline.vae.config.scaling_factor
+    images_tokens = pipeline._pack_latents(images, *images.shape)
+    images_ids = pipeline._prepare_latent_image_ids(
+        images.shape[0],
+        images.shape[2],
+        images.shape[3],
+        pipeline.device,
+        pipeline.dtype,
+    )
+    if images_tokens.shape[1] != images_ids.shape[0]:
+        images_ids = pipeline._prepare_latent_image_ids(
+            images.shape[0],
+            images.shape[2] // 2,
+            images.shape[3] // 2,
+            pipeline.device,
+            pipeline.dtype,
+        )
+    return images_tokens, images_ids
+depth_pipe = None
+def convert_to_condition(
+    condition_type: str,
+    raw_img: Union[Image.Image, torch.Tensor],
+    blur_radius: Optional[int] = 5,
+) -> Union[Image.Image, torch.Tensor]:
+    if condition_type == "depth":
+        global depth_pipe
+        depth_pipe = depth_pipe or pipeline(
+            task="depth-estimation",
+            model="LiheYoung/depth-anything-small-hf",
+            device="cpu",  # Use "cpu" to enable parallel processing
+        )
+        source_image = raw_img.convert("RGB")
+        condition_img = depth_pipe(source_image)["depth"].convert("RGB")
+        return condition_img
+    elif condition_type == "canny":
+        img = np.array(raw_img)
+        edges = cv2.Canny(img, 100, 200)
+        edges = Image.fromarray(edges).convert("RGB")
+        return edges
+    elif condition_type == "coloring":
+        return raw_img.convert("L").convert("RGB")
+    elif condition_type == "deblurring":
+        condition_image = (
+            raw_img.convert("RGB")
+            .filter(ImageFilter.GaussianBlur(blur_radius))
+            .convert("RGB")
+        )
+        return condition_image
+    else:
+        print("Warning: Returning the raw image.")
+        return raw_img.convert("RGB")
+class Condition(object):
+    def __init__(
+        self,
+        condition: Union[Image.Image, torch.Tensor],
+        adapter_setting: Union[str, dict],
+        position_delta=None,
+        position_scale=1.0,
+        latent_mask=None,
+        is_complement=False,
+    ) -> None:
+        self.condition = condition
+        self.adapter = adapter_setting
+        self.position_delta = position_delta
+        self.position_scale = position_scale
+        self.latent_mask = (
+            latent_mask.T.reshape(-1) if latent_mask is not None else None
+        )
+        self.is_complement = is_complement
+    def encode(
+        self, pipe: FluxPipeline, empty: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
+        condition_empty = Image.new("RGB", self.condition.size, (0, 0, 0))
+        tokens, ids = encode_images(pipe, condition_empty if empty else self.condition)
+        if self.position_delta is not None:
+            ids[:, 1] += self.position_delta[0]
+            ids[:, 2] += self.position_delta[1]
+        if self.position_scale != 1.0:
+            scale_bias = (self.position_scale - 1.0) / 2
+            ids[:, 1:] *= self.position_scale
+            ids[:, 1:] += scale_bias
+        if self.latent_mask is not None:
+            tokens = tokens[:, self.latent_mask]
+            ids = ids[self.latent_mask]
+        return tokens, ids
+@contextmanager
+def specify_lora(lora_modules: List[BaseTunerLayer], specified_lora):
+    # Filter valid lora modules
+    valid_lora_modules = [m for m in lora_modules if isinstance(m, BaseTunerLayer)]
+    # Save original scales
+    original_scales = [
+        {
+            adapter: module.scaling[adapter]
+            for adapter in module.active_adapters
+            if adapter in module.scaling
+        }
+        for module in valid_lora_modules
+    ]
+    # Enter context: adjust scaling
+    for module in valid_lora_modules:
+        for adapter in module.active_adapters:
+            if adapter in module.scaling:
+                module.scaling[adapter] = 1 if adapter == specified_lora else 0
+    try:
+        yield
+    finally:
+        # Exit context: restore original scales
+        for module, scales in zip(valid_lora_modules, original_scales):
+            for adapter in module.active_adapters:
+                if adapter in module.scaling:
+                    module.scaling[adapter] = scales[adapter]
+def attn_forward(
+    attn: Attention,
+    hidden_states: List[torch.FloatTensor],
+    adapters: List[str],
+    hidden_states2: Optional[List[torch.FloatTensor]] = [],
+    position_embs: Optional[List[torch.Tensor]] = None,
+    group_mask: Optional[torch.Tensor] = None,
+    cache_mode: Optional[str] = None,
+    # to determine whether to cache the keys and values for this branch
+    to_cache: Optional[List[torch.Tensor]] = None,
+    cache_storage: Optional[List[torch.Tensor]] = None,
+    **kwargs: dict,
+) -> torch.FloatTensor:
+    bs, _, _ = hidden_states[0].shape
+    h2_n = len(hidden_states2)
+    queries, keys, values = [], [], []
+    # Prepare query, key, value for each encoder hidden state (text branch)
+    for i, hidden_state in enumerate(hidden_states2):
+        query = attn.add_q_proj(hidden_state)
+        key = attn.add_k_proj(hidden_state)
+        value = attn.add_v_proj(hidden_state)
+        head_dim = key.shape[-1] // attn.heads
+        reshape_fn = lambda x: x.view(bs, -1, attn.heads, head_dim).transpose(1, 2)
+        query, key, value = map(reshape_fn, (query, key, value))
+        query, key = attn.norm_added_q(query), attn.norm_added_k(key)
+        queries.append(query)
+        keys.append(key)
+        values.append(value)
+    # Prepare query, key, value for each hidden state (image branch)
+    for i, hidden_state in enumerate(hidden_states):
+        with specify_lora((attn.to_q, attn.to_k, attn.to_v), adapters[i + h2_n]):
+            query = attn.to_q(hidden_state)
+            key = attn.to_k(hidden_state)
+            value = attn.to_v(hidden_state)
+        head_dim = key.shape[-1] // attn.heads
+        reshape_fn = lambda x: x.view(bs, -1, attn.heads, head_dim).transpose(1, 2)
+        query, key, value = map(reshape_fn, (query, key, value))
+        query, key = attn.norm_q(query), attn.norm_k(key)
+        queries.append(query)
+        keys.append(key)
+        values.append(value)
+    # Apply rotary embedding
+    if position_embs is not None:
+        queries = [apply_rotary_emb(q, position_embs[i]) for i, q in enumerate(queries)]
+        keys = [apply_rotary_emb(k, position_embs[i]) for i, k in enumerate(keys)]
+    if cache_mode == "write":
+        for i, (k, v) in enumerate(zip(keys, values)):
+            if to_cache[i]:
+                cache_storage[attn.cache_idx][0].append(k)
+                cache_storage[attn.cache_idx][1].append(v)
+    attn_outputs = []
+    for i, query in enumerate(queries):
+        keys_, values_ = [], []
+        # Add keys and values from other branches
+        for j, (k, v) in enumerate(zip(keys, values)):
+            if (group_mask is not None) and not (group_mask[i][j].item()):
+                continue
+            keys_.append(k)
+            values_.append(v)
+        if cache_mode == "read":
+            keys_.extend(cache_storage[attn.cache_idx][0])
+            values_.extend(cache_storage[attn.cache_idx][1])
+        # Add keys and values from cache TODO
+        # Attention computation
+        attn_output = F.scaled_dot_product_attention(
+            query, torch.cat(keys_, dim=2), torch.cat(values_, dim=2)
+        ).to(query.dtype)
+        attn_output = attn_output.transpose(1, 2).reshape(bs, -1, attn.heads * head_dim)
+        attn_outputs.append(attn_output)
+    # Reshape attention output to match the original hidden states
+    h_out, h2_out = [], []
+    for i, hidden_state in enumerate(hidden_states2):
+        h2_out.append(attn.to_add_out(attn_outputs[i]))
+    for i, hidden_state in enumerate(hidden_states):
+        h = attn_outputs[i + h2_n]
+        if getattr(attn, "to_out", None) is not None:
+            with specify_lora((attn.to_out[0],), adapters[i + h2_n]):
+                h = attn.to_out[0](h)
+        h_out.append(h)
+    return (h_out, h2_out) if h2_n else h_out
+def block_forward(
+    self,
+    image_hidden_states: List[torch.FloatTensor],
+    text_hidden_states: List[torch.FloatTensor],
+    tembs: List[torch.FloatTensor],
+    adapters: List[str],
+    position_embs=None,
+    attn_forward=attn_forward,
+    **kwargs: dict,
+):
+    txt_n = len(text_hidden_states)
+    img_variables, txt_variables = [], []
+    for i, text_h in enumerate(text_hidden_states):
+        txt_variables.append(self.norm1_context(text_h, emb=tembs[i]))
+    for i, image_h in enumerate(image_hidden_states):
+        with specify_lora((self.norm1.linear,), adapters[i + txt_n]):
+            img_variables.append(self.norm1(image_h, emb=tembs[i + txt_n]))
+    # Attention.
+    img_attn_output, txt_attn_output = attn_forward(
+        self.attn,
+        hidden_states=[each[0] for each in img_variables],
+        hidden_states2=[each[0] for each in txt_variables],
+        position_embs=position_embs,
+        adapters=adapters,
+        **kwargs,
+    )
+    text_out = []
+    for i in range(len(text_hidden_states)):
+        _, gate_msa, shift_mlp, scale_mlp, gate_mlp = txt_variables[i]
+        text_h = text_hidden_states[i] + txt_attn_output[i] * gate_msa.unsqueeze(1)
+        norm_h = (
+            self.norm2_context(text_h) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        )
+        text_h = self.ff_context(norm_h) * gate_mlp.unsqueeze(1) + text_h
+        text_out.append(clip_hidden_states(text_h))
+    image_out = []
+    for i in range(len(image_hidden_states)):
+        _, gate_msa, shift_mlp, scale_mlp, gate_mlp = img_variables[i]
+        image_h = (
+            image_hidden_states[i] + img_attn_output[i] * gate_msa.unsqueeze(1)
+        ).to(image_hidden_states[i].dtype)
+        norm_h = self.norm2(image_h) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        with specify_lora((self.ff.net[2],), adapters[i + txt_n]):
+            image_h = image_h + self.ff(norm_h) * gate_mlp.unsqueeze(1)
+        image_out.append(clip_hidden_states(image_h))
+    return image_out, text_out
+def single_block_forward(
+    self,
+    hidden_states: List[torch.FloatTensor],
+    tembs: List[torch.FloatTensor],
+    adapters: List[str],
+    position_embs=None,
+    attn_forward=attn_forward,
+    **kwargs: dict,
+):
+    mlp_hidden_states, gates = [[None for _ in hidden_states] for _ in range(2)]
+    hidden_state_norm = []
+    for i, hidden_state in enumerate(hidden_states):
+        # [NOTE]!: This function's output is slightly DIFFERENT from the original
+        # FLUX version. In the original implementation, the gates were computed using
+        # the combined hidden states from both the image and text branches. Here, each
+        # branch computes its gate using only its own hidden state.
+        with specify_lora((self.norm.linear, self.proj_mlp), adapters[i]):
+            h_norm, gates[i] = self.norm(hidden_state, emb=tembs[i])
+            mlp_hidden_states[i] = self.act_mlp(self.proj_mlp(h_norm))
+        hidden_state_norm.append(h_norm)
+    attn_outputs = attn_forward(
+        self.attn, hidden_state_norm, adapters, position_embs=position_embs, **kwargs
+    )
+    h_out = []
+    for i in range(len(hidden_states)):
+        with specify_lora((self.proj_out,), adapters[i]):
+            h = torch.cat([attn_outputs[i], mlp_hidden_states[i]], dim=2)
+            h = gates[i].unsqueeze(1) * self.proj_out(h) + hidden_states[i]
+            h_out.append(clip_hidden_states(h))
+    return h_out
+def transformer_forward(
+    transformer: FluxTransformer2DModel,
+    image_features: List[torch.Tensor],
+    text_features: List[torch.Tensor] = None,
+    img_ids: List[torch.Tensor] = None,
+    txt_ids: List[torch.Tensor] = None,
+    pooled_projections: List[torch.Tensor] = None,
+    timesteps: List[torch.LongTensor] = None,
+    guidances: List[torch.Tensor] = None,
+    adapters: List[str] = None,
+    # Assign the function to be used for the forward pass
+    single_block_forward=single_block_forward,
+    block_forward=block_forward,
+    attn_forward=attn_forward,
+    **kwargs: dict,
+):
+    self = transformer
+    txt_n = len(text_features) if text_features is not None else 0
+    adapters = adapters or [None] * (txt_n + len(image_features))
+    assert len(adapters) == len(timesteps)
+    # Preprocess the image_features
+    image_hidden_states = []
+    for i, image_feature in enumerate(image_features):
+        with specify_lora((self.x_embedder,), adapters[i + txt_n]):
+            image_hidden_states.append(self.x_embedder(image_feature))
+    # Preprocess the text_features
+    text_hidden_states = []
+    for text_feature in text_features:
+        text_hidden_states.append(self.context_embedder(text_feature))
+    # Prepare embeddings of (timestep, guidance, pooled_projections)
+    assert len(timesteps) == len(image_features) + len(text_features)
+    def get_temb(timestep, guidance, pooled_projection):
+        timestep = timestep.to(image_hidden_states[0].dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(image_hidden_states[0].dtype) * 1000
+            return self.time_text_embed(timestep, guidance, pooled_projection)
+        else:
+            return self.time_text_embed(timestep, pooled_projection)
+    tembs = [get_temb(*each) for each in zip(timesteps, guidances, pooled_projections)]
+    # Prepare position embeddings for each token
+    position_embs = [self.pos_embed(each) for each in (*txt_ids, *img_ids)]
+    # Prepare the gradient checkpointing kwargs
+    gckpt_kwargs: Dict[str, Any] = (
+        {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+    )
+    # dual branch blocks
+    for block in self.transformer_blocks:
+        block_kwargs = {
+            "self": block,
+            "image_hidden_states": image_hidden_states,
+            "text_hidden_states": text_hidden_states,
+            "tembs": tembs,
+            "position_embs": position_embs,
+            "adapters": adapters,
+            "attn_forward": attn_forward,
+            **kwargs,
+        }
+        if self.training and self.gradient_checkpointing:
+            image_hidden_states, text_hidden_states = torch.utils.checkpoint.checkpoint(
+                block_forward, **block_kwargs, **gckpt_kwargs
+            )
+        else:
+            image_hidden_states, text_hidden_states = block_forward(**block_kwargs)
+    # combine image and text hidden states then pass through the single transformer blocks
+    all_hidden_states = [*text_hidden_states, *image_hidden_states]
+    for block in self.single_transformer_blocks:
+        block_kwargs = {
+            "self": block,
+            "hidden_states": all_hidden_states,
+            "tembs": tembs,
+            "position_embs": position_embs,
+            "adapters": adapters,
+            "attn_forward": attn_forward,
+            **kwargs,
+        }
+        if self.training and self.gradient_checkpointing:
+            all_hidden_states = torch.utils.checkpoint.checkpoint(
+                single_block_forward, **block_kwargs, **gckpt_kwargs
+            )
+        else:
+            all_hidden_states = single_block_forward(**block_kwargs)
+    image_hidden_states = self.norm_out(all_hidden_states[txt_n], tembs[txt_n])
+    output = self.proj_out(image_hidden_states)
+    return (output,)
+@torch.no_grad()
+def generate(
+    pipeline: FluxPipeline,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = 512,
+    width: Optional[int] = 512,
+    num_inference_steps: int = 28,
+    timesteps: List[int] = None,
+    guidance_scale: float = 3.5,
+    num_images_per_prompt: Optional[int] = 1,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    return_dict: bool = True,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 512,
+    # Condition Parameters (Optional)
+    main_adapter: Optional[List[str]] = None,
+    conditions: List[Condition] = [],
+    image_guidance_scale: float = 1.0,
+    transformer_kwargs: Optional[Dict[str, Any]] = {},
+    kv_cache=False,
+    latent_mask=None,
+    **params: dict,
+):
+    self = pipeline
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    # Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        max_sequence_length=max_sequence_length,
+    )
+    self._guidance_scale = guidance_scale
+    self._joint_attention_kwargs = joint_attention_kwargs
+    # Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    device = self._execution_device
+    # Prepare prompt embeddings
+    (
+        prompt_embeds,
+        pooled_prompt_embeds,
+        text_ids,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        device=device,
+        num_images_per_prompt=num_images_per_prompt,
+        max_sequence_length=max_sequence_length,
+    )
+    # Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels // 4
+    latents, latent_image_ids = self.prepare_latents(
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+    if latent_mask is not None:
+        latent_mask = latent_mask.T.reshape(-1)
+        latents = latents[:, latent_mask]
+        latent_image_ids = latent_image_ids[latent_mask]
+    # Prepare conditions
+    c_latents, uc_latents, c_ids, c_timesteps = ([], [], [], [])
+    c_projections, c_guidances, c_adapters = ([], [], [])
+    complement_cond = None
+    for condition in conditions:
+        tokens, ids = condition.encode(self)
+        c_latents.append(tokens)  # [batch_size, token_n, token_dim]
+        # Empty condition for unconditioned image
+        if image_guidance_scale != 1.0:
+            uc_latents.append(condition.encode(self, empty=True)[0])
+        c_ids.append(ids)  # [token_n, id_dim(3)]
+        c_timesteps.append(torch.zeros([1], device=device))
+        c_projections.append(pooled_prompt_embeds)
+        c_guidances.append(torch.ones([1], device=device))
+        c_adapters.append(condition.adapter)
+        # This complement_condition will be combined with the original image.
+        # See the token integration of OminiControl2 [https://arxiv.org/abs/2503.08280]
+        if condition.is_complement:
+            complement_cond = (tokens, ids)
+    # Prepare timesteps
+    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+    image_seq_len = latents.shape[1]
+    mu = calculate_shift(
+        image_seq_len,
+        self.scheduler.config.base_image_seq_len,
+        self.scheduler.config.max_image_seq_len,
+        self.scheduler.config.base_shift,
+        self.scheduler.config.max_shift,
+    )
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler, num_inference_steps, device, timesteps, sigmas, mu=mu
+    )
+    num_warmup_steps = max(
+        len(timesteps) - num_inference_steps * self.scheduler.order, 0
+    )
+    self._num_timesteps = len(timesteps)
+    if kv_cache:
+        attn_counter = 0
+        for module in self.transformer.modules():
+            if isinstance(module, Attention):
+                setattr(module, "cache_idx", attn_counter)
+                attn_counter += 1
+        kv_cond = [[[], []] for _ in range(attn_counter)]
+        kv_uncond = [[[], []] for _ in range(attn_counter)]
+        def clear_cache():
+            for storage in [kv_cond, kv_uncond]:
+                for kesy, values in storage:
+                    kesy.clear()
+                    values.clear()
+    branch_n = len(conditions) + 2
+    group_mask = torch.ones([branch_n, branch_n], dtype=torch.bool)
+    # Disable the attention cross different condition branches
+    group_mask[2:, 2:] = torch.diag(torch.tensor([1] * len(conditions)))
+    # Disable the attention from condition branches to image branch and text branch
+    if kv_cache:
+        group_mask[2:, :2] = False
+    # Denoising loop
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latents.shape[0]).to(latents.dtype) / 1000
+            # handle guidance
+            if self.transformer.config.guidance_embeds:
+                guidance = torch.tensor([guidance_scale], device=device)
+                guidance = guidance.expand(latents.shape[0])
+            else:
+                guidance, c_guidances = None, [None for _ in c_guidances]
+            if kv_cache:
+                mode = "write" if i == 0 else "read"
+                if mode == "write":
+                    clear_cache()
+            use_cond = not (kv_cache) or mode == "write"
+            noise_pred = transformer_forward(
+                self.transformer,
+                image_features=[latents] + (c_latents if use_cond else []),
+                text_features=[prompt_embeds],
+                img_ids=[latent_image_ids] + (c_ids if use_cond else []),
+                txt_ids=[text_ids],
+                timesteps=[timestep, timestep] + (c_timesteps if use_cond else []),
+                pooled_projections=[pooled_prompt_embeds] * 2
+                + (c_projections if use_cond else []),
+                guidances=[guidance] * 2 + (c_guidances if use_cond else []),
+                return_dict=False,
+                adapters=[main_adapter] * 2 + (c_adapters if use_cond else []),
+                cache_mode=mode if kv_cache else None,
+                cache_storage=kv_cond if kv_cache else None,
+                to_cache=[False, False, *[True] * len(c_latents)],
+                group_mask=group_mask,
+                **transformer_kwargs,
+            )[0]
+            if image_guidance_scale != 1.0:
+                unc_pred = transformer_forward(
+                    self.transformer,
+                    image_features=[latents] + (uc_latents if use_cond else []),
+                    text_features=[prompt_embeds],
+                    img_ids=[latent_image_ids] + (c_ids if use_cond else []),
+                    txt_ids=[text_ids],
+                    timesteps=[timestep, timestep] + (c_timesteps if use_cond else []),
+                    pooled_projections=[pooled_prompt_embeds] * 2
+                    + (c_projections if use_cond else []),
+                    guidances=[guidance] * 2 + (c_guidances if use_cond else []),
+                    return_dict=False,
+                    adapters=[main_adapter] * 2 + (c_adapters if use_cond else []),
+                    cache_mode=mode if kv_cache else None,
+                    cache_storage=kv_uncond if kv_cache else None,
+                    to_cache=[False, False, *[True] * len(c_latents)],
+                    **transformer_kwargs,
+                )[0]
+                noise_pred = unc_pred + image_guidance_scale * (noise_pred - unc_pred)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            latents = self.scheduler.step(noise_pred, t, latents)[0]
+            if latents.dtype != latents_dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    latents = latents.to(latents_dtype)
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or (
+                (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+            ):
+                progress_bar.update()
+    if latent_mask is not None:
+        # Combine the generated latents and the complement condition
+        assert complement_cond is not None
+        comp_latent, comp_ids = complement_cond
+        all_ids = torch.cat([latent_image_ids, comp_ids], dim=0)  # (Ta+Tc,3)
+        shape = (all_ids.max(dim=0).values + 1).to(torch.long)  # (3,)
+        H, W = shape[1].item(), shape[2].item()
+        B, _, C = latents.shape
+        # Create a empty canvas
+        canvas = latents.new_zeros(B, H * W, C)  # (B,H*W,C)
+        # Stash the latents and the complement condition
+        def _stash(canvas, tokens, ids, H, W) -> None:
+            B, T, C = tokens.shape
+            ids = ids.to(torch.long)
+            flat_idx = (ids[:, 1] * W + ids[:, 2]).to(torch.long)
+            canvas.view(B, -1, C).index_copy_(1, flat_idx, tokens)
+        _stash(canvas, latents, latent_image_ids, H, W)
+        _stash(canvas, comp_latent, comp_ids, H, W)
+        latents = canvas.view(B, H * W, C)
+    if output_type == "latent":
+        image = latents
+    else:
+        latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+        latents = (
+            latents / self.vae.config.scaling_factor
+        ) + self.vae.config.shift_factor
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+    # Offload all models
+    self.maybe_free_model_hooks()
+    if not return_dict:
+        return (image,)
+    return FluxPipelineOutput(images=image)

omini/pipeline/flux_omini_ablate_qkv.py ADDED Viewed

	@@ -0,0 +1,772 @@

+"""
+This version is for ablation study for the effect of scaling in LORA adapters.
+The `generate` function is modified to include a `global_scale` parameter,
+the `SCALE` variable is set globally at the start of generation.
+"""
+import torch
+from typing import List, Union, Optional, Dict, Any, Callable, Type, Tuple
+from diffusers.pipelines import FluxPipeline
+from diffusers.pipelines.flux.pipeline_flux import (
+    FluxPipelineOutput,
+    FluxTransformer2DModel,
+    calculate_shift,
+    retrieve_timesteps,
+    np,
+)
+from diffusers.models.attention_processor import Attention, F
+from diffusers.models.embeddings import apply_rotary_emb
+from transformers import pipeline
+from peft.tuners.tuners_utils import BaseTunerLayer
+from accelerate.utils import is_torch_version
+from contextlib import contextmanager
+import cv2
+from PIL import Image, ImageFilter
+T_Q = None
+T_K = None
+T_V = None
+def seed_everything(seed: int = 42):
+    torch.backends.cudnn.deterministic = True
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+def clip_hidden_states(hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+    if hidden_states.dtype == torch.float16:
+        hidden_states = hidden_states.clip(-65504, 65504)
+    return hidden_states
+def encode_images(pipeline: FluxPipeline, images: torch.Tensor):
+    """
+    Encodes the images into tokens and ids for FLUX pipeline.
+    """
+    images = pipeline.image_processor.preprocess(images)
+    images = images.to(pipeline.device).to(pipeline.dtype)
+    images = pipeline.vae.encode(images).latent_dist.sample()
+    images = (
+        images - pipeline.vae.config.shift_factor
+    ) * pipeline.vae.config.scaling_factor
+    images_tokens = pipeline._pack_latents(images, *images.shape)
+    images_ids = pipeline._prepare_latent_image_ids(
+        images.shape[0],
+        images.shape[2],
+        images.shape[3],
+        pipeline.device,
+        pipeline.dtype,
+    )
+    if images_tokens.shape[1] != images_ids.shape[0]:
+        images_ids = pipeline._prepare_latent_image_ids(
+            images.shape[0],
+            images.shape[2] // 2,
+            images.shape[3] // 2,
+            pipeline.device,
+            pipeline.dtype,
+        )
+    return images_tokens, images_ids
+depth_pipe = None
+def convert_to_condition(
+    condition_type: str,
+    raw_img: Union[Image.Image, torch.Tensor],
+    blur_radius: Optional[int] = 5,
+) -> Union[Image.Image, torch.Tensor]:
+    if condition_type == "depth":
+        global depth_pipe
+        depth_pipe = depth_pipe or pipeline(
+            task="depth-estimation",
+            model="LiheYoung/depth-anything-small-hf",
+            device="cpu",  # Use "cpu" to enable parallel processing
+        )
+        source_image = raw_img.convert("RGB")
+        condition_img = depth_pipe(source_image)["depth"].convert("RGB")
+        return condition_img
+    elif condition_type == "canny":
+        img = np.array(raw_img)
+        edges = cv2.Canny(img, 100, 200)
+        edges = Image.fromarray(edges).convert("RGB")
+        return edges
+    elif condition_type == "coloring":
+        return raw_img.convert("L").convert("RGB")
+    elif condition_type == "deblurring":
+        condition_image = (
+            raw_img.convert("RGB")
+            .filter(ImageFilter.GaussianBlur(blur_radius))
+            .convert("RGB")
+        )
+        return condition_image
+    else:
+        print("Warning: Returning the raw image.")
+        return raw_img.convert("RGB")
+class Condition(object):
+    def __init__(
+        self,
+        condition: Union[Image.Image, torch.Tensor],
+        adapter_setting: Union[str, dict],
+        position_delta=None,
+        position_scale=1.0,
+        latent_mask=None,
+        is_complement=False,
+    ) -> None:
+        self.condition = condition
+        self.adapter = adapter_setting
+        self.position_delta = position_delta
+        self.position_scale = position_scale
+        self.latent_mask = (
+            latent_mask.T.reshape(-1) if latent_mask is not None else None
+        )
+        self.is_complement = is_complement
+    def encode(
+        self, pipe: FluxPipeline, empty: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
+        condition_empty = Image.new("RGB", self.condition.size, (0, 0, 0))
+        tokens, ids = encode_images(pipe, condition_empty if empty else self.condition)
+        if self.position_delta is not None:
+            ids[:, 1] += self.position_delta[0]
+            ids[:, 2] += self.position_delta[1]
+        if self.position_scale != 1.0:
+            scale_bias = (self.position_scale - 1.0) / 2
+            ids[:, 1:] *= self.position_scale
+            ids[:, 1:] += scale_bias
+        if self.latent_mask is not None:
+            tokens = tokens[:, self.latent_mask]
+            ids = ids[self.latent_mask]
+        return tokens, ids
+@contextmanager
+def specify_lora(lora_modules: List[BaseTunerLayer], specified_lora, T=None):
+    # Filter valid lora modules
+    valid_lora_modules = [m for m in lora_modules if isinstance(m, BaseTunerLayer)]
+    # Save original scales
+    original_scales = [
+        {
+            adapter: module.scaling[adapter]
+            for adapter in module.active_adapters
+            if adapter in module.scaling
+        }
+        for module in valid_lora_modules
+    ]
+    # Enter context: adjust scaling
+    for module in valid_lora_modules:
+        for adapter in module.active_adapters:
+            if adapter in module.scaling:
+                module.scaling[adapter] = 1. if adapter == specified_lora else 0
+            if hasattr(module, 'rotation') and T is not None:
+                # alter T if specified
+                if adapter in module.rotation:
+                    # print("FOR DEBUG:entering specify_lora context: setting T")
+                    module.rotation[adapter].T = T
+    try:
+        yield
+    finally:
+        # Exit context: restore original scales
+        for module, scales in zip(valid_lora_modules, original_scales):
+            for adapter in module.active_adapters:
+                if adapter in module.scaling:
+                    module.scaling[adapter] = scales[adapter]
+def attn_forward(
+    attn: Attention,
+    hidden_states: List[torch.FloatTensor],
+    adapters: List[str],
+    hidden_states2: Optional[List[torch.FloatTensor]] = [],
+    position_embs: Optional[List[torch.Tensor]] = None,
+    group_mask: Optional[torch.Tensor] = None,
+    cache_mode: Optional[str] = None,
+    # to determine whether to cache the keys and values for this branch
+    to_cache: Optional[List[torch.Tensor]] = None,
+    cache_storage: Optional[List[torch.Tensor]] = None,
+    **kwargs: dict,
+) -> torch.FloatTensor:
+    bs, _, _ = hidden_states[0].shape
+    h2_n = len(hidden_states2)
+    queries, keys, values = [], [], []
+    # Prepare query, key, value for each encoder hidden state (text branch)
+    for i, hidden_state in enumerate(hidden_states2):
+        query = attn.add_q_proj(hidden_state)
+        key = attn.add_k_proj(hidden_state)
+        value = attn.add_v_proj(hidden_state)
+        head_dim = key.shape[-1] // attn.heads
+        reshape_fn = lambda x: x.view(bs, -1, attn.heads, head_dim).transpose(1, 2)
+        query, key, value = map(reshape_fn, (query, key, value))
+        query, key = attn.norm_added_q(query), attn.norm_added_k(key)
+        queries.append(query)
+        keys.append(key)
+        values.append(value)
+    ## THIS IS THE MODIFIED PART TO ABALTE QKV ROTATION T ##
+    # Prepare query, key, value for each hidden state (image branch)
+    for i, hidden_state in enumerate(hidden_states):
+        with specify_lora((attn.to_q,), adapters[i + h2_n], T=T_Q):
+            query = attn.to_q(hidden_state)
+        with specify_lora((attn.to_k,), adapters[i + h2_n], T=T_K):
+            key = attn.to_k(hidden_state)
+        with specify_lora((attn.to_v,), adapters[i + h2_n], T=T_V):
+            value = attn.to_v(hidden_state)
+        head_dim = key.shape[-1] // attn.heads
+        reshape_fn = lambda x: x.view(bs, -1, attn.heads, head_dim).transpose(1, 2)
+        query, key, value = map(reshape_fn, (query, key, value))
+        query, key = attn.norm_q(query), attn.norm_k(key)
+        queries.append(query)
+        keys.append(key)
+        values.append(value)
+    # Apply rotary embedding
+    if position_embs is not None:
+        queries = [apply_rotary_emb(q, position_embs[i]) for i, q in enumerate(queries)]
+        keys = [apply_rotary_emb(k, position_embs[i]) for i, k in enumerate(keys)]
+    if cache_mode == "write":
+        for i, (k, v) in enumerate(zip(keys, values)):
+            if to_cache[i]:
+                cache_storage[attn.cache_idx][0].append(k)
+                cache_storage[attn.cache_idx][1].append(v)
+    attn_outputs = []
+    for i, query in enumerate(queries):
+        keys_, values_ = [], []
+        # Add keys and values from other branches
+        for j, (k, v) in enumerate(zip(keys, values)):
+            if (group_mask is not None) and not (group_mask[i][j].item()):
+                continue
+            keys_.append(k)
+            values_.append(v)
+        if cache_mode == "read":
+            keys_.extend(cache_storage[attn.cache_idx][0])
+            values_.extend(cache_storage[attn.cache_idx][1])
+        # Add keys and values from cache TODO
+        # Attention computation
+        attn_output = F.scaled_dot_product_attention(
+            query, torch.cat(keys_, dim=2), torch.cat(values_, dim=2)
+        ).to(query.dtype)
+        attn_output = attn_output.transpose(1, 2).reshape(bs, -1, attn.heads * head_dim)
+        attn_outputs.append(attn_output)
+    # Reshape attention output to match the original hidden states
+    h_out, h2_out = [], []
+    for i, hidden_state in enumerate(hidden_states2):
+        h2_out.append(attn.to_add_out(attn_outputs[i]))
+    for i, hidden_state in enumerate(hidden_states):
+        h = attn_outputs[i + h2_n]
+        if getattr(attn, "to_out", None) is not None:
+            with specify_lora((attn.to_out[0],), adapters[i + h2_n]):
+                h = attn.to_out[0](h)
+        h_out.append(h)
+    return (h_out, h2_out) if h2_n else h_out
+def block_forward(
+    self,
+    image_hidden_states: List[torch.FloatTensor],
+    text_hidden_states: List[torch.FloatTensor],
+    tembs: List[torch.FloatTensor],
+    adapters: List[str],
+    position_embs=None,
+    attn_forward=attn_forward,
+    **kwargs: dict,
+):
+    txt_n = len(text_hidden_states)
+    img_variables, txt_variables = [], []
+    for i, text_h in enumerate(text_hidden_states):
+        txt_variables.append(self.norm1_context(text_h, emb=tembs[i]))
+    for i, image_h in enumerate(image_hidden_states):
+        with specify_lora((self.norm1.linear,), adapters[i + txt_n]):
+            img_variables.append(self.norm1(image_h, emb=tembs[i + txt_n]))
+    # Attention.
+    img_attn_output, txt_attn_output = attn_forward(
+        self.attn,
+        hidden_states=[each[0] for each in img_variables],
+        hidden_states2=[each[0] for each in txt_variables],
+        position_embs=position_embs,
+        adapters=adapters,
+        **kwargs,
+    )
+    text_out = []
+    for i in range(len(text_hidden_states)):
+        _, gate_msa, shift_mlp, scale_mlp, gate_mlp = txt_variables[i]
+        text_h = text_hidden_states[i] + txt_attn_output[i] * gate_msa.unsqueeze(1)
+        norm_h = (
+            self.norm2_context(text_h) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        )
+        text_h = self.ff_context(norm_h) * gate_mlp.unsqueeze(1) + text_h
+        text_out.append(clip_hidden_states(text_h))
+    image_out = []
+    for i in range(len(image_hidden_states)):
+        _, gate_msa, shift_mlp, scale_mlp, gate_mlp = img_variables[i]
+        image_h = (
+            image_hidden_states[i] + img_attn_output[i] * gate_msa.unsqueeze(1)
+        ).to(image_hidden_states[i].dtype)
+        norm_h = self.norm2(image_h) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        with specify_lora((self.ff.net[2],), adapters[i + txt_n]):
+            image_h = image_h + self.ff(norm_h) * gate_mlp.unsqueeze(1)
+        image_out.append(clip_hidden_states(image_h))
+    return image_out, text_out
+def single_block_forward(
+    self,
+    hidden_states: List[torch.FloatTensor],
+    tembs: List[torch.FloatTensor],
+    adapters: List[str],
+    position_embs=None,
+    attn_forward=attn_forward,
+    **kwargs: dict,
+):
+    mlp_hidden_states, gates = [[None for _ in hidden_states] for _ in range(2)]
+    hidden_state_norm = []
+    for i, hidden_state in enumerate(hidden_states):
+        # [NOTE]!: This function's output is slightly DIFFERENT from the original
+        # FLUX version. In the original implementation, the gates were computed using
+        # the combined hidden states from both the image and text branches. Here, each
+        # branch computes its gate using only its own hidden state.
+        with specify_lora((self.norm.linear, self.proj_mlp), adapters[i]):
+            h_norm, gates[i] = self.norm(hidden_state, emb=tembs[i])
+            mlp_hidden_states[i] = self.act_mlp(self.proj_mlp(h_norm))
+        hidden_state_norm.append(h_norm)
+    attn_outputs = attn_forward(
+        self.attn, hidden_state_norm, adapters, position_embs=position_embs, **kwargs
+    )
+    h_out = []
+    for i in range(len(hidden_states)):
+        with specify_lora((self.proj_out,), adapters[i]):
+            h = torch.cat([attn_outputs[i], mlp_hidden_states[i]], dim=2)
+            h = gates[i].unsqueeze(1) * self.proj_out(h) + hidden_states[i]
+            h_out.append(clip_hidden_states(h))
+    return h_out
+def transformer_forward(
+    transformer: FluxTransformer2DModel,
+    image_features: List[torch.Tensor],
+    text_features: List[torch.Tensor] = None,
+    img_ids: List[torch.Tensor] = None,
+    txt_ids: List[torch.Tensor] = None,
+    pooled_projections: List[torch.Tensor] = None,
+    timesteps: List[torch.LongTensor] = None,
+    guidances: List[torch.Tensor] = None,
+    adapters: List[str] = None,
+    # Assign the function to be used for the forward pass
+    single_block_forward=single_block_forward,
+    block_forward=block_forward,
+    attn_forward=attn_forward,
+    **kwargs: dict,
+):
+    self = transformer
+    txt_n = len(text_features) if text_features is not None else 0
+    adapters = adapters or [None] * (txt_n + len(image_features))
+    assert len(adapters) == len(timesteps)
+    # Preprocess the image_features
+    image_hidden_states = []
+    for i, image_feature in enumerate(image_features):
+        with specify_lora((self.x_embedder,), adapters[i + txt_n]):
+            image_hidden_states.append(self.x_embedder(image_feature))
+    # Preprocess the text_features
+    text_hidden_states = []
+    for text_feature in text_features:
+        text_hidden_states.append(self.context_embedder(text_feature))
+    # Prepare embeddings of (timestep, guidance, pooled_projections)
+    assert len(timesteps) == len(image_features) + len(text_features)
+    def get_temb(timestep, guidance, pooled_projection):
+        timestep = timestep.to(image_hidden_states[0].dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(image_hidden_states[0].dtype) * 1000
+            return self.time_text_embed(timestep, guidance, pooled_projection)
+        else:
+            return self.time_text_embed(timestep, pooled_projection)
+    tembs = [get_temb(*each) for each in zip(timesteps, guidances, pooled_projections)]
+    # Prepare position embeddings for each token
+    position_embs = [self.pos_embed(each) for each in (*txt_ids, *img_ids)]
+    # Prepare the gradient checkpointing kwargs
+    gckpt_kwargs: Dict[str, Any] = (
+        {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+    )
+    # dual branch blocks
+    for block in self.transformer_blocks:
+        block_kwargs = {
+            "self": block,
+            "image_hidden_states": image_hidden_states,
+            "text_hidden_states": text_hidden_states,
+            "tembs": tembs,
+            "position_embs": position_embs,
+            "adapters": adapters,
+            "attn_forward": attn_forward,
+            **kwargs,
+        }
+        if self.training and self.gradient_checkpointing:
+            image_hidden_states, text_hidden_states = torch.utils.checkpoint.checkpoint(
+                block_forward, **block_kwargs, **gckpt_kwargs
+            )
+        else:
+            image_hidden_states, text_hidden_states = block_forward(**block_kwargs)
+    # combine image and text hidden states then pass through the single transformer blocks
+    all_hidden_states = [*text_hidden_states, *image_hidden_states]
+    for block in self.single_transformer_blocks:
+        block_kwargs = {
+            "self": block,
+            "hidden_states": all_hidden_states,
+            "tembs": tembs,
+            "position_embs": position_embs,
+            "adapters": adapters,
+            "attn_forward": attn_forward,
+            **kwargs,
+        }
+        if self.training and self.gradient_checkpointing:
+            all_hidden_states = torch.utils.checkpoint.checkpoint(
+                single_block_forward, **block_kwargs, **gckpt_kwargs
+            )
+        else:
+            all_hidden_states = single_block_forward(**block_kwargs)
+    image_hidden_states = self.norm_out(all_hidden_states[txt_n], tembs[txt_n])
+    output = self.proj_out(image_hidden_states)
+    return (output,)
+@torch.no_grad()
+def generate(
+    pipeline: FluxPipeline,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = 512,
+    width: Optional[int] = 512,
+    num_inference_steps: int = 28,
+    timesteps: List[int] = None,
+    guidance_scale: float = 3.5,
+    num_images_per_prompt: Optional[int] = 1,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    return_dict: bool = True,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 512,
+    # Condition Parameters (Optional)
+    main_adapter: Optional[List[str]] = None,
+    conditions: List[Condition] = [],
+    image_guidance_scale: float = 1.0,
+    transformer_kwargs: Optional[Dict[str, Any]] = {},
+    kv_cache=False,
+    latent_mask=None,
+    global_T_Q=None,
+    global_T_K=None,
+    global_T_V=None,
+    **params: dict,
+):
+    # Set global T_Q, T_K, T_V if provided
+    if global_T_Q is not None:
+        global T_Q
+        T_Q = global_T_Q
+    if global_T_K is not None:
+        global T_K
+        T_K = global_T_K
+    if global_T_V is not None:
+        global T_V
+        T_V = global_T_V
+    self = pipeline
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    # Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        max_sequence_length=max_sequence_length,
+    )
+    self._guidance_scale = guidance_scale
+    self._joint_attention_kwargs = joint_attention_kwargs
+    # Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    device = self._execution_device
+    # Prepare prompt embeddings
+    (
+        prompt_embeds,
+        pooled_prompt_embeds,
+        text_ids,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        device=device,
+        num_images_per_prompt=num_images_per_prompt,
+        max_sequence_length=max_sequence_length,
+    )
+    # Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels // 4
+    latents, latent_image_ids = self.prepare_latents(
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+    if latent_mask is not None:
+        latent_mask = latent_mask.T.reshape(-1)
+        latents = latents[:, latent_mask]
+        latent_image_ids = latent_image_ids[latent_mask]
+    # Prepare conditions
+    c_latents, uc_latents, c_ids, c_timesteps = ([], [], [], [])
+    c_projections, c_guidances, c_adapters = ([], [], [])
+    complement_cond = None
+    for condition in conditions:
+        tokens, ids = condition.encode(self)
+        c_latents.append(tokens)  # [batch_size, token_n, token_dim]
+        # Empty condition for unconditioned image
+        if image_guidance_scale != 1.0:
+            uc_latents.append(condition.encode(self, empty=True)[0])
+        c_ids.append(ids)  # [token_n, id_dim(3)]
+        c_timesteps.append(torch.zeros([1], device=device))
+        c_projections.append(pooled_prompt_embeds)
+        c_guidances.append(torch.ones([1], device=device))
+        c_adapters.append(condition.adapter)
+        # This complement_condition will be combined with the original image.
+        # See the token integration of OminiControl2 [https://arxiv.org/abs/2503.08280]
+        if condition.is_complement:
+            complement_cond = (tokens, ids)
+    # Prepare timesteps
+    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+    image_seq_len = latents.shape[1]
+    mu = calculate_shift(
+        image_seq_len,
+        self.scheduler.config.base_image_seq_len,
+        self.scheduler.config.max_image_seq_len,
+        self.scheduler.config.base_shift,
+        self.scheduler.config.max_shift,
+    )
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler, num_inference_steps, device, timesteps, sigmas, mu=mu
+    )
+    num_warmup_steps = max(
+        len(timesteps) - num_inference_steps * self.scheduler.order, 0
+    )
+    self._num_timesteps = len(timesteps)
+    if kv_cache:
+        attn_counter = 0
+        for module in self.transformer.modules():
+            if isinstance(module, Attention):
+                setattr(module, "cache_idx", attn_counter)
+                attn_counter += 1
+        kv_cond = [[[], []] for _ in range(attn_counter)]
+        kv_uncond = [[[], []] for _ in range(attn_counter)]
+        def clear_cache():
+            for storage in [kv_cond, kv_uncond]:
+                for kesy, values in storage:
+                    kesy.clear()
+                    values.clear()
+    branch_n = len(conditions) + 2
+    group_mask = torch.ones([branch_n, branch_n], dtype=torch.bool)
+    # Disable the attention cross different condition branches
+    group_mask[2:, 2:] = torch.diag(torch.tensor([1] * len(conditions)))
+    # Disable the attention from condition branches to image branch and text branch
+    if kv_cache:
+        group_mask[2:, :2] = False
+    # Denoising loop
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latents.shape[0]).to(latents.dtype) / 1000
+            # handle guidance
+            if self.transformer.config.guidance_embeds:
+                guidance = torch.tensor([guidance_scale], device=device)
+                guidance = guidance.expand(latents.shape[0])
+            else:
+                guidance, c_guidances = None, [None for _ in c_guidances]
+            if kv_cache:
+                mode = "write" if i == 0 else "read"
+                if mode == "write":
+                    clear_cache()
+            use_cond = not (kv_cache) or mode == "write"
+            noise_pred = transformer_forward(
+                self.transformer,
+                image_features=[latents] + (c_latents if use_cond else []),
+                text_features=[prompt_embeds],
+                img_ids=[latent_image_ids] + (c_ids if use_cond else []),
+                txt_ids=[text_ids],
+                timesteps=[timestep, timestep] + (c_timesteps if use_cond else []),
+                pooled_projections=[pooled_prompt_embeds] * 2
+                + (c_projections if use_cond else []),
+                guidances=[guidance] * 2 + (c_guidances if use_cond else []),
+                return_dict=False,
+                adapters=[main_adapter] * 2 + (c_adapters if use_cond else []),
+                cache_mode=mode if kv_cache else None,
+                cache_storage=kv_cond if kv_cache else None,
+                to_cache=[False, False, *[True] * len(c_latents)],
+                group_mask=group_mask,
+                **transformer_kwargs,
+            )[0]
+            if image_guidance_scale != 1.0:
+                unc_pred = transformer_forward(
+                    self.transformer,
+                    image_features=[latents] + (uc_latents if use_cond else []),
+                    text_features=[prompt_embeds],
+                    img_ids=[latent_image_ids] + (c_ids if use_cond else []),
+                    txt_ids=[text_ids],
+                    timesteps=[timestep, timestep] + (c_timesteps if use_cond else []),
+                    pooled_projections=[pooled_prompt_embeds] * 2
+                    + (c_projections if use_cond else []),
+                    guidances=[guidance] * 2 + (c_guidances if use_cond else []),
+                    return_dict=False,
+                    adapters=[main_adapter] * 2 + (c_adapters if use_cond else []),
+                    cache_mode=mode if kv_cache else None,
+                    cache_storage=kv_uncond if kv_cache else None,
+                    to_cache=[False, False, *[True] * len(c_latents)],
+                    **transformer_kwargs,
+                )[0]
+                noise_pred = unc_pred + image_guidance_scale * (noise_pred - unc_pred)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            latents = self.scheduler.step(noise_pred, t, latents)[0]
+            if latents.dtype != latents_dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    latents = latents.to(latents_dtype)
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or (
+                (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+            ):
+                progress_bar.update()
+    if latent_mask is not None:
+        # Combine the generated latents and the complement condition
+        assert complement_cond is not None
+        comp_latent, comp_ids = complement_cond
+        all_ids = torch.cat([latent_image_ids, comp_ids], dim=0)  # (Ta+Tc,3)
+        shape = (all_ids.max(dim=0).values + 1).to(torch.long)  # (3,)
+        H, W = shape[1].item(), shape[2].item()
+        B, _, C = latents.shape
+        # Create a empty canvas
+        canvas = latents.new_zeros(B, H * W, C)  # (B,H*W,C)
+        # Stash the latents and the complement condition
+        def _stash(canvas, tokens, ids, H, W) -> None:
+            B, T, C = tokens.shape
+            ids = ids.to(torch.long)
+            flat_idx = (ids[:, 1] * W + ids[:, 2]).to(torch.long)
+            canvas.view(B, -1, C).index_copy_(1, flat_idx, tokens)
+        _stash(canvas, latents, latent_image_ids, H, W)
+        _stash(canvas, comp_latent, comp_ids, H, W)
+        latents = canvas.view(B, H * W, C)
+    if output_type == "latent":
+        image = latents
+    else:
+        latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+        latents = (
+            latents / self.vae.config.scaling_factor
+        ) + self.vae.config.shift_factor
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+    # Offload all models
+    self.maybe_free_model_hooks()
+    if not return_dict:
+        return (image,)
+    return FluxPipelineOutput(images=image)

omini/pipeline/flux_omini_ablate_scale.py ADDED Viewed

	@@ -0,0 +1,748 @@

+"""
+This version is for ablation study for the effect of scaling in LORA adapters.
+The `generate` function is modified to include a `global_scale` parameter,
+the `SCALE` variable is set globally at the start of generation.
+"""
+import torch
+from typing import List, Union, Optional, Dict, Any, Callable, Type, Tuple
+from diffusers.pipelines import FluxPipeline
+from diffusers.pipelines.flux.pipeline_flux import (
+    FluxPipelineOutput,
+    FluxTransformer2DModel,
+    calculate_shift,
+    retrieve_timesteps,
+    np,
+)
+from diffusers.models.attention_processor import Attention, F
+from diffusers.models.embeddings import apply_rotary_emb
+from transformers import pipeline
+from peft.tuners.tuners_utils import BaseTunerLayer
+from accelerate.utils import is_torch_version
+from contextlib import contextmanager
+import cv2
+from PIL import Image, ImageFilter
+SCALE=1.
+def seed_everything(seed: int = 42):
+    torch.backends.cudnn.deterministic = True
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+def clip_hidden_states(hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+    if hidden_states.dtype == torch.float16:
+        hidden_states = hidden_states.clip(-65504, 65504)
+    return hidden_states
+def encode_images(pipeline: FluxPipeline, images: torch.Tensor):
+    """
+    Encodes the images into tokens and ids for FLUX pipeline.
+    """
+    images = pipeline.image_processor.preprocess(images)
+    images = images.to(pipeline.device).to(pipeline.dtype)
+    images = pipeline.vae.encode(images).latent_dist.sample()
+    images = (
+        images - pipeline.vae.config.shift_factor
+    ) * pipeline.vae.config.scaling_factor
+    images_tokens = pipeline._pack_latents(images, *images.shape)
+    images_ids = pipeline._prepare_latent_image_ids(
+        images.shape[0],
+        images.shape[2],
+        images.shape[3],
+        pipeline.device,
+        pipeline.dtype,
+    )
+    if images_tokens.shape[1] != images_ids.shape[0]:
+        images_ids = pipeline._prepare_latent_image_ids(
+            images.shape[0],
+            images.shape[2] // 2,
+            images.shape[3] // 2,
+            pipeline.device,
+            pipeline.dtype,
+        )
+    return images_tokens, images_ids
+depth_pipe = None
+def convert_to_condition(
+    condition_type: str,
+    raw_img: Union[Image.Image, torch.Tensor],
+    blur_radius: Optional[int] = 5,
+) -> Union[Image.Image, torch.Tensor]:
+    if condition_type == "depth":
+        global depth_pipe
+        depth_pipe = depth_pipe or pipeline(
+            task="depth-estimation",
+            model="LiheYoung/depth-anything-small-hf",
+            device="cpu",  # Use "cpu" to enable parallel processing
+        )
+        source_image = raw_img.convert("RGB")
+        condition_img = depth_pipe(source_image)["depth"].convert("RGB")
+        return condition_img
+    elif condition_type == "canny":
+        img = np.array(raw_img)
+        edges = cv2.Canny(img, 100, 200)
+        edges = Image.fromarray(edges).convert("RGB")
+        return edges
+    elif condition_type == "coloring":
+        return raw_img.convert("L").convert("RGB")
+    elif condition_type == "deblurring":
+        condition_image = (
+            raw_img.convert("RGB")
+            .filter(ImageFilter.GaussianBlur(blur_radius))
+            .convert("RGB")
+        )
+        return condition_image
+    else:
+        print("Warning: Returning the raw image.")
+        return raw_img.convert("RGB")
+class Condition(object):
+    def __init__(
+        self,
+        condition: Union[Image.Image, torch.Tensor],
+        adapter_setting: Union[str, dict],
+        position_delta=None,
+        position_scale=1.0,
+        latent_mask=None,
+        is_complement=False,
+    ) -> None:
+        self.condition = condition
+        self.adapter = adapter_setting
+        self.position_delta = position_delta
+        self.position_scale = position_scale
+        self.latent_mask = (
+            latent_mask.T.reshape(-1) if latent_mask is not None else None
+        )
+        self.is_complement = is_complement
+    def encode(
+        self, pipe: FluxPipeline, empty: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
+        condition_empty = Image.new("RGB", self.condition.size, (0, 0, 0))
+        tokens, ids = encode_images(pipe, condition_empty if empty else self.condition)
+        if self.position_delta is not None:
+            ids[:, 1] += self.position_delta[0]
+            ids[:, 2] += self.position_delta[1]
+        if self.position_scale != 1.0:
+            scale_bias = (self.position_scale - 1.0) / 2
+            ids[:, 1:] *= self.position_scale
+            ids[:, 1:] += scale_bias
+        if self.latent_mask is not None:
+            tokens = tokens[:, self.latent_mask]
+            ids = ids[self.latent_mask]
+        return tokens, ids
+@contextmanager
+def specify_lora(lora_modules: List[BaseTunerLayer], specified_lora):
+    # Filter valid lora modules
+    valid_lora_modules = [m for m in lora_modules if isinstance(m, BaseTunerLayer)]
+    # Save original scales
+    original_scales = [
+        {
+            adapter: module.scaling[adapter]
+            for adapter in module.active_adapters
+            if adapter in module.scaling
+        }
+        for module in valid_lora_modules
+    ]
+    # Enter context: adjust scaling
+    for module in valid_lora_modules:
+        for adapter in module.active_adapters:
+            if adapter in module.scaling:
+                module.scaling[adapter] = SCALE if adapter == specified_lora else 0
+    try:
+        yield
+    finally:
+        # Exit context: restore original scales
+        for module, scales in zip(valid_lora_modules, original_scales):
+            for adapter in module.active_adapters:
+                if adapter in module.scaling:
+                    module.scaling[adapter] = scales[adapter]
+def attn_forward(
+    attn: Attention,
+    hidden_states: List[torch.FloatTensor],
+    adapters: List[str],
+    hidden_states2: Optional[List[torch.FloatTensor]] = [],
+    position_embs: Optional[List[torch.Tensor]] = None,
+    group_mask: Optional[torch.Tensor] = None,
+    cache_mode: Optional[str] = None,
+    # to determine whether to cache the keys and values for this branch
+    to_cache: Optional[List[torch.Tensor]] = None,
+    cache_storage: Optional[List[torch.Tensor]] = None,
+    **kwargs: dict,
+) -> torch.FloatTensor:
+    bs, _, _ = hidden_states[0].shape
+    h2_n = len(hidden_states2)
+    queries, keys, values = [], [], []
+    # Prepare query, key, value for each encoder hidden state (text branch)
+    for i, hidden_state in enumerate(hidden_states2):
+        query = attn.add_q_proj(hidden_state)
+        key = attn.add_k_proj(hidden_state)
+        value = attn.add_v_proj(hidden_state)
+        head_dim = key.shape[-1] // attn.heads
+        reshape_fn = lambda x: x.view(bs, -1, attn.heads, head_dim).transpose(1, 2)
+        query, key, value = map(reshape_fn, (query, key, value))
+        query, key = attn.norm_added_q(query), attn.norm_added_k(key)
+        queries.append(query)
+        keys.append(key)
+        values.append(value)
+    # Prepare query, key, value for each hidden state (image branch)
+    for i, hidden_state in enumerate(hidden_states):
+        with specify_lora((attn.to_q, attn.to_k, attn.to_v), adapters[i + h2_n]):
+            query = attn.to_q(hidden_state)
+            key = attn.to_k(hidden_state)
+            value = attn.to_v(hidden_state)
+        head_dim = key.shape[-1] // attn.heads
+        reshape_fn = lambda x: x.view(bs, -1, attn.heads, head_dim).transpose(1, 2)
+        query, key, value = map(reshape_fn, (query, key, value))
+        query, key = attn.norm_q(query), attn.norm_k(key)
+        queries.append(query)
+        keys.append(key)
+        values.append(value)
+    # Apply rotary embedding
+    if position_embs is not None:
+        queries = [apply_rotary_emb(q, position_embs[i]) for i, q in enumerate(queries)]
+        keys = [apply_rotary_emb(k, position_embs[i]) for i, k in enumerate(keys)]
+    if cache_mode == "write":
+        for i, (k, v) in enumerate(zip(keys, values)):
+            if to_cache[i]:
+                cache_storage[attn.cache_idx][0].append(k)
+                cache_storage[attn.cache_idx][1].append(v)
+    attn_outputs = []
+    for i, query in enumerate(queries):
+        keys_, values_ = [], []
+        # Add keys and values from other branches
+        for j, (k, v) in enumerate(zip(keys, values)):
+            if (group_mask is not None) and not (group_mask[i][j].item()):
+                continue
+            keys_.append(k)
+            values_.append(v)
+        if cache_mode == "read":
+            keys_.extend(cache_storage[attn.cache_idx][0])
+            values_.extend(cache_storage[attn.cache_idx][1])
+        # Add keys and values from cache TODO
+        # Attention computation
+        attn_output = F.scaled_dot_product_attention(
+            query, torch.cat(keys_, dim=2), torch.cat(values_, dim=2)
+        ).to(query.dtype)
+        attn_output = attn_output.transpose(1, 2).reshape(bs, -1, attn.heads * head_dim)
+        attn_outputs.append(attn_output)
+    # Reshape attention output to match the original hidden states
+    h_out, h2_out = [], []
+    for i, hidden_state in enumerate(hidden_states2):
+        h2_out.append(attn.to_add_out(attn_outputs[i]))
+    for i, hidden_state in enumerate(hidden_states):
+        h = attn_outputs[i + h2_n]
+        if getattr(attn, "to_out", None) is not None:
+            with specify_lora((attn.to_out[0],), adapters[i + h2_n]):
+                h = attn.to_out[0](h)
+        h_out.append(h)
+    return (h_out, h2_out) if h2_n else h_out
+def block_forward(
+    self,
+    image_hidden_states: List[torch.FloatTensor],
+    text_hidden_states: List[torch.FloatTensor],
+    tembs: List[torch.FloatTensor],
+    adapters: List[str],
+    position_embs=None,
+    attn_forward=attn_forward,
+    **kwargs: dict,
+):
+    txt_n = len(text_hidden_states)
+    img_variables, txt_variables = [], []
+    for i, text_h in enumerate(text_hidden_states):
+        txt_variables.append(self.norm1_context(text_h, emb=tembs[i]))
+    for i, image_h in enumerate(image_hidden_states):
+        with specify_lora((self.norm1.linear,), adapters[i + txt_n]):
+            img_variables.append(self.norm1(image_h, emb=tembs[i + txt_n]))
+    # Attention.
+    img_attn_output, txt_attn_output = attn_forward(
+        self.attn,
+        hidden_states=[each[0] for each in img_variables],
+        hidden_states2=[each[0] for each in txt_variables],
+        position_embs=position_embs,
+        adapters=adapters,
+        **kwargs,
+    )
+    text_out = []
+    for i in range(len(text_hidden_states)):
+        _, gate_msa, shift_mlp, scale_mlp, gate_mlp = txt_variables[i]
+        text_h = text_hidden_states[i] + txt_attn_output[i] * gate_msa.unsqueeze(1)
+        norm_h = (
+            self.norm2_context(text_h) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        )
+        text_h = self.ff_context(norm_h) * gate_mlp.unsqueeze(1) + text_h
+        text_out.append(clip_hidden_states(text_h))
+    image_out = []
+    for i in range(len(image_hidden_states)):
+        _, gate_msa, shift_mlp, scale_mlp, gate_mlp = img_variables[i]
+        image_h = (
+            image_hidden_states[i] + img_attn_output[i] * gate_msa.unsqueeze(1)
+        ).to(image_hidden_states[i].dtype)
+        norm_h = self.norm2(image_h) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        with specify_lora((self.ff.net[2],), adapters[i + txt_n]):
+            image_h = image_h + self.ff(norm_h) * gate_mlp.unsqueeze(1)
+        image_out.append(clip_hidden_states(image_h))
+    return image_out, text_out
+def single_block_forward(
+    self,
+    hidden_states: List[torch.FloatTensor],
+    tembs: List[torch.FloatTensor],
+    adapters: List[str],
+    position_embs=None,
+    attn_forward=attn_forward,
+    **kwargs: dict,
+):
+    mlp_hidden_states, gates = [[None for _ in hidden_states] for _ in range(2)]
+    hidden_state_norm = []
+    for i, hidden_state in enumerate(hidden_states):
+        # [NOTE]!: This function's output is slightly DIFFERENT from the original
+        # FLUX version. In the original implementation, the gates were computed using
+        # the combined hidden states from both the image and text branches. Here, each
+        # branch computes its gate using only its own hidden state.
+        with specify_lora((self.norm.linear, self.proj_mlp), adapters[i]):
+            h_norm, gates[i] = self.norm(hidden_state, emb=tembs[i])
+            mlp_hidden_states[i] = self.act_mlp(self.proj_mlp(h_norm))
+        hidden_state_norm.append(h_norm)
+    attn_outputs = attn_forward(
+        self.attn, hidden_state_norm, adapters, position_embs=position_embs, **kwargs
+    )
+    h_out = []
+    for i in range(len(hidden_states)):
+        with specify_lora((self.proj_out,), adapters[i]):
+            h = torch.cat([attn_outputs[i], mlp_hidden_states[i]], dim=2)
+            h = gates[i].unsqueeze(1) * self.proj_out(h) + hidden_states[i]
+            h_out.append(clip_hidden_states(h))
+    return h_out
+def transformer_forward(
+    transformer: FluxTransformer2DModel,
+    image_features: List[torch.Tensor],
+    text_features: List[torch.Tensor] = None,
+    img_ids: List[torch.Tensor] = None,
+    txt_ids: List[torch.Tensor] = None,
+    pooled_projections: List[torch.Tensor] = None,
+    timesteps: List[torch.LongTensor] = None,
+    guidances: List[torch.Tensor] = None,
+    adapters: List[str] = None,
+    # Assign the function to be used for the forward pass
+    single_block_forward=single_block_forward,
+    block_forward=block_forward,
+    attn_forward=attn_forward,
+    **kwargs: dict,
+):
+    self = transformer
+    txt_n = len(text_features) if text_features is not None else 0
+    adapters = adapters or [None] * (txt_n + len(image_features))
+    assert len(adapters) == len(timesteps)
+    # Preprocess the image_features
+    image_hidden_states = []
+    for i, image_feature in enumerate(image_features):
+        with specify_lora((self.x_embedder,), adapters[i + txt_n]):
+            image_hidden_states.append(self.x_embedder(image_feature))
+    # Preprocess the text_features
+    text_hidden_states = []
+    for text_feature in text_features:
+        text_hidden_states.append(self.context_embedder(text_feature))
+    # Prepare embeddings of (timestep, guidance, pooled_projections)
+    assert len(timesteps) == len(image_features) + len(text_features)
+    def get_temb(timestep, guidance, pooled_projection):
+        timestep = timestep.to(image_hidden_states[0].dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(image_hidden_states[0].dtype) * 1000
+            return self.time_text_embed(timestep, guidance, pooled_projection)
+        else:
+            return self.time_text_embed(timestep, pooled_projection)
+    tembs = [get_temb(*each) for each in zip(timesteps, guidances, pooled_projections)]
+    # Prepare position embeddings for each token
+    position_embs = [self.pos_embed(each) for each in (*txt_ids, *img_ids)]
+    # Prepare the gradient checkpointing kwargs
+    gckpt_kwargs: Dict[str, Any] = (
+        {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+    )
+    # dual branch blocks
+    for block in self.transformer_blocks:
+        block_kwargs = {
+            "self": block,
+            "image_hidden_states": image_hidden_states,
+            "text_hidden_states": text_hidden_states,
+            "tembs": tembs,
+            "position_embs": position_embs,
+            "adapters": adapters,
+            "attn_forward": attn_forward,
+            **kwargs,
+        }
+        if self.training and self.gradient_checkpointing:
+            image_hidden_states, text_hidden_states = torch.utils.checkpoint.checkpoint(
+                block_forward, **block_kwargs, **gckpt_kwargs
+            )
+        else:
+            image_hidden_states, text_hidden_states = block_forward(**block_kwargs)
+    # combine image and text hidden states then pass through the single transformer blocks
+    all_hidden_states = [*text_hidden_states, *image_hidden_states]
+    for block in self.single_transformer_blocks:
+        block_kwargs = {
+            "self": block,
+            "hidden_states": all_hidden_states,
+            "tembs": tembs,
+            "position_embs": position_embs,
+            "adapters": adapters,
+            "attn_forward": attn_forward,
+            **kwargs,
+        }
+        if self.training and self.gradient_checkpointing:
+            all_hidden_states = torch.utils.checkpoint.checkpoint(
+                single_block_forward, **block_kwargs, **gckpt_kwargs
+            )
+        else:
+            all_hidden_states = single_block_forward(**block_kwargs)
+    image_hidden_states = self.norm_out(all_hidden_states[txt_n], tembs[txt_n])
+    output = self.proj_out(image_hidden_states)
+    return (output,)
+@torch.no_grad()
+def generate(
+    pipeline: FluxPipeline,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = 512,
+    width: Optional[int] = 512,
+    num_inference_steps: int = 28,
+    timesteps: List[int] = None,
+    guidance_scale: float = 3.5,
+    num_images_per_prompt: Optional[int] = 1,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    return_dict: bool = True,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 512,
+    # Condition Parameters (Optional)
+    main_adapter: Optional[List[str]] = None,
+    conditions: List[Condition] = [],
+    image_guidance_scale: float = 1.0,
+    transformer_kwargs: Optional[Dict[str, Any]] = {},
+    kv_cache=False,
+    latent_mask=None,
+    global_scale=None,
+    **params: dict,
+):
+    if global_scale is not None:
+        global SCALE
+        SCALE = global_scale
+    self = pipeline
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    # Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        max_sequence_length=max_sequence_length,
+    )
+    self._guidance_scale = guidance_scale
+    self._joint_attention_kwargs = joint_attention_kwargs
+    # Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    device = self._execution_device
+    # Prepare prompt embeddings
+    (
+        prompt_embeds,
+        pooled_prompt_embeds,
+        text_ids,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        device=device,
+        num_images_per_prompt=num_images_per_prompt,
+        max_sequence_length=max_sequence_length,
+    )
+    # Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels // 4
+    latents, latent_image_ids = self.prepare_latents(
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+    if latent_mask is not None:
+        latent_mask = latent_mask.T.reshape(-1)
+        latents = latents[:, latent_mask]
+        latent_image_ids = latent_image_ids[latent_mask]
+    # Prepare conditions
+    c_latents, uc_latents, c_ids, c_timesteps = ([], [], [], [])
+    c_projections, c_guidances, c_adapters = ([], [], [])
+    complement_cond = None
+    for condition in conditions:
+        tokens, ids = condition.encode(self)
+        c_latents.append(tokens)  # [batch_size, token_n, token_dim]
+        # Empty condition for unconditioned image
+        if image_guidance_scale != 1.0:
+            uc_latents.append(condition.encode(self, empty=True)[0])
+        c_ids.append(ids)  # [token_n, id_dim(3)]
+        c_timesteps.append(torch.zeros([1], device=device))
+        c_projections.append(pooled_prompt_embeds)
+        c_guidances.append(torch.ones([1], device=device))
+        c_adapters.append(condition.adapter)
+        # This complement_condition will be combined with the original image.
+        # See the token integration of OminiControl2 [https://arxiv.org/abs/2503.08280]
+        if condition.is_complement:
+            complement_cond = (tokens, ids)
+    # Prepare timesteps
+    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+    image_seq_len = latents.shape[1]
+    mu = calculate_shift(
+        image_seq_len,
+        self.scheduler.config.base_image_seq_len,
+        self.scheduler.config.max_image_seq_len,
+        self.scheduler.config.base_shift,
+        self.scheduler.config.max_shift,
+    )
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler, num_inference_steps, device, timesteps, sigmas, mu=mu
+    )
+    num_warmup_steps = max(
+        len(timesteps) - num_inference_steps * self.scheduler.order, 0
+    )
+    self._num_timesteps = len(timesteps)
+    if kv_cache:
+        attn_counter = 0
+        for module in self.transformer.modules():
+            if isinstance(module, Attention):
+                setattr(module, "cache_idx", attn_counter)
+                attn_counter += 1
+        kv_cond = [[[], []] for _ in range(attn_counter)]
+        kv_uncond = [[[], []] for _ in range(attn_counter)]
+        def clear_cache():
+            for storage in [kv_cond, kv_uncond]:
+                for kesy, values in storage:
+                    kesy.clear()
+                    values.clear()
+    branch_n = len(conditions) + 2
+    group_mask = torch.ones([branch_n, branch_n], dtype=torch.bool)
+    # Disable the attention cross different condition branches
+    group_mask[2:, 2:] = torch.diag(torch.tensor([1] * len(conditions)))
+    # Disable the attention from condition branches to image branch and text branch
+    if kv_cache:
+        group_mask[2:, :2] = False
+    # Denoising loop
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latents.shape[0]).to(latents.dtype) / 1000
+            # handle guidance
+            if self.transformer.config.guidance_embeds:
+                guidance = torch.tensor([guidance_scale], device=device)
+                guidance = guidance.expand(latents.shape[0])
+            else:
+                guidance, c_guidances = None, [None for _ in c_guidances]
+            if kv_cache:
+                mode = "write" if i == 0 else "read"
+                if mode == "write":
+                    clear_cache()
+            use_cond = not (kv_cache) or mode == "write"
+            noise_pred = transformer_forward(
+                self.transformer,
+                image_features=[latents] + (c_latents if use_cond else []),
+                text_features=[prompt_embeds],
+                img_ids=[latent_image_ids] + (c_ids if use_cond else []),
+                txt_ids=[text_ids],
+                timesteps=[timestep, timestep] + (c_timesteps if use_cond else []),
+                pooled_projections=[pooled_prompt_embeds] * 2
+                + (c_projections if use_cond else []),
+                guidances=[guidance] * 2 + (c_guidances if use_cond else []),
+                return_dict=False,
+                adapters=[main_adapter] * 2 + (c_adapters if use_cond else []),
+                cache_mode=mode if kv_cache else None,
+                cache_storage=kv_cond if kv_cache else None,
+                to_cache=[False, False, *[True] * len(c_latents)],
+                group_mask=group_mask,
+                **transformer_kwargs,
+            )[0]
+            if image_guidance_scale != 1.0:
+                unc_pred = transformer_forward(
+                    self.transformer,
+                    image_features=[latents] + (uc_latents if use_cond else []),
+                    text_features=[prompt_embeds],
+                    img_ids=[latent_image_ids] + (c_ids if use_cond else []),
+                    txt_ids=[text_ids],
+                    timesteps=[timestep, timestep] + (c_timesteps if use_cond else []),
+                    pooled_projections=[pooled_prompt_embeds] * 2
+                    + (c_projections if use_cond else []),
+                    guidances=[guidance] * 2 + (c_guidances if use_cond else []),
+                    return_dict=False,
+                    adapters=[main_adapter] * 2 + (c_adapters if use_cond else []),
+                    cache_mode=mode if kv_cache else None,
+                    cache_storage=kv_uncond if kv_cache else None,
+                    to_cache=[False, False, *[True] * len(c_latents)],
+                    **transformer_kwargs,
+                )[0]
+                noise_pred = unc_pred + image_guidance_scale * (noise_pred - unc_pred)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            latents = self.scheduler.step(noise_pred, t, latents)[0]
+            if latents.dtype != latents_dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    latents = latents.to(latents_dtype)
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or (
+                (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+            ):
+                progress_bar.update()
+    if latent_mask is not None:
+        # Combine the generated latents and the complement condition
+        assert complement_cond is not None
+        comp_latent, comp_ids = complement_cond
+        all_ids = torch.cat([latent_image_ids, comp_ids], dim=0)  # (Ta+Tc,3)
+        shape = (all_ids.max(dim=0).values + 1).to(torch.long)  # (3,)
+        H, W = shape[1].item(), shape[2].item()
+        B, _, C = latents.shape
+        # Create a empty canvas
+        canvas = latents.new_zeros(B, H * W, C)  # (B,H*W,C)
+        # Stash the latents and the complement condition
+        def _stash(canvas, tokens, ids, H, W) -> None:
+            B, T, C = tokens.shape
+            ids = ids.to(torch.long)
+            flat_idx = (ids[:, 1] * W + ids[:, 2]).to(torch.long)
+            canvas.view(B, -1, C).index_copy_(1, flat_idx, tokens)
+        _stash(canvas, latents, latent_image_ids, H, W)
+        _stash(canvas, comp_latent, comp_ids, H, W)
+        latents = canvas.view(B, H * W, C)
+    if output_type == "latent":
+        image = latents
+    else:
+        latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+        latents = (
+            latents / self.vae.config.scaling_factor
+        ) + self.vae.config.shift_factor
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+    # Offload all models
+    self.maybe_free_model_hooks()
+    if not return_dict:
+        return (image,)
+    return FluxPipelineOutput(images=image)

omini/rotation/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .rotation_config import RotationConfig
+from .layer import RotationLayer
+from .model import RotationTuner

omini/rotation/layer.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import torch
+import torch.nn as nn
+from typing import Optional, Set
+from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
+def inverse_2x2(matrices):
+    # Extract matrix elements
+    # matrices[..., 0, 0] corresponds to 'a' in [[a, b], [c, d]]
+    a = matrices[..., 0, 0]
+    b = matrices[..., 0, 1]
+    c = matrices[..., 1, 0]
+    d = matrices[..., 1, 1]
+    # Compute determinant
+    det = a * d - b * c
+    # Compute inverse using the formula:
+    # inv = (1/det) * [[d, -b], [-c, a]]
+    inv_det = 1.0 / det
+    # Create output tensor
+    inv_matrices = torch.empty_like(matrices)
+    inv_matrices[..., 0, 0] = d * inv_det
+    inv_matrices[..., 0, 1] = -b * inv_det
+    inv_matrices[..., 1, 0] = -c * inv_det
+    inv_matrices[..., 1, 1] = a * inv_det
+    return inv_matrices
+class Rotation(nn.Module):
+    """
+    Rotation layer based on Cayley transformation for parameter-efficient fine-tuning.
+    This layer implements orthogonal fine-tuning through Cayley transformation:
+    h(x) = (I - A)^{-1} (I + A) x
+    where A = XY^T with X = [U; -V] and Y = [V; U]
+    """
+    def __init__(self, r, dim, T=1.0, num_rotations=4):
+        super().__init__()
+        self.r = r
+        self.T = T
+        self.U = nn.Parameter(torch.randn(num_rotations, r, dim) * 0.002, requires_grad=True)
+        self.V = nn.Parameter(torch.randn(num_rotations, r, dim) * 0.0, requires_grad=True)
+        self.num_rotations = num_rotations
+    def forward(self, x):
+        """
+        Apply Cayley transformation to input x.
+        A = XY^T where X = [U; -V], Y = [V; U]
+        Cayley transformation: h(x) = (I - A)^{-1} (I + A) x
+        Uses Woodbury identity for efficient computation:
+        (I - XY^T)^{-1} = I + X (I - Y^T X)^{-1} Y^T
+        Args:
+            x: Input tensor of shape (..., dim)
+        Returns:
+            Transformed tensor of shape (..., dim)
+        """
+        x_dtype = x.dtype
+        X = torch.cat([self.U, -self.V], dim=1)  # Shape: (num_rotations, 2r, dim)
+        Y = torch.cat([self.V, self.U], dim=1) * self.T   # Shape: (num_rotations, 2r, dim)
+        Y_T_X = torch.matmul(Y, X.transpose(1, 2))  # Shape: (num_rotations, 2r, 2r)
+        I_2r = torch.eye(2 * self.r, device=x.device, dtype=x.dtype).repeat(self.num_rotations, 1, 1)
+        I_minus_YX = I_2r - Y_T_X
+        if self.r == 1:
+            I_minus_YX_inv = inverse_2x2(I_minus_YX)
+        else:
+            # make it float32
+            I_minus_YX = I_minus_YX.to(torch.float32)
+            I_minus_YX_inv = torch.linalg.inv(I_minus_YX)  # Shape: (num_rotations, 2r, 2r)
+            I_minus_YX_inv = I_minus_YX_inv.to(x_dtype)
+        Yx = torch.einsum("...d,nrd->...nr", x, Y)   # Shape: (batch*seq_len, num_rotations, 2r)
+        I_minus_YX_inv_Yx = torch.einsum("nrr,...nr->...nr", I_minus_YX_inv, Yx)
+        second_term = torch.einsum("...nr,nrd->...nd", I_minus_YX_inv_Yx, X)  # Shape: (batch*seq_len, num_rotations, dim)
+        second_term = second_term.sum(dim=-2)  # Sum over rotations
+        output = x + 2 * second_term  # Shape: (batch*seq_len, dim)
+        return output
+    def get_delta_weight(self):
+        """
+        Compute the delta weight matrix induced by the rotation layer.
+        Returns:
+            Delta weight matrix of shape (dim, dim)
+        """
+        X = torch.cat([self.U, -self.V], dim=1)  # Shape: (num_rotations, 2r, dim)
+        Y = torch.cat([self.V, self.U], dim=1) * self.T   # Shape: (num_rotations, 2r, dim)
+        Y_T_X = torch.matmul(Y, X.transpose(1, 2))  # Shape: (num_rotations, 2r, 2r)
+        I_2r = torch.eye(2 * self.r, device=X.device, dtype=X.dtype).repeat(self.num_rotations, 1, 1)
+        I_minus_YX = I_2r - Y_T_X
+        if self.r == 1:
+            I_minus_YX_inv = inverse_2x2(I_minus_YX)
+            I_minus_YX_inv_Y = torch.einsum("nRr,nrd->nRd", I_minus_YX_inv, Y) # Shape: (num_rotations, 2r, dim)
+        else:
+            I_minus_YX_inv_Y = torch.linalg.solve(I_minus_YX.to(torch.float32), Y.to(torch.float32))  # Shape: (num_rotations, 2r, dim)
+            I_minus_YX_inv_Y = I_minus_YX_inv_Y.to(X.dtype)
+        # I_minus_YX_float = I_minus_YX.float()
+        # I_minus_YX_inv = torch.linalg.inv(I_minus_YX_float)  # Shape: (num_rotations, 2r, 2r)
+        # I_minus_YX_inv = I_minus_YX_inv.to(X.dtype)
+        # I_minus_YX_inv_Y = torch.einsum("nRr,nrd->nRd", I_minus_YX_inv, Y) # Shape: (num_rotations, 2r, dim)
+        second_term = torch.einsum("nrd,nrD->ndD", X, I_minus_YX_inv_Y)  # Shape: (num_rotations, dim, dim)
+        second_term = second_term.sum(dim=0)
+        total_delta_weight = 2 * second_term
+        return total_delta_weight
+class RotationLayer(BaseTunerLayer):
+    """
+    Adapter-like wrapper that attaches Rotation modules to a base linear layer.
+    """
+    adapter_layer_names: tuple[str, ...] = ("rotation",)
+    other_param_names: tuple[str, ...] = ("r", "T", "num_rotations", "scaling")
+    def __init__(self, base_layer: nn.Module, **kwargs):
+        # Let BaseTunerLayer do its init (it usually subclasses nn.Module)
+        super().__init__()
+        # store base layer and adapter containers
+        self.base_layer = base_layer
+        self.rotation = nn.ModuleDict()  # mapping adapter_name -> Rotation module
+        self.scaling={}  # default scaling per adapter
+        self._adapter_config = {}  # store r, T, num_rotations per adapter
+        # flags (exposed in a simple way)
+        self._disable_adapters = False
+        self.merged_adapters: list[str] = []
+        self._cast_input_dtype_enabled = True
+        self.kwargs = kwargs
+        if isinstance(base_layer, nn.Linear):
+            self.in_features = base_layer.in_features
+            self.out_features = base_layer.out_features
+        else:
+            raise NotImplementedError("RotationLayer only supports nn.Linear base layers for now.")
+    @property
+    def _available_adapters(self) -> set[str]:
+        return set(self.rotation.keys())
+    @property
+    def disable_adapters(self) -> bool:
+        return self._disable_adapters
+    @property
+    def merged(self) -> bool:
+        return bool(self.merged_adapters)
+    @property
+    def active_adapters(self) -> list[str]:
+        # If some external mechanism sets active adapters, prefer it; else use all added adapters.
+        return getattr(self, "_active_adapters", list(self.rotation.keys()))
+    def get_base_layer(self) -> nn.Module:
+        return self.base_layer
+    def _cast_input_dtype(self, x: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+        if not self._cast_input_dtype_enabled:
+            return x
+        return x.to(dtype)
+    def update_layer(
+        self,
+        adapter_name: str,
+        r: int,
+        T: float,
+        num_rotations: int,
+        **kwargs,
+    ):
+        """
+        Add / update a rotation adapter for this layer.
+        """
+        if r <= 0:
+            raise ValueError(f"r must be positive, got {r}")
+        if num_rotations <= 0:
+            raise ValueError(f"num_rotations must be positive, got {num_rotations}")
+        rot = Rotation(r=r, dim=self.in_features, T=T, num_rotations=num_rotations)
+        self.rotation[adapter_name] = rot
+        self.scaling[adapter_name] = 1.0
+        self._adapter_config[adapter_name] = {"r": r, "T": T, "num_rotations": num_rotations}
+    # (optional) helper to set currently active adapters externally
+    def set_active_adapters(self, adapters: Optional[list[str]]):
+        if adapters is None:
+            if hasattr(self, "_active_adapters"):
+                delattr(self, "_active_adapters")
+        else:
+            self._active_adapters = adapters
+class Linear(nn.Module, RotationLayer):
+    """
+    A linear layer with an integrated rotation layer for parameter-efficient fine-tuning.
+    """
+    def __init__(self,
+                 base_layer: nn.Linear,
+                 adapter_name: str,
+                 r: int,
+                 T: float,
+                 num_rotations: int,
+                 **kwargs):
+        super().__init__()
+        RotationLayer.__init__(self, base_layer=base_layer, **kwargs)
+        self._active_adapter = adapter_name
+        self.update_layer(
+            adapter_name=adapter_name,
+            r=r,
+            T=T,
+            num_rotations=num_rotations,
+            **kwargs,
+        )
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[str] = None):
+        """
+        Merge the adapter effect into the base layer weights:
+            W_merged = W @ R
+        where R = I + delta (delta returned by get_delta_weight()).
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            return
+        base_layer = self.get_base_layer()
+        orig_dtype = base_layer.weight.dtype
+        # base_layer.weight shape: (out_features, in_features)
+        W = base_layer.weight.data  # (out, in)
+        for active_adapter in adapter_names:
+            if active_adapter not in self._available_adapters:
+                continue
+            delta_R = self.rotation[active_adapter].get_delta_weight()  # (in, in)
+            R = torch.eye(delta_R.size(0), device=delta_R.device, dtype=delta_R.dtype) + delta_R  # (in, in)
+            # merged W = W @ R
+            merged_W = W.to(R.dtype) @ R
+            if safe_merge and not torch.isfinite(merged_W).all():
+                raise ValueError("Merging resulted in non-finite weights. Aborting merge.")
+            base_layer.weight.data = merged_W.contiguous().to(orig_dtype)
+            # mark merged (so unmerge can restore by inverse)
+            self.merged_adapters.append(active_adapter)
+    def unmerge(self):
+        """
+        Reverse merges in LIFO order (pop merged adapters and invert R).
+        """
+        base_layer = self.get_base_layer()
+        orig_dtype = base_layer.weight.dtype
+        while self.merged_adapters:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter not in self._available_adapters:
+                continue
+            delta_R = self.rotation[active_adapter].get_delta_weight()  # (in, in)
+            R = torch.eye(delta_R.size(0), device=delta_R.device, dtype=delta_R.dtype) + delta_R
+            R_inv = torch.linalg.inv(R)
+            merged_W = base_layer.weight.data.to(R.dtype)
+            unmerged_W = merged_W @ R_inv
+            base_layer.weight.data = unmerged_W.contiguous().to(orig_dtype)
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        x_dtype = x.dtype
+        base_layer = self.get_base_layer()
+        if self.disable_adapters:
+            # if merged, unmerge to ensure base_layer produces original behavior
+            if self.merged:
+                self.unmerge()
+            return base_layer(x, *args, **kwargs).to(x_dtype)
+        if self.merged:
+            # if merged into base layer, just forward
+            return base_layer(x, *args, **kwargs).to(x_dtype)
+        # otherwise apply active adapters (transform inputs) then call base layer
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.rotation:
+                continue
+            rotation = self.rotation[active_adapter]
+            x = self._cast_input_dtype(x, rotation.U.dtype)
+            x = rotation(x)
+        return base_layer(x, *args, **kwargs).to(x_dtype)
+    def __repr__(self):
+        return f"rotation.{super().__repr__()}"

omini/rotation/layer_test.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import torch
+import torch.nn as nn
+from omini.rotation.layer import Linear, Rotation
+def test_rotation_merge():
+    """
+    Test that merging rotation adapter produces the same output as the unmerged version.
+    """
+    print("="*60)
+    print("Testing Rotation Layer Merge")
+    print("="*60)
+    # Set random seed for reproducibility
+    torch.manual_seed(42)
+    # Configuration
+    in_features = 512
+    out_features = 1024
+    r = 4
+    num_rotations = 4
+    T = 1.0
+    batch_size = 8
+    seq_len = 16
+    # Create base linear layer
+    base_layer = nn.Linear(in_features, out_features, bias=True)
+    # Create rotation layer
+    rotation_layer = Linear(
+        base_layer=base_layer,
+        adapter_name="default",
+        r=r,
+        T=T,
+        num_rotations=num_rotations
+    )
+    # Create random input
+    x = torch.randn(batch_size, seq_len, in_features)
+    # Test 1: Forward pass before merge
+    print("\n" + "-"*60)
+    print("Test 1: Computing output BEFORE merge")
+    print("-"*60)
+    rotation_layer.eval()
+    with torch.no_grad():
+        output_before = rotation_layer(x)
+    print(f"Output shape: {output_before.shape}")
+    print(f"Output mean: {output_before.mean().item():.6f}")
+    print(f"Output std: {output_before.std().item():.6f}")
+    print(f"Output min: {output_before.min().item():.6f}")
+    print(f"Output max: {output_before.max().item():.6f}")
+    # Save original weight for verification
+    original_weight = base_layer.weight.data.clone()
+    # Test 2: Merge adapter
+    print("\n" + "-"*60)
+    print("Test 2: Merging adapter")
+    print("-"*60)
+    rotation_layer.merge(safe_merge=True, adapter_names=["default"])
+    print(f"✓ Adapter merged successfully")
+    print(f"✓ Merged adapters: {rotation_layer.merged_adapters}")
+    # Check that weights have changed
+    weight_diff = (base_layer.weight.data - original_weight).abs().max().item()
+    print(f"Max weight change: {weight_diff:.6e}")
+    # Test 3: Forward pass after merge
+    print("\n" + "-"*60)
+    print("Test 3: Computing output AFTER merge")
+    print("-"*60)
+    with torch.no_grad():
+        output_after = rotation_layer(x)
+    print(f"Output shape: {output_after.shape}")
+    print(f"Output mean: {output_after.mean().item():.6f}")
+    print(f"Output std: {output_after.std().item():.6f}")
+    print(f"Output min: {output_after.min().item():.6f}")
+    print(f"Output max: {output_after.max().item():.6f}")
+    # Test 4: Compare outputs
+    print("\n" + "-"*60)
+    print("Test 4: Comparing outputs")
+    print("-"*60)
+    # Compute differences
+    abs_diff = (output_after - output_before).abs()
+    rel_diff = abs_diff / (output_before.abs() + 1e-8)
+    max_abs_diff = abs_diff.max().item()
+    mean_abs_diff = abs_diff.mean().item()
+    max_rel_diff = rel_diff.max().item()
+    mean_rel_diff = rel_diff.mean().item()
+    print(f"Max absolute difference: {max_abs_diff:.6e}")
+    print(f"Mean absolute difference: {mean_abs_diff:.6e}")
+    print(f"Max relative difference: {max_rel_diff:.6e}")
+    print(f"Mean relative difference: {mean_rel_diff:.6e}")
+    # Check if outputs are close
+    atol = 1e-4  # Absolute tolerance
+    rtol = 1e-3  # Relative tolerance
+    are_close = torch.allclose(output_before, output_after, atol=atol, rtol=rtol)
+    if are_close:
+        print(f"\n✅ PASS: Outputs are identical (within atol={atol}, rtol={rtol})")
+    else:
+        print(f"\n❌ FAIL: Outputs differ significantly")
+        print(f"   Expected: atol < {atol}, rtol < {rtol}")
+        print(f"   Got: max_abs_diff = {max_abs_diff:.6e}, max_rel_diff = {max_rel_diff:.6e}")
+    # Test 5: Unmerge and verify
+    print("\n" + "-"*60)
+    print("Test 5: Testing unmerge")
+    print("-"*60)
+    rotation_layer.unmerge()
+    print(f"✓ Adapter unmerged")
+    print(f"✓ Merged adapters: {rotation_layer.merged_adapters}")
+    with torch.no_grad():
+        output_unmerged = rotation_layer(x)
+    unmerge_diff = (output_unmerged - output_before).abs().max().item()
+    print(f"Max difference after unmerge: {unmerge_diff:.6e}")
+    unmerge_close = torch.allclose(output_before, output_unmerged, atol=atol, rtol=rtol)
+    if unmerge_close:
+        print(f"✅ PASS: Unmerge restored original behavior")
+    else:
+        print(f"❌ FAIL: Unmerge did not restore original behavior")
+    # Test 6: Verify weight restoration
+    weight_restored_diff = (base_layer.weight.data - original_weight).abs().max().item()
+    print(f"Max weight difference after unmerge: {weight_restored_diff:.6e}")
+    weight_restored = torch.allclose(base_layer.weight.data, original_weight, atol=1e-5)
+    if weight_restored:
+        print(f"✅ PASS: Original weights restored")
+    else:
+        print(f"❌ FAIL: Original weights not fully restored")
+    print("\n" + "="*60)
+    print("Test Summary")
+    print("="*60)
+    return are_close and unmerge_close and weight_restored
+def test_multiple_merges():
+    """
+    Test merging and unmerging multiple times.
+    """
+    print("\n" + "="*60)
+    print("Testing Multiple Merge/Unmerge Cycles")
+    print("="*60)
+    torch.manual_seed(42)
+    in_features = 256
+    out_features = 512
+    r = 4
+    num_rotations = 4
+    base_layer = nn.Linear(in_features, out_features, bias=True)
+    rotation_layer = Linear(
+        base_layer=base_layer,
+        adapter_name="default",
+        r=r,
+        T=1.0,
+        num_rotations=num_rotations
+    )
+    x = torch.randn(4, 8, in_features)
+    rotation_layer.eval()
+    # Get original output
+    with torch.no_grad():
+        original_output = rotation_layer(x)
+    # Test multiple cycles
+    all_passed = True
+    for cycle in range(3):
+        print(f"\nCycle {cycle + 1}:")
+        # Merge
+        rotation_layer.merge(safe_merge=True)
+        with torch.no_grad():
+            merged_output = rotation_layer(x)
+        merge_close = torch.allclose(original_output, merged_output, atol=1e-4, rtol=1e-3)
+        print(f"  Merge: {'✅ PASS' if merge_close else '❌ FAIL'}")
+        # Unmerge
+        rotation_layer.unmerge()
+        with torch.no_grad():
+            unmerged_output = rotation_layer(x)
+        unmerge_close = torch.allclose(original_output, unmerged_output, atol=1e-4, rtol=1e-3)
+        print(f"  Unmerge: {'✅ PASS' if unmerge_close else '❌ FAIL'}")
+        all_passed = all_passed and merge_close and unmerge_close
+    return all_passed
+def test_with_different_dtypes():
+    """
+    Test merging with different data types.
+    """
+    print("\n" + "="*60)
+    print("Testing Different Data Types")
+    print("="*60)
+    torch.manual_seed(42)
+    dtypes = [torch.float32, torch.float16, torch.bfloat16]
+    all_passed = True
+    for dtype in dtypes:
+        print(f"\nTesting with dtype: {dtype}")
+        in_features = 256
+        out_features = 512
+        r = 4
+        num_rotations = 4
+        base_layer = nn.Linear(in_features, out_features, bias=True)
+        base_layer = base_layer.to(dtype)
+        rotation_layer = Linear(
+            base_layer=base_layer,
+            adapter_name="default",
+            r=r,
+            T=1.0,
+            num_rotations=num_rotations
+        )
+        rotation_layer = rotation_layer.to(dtype)
+        x = torch.randn(4, 8, in_features, dtype=dtype)
+        rotation_layer.eval()
+        with torch.no_grad():
+            output_before = rotation_layer(x)
+            rotation_layer.merge(safe_merge=True)
+            output_after = rotation_layer(x)
+        # Adjust tolerances based on dtype
+        if dtype == torch.float32:
+            atol, rtol = 1e-5, 1e-4
+        elif dtype == torch.float16:
+            atol, rtol = 1e-2, 1e-2
+        else:  # bfloat16
+            atol, rtol = 1e-2, 1e-2
+        are_close = torch.allclose(output_before, output_after, atol=atol, rtol=rtol)
+        if are_close:
+            print(f"  ✅ PASS")
+        else:
+            max_diff = (output_after - output_before).abs().max().item()
+            print(f"  ❌ FAIL (max diff: {max_diff:.6e})")
+        all_passed = all_passed and are_close
+    return all_passed
+if __name__ == "__main__":
+    print("\n" + "="*60)
+    print("ROTATION LAYER MERGE TEST SUITE")
+    print("="*60)
+    results = {}
+    # Run all tests
+    results["basic_merge"] = test_rotation_merge()
+    results["multiple_cycles"] = test_multiple_merges()
+    results["different_dtypes"] = test_with_different_dtypes()
+    # Print summary
+    print("\n" + "="*60)
+    print("FINAL SUMMARY")
+    print("="*60)
+    for test_name, passed in results.items():
+        status = "✅ PASS" if passed else "❌ FAIL"
+        print(f"{test_name}: {status}")
+    all_passed = all(results.values())
+    print("\n" + "="*60)
+    if all_passed:
+        print("🎉 ALL TESTS PASSED!")
+    else:
+        print("⚠️  SOME TESTS FAILED")
+    print("="*60)

omini/rotation/model.py ADDED Viewed

	@@ -0,0 +1,390 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+from enum import Enum
+from dataclasses import asdict
+from tqdm import tqdm
+from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists, onload_layer
+from peft.utils import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, ModulesToSaveWrapper, _get_submodules
+from .layer import RotationLayer, Linear
+TRANSFORMERS_MODELS_TO_ROTATION_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy()
+class RotationTuner(BaseTuner):
+    prefix: str = "rotation_"
+    tuner_layer_class = RotationLayer
+    target_module_mapping = TRANSFORMERS_MODELS_TO_ROTATION_TARGET_MODULES_MAPPING
+    @staticmethod
+    def _check_target_module_exists(rotation_config, key: str) -> bool:
+        return check_target_module_exists(rotation_config, key)
+    def _create_and_replace(
+        self,
+        rotation_config,
+        adapter_name: str,
+        target: nn.Module,
+        target_name: str,
+        parent: nn.Module,
+        current_key: str,
+        **optional_kwargs,
+    ) -> None:
+        """
+        Create and replace a target module with a rotation-augmented version.
+        This method is called when an existing module is already a RotationLayer
+        and needs to have a new adapter added to it.
+        Args:
+            rotation_config: Configuration for the rotation adapter
+            adapter_name: Name of the adapter to add
+            target: The target module to augment
+            target_name: Name of the target module
+            parent: Parent module containing the target
+            current_key: Full key path to the current module
+            **optional_kwargs: Additional optional arguments
+        Raises:
+            ValueError: If current_key is not provided
+        """
+        if current_key is None:
+            raise ValueError("current_key must be provided to create Rotation layer")
+        # Check if target is already a RotationLayer
+        if isinstance(target, RotationLayer):
+            target.update_layer(
+                adapter_name=adapter_name,
+                r=rotation_config.r,
+                T=rotation_config.T,
+                num_rotations=rotation_config.num_rotations,
+            )
+        else:
+            # Create new rotation layer
+            new_module = self._create_new_module(
+                rotation_config=rotation_config,
+                adapter_name=adapter_name,
+                target=target,
+                **optional_kwargs,
+            )
+            if new_module is not None:
+                self._replace_module(parent, target_name, new_module, target)
+    def _replace_module(self, parent, child_name, new_module, child):
+        setattr(parent, child_name, new_module)
+        # child layer wraps the original module, unpack it
+        if hasattr(child, "base_layer"):
+            child = child.base_layer
+        meta = torch.device("meta")
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if (self.prefix in name) or ("ranknum" in name):
+                if hasattr(child, "qweight"):
+                    weight = child.qweight
+                elif hasattr(child, "W_q"):
+                    weight = child.W_q
+                elif hasattr(child, "weight"):
+                    weight = child.weight
+                elif getattr(child, "in_proj_weight", None) is not None:  # MHA
+                    weight = child.in_proj_weight
+                else:
+                    weight = next(child.parameters())
+                if not any(p.device == meta for p in module.parameters()):
+                    module.to(weight.device)
+    def _mark_only_adapters_as_trainable(self, model):
+        # First, freeze all parameters
+        for n, p in model.named_parameters():
+            if self.prefix not in n:
+                p.requires_grad = False
+            else:
+                p.requires_grad = True
+        # Handle bias parameters based on config
+        for active_adapter in self.active_adapters:
+            bias_config = self.peft_config[active_adapter].bias
+            if bias_config == "none":
+                continue
+            elif bias_config == "all":
+                # Enable all bias parameters
+                for n, p in model.named_parameters():
+                    if "bias" in n:
+                        p.requires_grad = True
+            elif bias_config == "rotation_only":
+                # Enable only bias in rotation layers
+                for name, m in model.named_modules():
+                    if isinstance(m, RotationLayer):
+                        if hasattr(m, "bias") and m.bias is not None:
+                            m.bias.requires_grad = True
+            else:
+                raise NotImplementedError(
+                    f"Requested bias configuration '{bias_config}' is not implemented. "
+                    f"Supported values: 'none', 'all', 'rotation_only'"
+                )
+    @staticmethod
+    def _create_new_module(
+        rotation_config,
+        adapter_name: str,
+        target: nn.Module,
+        **kwargs,
+    ) -> Optional[nn.Module]:
+        """
+        Create a new rotation-augmented module.
+        Args:
+            rotation_config: Configuration for the rotation adapter
+            adapter_name: Name of the adapter
+            target: Base module to augment
+            **kwargs: Additional arguments
+        Returns:
+            New RotationLayer module wrapping the target, or None if unsupported
+        """
+        if isinstance(target, nn.Linear):
+            return Linear(
+                base_layer=target,
+                adapter_name=adapter_name,
+                r=rotation_config.r,
+                T=rotation_config.T,
+                num_rotations=rotation_config.num_rotations,
+                **kwargs,
+            )
+        else:
+            # Unsupported layer type
+            print(
+                f"Rotation layer does not support {type(target).__name__} yet. "
+                f"Skipping this module."
+            )
+            return None
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            if name == "model":  # see #1892: prevent infinite recursion if class is not initialized
+                raise
+            return getattr(self.model, name)
+    def get_peft_config_as_dict(self, inference: bool = False):
+        config_dict = {}
+        for key, value in self.peft_config.items():
+            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
+            if inference:
+                config["inference_mode"] = True
+        config_dict[key] = config
+        return config
+    def _set_adapter_layers(self, enabled=True):
+        for module in self.model.modules():
+            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+    def enable_adapter_layers(self) -> None:
+        """Enable all adapters.
+        Call this if you have previously disabled all adapters and want to re-enable them.
+        """
+        self._set_adapter_layers(enabled=True)
+    def disable_adapter_layers(self):
+        for active_adapter in self.active_adapters:
+            val = self.peft_config[active_adapter].bias
+            if val != "none":
+                msg = (
+                    f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
+                    "output as the base model would without adaption."
+                )
+                print(msg)
+        self._set_adapter_layers(enabled=False)
+    def set_adapter(self, adapter_name):
+        """Set the active adapter(s).
+        Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is
+        not desired, use the following code.
+        ```py
+        >>> for name, param in model_peft.named_parameters():
+        ...     if ...:  # some check on name (ex. if 'lora' in name)
+        ...         param.requires_grad = False
+        ```
+        Args:
+            adapter_name (`str` or `list[str]`): Name of the adapter(s) to be activated.
+        """
+        for module in self.model.modules():
+            if isinstance(module, RotationLayer):
+                if module.merged:
+                    print("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+        self.active_adapter = adapter_name
+    def merge_adapter(self, adapter_names: Optional[list[str]] = None) -> None:
+        """
+        Merge adapter weights into the base model weights.
+        This can speed up inference by eliminating the need for runtime
+        rotation computations.
+        Args:
+            adapter_names: List of adapter names to merge. If None, merges all
+                active adapters.
+        """
+        for module in self.model.modules():
+            if isinstance(module, RotationLayer):
+                module.merge(safe_merge=False, adapter_names=adapter_names)
+    def unmerge_adapter(self) -> None:
+        """
+        Unmerge adapter weights from the base model weights.
+        This reverses the merge operation, restoring dynamic adapter behavior.
+        """
+        for module in self.model.modules():
+            if isinstance(module, RotationLayer):
+                module.unmerge()
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_ROTATION_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+            peft_config.target_modules = set(
+                TRANSFORMERS_MODELS_TO_ROTATION_TARGET_MODULES_MAPPING[model_config["model_type"]]
+            )
+        return peft_config
+    def _check_new_adapter_config(self, config) -> None:
+        """
+        Check the validity of a new adapter configuration.
+        Args:
+            config: Configuration to validate
+        Raises:
+            ValueError: If configuration is invalid
+        """
+        # Validate rank
+        if config.r <= 0:
+            raise ValueError(f"r must be positive, got {config.r}")
+        # Validate num_rotations
+        if config.num_rotations <= 0:
+            raise ValueError(
+                f"num_rotations must be positive, got {config.num_rotations}"
+            )
+        # Validate bias configuration
+        valid_bias_configs = ["none", "all", "rotation_only"]
+        if hasattr(config, "bias") and config.bias not in valid_bias_configs:
+            raise ValueError(
+                f"Invalid bias configuration '{config.bias}'. "
+                f"Must be one of {valid_bias_configs}"
+            )
+    def _unload_and_optionally_merge(
+        self,
+        merge=True,
+        progressbar: bool = False,
+        safe_merge: bool = False,
+        adapter_names: Optional[list[str]] = None,
+    ):
+        if merge:
+            self._check_merge_allowed()
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        desc = "Unloading " + ("and merging " if merge else "") + "model"
+        for key in tqdm(key_list, disable=not progressbar, desc=desc):
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+            with onload_layer(target):
+                if hasattr(target, "unload_and_optionally_merge_module"):
+                    # if layers have special unloading method, like MultiheadAttention, use that
+                    unloaded_module = target.unload_and_optionally_merge_module(
+                        merge=merge, safe_merge=safe_merge, adapter_names=adapter_names
+                    )
+                    self._replace_module(parent, target_name, unloaded_module, target)
+                elif hasattr(target, "base_layer"):
+                    if merge:
+                        target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                    self._replace_module(parent, target_name, target.get_base_layer(), target)
+        return self.model
+    def delete_adapter(self, adapter_name: str) -> None:
+        """
+        Deletes an existing adapter.
+        Args:
+            adapter_name (str): Name of the adapter to be deleted.
+        """
+        if adapter_name not in list(self.peft_config.keys()):
+            raise ValueError(f"Adapter {adapter_name} does not exist")
+        del self.peft_config[adapter_name]
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        new_adapter = None
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, RotationLayer):
+                target.delete_adapter(adapter_name)
+                if new_adapter is None:
+                    new_adapter = target.active_adapters[:]
+        self.active_adapter = new_adapter or []
+        self._delete_auxiliary_adapter(adapter_name, new_active_adapters=new_adapter)
+    def merge_and_unload(
+        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
+    ) -> torch.nn.Module:
+        r"""
+        This method merges the OFT layers into the base model. This is needed if someone wants to use the base model as
+        a standalone model.
+        Args:
+            progressbar (`bool`):
+                whether to show a progressbar indicating the unload and merge process
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        return self._unload_and_optionally_merge(
+            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
+        )
+    def unload(self) -> torch.nn.Module:
+        """
+        Gets back the base model by removing all the oft modules without merging. This gives back the original base
+        model.
+        """
+        return self._unload_and_optionally_merge(merge=False)

omini/rotation/rotation_config.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from dataclasses import dataclass, field
+from typing import List, Optional
+from peft.config import PeftConfig
+@dataclass
+class RotationConfig(PeftConfig):
+    """
+    Configuration class for Rotation-based Parameter-Efficient Fine-Tuning.
+    This configuration stores all parameters needed to apply the Rotation method
+    (based on Cayley transformation) to a model's linear layers.
+    Args:
+        r (`int`):
+            The rank parameter for the low-rank approximation in rotation matrices.
+        T (`float`, *optional*, defaults to 1.0):
+            Temperature parameter for the transformation.
+        num_rotations (`int`, *optional*, defaults to 4):
+            Number of rotation matrices to use in parallel.
+        target_modules (`Union[List[str], str]`):
+            Module names to apply rotation to (e.g., ["q_proj", "v_proj"]).
+        target_modules_to_skip (`Union[List[str], str]`, *optional*):
+            Module names to skip when applying rotation.
+        modules_to_save (`Union[List[str], str]`, *optional*):
+            Modules to save in addition to rotation parameters.
+        layers_to_transform (`Union[List[int], int]`, *optional*):
+            Layers to transform. If None, all layers matching target_modules are transformed.
+        apply_before (`bool`, *optional*, defaults to False):
+            If True, apply rotation before the base linear layer. If False, apply after.
+    """
+    peft_type: str = field(default="ROTATION", init=False)
+    target_modules: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names to apply rotation to (e.g., ['q_proj', 'v_proj', 'linear'])"
+        },
+    )
+    target_modules_to_skip: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": "List of module names to skip when applying rotation"},
+    )
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": "List of modules to save in addition to rotation parameters"},
+    )
+    r: int = field(
+        default=8,
+        metadata={"help": "Rank parameter for low-rank approximation"},
+    )
+    T: float = field(
+        default=1.0,
+        metadata={"help": "Temperature parameter for Cayley transformation"},
+    )
+    num_rotations: int = field(
+        default=4,
+        metadata={"help": "Number of rotation matrices to use in parallel"},
+    )
+    bias: str = field(
+        default="none",
+        metadata={
+            "help": "Bias training configuration. Options: 'none', 'all', 'rotation_only'"
+        }
+    )
+    layers_to_transform: Optional[List[int]] = field(
+        default=None,
+        metadata={"help": "Layers to transform. If None, all matching layers are transformed"},
+    )
+    def __post_init__(self):
+        self.peft_type = "ROTATION"
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
+        self.target_modules_to_skip = (
+            set(self.target_modules_to_skip)
+            if isinstance(self.target_modules_to_skip, list)
+            else self.target_modules_to_skip
+        )

omini/train_flux/train_custom.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import os
+import random
+from torch.utils.data import DataLoader, Dataset
+from PIL import Image
+from datasets import load_dataset
+from .trainer import OminiModel, get_config, train
+from ..pipeline.flux_omini import Condition, generate
+class CustomDataset(Dataset):
+    def __getitem__(self, idx):
+        # TODO: Implement the logic to load your custom dataset
+        raise NotImplementedError("Custom dataset loading not implemented")
+@torch.no_grad()
+def test_function(model, save_path, file_name):
+    # TODO: Implement the logic to generate a sample using the model
+    raise NotImplementedError("Sample generation not implemented")
+def main():
+    # Initialize
+    config = get_config()
+    training_config = config["train"]
+    torch.cuda.set_device(int(os.environ.get("LOCAL_RANK", 0)))
+    # Initialize custom dataset
+    dataset = CustomDataset()
+    # Initialize model
+    trainable_model = OminiModel(
+        flux_pipe_id=config["flux_path"],
+        lora_config=training_config["lora_config"],
+        device=f"cuda",
+        dtype=getattr(torch, config["dtype"]),
+        optimizer_config=training_config["optimizer"],
+        model_config=config.get("model", {}),
+        gradient_checkpointing=training_config.get("gradient_checkpointing", False),
+    )
+    train(dataset, trainable_model, config, test_function)
+if __name__ == "__main__":
+    main()

omini/train_flux/train_multi_condition.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import torch
+import os
+import random
+from PIL import Image, ImageDraw
+from datasets import load_dataset
+from .trainer import OminiModel, get_config, train
+from ..pipeline.flux_omini import Condition, convert_to_condition, generate
+from .train_spatial_alignment import ImageConditionDataset
+class ImageMultiConditionDataset(ImageConditionDataset):
+    def __getitem__(self, idx):
+        image = self.base_dataset[idx]["jpg"]
+        image = image.resize(self.target_size).convert("RGB")
+        description = self.base_dataset[idx]["json"]["prompt"]
+        condition_size = self.condition_size
+        position_scale = self.position_scale
+        condition_imgs, position_deltas = [], []
+        for c_type in self.condition_type:
+            condition_img, position_delta = self.__get_condition__(image, c_type)
+            condition_imgs.append(condition_img.convert("RGB"))
+            position_deltas.append(position_delta)
+        # Randomly drop text or image (for training)
+        drop_text = random.random() < self.drop_text_prob
+        drop_image = random.random() < self.drop_image_prob
+        if drop_text:
+            description = ""
+        if drop_image:
+            condition_imgs = [
+                Image.new("RGB", condition_size)
+                for _ in range(len(self.condition_type))
+            ]
+        return_dict = {
+            "image": self.to_tensor(image),
+            "description": description,
+            **({"pil_image": [image, condition_img]} if self.return_pil_image else {}),
+        }
+        for i, c_type in enumerate(self.condition_type):
+            return_dict[f"condition_{i}"] = self.to_tensor(condition_imgs[i])
+            return_dict[f"condition_type_{i}"] = self.condition_type[i]
+            return_dict[f"position_delta_{i}"] = position_deltas[i]
+            return_dict[f"position_scale_{i}"] = position_scale
+        return return_dict
+@torch.no_grad()
+def test_function(model, save_path, file_name):
+    condition_size = model.training_config["dataset"]["condition_size"]
+    target_size = model.training_config["dataset"]["target_size"]
+    position_delta = model.training_config["dataset"].get("position_delta", [0, 0])
+    position_scale = model.training_config["dataset"].get("position_scale", 1.0)
+    condition_type = model.training_config["condition_type"]
+    test_list = []
+    condition_list = []
+    for i, c_type in enumerate(condition_type):
+        if c_type in ["canny", "coloring", "deblurring", "depth"]:
+            image = Image.open("assets/vase_hq.jpg")
+            image = image.resize(condition_size)
+            condition_img = convert_to_condition(c_type, image, 5)
+        elif c_type == "fill":
+            condition_img = image.resize(condition_size).convert("RGB")
+            w, h = image.size
+            x1, x2 = sorted([random.randint(0, w), random.randint(0, w)])
+            y1, y2 = sorted([random.randint(0, h), random.randint(0, h)])
+            mask = Image.new("L", image.size, 0)
+            draw = ImageDraw.Draw(mask)
+            draw.rectangle([x1, y1, x2, y2], fill=255)
+            if random.random() > 0.5:
+                mask = Image.eval(mask, lambda a: 255 - a)
+            condition_img = Image.composite(
+                image, Image.new("RGB", image.size, (0, 0, 0)), mask
+            )
+        else:
+            raise NotImplementedError
+        condition = Condition(
+            condition_img,
+            model.adapter_names[i + 2],
+            position_delta,
+            position_scale,
+        )
+        condition_list.append(condition)
+    test_list.append((condition_list, "A beautiful vase on a table."))
+    os.makedirs(save_path, exist_ok=True)
+    for i, (condition, prompt) in enumerate(test_list):
+        generator = torch.Generator(device=model.device)
+        generator.manual_seed(42)
+        res = generate(
+            model.flux_pipe,
+            prompt=prompt,
+            conditions=condition_list,
+            height=target_size[0],
+            width=target_size[1],
+            generator=generator,
+            model_config=model.model_config,
+            kv_cache=model.model_config.get("independent_condition", False),
+        )
+        file_path = os.path.join(
+            save_path, f"{file_name}_{'|'.join(condition_type)}_{i}.jpg"
+        )
+        res.images[0].save(file_path)
+def main():
+    # Initialize
+    config = get_config()
+    training_config = config["train"]
+    torch.cuda.set_device(int(os.environ.get("LOCAL_RANK", 0)))
+    # Initialize dataset
+    dataset = load_dataset(
+        "webdataset",
+        data_files={"train": training_config["dataset"]["urls"]},
+        split="train",
+        cache_dir="cache/t2i2m",
+        num_proc=32,
+    )
+    dataset = ImageMultiConditionDataset(
+        dataset,
+        condition_size=training_config["dataset"]["condition_size"],
+        target_size=training_config["dataset"]["target_size"],
+        condition_type=training_config["condition_type"],
+        drop_text_prob=training_config["dataset"]["drop_text_prob"],
+        drop_image_prob=training_config["dataset"]["drop_image_prob"],
+        position_scale=training_config["dataset"].get("position_scale", 1.0),
+    )
+    cond_n = len(training_config["condition_type"])
+    # Initialize model
+    trainable_model = OminiModel(
+        flux_pipe_id=config["flux_path"],
+        lora_config=training_config["lora_config"],
+        device=f"cuda",
+        dtype=getattr(torch, config["dtype"]),
+        optimizer_config=training_config["optimizer"],
+        model_config=config.get("model", {}),
+        gradient_checkpointing=training_config.get("gradient_checkpointing", False),
+        adapter_names=[None, None, *["default"] * cond_n],
+        # In this setting, all the conditions are using the same LoRA adapter
+    )
+    train(dataset, trainable_model, config, test_function)
+if __name__ == "__main__":
+    main()