Niklas Schulte commited on Mar 2

Commit

6ac38d4

•

1 Parent(s): 477f8c0

Delete unused model files and adapter configurations

Browse files

Files changed (30) hide show

README.md +0 -45
gradio_app.py +0 -78
inference.py +0 -118
lm_final(10).pt +0 -3
lm_final(11).pt +0 -3
lm_final(7).pt +0 -3
lm_final(8).pt +0 -3
lm_final(9).pt +0 -3
lm_final(2).pt → models_frozen_decoder/nature_large/lm_final(2).pt +0 -0
lm_final(1).pt → models_frozen_decoder/nature_medium/lm_final(1).pt +0 -0
lm_final(5).pt → models_frozen_decoder/symmv_large/lm_final(5).pt +0 -0
lm_final(4).pt → models_frozen_decoder/symmv_medium/lm_final(4).pt +0 -0
lm_final(3).pt → models_frozen_decoder/symmv_small/lm_final(3).pt +0 -0
lm_final(6).pt → models_peft/nature_small/lm_final(6).pt +0 -0
musicgen_peft_final 2/README.md +0 -203
musicgen_peft_final 2/adapter_config.json +0 -31
musicgen_peft_final 2/adapter_model.safetensors +0 -3
musicgen_peft_final 3/README.md +0 -203
musicgen_peft_final 3/adapter_config.json +0 -31
musicgen_peft_final 3/adapter_model.safetensors +0 -3
musicgen_peft_final 4/README.md +0 -203
musicgen_peft_final 4/adapter_config.json +0 -31
musicgen_peft_final 4/adapter_model.safetensors +0 -3
musicgen_peft_final 5/README.md +0 -203
musicgen_peft_final 5/adapter_config.json +0 -31
musicgen_peft_final 5/adapter_model.safetensors +0 -3
musicgen_peft_final/README.md +0 -203
musicgen_peft_final/adapter_config.json +0 -31
musicgen_peft_final/adapter_model.safetensors +0 -3
training_utils.py +0 -278

README.md DELETED Viewed

@@ -1,45 +0,0 @@
----
-license: mit
-language:
-- en
----
-# Master Thesis: High-Fidelity Video Background Music Generation using Transformers
-This is the corresponding GitLab Repository of my Master Thesis. The goal this thisis is to generate video background
-music by the adaptation of MusicGen (https://arxiv.org/pdf/2306.05284.pdf) to video input as another input modality.
-This should be accomplished by mapping video information into the T5 text embedding space on which MusicGen usually
-works on. To this end, a Transformer Encoder network to accomplish this task, called Video Encoder. Two options are
-foreseen within the training loop for the Video Encoder:
-- freezing the weights within the MusicGen Audio Decoder
-- adjusting the weights of the MusicGen Audio Decoder with Parameter Efficient Fine-Tuning (PEFT) using LoRA (https://arxiv.org/abs/2106.09685)
-# Installation
-- create a Python virtual environment with `Python 3.11`
-- check https://pytorch.org/get-started/previous-versions/ to install `PyTorch 2.1.0` with `CUDA` on your machine
-- install the local fork of audiocraft: `cd audiocraft; pip install -e .`
-- install the other requirements: `pip install -r requirements.txt`
-# Folder Structure
-- `audiocraft` contains a local fork of the audiocraft library (https://github.com/facebookresearch/audiocraft) with
-little changes to the generation method, further information can be seen in `code/code_adaptations_audiocraft`.
-- `code` contains the code for model `training` and `inference` of video background music
-- `datasets` contains the code to create the datasets used for training within `data_preparation` and video examples
-used for the evaluation in `example_videos`
-- `evaluation` contains the code used to evaluate the datasets and created video embeddings
-- `gradio_app` contains the code for interface to generate video background music
-# Training
-To train the models set the training parameters under `training/training_conf.yml` and start training with
-`python training/training.py`. The models weights will be stored under `training/models_audiocraft` or
-`training/models_peft` respectively.
-# Inference
-- start the user interface by running `python gradio_app/app.py`
-- inside the interface select a video, parameters
-- click on "submit" to start the generation
-# Contact
-For any questions contact me at [niklas.schulte@rwth-aachen.de](mailto:niklas.schulte@rwth-aachen.de)

gradio_app.py DELETED Viewed

@@ -1,78 +0,0 @@
-import gradio as gr
-import os
-import sys
-sys.path.insert(1, '../training_audiocraft/inference')
-import inference
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-def generate_background_music(video_path, dataset, use_peft, musicgen_size):
-    print(f"Start generating background music for {video_path} with model \"{'peft' if use_peft else 'audiocraft'}_{dataset}_{musicgen_size}\"")
-    new_video_path = inference.generate_background_music(
-        video_path=video_path,
-        dataset=dataset,
-        musicgen_size=musicgen_size,
-        use_stereo=True,
-        use_peft=use_peft,
-        musicgen_temperature=1.0,
-        musicgen_guidance_scale=3.0,
-        top_k_sampling=250,
-        device=device
-    )
-    return gr.Video(new_video_path)
-interface = gr.Interface(fn=generate_background_music,
-                         inputs=[
-                                 gr.Video(
-                                          label="video input",
-                                          min_length=5,
-                                          max_length=20,
-                                          sources=['upload'],
-                                          show_download_button=True,
-                                          include_audio=True
-                                          ),
-                                 gr.Radio(["nature", "symmv"],
-                                          label="Video Encoder Version",
-                                          value="nature",
-                                          info="Choose one of the available Video Encoders."),
-                                 gr.Radio([False, True],
-                                          label="Use MusicGen Audio Decoder Model trained with PEFT",
-                                          value=False,
-                                          info="If set to 'True' the MusicGen Audio Decoder models trained with LoRA "
-                                               "(Low Rank Adaptation) are used. If set to 'False', the original "
-                                               "MusicGen models are used."),
-                                 gr.Radio(["small", "medium", "large"],
-                                          label="MusicGen Audio Decoder Size",
-                                          value="small",
-                                          info="Choose the size of the MusicGen audio decoder."),
-                                ],
-                         outputs=[gr.Video(label="video output")],
-                         examples=[
-                             [os.path.abspath("../videos/study/n_1.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/n_2.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/n_3.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/n_4.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/n_5.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/n_6.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/n_7.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/n_8.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/s_1.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/s_2.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/s_3.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/s_4.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/s_5.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/s_6.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/s_7.mp4"), "nature", True, "small"],
-                             [os.path.abspath("../videos/study/s_8.mp4"), "nature", True, "small"],
-                           ],
-                         cache_examples=False
-                         )
-if __name__ == "__main__":
-    interface.launch(
-        share=False
-    )

inference.py DELETED Viewed

@@ -1,118 +0,0 @@
-from omegaconf import OmegaConf
-from peft import  PeftConfig, get_peft_model
-from audiocraft.models import MusicGen
-from moviepy.editor import AudioFileClip
-from training_utils import *
-import re
-import time
-re_file_name = re.compile('([^/]+$)')
-def generate_background_music(video_path: str,
-                              dataset: str,
-                              musicgen_size: str,
-                              use_stereo: bool,
-                              use_peft: bool,
-                              device: str,
-                              musicgen_temperature: float = 1.0,
-                              musicgen_guidance_scale: float = 3.0,
-                              top_k_sampling: int = 250) -> str:
-    start = time.time()
-    model_path = "../training_audiocraft/training/"
-    model_path += "models_peft" if use_peft else "models_audiocraft"
-    model_path += f"/{dataset}" + f"_{musicgen_size}"
-    conf = OmegaConf.load(model_path + '/configuration.yml')
-    use_sampling = True if top_k_sampling > 0 else False
-    video = mpe.VideoFileClip(video_path)
-    musicgen_model_id = "facebook/musicgen-" + "stereo-" if use_stereo else ""
-    musicgen_model_id += musicgen_size
-    result_dir = "./results"
-    os.makedirs(result_dir, exist_ok=True)
-    encoder_output_dimension = None
-    if "small" in conf.musicgen_model_id:
-        encoder_output_dimension = 1024
-    elif "medium" in conf.musicgen_model_id:
-        encoder_output_dimension = 1536
-    elif "large" in conf.musicgen_model_id:
-        encoder_output_dimension = 2048
-    assert encoder_output_dimension, f"Video Encoder output dimension could not be determined by {conf.musicgen_model_id}"
-    musicgen_model = MusicGen.get_pretrained(musicgen_model_id)
-    musicgen_model.lm.to(device)
-    musicgen_model.compression_model.to(device)
-    if use_peft:
-        peft_path = model_path + "/musicgen_peft_final"
-        peft_config = PeftConfig.from_pretrained(peft_path)
-        musicgen_model.lm = get_peft_model(musicgen_model.lm, peft_config)
-        musicgen_model.lm.load_adapter(peft_path, "default")
-    print("MusicGen Model loaded.")
-    video_to_t5 = VideoToT5(
-        video_extraction_framerate=conf.video_extraction_framerate,
-        encoder_input_dimension=conf.encoder_input_dimension,
-        encoder_output_dimension=encoder_output_dimension,
-        encoder_heads=conf.encoder_heads,
-        encoder_dim_feedforward=conf.encoder_dim_feedforward,
-        encoder_layers=conf.encoder_layers,
-        device=device
-    )
-    video_to_t5.load_state_dict(torch.load(model_path + "/lm_final.pt", map_location=device))
-    print("Video Encoder Model loaded.")
-    print("Starting Video Feature Extraction.")
-    video_embedding_t5 = video_to_t5(video_paths=[video_path])
-    condition_tensors = create_condition_tensors(
-        video_embeddings=video_embedding_t5,
-        batch_size=1,
-        video_extraction_framerate=video_to_t5.video_extraction_framerate,
-        device=device
-    )
-    musicgen_model.generation_params = {
-        'max_gen_len': int(video.duration * musicgen_model.frame_rate),
-        'use_sampling': use_sampling,
-        'temp': musicgen_temperature,
-        'cfg_coef': musicgen_guidance_scale,
-        'two_step_cfg': False,
-    }
-    if use_sampling:
-        musicgen_model.generation_params['top_k'] = 250
-    print("Starting Audio Generation.")
-    prompt_tokens = None
-    with torch.no_grad():
-        with musicgen_model.autocast:
-            gen_tokens = musicgen_model.lm.generate(prompt_tokens, [], condition_tensors, callback=None,
-                                                    **musicgen_model.generation_params)
-        gen_audio = musicgen_model.compression_model.decode(gen_tokens)
-    end = time.time()
-    print("Elapsed time for generation: " + str(end - start))
-    _, video_file_name = os.path.split(video_path)
-    video_file_name = video_file_name[:-4]  # remove .mp4
-    re_result = re_file_name.search(video_file_name)  # get video file name
-    result_path = f"{'peft' if use_peft else 'audiocraft'}_{dataset}_{musicgen_size}_{re_result.group(1)}"
-    audio_result_path = f"{result_dir}/tmp.wav"
-    video_result_path = f"{result_dir}/{result_path}_video.mp4"
-    gen_audio = torch.squeeze(gen_audio.detach().cpu())  # remove mini-batch dimension, move to CPU for saving
-    sample_rate = musicgen_model.sample_rate
-    torchaudio.save(audio_result_path, gen_audio, sample_rate)
-    audio_file_clip = AudioFileClip(audio_result_path)
-    video.audio = audio_file_clip
-    print("Rendering Video.")
-    video.write_videofile(video_result_path)
-    return video_result_path

lm_final(10).pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0d79ea467294e53dcf48c54186cd2831c8625c10ea82beaa257b73ccc65fcdd3
-size 4176171365

lm_final(11).pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ecb2679c5b0e222cb12e3c4ed2d01e5f86c05a698b8d8f6cc6fe882c0a02ef4b
-size 14652654385

lm_final(7).pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20117d416847702d33e46ee7c4c1f814cd2a1bea64066490e275a480c9c6148
-size 4176171365

lm_final(8).pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4bbdb2a036f0561fdc74e80b3c1e1a4e6043a3ee647323b1a5dac69499855684
-size 14652654385

lm_final(9).pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:af67752913cba1ccae33d804168a1befa503e1c9933f4ae3c254d78b4b172d96
-size 1352342381

lm_final(2).pt → models_frozen_decoder/nature_large/lm_final(2).pt RENAMED Viewed

File without changes

lm_final(1).pt → models_frozen_decoder/nature_medium/lm_final(1).pt RENAMED Viewed

File without changes

lm_final(5).pt → models_frozen_decoder/symmv_large/lm_final(5).pt RENAMED Viewed

File without changes

lm_final(4).pt → models_frozen_decoder/symmv_medium/lm_final(4).pt RENAMED Viewed

File without changes

lm_final(3).pt → models_frozen_decoder/symmv_small/lm_final(3).pt RENAMED Viewed

File without changes

lm_final(6).pt → models_peft/nature_small/lm_final(6).pt RENAMED Viewed

File without changes

musicgen_peft_final 2/README.md DELETED Viewed

@@ -1,203 +0,0 @@
----
-library_name: peft
----
-# Model Card for Model ID
-<!-- Provide a quick summary of what the model is/does. -->
-## Model Details
-### Model Description
-<!-- Provide a longer summary of what this model is. -->
-- **Developed by:** [More Information Needed]
-- **Funded by [optional]:** [More Information Needed]
-- **Shared by [optional]:** [More Information Needed]
-- **Model type:** [More Information Needed]
-- **Language(s) (NLP):** [More Information Needed]
-- **License:** [More Information Needed]
-- **Finetuned from model [optional]:** [More Information Needed]
-### Model Sources [optional]
-<!-- Provide the basic links for the model. -->
-- **Repository:** [More Information Needed]
-- **Paper [optional]:** [More Information Needed]
-- **Demo [optional]:** [More Information Needed]
-## Uses
-<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
-### Direct Use
-<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
-[More Information Needed]
-### Downstream Use [optional]
-<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
-[More Information Needed]
-### Out-of-Scope Use
-<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
-[More Information Needed]
-## Bias, Risks, and Limitations
-<!-- This section is meant to convey both technical and sociotechnical limitations. -->
-[More Information Needed]
-### Recommendations
-<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
-## How to Get Started with the Model
-Use the code below to get started with the model.
-[More Information Needed]
-## Training Details
-### Training Data
-<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
-[More Information Needed]
-### Training Procedure
-<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
-#### Preprocessing [optional]
-[More Information Needed]
-#### Training Hyperparameters
-- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
-#### Speeds, Sizes, Times [optional]
-<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
-[More Information Needed]
-## Evaluation
-<!-- This section describes the evaluation protocols and provides the results. -->
-### Testing Data, Factors & Metrics
-#### Testing Data
-<!-- This should link to a Dataset Card if possible. -->
-[More Information Needed]
-#### Factors
-<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
-[More Information Needed]
-#### Metrics
-<!-- These are the evaluation metrics being used, ideally with a description of why. -->
-[More Information Needed]
-### Results
-[More Information Needed]
-#### Summary
-## Model Examination [optional]
-<!-- Relevant interpretability work for the model goes here -->
-[More Information Needed]
-## Environmental Impact
-<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
-Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
-- **Hardware Type:** [More Information Needed]
-- **Hours used:** [More Information Needed]
-- **Cloud Provider:** [More Information Needed]
-- **Compute Region:** [More Information Needed]
-- **Carbon Emitted:** [More Information Needed]
-## Technical Specifications [optional]
-### Model Architecture and Objective
-[More Information Needed]
-### Compute Infrastructure
-[More Information Needed]
-#### Hardware
-[More Information Needed]
-#### Software
-[More Information Needed]
-## Citation [optional]
-<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
-**BibTeX:**
-[More Information Needed]
-**APA:**
-[More Information Needed]
-## Glossary [optional]
-<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
-[More Information Needed]
-## More Information [optional]
-[More Information Needed]
-## Model Card Authors [optional]
-[More Information Needed]
-## Model Card Contact
-[More Information Needed]
-### Framework versions
-- PEFT 0.8.2

musicgen_peft_final 2/adapter_config.json DELETED Viewed

@@ -1,31 +0,0 @@
-{
-  "alpha_pattern": {},
-  "auto_mapping": {
-    "base_model_class": "LMModel",
-    "parent_library": "audiocraft.models.lm"
-  },
-  "base_model_name_or_path": null,
-  "bias": "none",
-  "fan_in_fan_out": false,
-  "inference_mode": true,
-  "init_lora_weights": true,
-  "layers_pattern": null,
-  "layers_to_transform": null,
-  "loftq_config": {},
-  "lora_alpha": 16,
-  "lora_dropout": 0.1,
-  "megatron_config": null,
-  "megatron_core": "megatron.core",
-  "modules_to_save": [
-    "classifier"
-  ],
-  "peft_type": "LORA",
-  "r": 16,
-  "rank_pattern": {},
-  "revision": null,
-  "target_modules": [
-    "out_proj"
-  ],
-  "task_type": null,
-  "use_rslora": false
-}

musicgen_peft_final 2/adapter_model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a56c5b78dc0be771429c038f27bfe5a9a1fe1460778bdeb45213308b7c4c0f4e
-size 9464784

musicgen_peft_final 3/README.md DELETED Viewed

@@ -1,203 +0,0 @@
----
-library_name: peft
----
-# Model Card for Model ID
-<!-- Provide a quick summary of what the model is/does. -->
-## Model Details
-### Model Description
-<!-- Provide a longer summary of what this model is. -->
-- **Developed by:** [More Information Needed]
-- **Funded by [optional]:** [More Information Needed]
-- **Shared by [optional]:** [More Information Needed]
-- **Model type:** [More Information Needed]
-- **Language(s) (NLP):** [More Information Needed]
-- **License:** [More Information Needed]
-- **Finetuned from model [optional]:** [More Information Needed]
-### Model Sources [optional]
-<!-- Provide the basic links for the model. -->
-- **Repository:** [More Information Needed]
-- **Paper [optional]:** [More Information Needed]
-- **Demo [optional]:** [More Information Needed]
-## Uses
-<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
-### Direct Use
-<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
-[More Information Needed]
-### Downstream Use [optional]
-<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
-[More Information Needed]
-### Out-of-Scope Use
-<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
-[More Information Needed]
-## Bias, Risks, and Limitations
-<!-- This section is meant to convey both technical and sociotechnical limitations. -->
-[More Information Needed]
-### Recommendations
-<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
-## How to Get Started with the Model
-Use the code below to get started with the model.
-[More Information Needed]
-## Training Details
-### Training Data
-<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
-[More Information Needed]
-### Training Procedure
-<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
-#### Preprocessing [optional]
-[More Information Needed]
-#### Training Hyperparameters
-- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
-#### Speeds, Sizes, Times [optional]
-<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
-[More Information Needed]
-## Evaluation
-<!-- This section describes the evaluation protocols and provides the results. -->
-### Testing Data, Factors & Metrics
-#### Testing Data
-<!-- This should link to a Dataset Card if possible. -->
-[More Information Needed]
-#### Factors
-<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
-[More Information Needed]
-#### Metrics
-<!-- These are the evaluation metrics being used, ideally with a description of why. -->
-[More Information Needed]
-### Results
-[More Information Needed]
-#### Summary
-## Model Examination [optional]
-<!-- Relevant interpretability work for the model goes here -->
-[More Information Needed]
-## Environmental Impact
-<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
-Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
-- **Hardware Type:** [More Information Needed]
-- **Hours used:** [More Information Needed]
-- **Cloud Provider:** [More Information Needed]
-- **Compute Region:** [More Information Needed]
-- **Carbon Emitted:** [More Information Needed]
-## Technical Specifications [optional]
-### Model Architecture and Objective
-[More Information Needed]
-### Compute Infrastructure
-[More Information Needed]
-#### Hardware
-[More Information Needed]
-#### Software
-[More Information Needed]
-## Citation [optional]
-<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
-**BibTeX:**
-[More Information Needed]
-**APA:**
-[More Information Needed]
-## Glossary [optional]
-<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
-[More Information Needed]
-## More Information [optional]
-[More Information Needed]
-## Model Card Authors [optional]
-[More Information Needed]
-## Model Card Contact
-[More Information Needed]
-### Framework versions
-- PEFT 0.8.2

musicgen_peft_final 3/adapter_config.json DELETED Viewed

@@ -1,31 +0,0 @@
-{
-  "alpha_pattern": {},
-  "auto_mapping": {
-    "base_model_class": "LMModel",
-    "parent_library": "audiocraft.models.lm"
-  },
-  "base_model_name_or_path": null,
-  "bias": "none",
-  "fan_in_fan_out": false,
-  "inference_mode": true,
-  "init_lora_weights": true,
-  "layers_pattern": null,
-  "layers_to_transform": null,
-  "loftq_config": {},
-  "lora_alpha": 16,
-  "lora_dropout": 0.1,
-  "megatron_config": null,
-  "megatron_core": "megatron.core",
-  "modules_to_save": [
-    "classifier"
-  ],
-  "peft_type": "LORA",
-  "r": 16,
-  "rank_pattern": {},
-  "revision": null,
-  "target_modules": [
-    "out_proj"
-  ],
-  "task_type": null,
-  "use_rslora": false
-}

musicgen_peft_final 3/adapter_model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:50885374bd3299e8335820c113650f2038fb5286f3d13415ce38b7cb2bb3bedb
-size 12610608

musicgen_peft_final 4/README.md DELETED Viewed

@@ -1,203 +0,0 @@
----
-library_name: peft
----
-# Model Card for Model ID
-<!-- Provide a quick summary of what the model is/does. -->
-## Model Details
-### Model Description
-<!-- Provide a longer summary of what this model is. -->
-- **Developed by:** [More Information Needed]
-- **Funded by [optional]:** [More Information Needed]
-- **Shared by [optional]:** [More Information Needed]
-- **Model type:** [More Information Needed]
-- **Language(s) (NLP):** [More Information Needed]
-- **License:** [More Information Needed]
-- **Finetuned from model [optional]:** [More Information Needed]
-### Model Sources [optional]
-<!-- Provide the basic links for the model. -->
-- **Repository:** [More Information Needed]
-- **Paper [optional]:** [More Information Needed]
-- **Demo [optional]:** [More Information Needed]
-## Uses
-<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
-### Direct Use
-<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
-[More Information Needed]
-### Downstream Use [optional]
-<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
-[More Information Needed]
-### Out-of-Scope Use
-<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
-[More Information Needed]
-## Bias, Risks, and Limitations
-<!-- This section is meant to convey both technical and sociotechnical limitations. -->
-[More Information Needed]
-### Recommendations
-<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
-## How to Get Started with the Model
-Use the code below to get started with the model.
-[More Information Needed]
-## Training Details
-### Training Data
-<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
-[More Information Needed]
-### Training Procedure
-<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
-#### Preprocessing [optional]
-[More Information Needed]
-#### Training Hyperparameters
-- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
-#### Speeds, Sizes, Times [optional]
-<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
-[More Information Needed]
-## Evaluation
-<!-- This section describes the evaluation protocols and provides the results. -->
-### Testing Data, Factors & Metrics
-#### Testing Data
-<!-- This should link to a Dataset Card if possible. -->
-[More Information Needed]
-#### Factors
-<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
-[More Information Needed]
-#### Metrics
-<!-- These are the evaluation metrics being used, ideally with a description of why. -->
-[More Information Needed]
-### Results
-[More Information Needed]
-#### Summary
-## Model Examination [optional]
-<!-- Relevant interpretability work for the model goes here -->
-[More Information Needed]
-## Environmental Impact
-<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
-Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
-- **Hardware Type:** [More Information Needed]
-- **Hours used:** [More Information Needed]
-- **Cloud Provider:** [More Information Needed]
-- **Compute Region:** [More Information Needed]
-- **Carbon Emitted:** [More Information Needed]
-## Technical Specifications [optional]
-### Model Architecture and Objective
-[More Information Needed]
-### Compute Infrastructure
-[More Information Needed]
-#### Hardware
-[More Information Needed]
-#### Software
-[More Information Needed]
-## Citation [optional]
-<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
-**BibTeX:**
-[More Information Needed]
-**APA:**
-[More Information Needed]
-## Glossary [optional]
-<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
-[More Information Needed]
-## More Information [optional]
-[More Information Needed]
-## Model Card Authors [optional]
-[More Information Needed]
-## Model Card Contact
-[More Information Needed]
-### Framework versions
-- PEFT 0.8.2

musicgen_peft_final 4/adapter_config.json DELETED Viewed

@@ -1,31 +0,0 @@
-{
-  "alpha_pattern": {},
-  "auto_mapping": {
-    "base_model_class": "LMModel",
-    "parent_library": "audiocraft.models.lm"
-  },
-  "base_model_name_or_path": null,
-  "bias": "none",
-  "fan_in_fan_out": false,
-  "inference_mode": true,
-  "init_lora_weights": true,
-  "layers_pattern": null,
-  "layers_to_transform": null,
-  "loftq_config": {},
-  "lora_alpha": 16,
-  "lora_dropout": 0.1,
-  "megatron_config": null,
-  "megatron_core": "megatron.core",
-  "modules_to_save": [
-    "classifier"
-  ],
-  "peft_type": "LORA",
-  "r": 16,
-  "rank_pattern": {},
-  "revision": null,
-  "target_modules": [
-    "out_proj"
-  ],
-  "task_type": null,
-  "use_rslora": false
-}

musicgen_peft_final 4/adapter_model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:47f3accb461e386e4032239789cc8aed985ae4e9dc7205e9339b5c42daf788cf
-size 9464784

musicgen_peft_final 5/README.md DELETED Viewed

@@ -1,203 +0,0 @@
----
-library_name: peft
----
-# Model Card for Model ID
-<!-- Provide a quick summary of what the model is/does. -->
-## Model Details
-### Model Description
-<!-- Provide a longer summary of what this model is. -->
-- **Developed by:** [More Information Needed]
-- **Funded by [optional]:** [More Information Needed]
-- **Shared by [optional]:** [More Information Needed]
-- **Model type:** [More Information Needed]
-- **Language(s) (NLP):** [More Information Needed]
-- **License:** [More Information Needed]
-- **Finetuned from model [optional]:** [More Information Needed]
-### Model Sources [optional]
-<!-- Provide the basic links for the model. -->
-- **Repository:** [More Information Needed]
-- **Paper [optional]:** [More Information Needed]
-- **Demo [optional]:** [More Information Needed]
-## Uses
-<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
-### Direct Use
-<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
-[More Information Needed]
-### Downstream Use [optional]
-<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
-[More Information Needed]
-### Out-of-Scope Use
-<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
-[More Information Needed]
-## Bias, Risks, and Limitations
-<!-- This section is meant to convey both technical and sociotechnical limitations. -->
-[More Information Needed]
-### Recommendations
-<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
-## How to Get Started with the Model
-Use the code below to get started with the model.
-[More Information Needed]
-## Training Details
-### Training Data
-<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
-[More Information Needed]
-### Training Procedure
-<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
-#### Preprocessing [optional]
-[More Information Needed]
-#### Training Hyperparameters
-- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
-#### Speeds, Sizes, Times [optional]
-<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
-[More Information Needed]
-## Evaluation
-<!-- This section describes the evaluation protocols and provides the results. -->
-### Testing Data, Factors & Metrics
-#### Testing Data
-<!-- This should link to a Dataset Card if possible. -->
-[More Information Needed]
-#### Factors
-<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
-[More Information Needed]
-#### Metrics
-<!-- These are the evaluation metrics being used, ideally with a description of why. -->
-[More Information Needed]
-### Results
-[More Information Needed]
-#### Summary
-## Model Examination [optional]
-<!-- Relevant interpretability work for the model goes here -->
-[More Information Needed]
-## Environmental Impact
-<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
-Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
-- **Hardware Type:** [More Information Needed]
-- **Hours used:** [More Information Needed]
-- **Cloud Provider:** [More Information Needed]
-- **Compute Region:** [More Information Needed]
-- **Carbon Emitted:** [More Information Needed]
-## Technical Specifications [optional]
-### Model Architecture and Objective
-[More Information Needed]
-### Compute Infrastructure
-[More Information Needed]
-#### Hardware
-[More Information Needed]
-#### Software
-[More Information Needed]
-## Citation [optional]
-<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
-**BibTeX:**
-[More Information Needed]
-**APA:**
-[More Information Needed]
-## Glossary [optional]
-<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
-[More Information Needed]
-## More Information [optional]
-[More Information Needed]
-## Model Card Authors [optional]
-[More Information Needed]
-## Model Card Contact
-[More Information Needed]
-### Framework versions
-- PEFT 0.8.2

musicgen_peft_final 5/adapter_config.json DELETED Viewed

@@ -1,31 +0,0 @@
-{
-  "alpha_pattern": {},
-  "auto_mapping": {
-    "base_model_class": "LMModel",
-    "parent_library": "audiocraft.models.lm"
-  },
-  "base_model_name_or_path": null,
-  "bias": "none",
-  "fan_in_fan_out": false,
-  "inference_mode": true,
-  "init_lora_weights": true,
-  "layers_pattern": null,
-  "layers_to_transform": null,
-  "loftq_config": {},
-  "lora_alpha": 16,
-  "lora_dropout": 0.1,
-  "megatron_config": null,
-  "megatron_core": "megatron.core",
-  "modules_to_save": [
-    "classifier"
-  ],
-  "peft_type": "LORA",
-  "r": 16,
-  "rank_pattern": {},
-  "revision": null,
-  "target_modules": [
-    "out_proj"
-  ],
-  "task_type": null,
-  "use_rslora": false
-}

musicgen_peft_final 5/adapter_model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:af0156f331d3b5c5fcfc329a5a724d4abf75d8574adafb691f8a3b3bbfa55021
-size 3159480

musicgen_peft_final/README.md DELETED Viewed

@@ -1,203 +0,0 @@
----
-library_name: peft
----
-# Model Card for Model ID
-<!-- Provide a quick summary of what the model is/does. -->
-## Model Details
-### Model Description
-<!-- Provide a longer summary of what this model is. -->
-- **Developed by:** [More Information Needed]
-- **Funded by [optional]:** [More Information Needed]
-- **Shared by [optional]:** [More Information Needed]
-- **Model type:** [More Information Needed]
-- **Language(s) (NLP):** [More Information Needed]
-- **License:** [More Information Needed]
-- **Finetuned from model [optional]:** [More Information Needed]
-### Model Sources [optional]
-<!-- Provide the basic links for the model. -->
-- **Repository:** [More Information Needed]
-- **Paper [optional]:** [More Information Needed]
-- **Demo [optional]:** [More Information Needed]
-## Uses
-<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
-### Direct Use
-<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
-[More Information Needed]
-### Downstream Use [optional]
-<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
-[More Information Needed]
-### Out-of-Scope Use
-<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
-[More Information Needed]
-## Bias, Risks, and Limitations
-<!-- This section is meant to convey both technical and sociotechnical limitations. -->
-[More Information Needed]
-### Recommendations
-<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
-## How to Get Started with the Model
-Use the code below to get started with the model.
-[More Information Needed]
-## Training Details
-### Training Data
-<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
-[More Information Needed]
-### Training Procedure
-<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
-#### Preprocessing [optional]
-[More Information Needed]
-#### Training Hyperparameters
-- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
-#### Speeds, Sizes, Times [optional]
-<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
-[More Information Needed]
-## Evaluation
-<!-- This section describes the evaluation protocols and provides the results. -->
-### Testing Data, Factors & Metrics
-#### Testing Data
-<!-- This should link to a Dataset Card if possible. -->
-[More Information Needed]
-#### Factors
-<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
-[More Information Needed]
-#### Metrics
-<!-- These are the evaluation metrics being used, ideally with a description of why. -->
-[More Information Needed]
-### Results
-[More Information Needed]
-#### Summary
-## Model Examination [optional]
-<!-- Relevant interpretability work for the model goes here -->
-[More Information Needed]
-## Environmental Impact
-<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
-Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
-- **Hardware Type:** [More Information Needed]
-- **Hours used:** [More Information Needed]
-- **Cloud Provider:** [More Information Needed]
-- **Compute Region:** [More Information Needed]
-- **Carbon Emitted:** [More Information Needed]
-## Technical Specifications [optional]
-### Model Architecture and Objective
-[More Information Needed]
-### Compute Infrastructure
-[More Information Needed]
-#### Hardware
-[More Information Needed]
-#### Software
-[More Information Needed]
-## Citation [optional]
-<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
-**BibTeX:**
-[More Information Needed]
-**APA:**
-[More Information Needed]
-## Glossary [optional]
-<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
-[More Information Needed]
-## More Information [optional]
-[More Information Needed]
-## Model Card Authors [optional]
-[More Information Needed]
-## Model Card Contact
-[More Information Needed]
-### Framework versions
-- PEFT 0.8.2

musicgen_peft_final/adapter_config.json DELETED Viewed

@@ -1,31 +0,0 @@
-{
-  "alpha_pattern": {},
-  "auto_mapping": {
-    "base_model_class": "LMModel",
-    "parent_library": "audiocraft.models.lm"
-  },
-  "base_model_name_or_path": null,
-  "bias": "none",
-  "fan_in_fan_out": false,
-  "inference_mode": true,
-  "init_lora_weights": true,
-  "layers_pattern": null,
-  "layers_to_transform": null,
-  "loftq_config": {},
-  "lora_alpha": 16,
-  "lora_dropout": 0.1,
-  "megatron_config": null,
-  "megatron_core": "megatron.core",
-  "modules_to_save": [
-    "classifier"
-  ],
-  "peft_type": "LORA",
-  "r": 16,
-  "rank_pattern": {},
-  "revision": null,
-  "target_modules": [
-    "out_proj"
-  ],
-  "task_type": null,
-  "use_rslora": false
-}

musicgen_peft_final/adapter_model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cbeeb0b56335e300eeae46dd9e0d6df01b33d6d34a6f347cfab3cf70370e326b
-size 3159480

training_utils.py DELETED Viewed

@@ -1,278 +0,0 @@
-from torch.utils.data import Dataset
-import torch
-from torch import nn, Tensor
-import torch.nn.functional as F
-import torchaudio
-import os
-import logging
-from torchvision.models import resnet50, ResNet50_Weights, resnet152, resnet18, resnet34, ResNet152_Weights
-from PIL import Image
-from time import strftime
-import math
-import numpy as np
-from torch.utils.data.sampler import SubsetRandomSampler
-import moviepy.editor as mpe
-class VideoDataset(Dataset):
-    def __init__(self, data_dir):
-        self.data_dir = data_dir
-        self.data_map = []
-        dir_map = os.listdir(data_dir)
-        for d in dir_map:
-            name, extension = os.path.splitext(d)
-            if extension == ".mp4":
-                self.data_map.append({"video": os.path.join(data_dir, d)})
-    def __len__(self):
-        return len(self.data_map)
-    def __getitem__(self, idx):
-        return self.data_map[idx]["video"]
-# input: video_path, output: wav_music
-class VideoToT5(nn.Module):
-    def __init__(self,
-                 device: str,
-                 video_extraction_framerate: int,
-                 encoder_input_dimension: int,
-                 encoder_output_dimension: int,
-                 encoder_heads: int,
-                 encoder_dim_feedforward: int,
-                 encoder_layers: int
-                 ):
-        super().__init__()
-        self.video_extraction_framerate = video_extraction_framerate
-        self.video_feature_extractor = VideoFeatureExtractor(video_extraction_framerate=video_extraction_framerate,
-                                                             device=device)
-        self.video_encoder = VideoEncoder(
-            device,
-            encoder_input_dimension,
-            encoder_output_dimension,
-            encoder_heads,
-            encoder_dim_feedforward,
-            encoder_layers
-        )
-    def forward(self, video_paths: [str]):
-        video_embeddings = []
-        for video_path in video_paths:
-            video = mpe.VideoFileClip(video_path)
-            video_embedding = self.video_feature_extractor(video)
-            video_embeddings.append(video_embedding)
-        video_embeddings = torch.stack(video_embeddings)  # resulting shape: [batch_size, video_extraction_framerate, # ResNet output dimension]
-        # not used, gives worse results!
-        #video_embeddings = torch.mean(video_embeddings, 0, True)  # average out all image embedding to one video embedding
-        t5_embeddings = self.video_encoder(video_embeddings)  # T5 output: [batch_size, num_tokens,
-        # t5_embedding_size]
-        return t5_embeddings
-class VideoEncoder(nn.Module):
-    def __init__(self,
-                 device: str,
-                 encoder_input_dimension: int,
-                 encoder_output_dimension: int,
-                 encoder_heads: int,
-                 encoder_dim_feedforward: int,
-                 encoder_layers: int
-                 ):
-        super().__init__()
-        self.device = device
-        self.encoder = (nn.TransformerEncoder(
-            nn.TransformerEncoderLayer(
-                                       d_model=encoder_input_dimension,
-                                       nhead=encoder_heads,
-                                       dim_feedforward=encoder_dim_feedforward
-                                       ),
-            num_layers=encoder_layers,
-        )
-        ).to(device)
-        # linear layer to match T5 embedding dimension
-        self.linear = (nn.Linear(
-            in_features=encoder_input_dimension,
-            out_features=encoder_output_dimension)
-                       .to(device))
-    def forward(self, x):
-        assert x.dim() == 3
-        x = torch.transpose(x, 0, 1)  # encoder expects [sequence_length, batch_size, embedding_dimension]
-        x = self.encoder(x)  # encoder forward pass
-        x = self.linear(x)  # forward pass through the linear layer
-        x = torch.transpose(x, 0, 1)  # shape: [batch_size, sequence_length, embedding_dimension]
-        return x
-class VideoFeatureExtractor(nn.Module):
-    def __init__(self,
-                 device: str,
-                 video_extraction_framerate: int = 1,
-                 resnet_input_dimension: int = 2048):
-        super().__init__()
-        self.device = device
-        # using a ResNet trained on ImageNet
-        #self.resnet = resnet152(weights="IMAGENET1K_V2").eval()
-        self.resnet = resnet50(weights="IMAGENET1K_V2").eval()
-        self.resnet = torch.nn.Sequential(*(list(self.resnet.children())[:-1])).to(device)  # remove ResNet layer
-        #self.resnet_preprocessor = ResNet152_Weights.DEFAULT.transforms().to(device)  # ResNet image preprocessor
-        self.resnet_preprocessor = ResNet50_Weights.DEFAULT.transforms().to(device)
-        self.video_extraction_framerate = video_extraction_framerate  # setting the fps at which the video is processed
-        self.positional_encoder = PositionalEncoding(resnet_input_dimension).to(device)
-    def forward(self, video: mpe.VideoFileClip):
-        embeddings = []
-        for i in range(0, 30 * self.video_extraction_framerate):
-            i = video.get_frame(i)  # get frame as numpy array
-            i = Image.fromarray(i)  # create PIL image from numpy array
-            i = self.resnet_preprocessor(i)  # preprocess image
-            i = i.to(self.device)
-            i = i.unsqueeze(0)  # adding a batch dimension
-            i = self.resnet(i).squeeze()  # ResNet forward pass
-            i = i.squeeze()
-            embeddings.append(i)  # collect embeddings
-        embeddings = torch.stack(embeddings)  # concatenate all frame embeddings into one video embedding
-        embeddings = embeddings.unsqueeze(1)
-        embeddings = self.positional_encoder(embeddings)  # apply positional encoding with a sequence length of 30
-        embeddings = embeddings.squeeze()
-        return embeddings
-# form https://pytorch.org/tutorials/beginner/transformer_tutorial.html
-class PositionalEncoding(nn.Module):
-    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
-        super().__init__()
-        self.dropout = nn.Dropout(p=dropout)
-        position = torch.arange(max_len).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
-        pe = torch.zeros(max_len, 1, d_model)
-        pe[:, 0, 0::2] = torch.sin(position * div_term)
-        pe[:, 0, 1::2] = torch.cos(position * div_term)
-        self.register_buffer('pe', pe)
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Arguments:
-            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
-        """
-        x = x + self.pe[:x.size(0)]
-        return self.dropout(x)
-def freeze_model(model: nn.Module):
-    for param in model.parameters():
-        param.requires_grad = False
-    model.eval()
-def split_dataset_randomly(dataset, validation_split: float, seed: int=None):
-    dataset_size = len(dataset)
-    indices = list(range(dataset_size))
-    split = int(np.floor(validation_split * dataset_size))
-    if seed:
-        np.random.seed(seed)
-    np.random.shuffle(indices)  # in-place operation
-    return indices[split:], indices[:split]
-### from audiocraft.solver.musicgen.py => _compute_cross_entropy
-def compute_cross_entropy(logits: torch.Tensor, targets: torch.Tensor, mask: torch.Tensor):
-    """Compute cross entropy between multi-codebook targets and model's logits.
-    The cross entropy is computed per codebook to provide codebook-level cross entropy.
-    Valid timesteps for each of the codebook are pulled from the mask, where invalid
-    timesteps are set to 0.
-    Args:
-        logits (torch.Tensor): Model's logits of shape [B, K, T, card].
-        targets (torch.Tensor): Target codes, of shape [B, K, T].
-        mask (torch.Tensor): Mask for valid target codes, of shape [B, K, T].
-    Returns:
-        ce (torch.Tensor): Cross entropy averaged over the codebooks
-        ce_per_codebook (list of torch.Tensor): Cross entropy per codebook (detached).
-    """
-    B, K, T = targets.shape
-    assert logits.shape[:-1] == targets.shape
-    assert mask.shape == targets.shape
-    ce = torch.zeros([], device=targets.device)
-    ce_per_codebook = []
-    for k in range(K):
-        logits_k = logits[:, k, ...].contiguous().view(-1, logits.size(-1))  # [B x T, card]
-        targets_k = targets[:, k, ...].contiguous().view(-1)  # [B x T]
-        mask_k = mask[:, k, ...].contiguous().view(-1)  # [B x T]
-        ce_targets = targets_k[mask_k]
-        ce_logits = logits_k[mask_k]
-        q_ce = F.cross_entropy(ce_logits, ce_targets)
-        ce += q_ce
-        ce_per_codebook.append(q_ce.detach())
-    # average cross entropy across codebooks
-    ce = ce / K
-    return ce, ce_per_codebook
-def generate_audio_codes(audio_paths: [str],
-                         audiocraft_compression_model: torch.nn.Module,
-                         device: str) -> torch.Tensor:
-    audio_duration = 30
-    encodec_sample_rate = audiocraft_compression_model.sample_rate
-    torch_audios = []
-    for audio_path in audio_paths:
-        wav, original_sample_rate = torchaudio.load(audio_path)  # load audio from file
-        wav = torchaudio.functional.resample(wav, original_sample_rate,
-                                             encodec_sample_rate)  # cast audio to model sample rate
-        wav = wav[:, :encodec_sample_rate * audio_duration]  # enforce an exact audio length of 30 seconds
-        assert len(wav.shape) == 2, f"audio data is not of shape [channels, duration]"
-        assert wav.shape[0] == 2, "audio data should be in stereo, but has not 2 channels"
-        torch_audios.append(wav)
-    torch_audios = torch.stack(torch_audios)
-    torch_audios = torch_audios.to(device)
-    with torch.no_grad():
-        gen_audio = audiocraft_compression_model.encode(torch_audios)
-    codes, scale = gen_audio
-    assert scale is None
-    return codes
-def create_condition_tensors(
-                             video_embeddings: torch.Tensor,
-                             batch_size: int,
-                             video_extraction_framerate: int,
-                             device: str
-                            ):
-    # TODO: create T5 mask properly instead of using torch.ones()
-    mask = torch.ones((batch_size, video_extraction_framerate * 30), dtype=torch.int).to(device)
-    condition_tensors = {
-        'description': (video_embeddings, mask)
-    }
-    return condition_tensors
-def get_current_timestamp():
-    return strftime("%Y_%m_%d___%H_%M_%S")
-def configure_logging(output_dir: str, filename: str, log_level):
-    # create logs folder, if not existing
-    os.makedirs(output_dir, exist_ok=True)
-    level = getattr(logging, log_level)
-    file_path = output_dir + "/" + filename
-    logging.basicConfig(filename=file_path, encoding='utf-8', level=level)
-    logger = logging.getLogger()
-    # only add a StreamHandler if it is not present yet
-    if len(logger.handlers) <= 1:
-        logger.addHandler(logging.StreamHandler())