Niklas Schulte commited on
Commit
6ac38d4
β€’
1 Parent(s): 477f8c0

Delete unused model files and adapter configurations

Browse files
README.md DELETED
@@ -1,45 +0,0 @@
1
- ---
2
- license: mit
3
- language:
4
- - en
5
- ---
6
-
7
- # Master Thesis: High-Fidelity Video Background Music Generation using Transformers
8
- This is the corresponding GitLab Repository of my Master Thesis. The goal this thisis is to generate video background
9
- music by the adaptation of MusicGen (https://arxiv.org/pdf/2306.05284.pdf) to video input as another input modality.
10
- This should be accomplished by mapping video information into the T5 text embedding space on which MusicGen usually
11
- works on. To this end, a Transformer Encoder network to accomplish this task, called Video Encoder. Two options are
12
- foreseen within the training loop for the Video Encoder:
13
-
14
- - freezing the weights within the MusicGen Audio Decoder
15
- - adjusting the weights of the MusicGen Audio Decoder with Parameter Efficient Fine-Tuning (PEFT) using LoRA (https://arxiv.org/abs/2106.09685)
16
-
17
-
18
- # Installation
19
- - create a Python virtual environment with `Python 3.11`
20
- - check https://pytorch.org/get-started/previous-versions/ to install `PyTorch 2.1.0` with `CUDA` on your machine
21
- - install the local fork of audiocraft: `cd audiocraft; pip install -e .`
22
- - install the other requirements: `pip install -r requirements.txt`
23
-
24
-
25
- # Folder Structure
26
- - `audiocraft` contains a local fork of the audiocraft library (https://github.com/facebookresearch/audiocraft) with
27
- little changes to the generation method, further information can be seen in `code/code_adaptations_audiocraft`.
28
- - `code` contains the code for model `training` and `inference` of video background music
29
- - `datasets` contains the code to create the datasets used for training within `data_preparation` and video examples
30
- used for the evaluation in `example_videos`
31
- - `evaluation` contains the code used to evaluate the datasets and created video embeddings
32
- - `gradio_app` contains the code for interface to generate video background music
33
-
34
- # Training
35
- To train the models set the training parameters under `training/training_conf.yml` and start training with
36
- `python training/training.py`. The models weights will be stored under `training/models_audiocraft` or
37
- `training/models_peft` respectively.
38
-
39
- # Inference
40
- - start the user interface by running `python gradio_app/app.py`
41
- - inside the interface select a video, parameters
42
- - click on "submit" to start the generation
43
-
44
- # Contact
45
- For any questions contact me at [niklas.schulte@rwth-aachen.de](mailto:niklas.schulte@rwth-aachen.de)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gradio_app.py DELETED
@@ -1,78 +0,0 @@
1
- import gradio as gr
2
- import os
3
- import sys
4
- sys.path.insert(1, '../training_audiocraft/inference')
5
- import inference
6
- import torch
7
-
8
- device = "cuda" if torch.cuda.is_available() else "cpu"
9
-
10
- def generate_background_music(video_path, dataset, use_peft, musicgen_size):
11
- print(f"Start generating background music for {video_path} with model \"{'peft' if use_peft else 'audiocraft'}_{dataset}_{musicgen_size}\"")
12
-
13
- new_video_path = inference.generate_background_music(
14
- video_path=video_path,
15
- dataset=dataset,
16
- musicgen_size=musicgen_size,
17
- use_stereo=True,
18
- use_peft=use_peft,
19
- musicgen_temperature=1.0,
20
- musicgen_guidance_scale=3.0,
21
- top_k_sampling=250,
22
- device=device
23
- )
24
- return gr.Video(new_video_path)
25
-
26
-
27
- interface = gr.Interface(fn=generate_background_music,
28
- inputs=[
29
- gr.Video(
30
- label="video input",
31
- min_length=5,
32
- max_length=20,
33
- sources=['upload'],
34
- show_download_button=True,
35
- include_audio=True
36
- ),
37
- gr.Radio(["nature", "symmv"],
38
- label="Video Encoder Version",
39
- value="nature",
40
- info="Choose one of the available Video Encoders."),
41
- gr.Radio([False, True],
42
- label="Use MusicGen Audio Decoder Model trained with PEFT",
43
- value=False,
44
- info="If set to 'True' the MusicGen Audio Decoder models trained with LoRA "
45
- "(Low Rank Adaptation) are used. If set to 'False', the original "
46
- "MusicGen models are used."),
47
- gr.Radio(["small", "medium", "large"],
48
- label="MusicGen Audio Decoder Size",
49
- value="small",
50
- info="Choose the size of the MusicGen audio decoder."),
51
- ],
52
-
53
- outputs=[gr.Video(label="video output")],
54
- examples=[
55
- [os.path.abspath("../videos/study/n_1.mp4"), "nature", True, "small"],
56
- [os.path.abspath("../videos/study/n_2.mp4"), "nature", True, "small"],
57
- [os.path.abspath("../videos/study/n_3.mp4"), "nature", True, "small"],
58
- [os.path.abspath("../videos/study/n_4.mp4"), "nature", True, "small"],
59
- [os.path.abspath("../videos/study/n_5.mp4"), "nature", True, "small"],
60
- [os.path.abspath("../videos/study/n_6.mp4"), "nature", True, "small"],
61
- [os.path.abspath("../videos/study/n_7.mp4"), "nature", True, "small"],
62
- [os.path.abspath("../videos/study/n_8.mp4"), "nature", True, "small"],
63
- [os.path.abspath("../videos/study/s_1.mp4"), "nature", True, "small"],
64
- [os.path.abspath("../videos/study/s_2.mp4"), "nature", True, "small"],
65
- [os.path.abspath("../videos/study/s_3.mp4"), "nature", True, "small"],
66
- [os.path.abspath("../videos/study/s_4.mp4"), "nature", True, "small"],
67
- [os.path.abspath("../videos/study/s_5.mp4"), "nature", True, "small"],
68
- [os.path.abspath("../videos/study/s_6.mp4"), "nature", True, "small"],
69
- [os.path.abspath("../videos/study/s_7.mp4"), "nature", True, "small"],
70
- [os.path.abspath("../videos/study/s_8.mp4"), "nature", True, "small"],
71
- ],
72
- cache_examples=False
73
- )
74
-
75
- if __name__ == "__main__":
76
- interface.launch(
77
- share=False
78
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
inference.py DELETED
@@ -1,118 +0,0 @@
1
- from omegaconf import OmegaConf
2
- from peft import PeftConfig, get_peft_model
3
-
4
- from audiocraft.models import MusicGen
5
- from moviepy.editor import AudioFileClip
6
- from training_utils import *
7
- import re
8
- import time
9
-
10
- re_file_name = re.compile('([^/]+$)')
11
-
12
-
13
- def generate_background_music(video_path: str,
14
- dataset: str,
15
- musicgen_size: str,
16
- use_stereo: bool,
17
- use_peft: bool,
18
- device: str,
19
- musicgen_temperature: float = 1.0,
20
- musicgen_guidance_scale: float = 3.0,
21
- top_k_sampling: int = 250) -> str:
22
- start = time.time()
23
- model_path = "../training_audiocraft/training/"
24
- model_path += "models_peft" if use_peft else "models_audiocraft"
25
- model_path += f"/{dataset}" + f"_{musicgen_size}"
26
-
27
- conf = OmegaConf.load(model_path + '/configuration.yml')
28
- use_sampling = True if top_k_sampling > 0 else False
29
- video = mpe.VideoFileClip(video_path)
30
-
31
- musicgen_model_id = "facebook/musicgen-" + "stereo-" if use_stereo else ""
32
- musicgen_model_id += musicgen_size
33
-
34
- result_dir = "./results"
35
- os.makedirs(result_dir, exist_ok=True)
36
-
37
- encoder_output_dimension = None
38
- if "small" in conf.musicgen_model_id:
39
- encoder_output_dimension = 1024
40
- elif "medium" in conf.musicgen_model_id:
41
- encoder_output_dimension = 1536
42
- elif "large" in conf.musicgen_model_id:
43
- encoder_output_dimension = 2048
44
- assert encoder_output_dimension, f"Video Encoder output dimension could not be determined by {conf.musicgen_model_id}"
45
-
46
- musicgen_model = MusicGen.get_pretrained(musicgen_model_id)
47
- musicgen_model.lm.to(device)
48
- musicgen_model.compression_model.to(device)
49
- if use_peft:
50
- peft_path = model_path + "/musicgen_peft_final"
51
- peft_config = PeftConfig.from_pretrained(peft_path)
52
- musicgen_model.lm = get_peft_model(musicgen_model.lm, peft_config)
53
- musicgen_model.lm.load_adapter(peft_path, "default")
54
-
55
- print("MusicGen Model loaded.")
56
-
57
- video_to_t5 = VideoToT5(
58
- video_extraction_framerate=conf.video_extraction_framerate,
59
- encoder_input_dimension=conf.encoder_input_dimension,
60
- encoder_output_dimension=encoder_output_dimension,
61
- encoder_heads=conf.encoder_heads,
62
- encoder_dim_feedforward=conf.encoder_dim_feedforward,
63
- encoder_layers=conf.encoder_layers,
64
- device=device
65
- )
66
-
67
- video_to_t5.load_state_dict(torch.load(model_path + "/lm_final.pt", map_location=device))
68
- print("Video Encoder Model loaded.")
69
-
70
- print("Starting Video Feature Extraction.")
71
- video_embedding_t5 = video_to_t5(video_paths=[video_path])
72
-
73
- condition_tensors = create_condition_tensors(
74
- video_embeddings=video_embedding_t5,
75
- batch_size=1,
76
- video_extraction_framerate=video_to_t5.video_extraction_framerate,
77
- device=device
78
- )
79
-
80
- musicgen_model.generation_params = {
81
- 'max_gen_len': int(video.duration * musicgen_model.frame_rate),
82
- 'use_sampling': use_sampling,
83
- 'temp': musicgen_temperature,
84
- 'cfg_coef': musicgen_guidance_scale,
85
- 'two_step_cfg': False,
86
- }
87
- if use_sampling:
88
- musicgen_model.generation_params['top_k'] = 250
89
-
90
- print("Starting Audio Generation.")
91
- prompt_tokens = None
92
- with torch.no_grad():
93
- with musicgen_model.autocast:
94
- gen_tokens = musicgen_model.lm.generate(prompt_tokens, [], condition_tensors, callback=None,
95
- **musicgen_model.generation_params)
96
- gen_audio = musicgen_model.compression_model.decode(gen_tokens)
97
-
98
- end = time.time()
99
- print("Elapsed time for generation: " + str(end - start))
100
-
101
- _, video_file_name = os.path.split(video_path)
102
- video_file_name = video_file_name[:-4] # remove .mp4
103
-
104
- re_result = re_file_name.search(video_file_name) # get video file name
105
- result_path = f"{'peft' if use_peft else 'audiocraft'}_{dataset}_{musicgen_size}_{re_result.group(1)}"
106
- audio_result_path = f"{result_dir}/tmp.wav"
107
- video_result_path = f"{result_dir}/{result_path}_video.mp4"
108
-
109
- gen_audio = torch.squeeze(gen_audio.detach().cpu()) # remove mini-batch dimension, move to CPU for saving
110
- sample_rate = musicgen_model.sample_rate
111
- torchaudio.save(audio_result_path, gen_audio, sample_rate)
112
- audio_file_clip = AudioFileClip(audio_result_path)
113
- video.audio = audio_file_clip
114
-
115
- print("Rendering Video.")
116
- video.write_videofile(video_result_path)
117
-
118
- return video_result_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lm_final(10).pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d79ea467294e53dcf48c54186cd2831c8625c10ea82beaa257b73ccc65fcdd3
3
- size 4176171365
 
 
 
 
lm_final(11).pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ecb2679c5b0e222cb12e3c4ed2d01e5f86c05a698b8d8f6cc6fe882c0a02ef4b
3
- size 14652654385
 
 
 
 
lm_final(7).pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e20117d416847702d33e46ee7c4c1f814cd2a1bea64066490e275a480c9c6148
3
- size 4176171365
 
 
 
 
lm_final(8).pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4bbdb2a036f0561fdc74e80b3c1e1a4e6043a3ee647323b1a5dac69499855684
3
- size 14652654385
 
 
 
 
lm_final(9).pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:af67752913cba1ccae33d804168a1befa503e1c9933f4ae3c254d78b4b172d96
3
- size 1352342381
 
 
 
 
lm_final(2).pt β†’ models_frozen_decoder/nature_large/lm_final(2).pt RENAMED
File without changes
lm_final(1).pt β†’ models_frozen_decoder/nature_medium/lm_final(1).pt RENAMED
File without changes
lm_final(5).pt β†’ models_frozen_decoder/symmv_large/lm_final(5).pt RENAMED
File without changes
lm_final(4).pt β†’ models_frozen_decoder/symmv_medium/lm_final(4).pt RENAMED
File without changes
lm_final(3).pt β†’ models_frozen_decoder/symmv_small/lm_final(3).pt RENAMED
File without changes
lm_final(6).pt β†’ models_peft/nature_small/lm_final(6).pt RENAMED
File without changes
musicgen_peft_final 2/README.md DELETED
@@ -1,203 +0,0 @@
1
- ---
2
- library_name: peft
3
- ---
4
-
5
- # Model Card for Model ID
6
-
7
- <!-- Provide a quick summary of what the model is/does. -->
8
-
9
-
10
-
11
- ## Model Details
12
-
13
- ### Model Description
14
-
15
- <!-- Provide a longer summary of what this model is. -->
16
-
17
-
18
-
19
- - **Developed by:** [More Information Needed]
20
- - **Funded by [optional]:** [More Information Needed]
21
- - **Shared by [optional]:** [More Information Needed]
22
- - **Model type:** [More Information Needed]
23
- - **Language(s) (NLP):** [More Information Needed]
24
- - **License:** [More Information Needed]
25
- - **Finetuned from model [optional]:** [More Information Needed]
26
-
27
- ### Model Sources [optional]
28
-
29
- <!-- Provide the basic links for the model. -->
30
-
31
- - **Repository:** [More Information Needed]
32
- - **Paper [optional]:** [More Information Needed]
33
- - **Demo [optional]:** [More Information Needed]
34
-
35
- ## Uses
36
-
37
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
38
-
39
- ### Direct Use
40
-
41
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
42
-
43
- [More Information Needed]
44
-
45
- ### Downstream Use [optional]
46
-
47
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
48
-
49
- [More Information Needed]
50
-
51
- ### Out-of-Scope Use
52
-
53
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
54
-
55
- [More Information Needed]
56
-
57
- ## Bias, Risks, and Limitations
58
-
59
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
60
-
61
- [More Information Needed]
62
-
63
- ### Recommendations
64
-
65
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
66
-
67
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
68
-
69
- ## How to Get Started with the Model
70
-
71
- Use the code below to get started with the model.
72
-
73
- [More Information Needed]
74
-
75
- ## Training Details
76
-
77
- ### Training Data
78
-
79
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
80
-
81
- [More Information Needed]
82
-
83
- ### Training Procedure
84
-
85
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
86
-
87
- #### Preprocessing [optional]
88
-
89
- [More Information Needed]
90
-
91
-
92
- #### Training Hyperparameters
93
-
94
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
95
-
96
- #### Speeds, Sizes, Times [optional]
97
-
98
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
99
-
100
- [More Information Needed]
101
-
102
- ## Evaluation
103
-
104
- <!-- This section describes the evaluation protocols and provides the results. -->
105
-
106
- ### Testing Data, Factors & Metrics
107
-
108
- #### Testing Data
109
-
110
- <!-- This should link to a Dataset Card if possible. -->
111
-
112
- [More Information Needed]
113
-
114
- #### Factors
115
-
116
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
117
-
118
- [More Information Needed]
119
-
120
- #### Metrics
121
-
122
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
123
-
124
- [More Information Needed]
125
-
126
- ### Results
127
-
128
- [More Information Needed]
129
-
130
- #### Summary
131
-
132
-
133
-
134
- ## Model Examination [optional]
135
-
136
- <!-- Relevant interpretability work for the model goes here -->
137
-
138
- [More Information Needed]
139
-
140
- ## Environmental Impact
141
-
142
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
143
-
144
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
145
-
146
- - **Hardware Type:** [More Information Needed]
147
- - **Hours used:** [More Information Needed]
148
- - **Cloud Provider:** [More Information Needed]
149
- - **Compute Region:** [More Information Needed]
150
- - **Carbon Emitted:** [More Information Needed]
151
-
152
- ## Technical Specifications [optional]
153
-
154
- ### Model Architecture and Objective
155
-
156
- [More Information Needed]
157
-
158
- ### Compute Infrastructure
159
-
160
- [More Information Needed]
161
-
162
- #### Hardware
163
-
164
- [More Information Needed]
165
-
166
- #### Software
167
-
168
- [More Information Needed]
169
-
170
- ## Citation [optional]
171
-
172
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
173
-
174
- **BibTeX:**
175
-
176
- [More Information Needed]
177
-
178
- **APA:**
179
-
180
- [More Information Needed]
181
-
182
- ## Glossary [optional]
183
-
184
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
185
-
186
- [More Information Needed]
187
-
188
- ## More Information [optional]
189
-
190
- [More Information Needed]
191
-
192
- ## Model Card Authors [optional]
193
-
194
- [More Information Needed]
195
-
196
- ## Model Card Contact
197
-
198
- [More Information Needed]
199
-
200
-
201
- ### Framework versions
202
-
203
- - PEFT 0.8.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
musicgen_peft_final 2/adapter_config.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": {
4
- "base_model_class": "LMModel",
5
- "parent_library": "audiocraft.models.lm"
6
- },
7
- "base_model_name_or_path": null,
8
- "bias": "none",
9
- "fan_in_fan_out": false,
10
- "inference_mode": true,
11
- "init_lora_weights": true,
12
- "layers_pattern": null,
13
- "layers_to_transform": null,
14
- "loftq_config": {},
15
- "lora_alpha": 16,
16
- "lora_dropout": 0.1,
17
- "megatron_config": null,
18
- "megatron_core": "megatron.core",
19
- "modules_to_save": [
20
- "classifier"
21
- ],
22
- "peft_type": "LORA",
23
- "r": 16,
24
- "rank_pattern": {},
25
- "revision": null,
26
- "target_modules": [
27
- "out_proj"
28
- ],
29
- "task_type": null,
30
- "use_rslora": false
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
musicgen_peft_final 2/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a56c5b78dc0be771429c038f27bfe5a9a1fe1460778bdeb45213308b7c4c0f4e
3
- size 9464784
 
 
 
 
musicgen_peft_final 3/README.md DELETED
@@ -1,203 +0,0 @@
1
- ---
2
- library_name: peft
3
- ---
4
-
5
- # Model Card for Model ID
6
-
7
- <!-- Provide a quick summary of what the model is/does. -->
8
-
9
-
10
-
11
- ## Model Details
12
-
13
- ### Model Description
14
-
15
- <!-- Provide a longer summary of what this model is. -->
16
-
17
-
18
-
19
- - **Developed by:** [More Information Needed]
20
- - **Funded by [optional]:** [More Information Needed]
21
- - **Shared by [optional]:** [More Information Needed]
22
- - **Model type:** [More Information Needed]
23
- - **Language(s) (NLP):** [More Information Needed]
24
- - **License:** [More Information Needed]
25
- - **Finetuned from model [optional]:** [More Information Needed]
26
-
27
- ### Model Sources [optional]
28
-
29
- <!-- Provide the basic links for the model. -->
30
-
31
- - **Repository:** [More Information Needed]
32
- - **Paper [optional]:** [More Information Needed]
33
- - **Demo [optional]:** [More Information Needed]
34
-
35
- ## Uses
36
-
37
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
38
-
39
- ### Direct Use
40
-
41
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
42
-
43
- [More Information Needed]
44
-
45
- ### Downstream Use [optional]
46
-
47
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
48
-
49
- [More Information Needed]
50
-
51
- ### Out-of-Scope Use
52
-
53
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
54
-
55
- [More Information Needed]
56
-
57
- ## Bias, Risks, and Limitations
58
-
59
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
60
-
61
- [More Information Needed]
62
-
63
- ### Recommendations
64
-
65
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
66
-
67
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
68
-
69
- ## How to Get Started with the Model
70
-
71
- Use the code below to get started with the model.
72
-
73
- [More Information Needed]
74
-
75
- ## Training Details
76
-
77
- ### Training Data
78
-
79
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
80
-
81
- [More Information Needed]
82
-
83
- ### Training Procedure
84
-
85
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
86
-
87
- #### Preprocessing [optional]
88
-
89
- [More Information Needed]
90
-
91
-
92
- #### Training Hyperparameters
93
-
94
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
95
-
96
- #### Speeds, Sizes, Times [optional]
97
-
98
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
99
-
100
- [More Information Needed]
101
-
102
- ## Evaluation
103
-
104
- <!-- This section describes the evaluation protocols and provides the results. -->
105
-
106
- ### Testing Data, Factors & Metrics
107
-
108
- #### Testing Data
109
-
110
- <!-- This should link to a Dataset Card if possible. -->
111
-
112
- [More Information Needed]
113
-
114
- #### Factors
115
-
116
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
117
-
118
- [More Information Needed]
119
-
120
- #### Metrics
121
-
122
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
123
-
124
- [More Information Needed]
125
-
126
- ### Results
127
-
128
- [More Information Needed]
129
-
130
- #### Summary
131
-
132
-
133
-
134
- ## Model Examination [optional]
135
-
136
- <!-- Relevant interpretability work for the model goes here -->
137
-
138
- [More Information Needed]
139
-
140
- ## Environmental Impact
141
-
142
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
143
-
144
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
145
-
146
- - **Hardware Type:** [More Information Needed]
147
- - **Hours used:** [More Information Needed]
148
- - **Cloud Provider:** [More Information Needed]
149
- - **Compute Region:** [More Information Needed]
150
- - **Carbon Emitted:** [More Information Needed]
151
-
152
- ## Technical Specifications [optional]
153
-
154
- ### Model Architecture and Objective
155
-
156
- [More Information Needed]
157
-
158
- ### Compute Infrastructure
159
-
160
- [More Information Needed]
161
-
162
- #### Hardware
163
-
164
- [More Information Needed]
165
-
166
- #### Software
167
-
168
- [More Information Needed]
169
-
170
- ## Citation [optional]
171
-
172
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
173
-
174
- **BibTeX:**
175
-
176
- [More Information Needed]
177
-
178
- **APA:**
179
-
180
- [More Information Needed]
181
-
182
- ## Glossary [optional]
183
-
184
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
185
-
186
- [More Information Needed]
187
-
188
- ## More Information [optional]
189
-
190
- [More Information Needed]
191
-
192
- ## Model Card Authors [optional]
193
-
194
- [More Information Needed]
195
-
196
- ## Model Card Contact
197
-
198
- [More Information Needed]
199
-
200
-
201
- ### Framework versions
202
-
203
- - PEFT 0.8.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
musicgen_peft_final 3/adapter_config.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": {
4
- "base_model_class": "LMModel",
5
- "parent_library": "audiocraft.models.lm"
6
- },
7
- "base_model_name_or_path": null,
8
- "bias": "none",
9
- "fan_in_fan_out": false,
10
- "inference_mode": true,
11
- "init_lora_weights": true,
12
- "layers_pattern": null,
13
- "layers_to_transform": null,
14
- "loftq_config": {},
15
- "lora_alpha": 16,
16
- "lora_dropout": 0.1,
17
- "megatron_config": null,
18
- "megatron_core": "megatron.core",
19
- "modules_to_save": [
20
- "classifier"
21
- ],
22
- "peft_type": "LORA",
23
- "r": 16,
24
- "rank_pattern": {},
25
- "revision": null,
26
- "target_modules": [
27
- "out_proj"
28
- ],
29
- "task_type": null,
30
- "use_rslora": false
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
musicgen_peft_final 3/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:50885374bd3299e8335820c113650f2038fb5286f3d13415ce38b7cb2bb3bedb
3
- size 12610608
 
 
 
 
musicgen_peft_final 4/README.md DELETED
@@ -1,203 +0,0 @@
1
- ---
2
- library_name: peft
3
- ---
4
-
5
- # Model Card for Model ID
6
-
7
- <!-- Provide a quick summary of what the model is/does. -->
8
-
9
-
10
-
11
- ## Model Details
12
-
13
- ### Model Description
14
-
15
- <!-- Provide a longer summary of what this model is. -->
16
-
17
-
18
-
19
- - **Developed by:** [More Information Needed]
20
- - **Funded by [optional]:** [More Information Needed]
21
- - **Shared by [optional]:** [More Information Needed]
22
- - **Model type:** [More Information Needed]
23
- - **Language(s) (NLP):** [More Information Needed]
24
- - **License:** [More Information Needed]
25
- - **Finetuned from model [optional]:** [More Information Needed]
26
-
27
- ### Model Sources [optional]
28
-
29
- <!-- Provide the basic links for the model. -->
30
-
31
- - **Repository:** [More Information Needed]
32
- - **Paper [optional]:** [More Information Needed]
33
- - **Demo [optional]:** [More Information Needed]
34
-
35
- ## Uses
36
-
37
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
38
-
39
- ### Direct Use
40
-
41
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
42
-
43
- [More Information Needed]
44
-
45
- ### Downstream Use [optional]
46
-
47
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
48
-
49
- [More Information Needed]
50
-
51
- ### Out-of-Scope Use
52
-
53
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
54
-
55
- [More Information Needed]
56
-
57
- ## Bias, Risks, and Limitations
58
-
59
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
60
-
61
- [More Information Needed]
62
-
63
- ### Recommendations
64
-
65
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
66
-
67
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
68
-
69
- ## How to Get Started with the Model
70
-
71
- Use the code below to get started with the model.
72
-
73
- [More Information Needed]
74
-
75
- ## Training Details
76
-
77
- ### Training Data
78
-
79
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
80
-
81
- [More Information Needed]
82
-
83
- ### Training Procedure
84
-
85
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
86
-
87
- #### Preprocessing [optional]
88
-
89
- [More Information Needed]
90
-
91
-
92
- #### Training Hyperparameters
93
-
94
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
95
-
96
- #### Speeds, Sizes, Times [optional]
97
-
98
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
99
-
100
- [More Information Needed]
101
-
102
- ## Evaluation
103
-
104
- <!-- This section describes the evaluation protocols and provides the results. -->
105
-
106
- ### Testing Data, Factors & Metrics
107
-
108
- #### Testing Data
109
-
110
- <!-- This should link to a Dataset Card if possible. -->
111
-
112
- [More Information Needed]
113
-
114
- #### Factors
115
-
116
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
117
-
118
- [More Information Needed]
119
-
120
- #### Metrics
121
-
122
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
123
-
124
- [More Information Needed]
125
-
126
- ### Results
127
-
128
- [More Information Needed]
129
-
130
- #### Summary
131
-
132
-
133
-
134
- ## Model Examination [optional]
135
-
136
- <!-- Relevant interpretability work for the model goes here -->
137
-
138
- [More Information Needed]
139
-
140
- ## Environmental Impact
141
-
142
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
143
-
144
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
145
-
146
- - **Hardware Type:** [More Information Needed]
147
- - **Hours used:** [More Information Needed]
148
- - **Cloud Provider:** [More Information Needed]
149
- - **Compute Region:** [More Information Needed]
150
- - **Carbon Emitted:** [More Information Needed]
151
-
152
- ## Technical Specifications [optional]
153
-
154
- ### Model Architecture and Objective
155
-
156
- [More Information Needed]
157
-
158
- ### Compute Infrastructure
159
-
160
- [More Information Needed]
161
-
162
- #### Hardware
163
-
164
- [More Information Needed]
165
-
166
- #### Software
167
-
168
- [More Information Needed]
169
-
170
- ## Citation [optional]
171
-
172
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
173
-
174
- **BibTeX:**
175
-
176
- [More Information Needed]
177
-
178
- **APA:**
179
-
180
- [More Information Needed]
181
-
182
- ## Glossary [optional]
183
-
184
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
185
-
186
- [More Information Needed]
187
-
188
- ## More Information [optional]
189
-
190
- [More Information Needed]
191
-
192
- ## Model Card Authors [optional]
193
-
194
- [More Information Needed]
195
-
196
- ## Model Card Contact
197
-
198
- [More Information Needed]
199
-
200
-
201
- ### Framework versions
202
-
203
- - PEFT 0.8.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
musicgen_peft_final 4/adapter_config.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": {
4
- "base_model_class": "LMModel",
5
- "parent_library": "audiocraft.models.lm"
6
- },
7
- "base_model_name_or_path": null,
8
- "bias": "none",
9
- "fan_in_fan_out": false,
10
- "inference_mode": true,
11
- "init_lora_weights": true,
12
- "layers_pattern": null,
13
- "layers_to_transform": null,
14
- "loftq_config": {},
15
- "lora_alpha": 16,
16
- "lora_dropout": 0.1,
17
- "megatron_config": null,
18
- "megatron_core": "megatron.core",
19
- "modules_to_save": [
20
- "classifier"
21
- ],
22
- "peft_type": "LORA",
23
- "r": 16,
24
- "rank_pattern": {},
25
- "revision": null,
26
- "target_modules": [
27
- "out_proj"
28
- ],
29
- "task_type": null,
30
- "use_rslora": false
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
musicgen_peft_final 4/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:47f3accb461e386e4032239789cc8aed985ae4e9dc7205e9339b5c42daf788cf
3
- size 9464784
 
 
 
 
musicgen_peft_final 5/README.md DELETED
@@ -1,203 +0,0 @@
1
- ---
2
- library_name: peft
3
- ---
4
-
5
- # Model Card for Model ID
6
-
7
- <!-- Provide a quick summary of what the model is/does. -->
8
-
9
-
10
-
11
- ## Model Details
12
-
13
- ### Model Description
14
-
15
- <!-- Provide a longer summary of what this model is. -->
16
-
17
-
18
-
19
- - **Developed by:** [More Information Needed]
20
- - **Funded by [optional]:** [More Information Needed]
21
- - **Shared by [optional]:** [More Information Needed]
22
- - **Model type:** [More Information Needed]
23
- - **Language(s) (NLP):** [More Information Needed]
24
- - **License:** [More Information Needed]
25
- - **Finetuned from model [optional]:** [More Information Needed]
26
-
27
- ### Model Sources [optional]
28
-
29
- <!-- Provide the basic links for the model. -->
30
-
31
- - **Repository:** [More Information Needed]
32
- - **Paper [optional]:** [More Information Needed]
33
- - **Demo [optional]:** [More Information Needed]
34
-
35
- ## Uses
36
-
37
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
38
-
39
- ### Direct Use
40
-
41
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
42
-
43
- [More Information Needed]
44
-
45
- ### Downstream Use [optional]
46
-
47
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
48
-
49
- [More Information Needed]
50
-
51
- ### Out-of-Scope Use
52
-
53
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
54
-
55
- [More Information Needed]
56
-
57
- ## Bias, Risks, and Limitations
58
-
59
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
60
-
61
- [More Information Needed]
62
-
63
- ### Recommendations
64
-
65
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
66
-
67
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
68
-
69
- ## How to Get Started with the Model
70
-
71
- Use the code below to get started with the model.
72
-
73
- [More Information Needed]
74
-
75
- ## Training Details
76
-
77
- ### Training Data
78
-
79
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
80
-
81
- [More Information Needed]
82
-
83
- ### Training Procedure
84
-
85
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
86
-
87
- #### Preprocessing [optional]
88
-
89
- [More Information Needed]
90
-
91
-
92
- #### Training Hyperparameters
93
-
94
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
95
-
96
- #### Speeds, Sizes, Times [optional]
97
-
98
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
99
-
100
- [More Information Needed]
101
-
102
- ## Evaluation
103
-
104
- <!-- This section describes the evaluation protocols and provides the results. -->
105
-
106
- ### Testing Data, Factors & Metrics
107
-
108
- #### Testing Data
109
-
110
- <!-- This should link to a Dataset Card if possible. -->
111
-
112
- [More Information Needed]
113
-
114
- #### Factors
115
-
116
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
117
-
118
- [More Information Needed]
119
-
120
- #### Metrics
121
-
122
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
123
-
124
- [More Information Needed]
125
-
126
- ### Results
127
-
128
- [More Information Needed]
129
-
130
- #### Summary
131
-
132
-
133
-
134
- ## Model Examination [optional]
135
-
136
- <!-- Relevant interpretability work for the model goes here -->
137
-
138
- [More Information Needed]
139
-
140
- ## Environmental Impact
141
-
142
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
143
-
144
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
145
-
146
- - **Hardware Type:** [More Information Needed]
147
- - **Hours used:** [More Information Needed]
148
- - **Cloud Provider:** [More Information Needed]
149
- - **Compute Region:** [More Information Needed]
150
- - **Carbon Emitted:** [More Information Needed]
151
-
152
- ## Technical Specifications [optional]
153
-
154
- ### Model Architecture and Objective
155
-
156
- [More Information Needed]
157
-
158
- ### Compute Infrastructure
159
-
160
- [More Information Needed]
161
-
162
- #### Hardware
163
-
164
- [More Information Needed]
165
-
166
- #### Software
167
-
168
- [More Information Needed]
169
-
170
- ## Citation [optional]
171
-
172
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
173
-
174
- **BibTeX:**
175
-
176
- [More Information Needed]
177
-
178
- **APA:**
179
-
180
- [More Information Needed]
181
-
182
- ## Glossary [optional]
183
-
184
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
185
-
186
- [More Information Needed]
187
-
188
- ## More Information [optional]
189
-
190
- [More Information Needed]
191
-
192
- ## Model Card Authors [optional]
193
-
194
- [More Information Needed]
195
-
196
- ## Model Card Contact
197
-
198
- [More Information Needed]
199
-
200
-
201
- ### Framework versions
202
-
203
- - PEFT 0.8.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
musicgen_peft_final 5/adapter_config.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": {
4
- "base_model_class": "LMModel",
5
- "parent_library": "audiocraft.models.lm"
6
- },
7
- "base_model_name_or_path": null,
8
- "bias": "none",
9
- "fan_in_fan_out": false,
10
- "inference_mode": true,
11
- "init_lora_weights": true,
12
- "layers_pattern": null,
13
- "layers_to_transform": null,
14
- "loftq_config": {},
15
- "lora_alpha": 16,
16
- "lora_dropout": 0.1,
17
- "megatron_config": null,
18
- "megatron_core": "megatron.core",
19
- "modules_to_save": [
20
- "classifier"
21
- ],
22
- "peft_type": "LORA",
23
- "r": 16,
24
- "rank_pattern": {},
25
- "revision": null,
26
- "target_modules": [
27
- "out_proj"
28
- ],
29
- "task_type": null,
30
- "use_rslora": false
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
musicgen_peft_final 5/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:af0156f331d3b5c5fcfc329a5a724d4abf75d8574adafb691f8a3b3bbfa55021
3
- size 3159480
 
 
 
 
musicgen_peft_final/README.md DELETED
@@ -1,203 +0,0 @@
1
- ---
2
- library_name: peft
3
- ---
4
-
5
- # Model Card for Model ID
6
-
7
- <!-- Provide a quick summary of what the model is/does. -->
8
-
9
-
10
-
11
- ## Model Details
12
-
13
- ### Model Description
14
-
15
- <!-- Provide a longer summary of what this model is. -->
16
-
17
-
18
-
19
- - **Developed by:** [More Information Needed]
20
- - **Funded by [optional]:** [More Information Needed]
21
- - **Shared by [optional]:** [More Information Needed]
22
- - **Model type:** [More Information Needed]
23
- - **Language(s) (NLP):** [More Information Needed]
24
- - **License:** [More Information Needed]
25
- - **Finetuned from model [optional]:** [More Information Needed]
26
-
27
- ### Model Sources [optional]
28
-
29
- <!-- Provide the basic links for the model. -->
30
-
31
- - **Repository:** [More Information Needed]
32
- - **Paper [optional]:** [More Information Needed]
33
- - **Demo [optional]:** [More Information Needed]
34
-
35
- ## Uses
36
-
37
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
38
-
39
- ### Direct Use
40
-
41
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
42
-
43
- [More Information Needed]
44
-
45
- ### Downstream Use [optional]
46
-
47
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
48
-
49
- [More Information Needed]
50
-
51
- ### Out-of-Scope Use
52
-
53
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
54
-
55
- [More Information Needed]
56
-
57
- ## Bias, Risks, and Limitations
58
-
59
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
60
-
61
- [More Information Needed]
62
-
63
- ### Recommendations
64
-
65
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
66
-
67
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
68
-
69
- ## How to Get Started with the Model
70
-
71
- Use the code below to get started with the model.
72
-
73
- [More Information Needed]
74
-
75
- ## Training Details
76
-
77
- ### Training Data
78
-
79
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
80
-
81
- [More Information Needed]
82
-
83
- ### Training Procedure
84
-
85
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
86
-
87
- #### Preprocessing [optional]
88
-
89
- [More Information Needed]
90
-
91
-
92
- #### Training Hyperparameters
93
-
94
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
95
-
96
- #### Speeds, Sizes, Times [optional]
97
-
98
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
99
-
100
- [More Information Needed]
101
-
102
- ## Evaluation
103
-
104
- <!-- This section describes the evaluation protocols and provides the results. -->
105
-
106
- ### Testing Data, Factors & Metrics
107
-
108
- #### Testing Data
109
-
110
- <!-- This should link to a Dataset Card if possible. -->
111
-
112
- [More Information Needed]
113
-
114
- #### Factors
115
-
116
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
117
-
118
- [More Information Needed]
119
-
120
- #### Metrics
121
-
122
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
123
-
124
- [More Information Needed]
125
-
126
- ### Results
127
-
128
- [More Information Needed]
129
-
130
- #### Summary
131
-
132
-
133
-
134
- ## Model Examination [optional]
135
-
136
- <!-- Relevant interpretability work for the model goes here -->
137
-
138
- [More Information Needed]
139
-
140
- ## Environmental Impact
141
-
142
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
143
-
144
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
145
-
146
- - **Hardware Type:** [More Information Needed]
147
- - **Hours used:** [More Information Needed]
148
- - **Cloud Provider:** [More Information Needed]
149
- - **Compute Region:** [More Information Needed]
150
- - **Carbon Emitted:** [More Information Needed]
151
-
152
- ## Technical Specifications [optional]
153
-
154
- ### Model Architecture and Objective
155
-
156
- [More Information Needed]
157
-
158
- ### Compute Infrastructure
159
-
160
- [More Information Needed]
161
-
162
- #### Hardware
163
-
164
- [More Information Needed]
165
-
166
- #### Software
167
-
168
- [More Information Needed]
169
-
170
- ## Citation [optional]
171
-
172
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
173
-
174
- **BibTeX:**
175
-
176
- [More Information Needed]
177
-
178
- **APA:**
179
-
180
- [More Information Needed]
181
-
182
- ## Glossary [optional]
183
-
184
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
185
-
186
- [More Information Needed]
187
-
188
- ## More Information [optional]
189
-
190
- [More Information Needed]
191
-
192
- ## Model Card Authors [optional]
193
-
194
- [More Information Needed]
195
-
196
- ## Model Card Contact
197
-
198
- [More Information Needed]
199
-
200
-
201
- ### Framework versions
202
-
203
- - PEFT 0.8.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
musicgen_peft_final/adapter_config.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": {
4
- "base_model_class": "LMModel",
5
- "parent_library": "audiocraft.models.lm"
6
- },
7
- "base_model_name_or_path": null,
8
- "bias": "none",
9
- "fan_in_fan_out": false,
10
- "inference_mode": true,
11
- "init_lora_weights": true,
12
- "layers_pattern": null,
13
- "layers_to_transform": null,
14
- "loftq_config": {},
15
- "lora_alpha": 16,
16
- "lora_dropout": 0.1,
17
- "megatron_config": null,
18
- "megatron_core": "megatron.core",
19
- "modules_to_save": [
20
- "classifier"
21
- ],
22
- "peft_type": "LORA",
23
- "r": 16,
24
- "rank_pattern": {},
25
- "revision": null,
26
- "target_modules": [
27
- "out_proj"
28
- ],
29
- "task_type": null,
30
- "use_rslora": false
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
musicgen_peft_final/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbeeb0b56335e300eeae46dd9e0d6df01b33d6d34a6f347cfab3cf70370e326b
3
- size 3159480
 
 
 
 
training_utils.py DELETED
@@ -1,278 +0,0 @@
1
- from torch.utils.data import Dataset
2
- import torch
3
- from torch import nn, Tensor
4
- import torch.nn.functional as F
5
- import torchaudio
6
- import os
7
- import logging
8
- from torchvision.models import resnet50, ResNet50_Weights, resnet152, resnet18, resnet34, ResNet152_Weights
9
- from PIL import Image
10
- from time import strftime
11
- import math
12
- import numpy as np
13
- from torch.utils.data.sampler import SubsetRandomSampler
14
- import moviepy.editor as mpe
15
-
16
-
17
- class VideoDataset(Dataset):
18
- def __init__(self, data_dir):
19
- self.data_dir = data_dir
20
- self.data_map = []
21
-
22
- dir_map = os.listdir(data_dir)
23
- for d in dir_map:
24
- name, extension = os.path.splitext(d)
25
- if extension == ".mp4":
26
- self.data_map.append({"video": os.path.join(data_dir, d)})
27
-
28
- def __len__(self):
29
- return len(self.data_map)
30
-
31
- def __getitem__(self, idx):
32
- return self.data_map[idx]["video"]
33
-
34
-
35
- # input: video_path, output: wav_music
36
- class VideoToT5(nn.Module):
37
- def __init__(self,
38
- device: str,
39
- video_extraction_framerate: int,
40
- encoder_input_dimension: int,
41
- encoder_output_dimension: int,
42
- encoder_heads: int,
43
- encoder_dim_feedforward: int,
44
- encoder_layers: int
45
- ):
46
- super().__init__()
47
- self.video_extraction_framerate = video_extraction_framerate
48
- self.video_feature_extractor = VideoFeatureExtractor(video_extraction_framerate=video_extraction_framerate,
49
- device=device)
50
- self.video_encoder = VideoEncoder(
51
- device,
52
- encoder_input_dimension,
53
- encoder_output_dimension,
54
- encoder_heads,
55
- encoder_dim_feedforward,
56
- encoder_layers
57
- )
58
-
59
- def forward(self, video_paths: [str]):
60
- video_embeddings = []
61
- for video_path in video_paths:
62
- video = mpe.VideoFileClip(video_path)
63
- video_embedding = self.video_feature_extractor(video)
64
- video_embeddings.append(video_embedding)
65
- video_embeddings = torch.stack(video_embeddings) # resulting shape: [batch_size, video_extraction_framerate, # ResNet output dimension]
66
- # not used, gives worse results!
67
- #video_embeddings = torch.mean(video_embeddings, 0, True) # average out all image embedding to one video embedding
68
-
69
- t5_embeddings = self.video_encoder(video_embeddings) # T5 output: [batch_size, num_tokens,
70
- # t5_embedding_size]
71
- return t5_embeddings
72
-
73
-
74
- class VideoEncoder(nn.Module):
75
- def __init__(self,
76
- device: str,
77
- encoder_input_dimension: int,
78
- encoder_output_dimension: int,
79
- encoder_heads: int,
80
- encoder_dim_feedforward: int,
81
- encoder_layers: int
82
- ):
83
- super().__init__()
84
- self.device = device
85
- self.encoder = (nn.TransformerEncoder(
86
- nn.TransformerEncoderLayer(
87
- d_model=encoder_input_dimension,
88
- nhead=encoder_heads,
89
- dim_feedforward=encoder_dim_feedforward
90
- ),
91
- num_layers=encoder_layers,
92
- )
93
- ).to(device)
94
-
95
- # linear layer to match T5 embedding dimension
96
- self.linear = (nn.Linear(
97
- in_features=encoder_input_dimension,
98
- out_features=encoder_output_dimension)
99
- .to(device))
100
-
101
- def forward(self, x):
102
- assert x.dim() == 3
103
- x = torch.transpose(x, 0, 1) # encoder expects [sequence_length, batch_size, embedding_dimension]
104
- x = self.encoder(x) # encoder forward pass
105
- x = self.linear(x) # forward pass through the linear layer
106
- x = torch.transpose(x, 0, 1) # shape: [batch_size, sequence_length, embedding_dimension]
107
- return x
108
-
109
-
110
- class VideoFeatureExtractor(nn.Module):
111
- def __init__(self,
112
- device: str,
113
- video_extraction_framerate: int = 1,
114
- resnet_input_dimension: int = 2048):
115
- super().__init__()
116
- self.device = device
117
-
118
- # using a ResNet trained on ImageNet
119
- #self.resnet = resnet152(weights="IMAGENET1K_V2").eval()
120
- self.resnet = resnet50(weights="IMAGENET1K_V2").eval()
121
- self.resnet = torch.nn.Sequential(*(list(self.resnet.children())[:-1])).to(device) # remove ResNet layer
122
- #self.resnet_preprocessor = ResNet152_Weights.DEFAULT.transforms().to(device) # ResNet image preprocessor
123
- self.resnet_preprocessor = ResNet50_Weights.DEFAULT.transforms().to(device)
124
- self.video_extraction_framerate = video_extraction_framerate # setting the fps at which the video is processed
125
- self.positional_encoder = PositionalEncoding(resnet_input_dimension).to(device)
126
-
127
- def forward(self, video: mpe.VideoFileClip):
128
- embeddings = []
129
- for i in range(0, 30 * self.video_extraction_framerate):
130
- i = video.get_frame(i) # get frame as numpy array
131
- i = Image.fromarray(i) # create PIL image from numpy array
132
- i = self.resnet_preprocessor(i) # preprocess image
133
- i = i.to(self.device)
134
- i = i.unsqueeze(0) # adding a batch dimension
135
- i = self.resnet(i).squeeze() # ResNet forward pass
136
- i = i.squeeze()
137
- embeddings.append(i) # collect embeddings
138
-
139
- embeddings = torch.stack(embeddings) # concatenate all frame embeddings into one video embedding
140
- embeddings = embeddings.unsqueeze(1)
141
- embeddings = self.positional_encoder(embeddings) # apply positional encoding with a sequence length of 30
142
- embeddings = embeddings.squeeze()
143
- return embeddings
144
-
145
-
146
- # form https://pytorch.org/tutorials/beginner/transformer_tutorial.html
147
- class PositionalEncoding(nn.Module):
148
- def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
149
- super().__init__()
150
- self.dropout = nn.Dropout(p=dropout)
151
-
152
- position = torch.arange(max_len).unsqueeze(1)
153
- div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
154
- pe = torch.zeros(max_len, 1, d_model)
155
- pe[:, 0, 0::2] = torch.sin(position * div_term)
156
- pe[:, 0, 1::2] = torch.cos(position * div_term)
157
- self.register_buffer('pe', pe)
158
-
159
- def forward(self, x: Tensor) -> Tensor:
160
- """
161
- Arguments:
162
- x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
163
- """
164
- x = x + self.pe[:x.size(0)]
165
- return self.dropout(x)
166
-
167
-
168
- def freeze_model(model: nn.Module):
169
- for param in model.parameters():
170
- param.requires_grad = False
171
- model.eval()
172
-
173
-
174
- def split_dataset_randomly(dataset, validation_split: float, seed: int=None):
175
- dataset_size = len(dataset)
176
- indices = list(range(dataset_size))
177
- split = int(np.floor(validation_split * dataset_size))
178
-
179
- if seed:
180
- np.random.seed(seed)
181
-
182
- np.random.shuffle(indices) # in-place operation
183
- return indices[split:], indices[:split]
184
-
185
-
186
- ### from audiocraft.solver.musicgen.py => _compute_cross_entropy
187
- def compute_cross_entropy(logits: torch.Tensor, targets: torch.Tensor, mask: torch.Tensor):
188
- """Compute cross entropy between multi-codebook targets and model's logits.
189
- The cross entropy is computed per codebook to provide codebook-level cross entropy.
190
- Valid timesteps for each of the codebook are pulled from the mask, where invalid
191
- timesteps are set to 0.
192
-
193
- Args:
194
- logits (torch.Tensor): Model's logits of shape [B, K, T, card].
195
- targets (torch.Tensor): Target codes, of shape [B, K, T].
196
- mask (torch.Tensor): Mask for valid target codes, of shape [B, K, T].
197
- Returns:
198
- ce (torch.Tensor): Cross entropy averaged over the codebooks
199
- ce_per_codebook (list of torch.Tensor): Cross entropy per codebook (detached).
200
- """
201
- B, K, T = targets.shape
202
- assert logits.shape[:-1] == targets.shape
203
- assert mask.shape == targets.shape
204
- ce = torch.zeros([], device=targets.device)
205
- ce_per_codebook = []
206
- for k in range(K):
207
- logits_k = logits[:, k, ...].contiguous().view(-1, logits.size(-1)) # [B x T, card]
208
- targets_k = targets[:, k, ...].contiguous().view(-1) # [B x T]
209
- mask_k = mask[:, k, ...].contiguous().view(-1) # [B x T]
210
- ce_targets = targets_k[mask_k]
211
- ce_logits = logits_k[mask_k]
212
- q_ce = F.cross_entropy(ce_logits, ce_targets)
213
- ce += q_ce
214
- ce_per_codebook.append(q_ce.detach())
215
- # average cross entropy across codebooks
216
- ce = ce / K
217
- return ce, ce_per_codebook
218
-
219
-
220
- def generate_audio_codes(audio_paths: [str],
221
- audiocraft_compression_model: torch.nn.Module,
222
- device: str) -> torch.Tensor:
223
- audio_duration = 30
224
- encodec_sample_rate = audiocraft_compression_model.sample_rate
225
-
226
- torch_audios = []
227
- for audio_path in audio_paths:
228
- wav, original_sample_rate = torchaudio.load(audio_path) # load audio from file
229
- wav = torchaudio.functional.resample(wav, original_sample_rate,
230
- encodec_sample_rate) # cast audio to model sample rate
231
- wav = wav[:, :encodec_sample_rate * audio_duration] # enforce an exact audio length of 30 seconds
232
-
233
- assert len(wav.shape) == 2, f"audio data is not of shape [channels, duration]"
234
- assert wav.shape[0] == 2, "audio data should be in stereo, but has not 2 channels"
235
-
236
- torch_audios.append(wav)
237
-
238
- torch_audios = torch.stack(torch_audios)
239
- torch_audios = torch_audios.to(device)
240
-
241
- with torch.no_grad():
242
- gen_audio = audiocraft_compression_model.encode(torch_audios)
243
-
244
- codes, scale = gen_audio
245
- assert scale is None
246
-
247
- return codes
248
-
249
-
250
- def create_condition_tensors(
251
- video_embeddings: torch.Tensor,
252
- batch_size: int,
253
- video_extraction_framerate: int,
254
- device: str
255
- ):
256
- # TODO: create T5 mask properly instead of using torch.ones()
257
- mask = torch.ones((batch_size, video_extraction_framerate * 30), dtype=torch.int).to(device)
258
-
259
- condition_tensors = {
260
- 'description': (video_embeddings, mask)
261
- }
262
- return condition_tensors
263
-
264
-
265
- def get_current_timestamp():
266
- return strftime("%Y_%m_%d___%H_%M_%S")
267
-
268
-
269
- def configure_logging(output_dir: str, filename: str, log_level):
270
- # create logs folder, if not existing
271
- os.makedirs(output_dir, exist_ok=True)
272
- level = getattr(logging, log_level)
273
- file_path = output_dir + "/" + filename
274
- logging.basicConfig(filename=file_path, encoding='utf-8', level=level)
275
- logger = logging.getLogger()
276
- # only add a StreamHandler if it is not present yet
277
- if len(logger.handlers) <= 1:
278
- logger.addHandler(logging.StreamHandler())