Aspik101 commited on
Commit
0c212b0
0 Parent(s):

Duplicate from Lajonbot/Marketplace-audio

Browse files
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ muzyka_AI.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ voice_cloning_fraud.mp4 filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Lp Music Caps
3
+ emoji: 🎵🎵🎵
4
+ colorFrom: purple
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 3.33.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ duplicated_from: Lajonbot/Marketplace-audio
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import VitsModel, AutoTokenizer
2
+ import soundfile as sf
3
+ import torch
4
+ from datetime import datetime
5
+ import random
6
+ import time
7
+ from ctransformers import AutoModelForCausalLM
8
+ from datetime import datetime
9
+ import whisper
10
+ from transformers import VitsModel, AutoTokenizer
11
+ import torch
12
+ from transformers import MusicgenForConditionalGeneration, AutoProcessor, set_seed
13
+ import torch
14
+ import numpy as np
15
+ import os
16
+ import argparse
17
+ import gradio as gr
18
+ from timeit import default_timer as timer
19
+ import torch
20
+ import numpy as np
21
+ import pandas as pd
22
+ from huggingface_hub import hf_hub_download
23
+ from model.bart import BartCaptionModel
24
+ from utils.audio_utils import load_audio, STR_CH_FIRST
25
+ from diffusers import DiffusionPipeline
26
+
27
+ from PIL import Image
28
+
29
+ def image_grid(imgs, rows, cols):
30
+ assert len(imgs) == rows*cols
31
+
32
+ w, h = imgs[0].size
33
+ grid = Image.new('RGB', size=(cols*w, rows*h))
34
+ grid_w, grid_h = grid.size
35
+
36
+ for i, img in enumerate(imgs):
37
+ grid.paste(img, box=(i%cols*w, i//cols*h))
38
+ return grid
39
+
40
+
41
+
42
+ def save_to_txt(text_to_save):
43
+ with open('prompt.txt', 'w', encoding='utf-8') as f:
44
+ f.write(text_to_save)
45
+
46
+ def read_txt():
47
+ with open('prompt.txt') as f:
48
+ lines = f.readlines()
49
+ return lines
50
+
51
+ ##### Chat z LLAMA ####
52
+ ##### Chat z LLAMA ####
53
+ ##### Chat z LLAMA ####
54
+ params = {
55
+ "max_new_tokens":512,
56
+ "stop":["<end>" ,"<|endoftext|>","[", "<user>"],
57
+ "temperature":0.7,
58
+ "top_p":0.8,
59
+ "stream":True,
60
+ "batch_size": 8}
61
+
62
+
63
+ whisper_model = whisper.load_model("medium").to("cuda")
64
+ print("Whisper Loaded!")
65
+ llm = AutoModelForCausalLM.from_pretrained("Aspik101/trurl-2-7b-pl-instruct_GGML", model_type="llama")
66
+ print("LLM Loaded!")
67
+ tts_model = VitsModel.from_pretrained("facebook/mms-tts-pol")
68
+ tts_model.to("cuda")
69
+ print("TTS Loaded!")
70
+ tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-pol")
71
+
72
+ pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0",
73
+ torch_dtype=torch.float16,
74
+ use_safetensors=True,
75
+ variant="fp16").to("cuda")
76
+ print("DiffusionPipeline Loaded!")
77
+
78
+ model_audio_gen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small").to("cuda")
79
+ processor_audio_gen = AutoProcessor.from_pretrained("facebook/musicgen-small")
80
+
81
+ with gr.Blocks() as chat_demo:
82
+ chatbot = gr.Chatbot()
83
+ audio_input = gr.Audio(source="microphone", type="filepath", show_label=False)
84
+ submit_audio = gr.Button("Submit Audio")
85
+ clear = gr.Button("Clear")
86
+ audio_output = gr.Audio('temp_file.wav', label="Generated Audio (wav)", type='filepath', autoplay=False)
87
+
88
+ def translate(audio):
89
+ print("__Wysyłam nagranie do whisper!")
90
+ transcription = whisper_model.transcribe(audio, language="pl")
91
+ return transcription["text"]
92
+
93
+ def read_text(text):
94
+ print("Tutaj jest tekst to przeczytania!", text[-1][-1])
95
+ inputs = tokenizer(text[-1][-1], return_tensors="pt").to("cuda")
96
+ with torch.no_grad():
97
+ output = tts_model(**inputs).waveform.squeeze().cpu().numpy()
98
+ sf.write('temp_file.wav', output, tts_model.config.sampling_rate)
99
+ return 'temp_file.wav'
100
+
101
+ def user(audio_data, history):
102
+ if audio_data:
103
+ user_message = translate(audio_data)
104
+ print("USER!:")
105
+ print("", history + [[user_message, None]])
106
+ return history + [[user_message, None]]
107
+
108
+ def parse_history(hist):
109
+ history_ = ""
110
+ for q, a in hist:
111
+ history_ += f"<user>: {q } \n"
112
+ if a:
113
+ history_ += f"<assistant>: {a} \n"
114
+ return history_
115
+
116
+ def bot(history):
117
+ print(f"When: {datetime.today().strftime('%Y-%m-%d %H:%M:%S')}")
118
+ prompt = f"Jesteś AI assystentem. Odpowiadaj krótko i po polsku. {parse_history(history)}. <assistant>:"
119
+ stream = llm(prompt, **params)
120
+ history[-1][1] = ""
121
+ answer_save = ""
122
+ for character in stream:
123
+ history[-1][1] += character
124
+ answer_save += character
125
+ time.sleep(0.005)
126
+ yield history
127
+
128
+ submit_audio.click(user, [audio_input, chatbot], [chatbot], queue=False).then(bot, chatbot, chatbot).then(read_text, chatbot, audio_output)
129
+ clear.click(lambda: None, None, chatbot, queue=False)
130
+
131
+
132
+ ##### Audio Gen ####
133
+ ##### Audio Gen ####
134
+ ##### Audio Gen ####
135
+
136
+
137
+
138
+ sampling_rate = model_audio_gen.audio_encoder.config.sampling_rate
139
+ frame_rate = model_audio_gen.audio_encoder.config.frame_rate
140
+ text_encoder = model_audio_gen.get_text_encoder()
141
+
142
+ def generate_audio(decade, genre, instrument, guidance_scale=8, audio_length_in_s=20, seed=0):
143
+ prompt = " ".join([decade, genre, 'track with ', instrument])
144
+ save_to_txt(prompt)
145
+ inputs = processor_audio_gen(
146
+ text=[prompt, "drums"],
147
+ padding=True,
148
+ return_tensors="pt",
149
+ ).to(device)
150
+
151
+ with torch.no_grad():
152
+ encoder_outputs = text_encoder(**inputs)
153
+
154
+ max_new_tokens = int(frame_rate * audio_length_in_s)
155
+
156
+ set_seed(seed)
157
+ audio_values = model_audio_gen.generate(inputs.input_ids[0][None, :], attention_mask=inputs.attention_mask, encoder_outputs=encoder_outputs, do_sample=True, guidance_scale=guidance_scale, max_new_tokens=max_new_tokens)
158
+ sf.write('generated_audio.wav', audio_values.cpu()[0][0], 32_000)
159
+ audio_values = (audio_values.cpu().numpy() * 32767).astype(np.int16)
160
+ return (sampling_rate, audio_values)
161
+
162
+
163
+
164
+ audio_gen = gr.Interface(
165
+ fn=generate_audio,
166
+ inputs=[
167
+ # gr.Text(label="Negative prompt", value="drums"),
168
+ gr.Radio(["50s", " 60s", "70s", "80s", "90s"], label="decade", info=""),
169
+ gr.Radio(["classic", "rock", "pop", "metal", "jazz", "synth"], label="genre", info=""),
170
+ gr.Radio(["acoustic guitar", "electric guitar", "drums", "saxophone", "keyboard", "accordion", "fiddle"], label="instrument", info=""),
171
+ gr.Slider(1.5, 10, value=8, step=0.5, label="Guidance scale"),
172
+ gr.Slider(5, 30, value=20, step=5, label="Audio length in s"),
173
+ # gr.Slider(0, 10, value=0, step=1, label="Seed"),
174
+ ],
175
+ outputs=[
176
+ gr.Audio(label="Generated Music", type="numpy"),
177
+ ]#,
178
+ # examples=EXAMPLES,
179
+ )
180
+
181
+ #### Audio desc and Stable ###
182
+ #### Audio desc and Stable ###
183
+ #### Audio desc and Stable ###
184
+
185
+ if os.path.isfile("transfer.pth") == False:
186
+ torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth', 'transfer.pth')
187
+ torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav', 'folk.wav')
188
+ torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3', 'electronic.mp3')
189
+ torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav', 'orchestra.wav')
190
+
191
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
192
+
193
+ example_list = ['folk.wav', 'electronic.mp3', 'orchestra.wav']
194
+ model = BartCaptionModel(max_length = 128)
195
+ pretrained_object = torch.load('./transfer.pth', map_location='cpu')
196
+ state_dict = pretrained_object['state_dict']
197
+ model.load_state_dict(state_dict)
198
+ if torch.cuda.is_available():
199
+ torch.cuda.set_device(device)
200
+ model = model.cuda(device)
201
+ model.eval()
202
+
203
+
204
+
205
+
206
+
207
+ def get_audio(audio_path, duration=10, target_sr=16000):
208
+ n_samples = int(duration * target_sr)
209
+ audio, sr = load_audio(
210
+ path= audio_path,
211
+ ch_format= STR_CH_FIRST,
212
+ sample_rate= target_sr,
213
+ downmix_to_mono= True,
214
+ )
215
+ if len(audio.shape) == 2:
216
+ audio = audio.mean(0, False) # to mono
217
+ input_size = int(n_samples)
218
+ if audio.shape[-1] < input_size: # pad sequence
219
+ pad = np.zeros(input_size)
220
+ pad[: audio.shape[-1]] = audio
221
+ audio = pad
222
+ ceil = int(audio.shape[-1] // n_samples)
223
+ audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32'))
224
+ return audio
225
+
226
+ def captioning(audio_path):
227
+ audio_tensor = get_audio(audio_path = audio_path)
228
+ if torch.cuda.is_available():
229
+ audio_tensor = audio_tensor.to(device)
230
+ with torch.no_grad():
231
+ output = model.generate(
232
+ samples=audio_tensor,
233
+ num_beams=5,
234
+ )
235
+ inference = ""
236
+ number_of_chunks = range(audio_tensor.shape[0])
237
+ for chunk, text in zip(number_of_chunks, output):
238
+ time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
239
+ inference += f"{time}\n{text} \n \n"
240
+ return inference
241
+
242
+ title = ""
243
+ description = ""
244
+
245
+ article = ""
246
+ def captioning():
247
+ audio_path = 'generated_audio.wav'
248
+ audio_tensor = get_audio(audio_path=audio_path)
249
+
250
+ if torch.cuda.is_available():
251
+ audio_tensor = audio_tensor.to(device)
252
+
253
+ with torch.no_grad():
254
+ output = model.generate(
255
+ samples=audio_tensor,
256
+ num_beams=5)
257
+
258
+ inference = ""
259
+ number_of_chunks = range(audio_tensor.shape[0])
260
+ for chunk, text in zip(number_of_chunks, output):
261
+ time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
262
+ inference += f"{time}\n{text} \n \n"
263
+
264
+ prompt = read_txt()
265
+ print(prompt[0])
266
+ # Generuj obraz na podstawie tekstu
267
+ #generated_images = pipe(prompt=prompt[0]*5 + inference + prompt[0]*5).images
268
+ #image = generated_images[0]
269
+
270
+ num_images = 3
271
+ prompt = [prompt[0]*5 + inference + prompt[0]*5] * num_images
272
+ images = pipe(prompt, height=768, width=768).images
273
+ grid = image_grid(images, rows=1, cols=3)
274
+
275
+ return inference, grid
276
+
277
+ audio_desc = gr.Interface(fn=captioning,
278
+ inputs=None,
279
+ outputs=[
280
+ gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"),
281
+ gr.Image(label="Generated Image") # Dodane wyjście dla obrazu
282
+ ],
283
+ title=title,
284
+ description=description,
285
+ article=article,
286
+ cache_examples=False
287
+ )
288
+
289
+ music = gr.Video("muzyka_AI.mp4")
290
+ voice_cloning = gr.Video("voice_cloning_fraud.mp4")
291
+
292
+ ##### Run Alll #######
293
+ ##### Run Alll #######
294
+ ##### Run Alll #######
295
+
296
+
297
+ demo_all = gr.TabbedInterface([music, audio_gen, audio_desc, voice_cloning, chat_demo], ["1.Music", "2.Audio Generation", "3.Image Generation", "4.Voice Cloning", "5.Chat with LLama"])
298
+
299
+ demo_all.queue()
300
+ demo_all.launch()
model/__pycache__/bart.cpython-310.pyc ADDED
Binary file (4.53 kB). View file
 
model/__pycache__/modules.cpython-310.pyc ADDED
Binary file (3.24 kB). View file
 
model/bart.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+ from .modules import AudioEncoder
6
+ from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
7
+
8
+ class BartCaptionModel(nn.Module):
9
+ def __init__(self, n_mels=128, num_of_conv=6, sr=16000, duration=10, max_length=128, label_smoothing=0.1, bart_type="facebook/bart-base", audio_dim=768):
10
+ super(BartCaptionModel, self).__init__()
11
+ # non-finetunning case
12
+ bart_config = BartConfig.from_pretrained(bart_type)
13
+ self.tokenizer = BartTokenizer.from_pretrained(bart_type)
14
+ self.bart = BartForConditionalGeneration(bart_config)
15
+
16
+ self.n_sample = sr * duration
17
+ self.hop_length = int(0.01 * sr) # hard coding hop_size
18
+ self.n_frames = int(self.n_sample // self.hop_length)
19
+ self.num_of_stride_conv = num_of_conv - 1
20
+ self.n_ctx = int(self.n_frames // 2**self.num_of_stride_conv) + 1
21
+ self.audio_encoder = AudioEncoder(
22
+ n_mels = n_mels, # hard coding n_mel
23
+ n_ctx = self.n_ctx,
24
+ audio_dim = audio_dim,
25
+ text_dim = self.bart.config.hidden_size,
26
+ num_of_stride_conv = self.num_of_stride_conv
27
+ )
28
+
29
+ self.max_length = max_length
30
+ self.loss_fct = nn.CrossEntropyLoss(label_smoothing= label_smoothing, ignore_index=-100)
31
+
32
+ @property
33
+ def device(self):
34
+ return list(self.parameters())[0].device
35
+
36
+ def shift_tokens_right(self, input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
37
+ """
38
+ Shift input ids one token to the right.ls
39
+ """
40
+ shifted_input_ids = input_ids.new_zeros(input_ids.shape)
41
+ shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
42
+ shifted_input_ids[:, 0] = decoder_start_token_id
43
+
44
+ if pad_token_id is None:
45
+ raise ValueError("self.model.config.pad_token_id has to be defined.")
46
+ # replace possible -100 values in labels by `pad_token_id`
47
+ shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
48
+ return shifted_input_ids
49
+
50
+ def forward_encoder(self, audio):
51
+ audio_embs = self.audio_encoder(audio)
52
+ encoder_outputs = self.bart.model.encoder(
53
+ input_ids=None,
54
+ inputs_embeds=audio_embs,
55
+ return_dict=True
56
+ )["last_hidden_state"]
57
+ return encoder_outputs, audio_embs
58
+
59
+ def forward_decoder(self, text, encoder_outputs):
60
+ text = self.tokenizer(text,
61
+ padding='longest',
62
+ truncation=True,
63
+ max_length=self.max_length,
64
+ return_tensors="pt")
65
+ input_ids = text["input_ids"].to(self.device)
66
+ attention_mask = text["attention_mask"].to(self.device)
67
+
68
+ decoder_targets = input_ids.masked_fill(
69
+ input_ids == self.tokenizer.pad_token_id, -100
70
+ )
71
+
72
+ decoder_input_ids = self.shift_tokens_right(
73
+ decoder_targets, self.bart.config.pad_token_id, self.bart.config.decoder_start_token_id
74
+ )
75
+
76
+ decoder_outputs = self.bart(
77
+ input_ids=None,
78
+ attention_mask=None,
79
+ decoder_input_ids=decoder_input_ids,
80
+ decoder_attention_mask=attention_mask,
81
+ inputs_embeds=None,
82
+ labels=None,
83
+ encoder_outputs=(encoder_outputs,),
84
+ return_dict=True
85
+ )
86
+ lm_logits = decoder_outputs["logits"]
87
+ loss = self.loss_fct(lm_logits.view(-1, self.tokenizer.vocab_size), decoder_targets.view(-1))
88
+ return loss
89
+
90
+ def forward(self, audio, text):
91
+ encoder_outputs, _ = self.forward_encoder(audio)
92
+ loss = self.forward_decoder(text, encoder_outputs)
93
+ return loss
94
+
95
+ def generate(self,
96
+ samples,
97
+ use_nucleus_sampling=False,
98
+ num_beams=5,
99
+ max_length=128,
100
+ min_length=2,
101
+ top_p=0.9,
102
+ repetition_penalty=1.0,
103
+ ):
104
+
105
+ # self.bart.force_bos_token_to_be_generated = True
106
+ audio_embs = self.audio_encoder(samples)
107
+ encoder_outputs = self.bart.model.encoder(
108
+ input_ids=None,
109
+ attention_mask=None,
110
+ head_mask=None,
111
+ inputs_embeds=audio_embs,
112
+ output_attentions=None,
113
+ output_hidden_states=None,
114
+ return_dict=True)
115
+
116
+ input_ids = torch.zeros((encoder_outputs['last_hidden_state'].size(0), 1)).long().to(self.device)
117
+ input_ids[:, 0] = self.bart.config.decoder_start_token_id
118
+ decoder_attention_mask = torch.ones((encoder_outputs['last_hidden_state'].size(0), 1)).long().to(self.device)
119
+ if use_nucleus_sampling:
120
+ outputs = self.bart.generate(
121
+ input_ids=None,
122
+ attention_mask=None,
123
+ decoder_input_ids=input_ids,
124
+ decoder_attention_mask=decoder_attention_mask,
125
+ encoder_outputs=encoder_outputs,
126
+ max_length=max_length,
127
+ min_length=min_length,
128
+ do_sample=True,
129
+ top_p=top_p,
130
+ num_return_sequences=1,
131
+ repetition_penalty=1.1)
132
+ else:
133
+ outputs = self.bart.generate(input_ids=None,
134
+ attention_mask=None,
135
+ decoder_input_ids=input_ids,
136
+ decoder_attention_mask=decoder_attention_mask,
137
+ encoder_outputs=encoder_outputs,
138
+ head_mask=None,
139
+ decoder_head_mask=None,
140
+ inputs_embeds=None,
141
+ decoder_inputs_embeds=None,
142
+ use_cache=None,
143
+ output_attentions=None,
144
+ output_hidden_states=None,
145
+ max_length=max_length,
146
+ min_length=min_length,
147
+ num_beams=num_beams,
148
+ repetition_penalty=repetition_penalty)
149
+
150
+ captions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
151
+ return captions
model/modules.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### code reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
2
+
3
+ import os
4
+ import torch
5
+ import torchaudio
6
+ import numpy as np
7
+ import torch.nn.functional as F
8
+ from torch import Tensor, nn
9
+ from typing import Dict, Iterable, Optional
10
+
11
+ # hard-coded audio hyperparameters
12
+ SAMPLE_RATE = 16000
13
+ N_FFT = 1024
14
+ N_MELS = 128
15
+ HOP_LENGTH = int(0.01 * SAMPLE_RATE)
16
+ DURATION = 10
17
+ N_SAMPLES = int(DURATION * SAMPLE_RATE)
18
+ N_FRAMES = N_SAMPLES // HOP_LENGTH + 1
19
+
20
+ def sinusoids(length, channels, max_timescale=10000):
21
+ """Returns sinusoids for positional embedding"""
22
+ log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
23
+ inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
24
+ scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
25
+ return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
26
+
27
+ class MelEncoder(nn.Module):
28
+ """
29
+ time-frequency represntation
30
+ """
31
+ def __init__(self,
32
+ sample_rate= 16000,
33
+ f_min=0,
34
+ f_max=8000,
35
+ n_fft=1024,
36
+ win_length=1024,
37
+ hop_length = int(0.01 * 16000),
38
+ n_mels = 128,
39
+ power = None,
40
+ pad= 0,
41
+ normalized= False,
42
+ center= True,
43
+ pad_mode= "reflect"
44
+ ):
45
+ super(MelEncoder, self).__init__()
46
+ self.window = torch.hann_window(win_length)
47
+ self.spec_fn = torchaudio.transforms.Spectrogram(
48
+ n_fft = n_fft,
49
+ win_length = win_length,
50
+ hop_length = hop_length,
51
+ power = power
52
+ )
53
+ self.mel_scale = torchaudio.transforms.MelScale(
54
+ n_mels,
55
+ sample_rate,
56
+ f_min,
57
+ f_max,
58
+ n_fft // 2 + 1)
59
+
60
+ self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
61
+
62
+ def forward(self, wav):
63
+ spec = self.spec_fn(wav)
64
+ power_spec = spec.real.abs().pow(2)
65
+ mel_spec = self.mel_scale(power_spec)
66
+ mel_spec = self.amplitude_to_db(mel_spec) # Log10(max(reference value and amin))
67
+ return mel_spec
68
+
69
+ class AudioEncoder(nn.Module):
70
+ def __init__(
71
+ self, n_mels: int, n_ctx: int, audio_dim: int, text_dim: int, num_of_stride_conv: int,
72
+ ):
73
+ super().__init__()
74
+ self.mel_encoder = MelEncoder(n_mels=n_mels)
75
+ self.conv1 = nn.Conv1d(n_mels, audio_dim, kernel_size=3, padding=1)
76
+ self.conv_stack = nn.ModuleList([])
77
+ for _ in range(num_of_stride_conv):
78
+ self.conv_stack.append(
79
+ nn.Conv1d(audio_dim, audio_dim, kernel_size=3, stride=2, padding=1)
80
+ )
81
+ # self.proj = nn.Linear(audio_dim, text_dim, bias=False)
82
+ self.register_buffer("positional_embedding", sinusoids(n_ctx, text_dim))
83
+
84
+ def forward(self, x: Tensor):
85
+ """
86
+ x : torch.Tensor, shape = (batch_size, waveform)
87
+ single channel wavform
88
+ """
89
+ x = self.mel_encoder(x) # (batch_size, n_mels, n_ctx)
90
+ x = F.gelu(self.conv1(x))
91
+ for conv in self.conv_stack:
92
+ x = F.gelu(conv(x))
93
+ x = x.permute(0, 2, 1)
94
+ x = (x + self.positional_embedding).to(x.dtype)
95
+ return x
muzyka_AI.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:480742943da1b14e194684919a8e531e383503318c28420a29f723468c3407dc
3
+ size 6376447
prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ma to być
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ transformers
4
+ ctransformers --no-binary=ctransformers
5
+ librosa >= 0.8
6
+ pip>=23.2
7
+ gradio_client==0.2.7
8
+ invisible_watermark
9
+ safetensors
10
+ diffusers
11
+ soundfile
12
+ openai-whisper
13
+ accelerate
temp_audio.wav ADDED
Binary file (72.7 kB). View file
 
temp_file.wav ADDED
Binary file (228 kB). View file
 
utils/__pycache__/audio_utils.cpython-310.pyc ADDED
Binary file (7.74 kB). View file
 
utils/audio_utils.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ STR_CLIP_ID = 'clip_id'
2
+ STR_AUDIO_SIGNAL = 'audio_signal'
3
+ STR_TARGET_VECTOR = 'target_vector'
4
+
5
+
6
+ STR_CH_FIRST = 'channels_first'
7
+ STR_CH_LAST = 'channels_last'
8
+
9
+ import io
10
+ import os
11
+ import tqdm
12
+ import logging
13
+ import subprocess
14
+ from typing import Tuple
15
+ from pathlib import Path
16
+
17
+ # import librosa
18
+ import numpy as np
19
+ import soundfile as sf
20
+
21
+ import itertools
22
+ from numpy.fft import irfft
23
+
24
+ def _resample_load_ffmpeg(path: str, sample_rate: int, downmix_to_mono: bool) -> Tuple[np.ndarray, int]:
25
+ """
26
+ Decoding, downmixing, and downsampling by librosa.
27
+ Returns a channel-first audio signal.
28
+
29
+ Args:
30
+ path:
31
+ sample_rate:
32
+ downmix_to_mono:
33
+
34
+ Returns:
35
+ (audio signal, sample rate)
36
+ """
37
+
38
+ def _decode_resample_by_ffmpeg(filename, sr):
39
+ """decode, downmix, and resample audio file"""
40
+ channel_cmd = '-ac 1 ' if downmix_to_mono else '' # downmixing option
41
+ resampling_cmd = f'-ar {str(sr)}' if sr else '' # downsampling option
42
+ cmd = f"ffmpeg -i \"{filename}\" {channel_cmd} {resampling_cmd} -f wav -"
43
+ p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
44
+ out, err = p.communicate()
45
+ return out
46
+
47
+ src, sr = sf.read(io.BytesIO(_decode_resample_by_ffmpeg(path, sr=sample_rate)))
48
+ return src.T, sr
49
+
50
+
51
+ def _resample_load_librosa(path: str, sample_rate: int, downmix_to_mono: bool, **kwargs) -> Tuple[np.ndarray, int]:
52
+ """
53
+ Decoding, downmixing, and downsampling by librosa.
54
+ Returns a channel-first audio signal.
55
+ """
56
+ src, sr = librosa.load(path, sr=sample_rate, mono=downmix_to_mono, **kwargs)
57
+ return src, sr
58
+
59
+
60
+ def load_audio(
61
+ path: str or Path,
62
+ ch_format: str,
63
+ sample_rate: int = None,
64
+ downmix_to_mono: bool = False,
65
+ resample_by: str = 'ffmpeg',
66
+ **kwargs,
67
+ ) -> Tuple[np.ndarray, int]:
68
+ """A wrapper of librosa.load that:
69
+ - forces the returned audio to be 2-dim,
70
+ - defaults to sr=None, and
71
+ - defaults to downmix_to_mono=False.
72
+
73
+ The audio decoding is done by `audioread` or `soundfile` package and ultimately, often by ffmpeg.
74
+ The resampling is done by `librosa`'s child package `resampy`.
75
+
76
+ Args:
77
+ path: audio file path
78
+ ch_format: one of 'channels_first' or 'channels_last'
79
+ sample_rate: target sampling rate. if None, use the rate of the audio file
80
+ downmix_to_mono:
81
+ resample_by (str): 'librosa' or 'ffmpeg'. it decides backend for audio decoding and resampling.
82
+ **kwargs: keyword args for librosa.load - offset, duration, dtype, res_type.
83
+
84
+ Returns:
85
+ (audio, sr) tuple
86
+ """
87
+ if ch_format not in (STR_CH_FIRST, STR_CH_LAST):
88
+ raise ValueError(f'ch_format is wrong here -> {ch_format}')
89
+
90
+ if os.stat(path).st_size > 8000:
91
+ if resample_by == 'librosa':
92
+ src, sr = _resample_load_librosa(path, sample_rate, downmix_to_mono, **kwargs)
93
+ elif resample_by == 'ffmpeg':
94
+ src, sr = _resample_load_ffmpeg(path, sample_rate, downmix_to_mono)
95
+ else:
96
+ raise NotImplementedError(f'resample_by: "{resample_by}" is not supposred yet')
97
+ else:
98
+ raise ValueError('Given audio is too short!')
99
+ return src, sr
100
+
101
+ # if src.ndim == 1:
102
+ # src = np.expand_dims(src, axis=0)
103
+ # # now always 2d and channels_first
104
+
105
+ # if ch_format == STR_CH_FIRST:
106
+ # return src, sr
107
+ # else:
108
+ # return src.T, sr
109
+
110
+ def ms(x):
111
+ """Mean value of signal `x` squared.
112
+ :param x: Dynamic quantity.
113
+ :returns: Mean squared of `x`.
114
+ """
115
+ return (np.abs(x)**2.0).mean()
116
+
117
+ def normalize(y, x=None):
118
+ """normalize power in y to a (standard normal) white noise signal.
119
+ Optionally normalize to power in signal `x`.
120
+ #The mean power of a Gaussian with :math:`\\mu=0` and :math:`\\sigma=1` is 1.
121
+ """
122
+ if x is not None:
123
+ x = ms(x)
124
+ else:
125
+ x = 1.0
126
+ return y * np.sqrt(x / ms(y))
127
+
128
+ def noise(N, color='white', state=None):
129
+ """Noise generator.
130
+ :param N: Amount of samples.
131
+ :param color: Color of noise.
132
+ :param state: State of PRNG.
133
+ :type state: :class:`np.random.RandomState`
134
+ """
135
+ try:
136
+ return _noise_generators[color](N, state)
137
+ except KeyError:
138
+ raise ValueError("Incorrect color.")
139
+
140
+ def white(N, state=None):
141
+ """
142
+ White noise.
143
+ :param N: Amount of samples.
144
+ :param state: State of PRNG.
145
+ :type state: :class:`np.random.RandomState`
146
+ White noise has a constant power density. It's narrowband spectrum is therefore flat.
147
+ The power in white noise will increase by a factor of two for each octave band,
148
+ and therefore increases with 3 dB per octave.
149
+ """
150
+ state = np.random.RandomState() if state is None else state
151
+ return state.randn(N)
152
+
153
+ def pink(N, state=None):
154
+ """
155
+ Pink noise.
156
+ :param N: Amount of samples.
157
+ :param state: State of PRNG.
158
+ :type state: :class:`np.random.RandomState`
159
+ Pink noise has equal power in bands that are proportionally wide.
160
+ Power density decreases with 3 dB per octave.
161
+ """
162
+ state = np.random.RandomState() if state is None else state
163
+ uneven = N % 2
164
+ X = state.randn(N // 2 + 1 + uneven) + 1j * state.randn(N // 2 + 1 + uneven)
165
+ S = np.sqrt(np.arange(len(X)) + 1.) # +1 to avoid divide by zero
166
+ y = (irfft(X / S)).real
167
+ if uneven:
168
+ y = y[:-1]
169
+ return normalize(y)
170
+
171
+ def blue(N, state=None):
172
+ """
173
+ Blue noise.
174
+ :param N: Amount of samples.
175
+ :param state: State of PRNG.
176
+ :type state: :class:`np.random.RandomState`
177
+ Power increases with 6 dB per octave.
178
+ Power density increases with 3 dB per octave.
179
+ """
180
+ state = np.random.RandomState() if state is None else state
181
+ uneven = N % 2
182
+ X = state.randn(N // 2 + 1 + uneven) + 1j * state.randn(N // 2 + 1 + uneven)
183
+ S = np.sqrt(np.arange(len(X))) # Filter
184
+ y = (irfft(X * S)).real
185
+ if uneven:
186
+ y = y[:-1]
187
+ return normalize(y)
188
+
189
+ def brown(N, state=None):
190
+ """
191
+ Violet noise.
192
+ :param N: Amount of samples.
193
+ :param state: State of PRNG.
194
+ :type state: :class:`np.random.RandomState`
195
+ Power decreases with -3 dB per octave.
196
+ Power density decreases with 6 dB per octave.
197
+ """
198
+ state = np.random.RandomState() if state is None else state
199
+ uneven = N % 2
200
+ X = state.randn(N // 2 + 1 + uneven) + 1j * state.randn(N // 2 + 1 + uneven)
201
+ S = (np.arange(len(X)) + 1) # Filter
202
+ y = (irfft(X / S)).real
203
+ if uneven:
204
+ y = y[:-1]
205
+ return normalize(y)
206
+
207
+ def violet(N, state=None):
208
+ """
209
+ Violet noise. Power increases with 6 dB per octave.
210
+ :param N: Amount of samples.
211
+ :param state: State of PRNG.
212
+ :type state: :class:`np.random.RandomState`
213
+ Power increases with +9 dB per octave.
214
+ Power density increases with +6 dB per octave.
215
+ """
216
+ state = np.random.RandomState() if state is None else state
217
+ uneven = N % 2
218
+ X = state.randn(N // 2 + 1 + uneven) + 1j * state.randn(N // 2 + 1 + uneven)
219
+ S = (np.arange(len(X))) # Filter
220
+ y = (irfft(X * S)).real
221
+ if uneven:
222
+ y = y[:-1]
223
+ return normalize(y)
224
+
225
+ _noise_generators = {
226
+ 'white': white,
227
+ 'pink': pink,
228
+ 'blue': blue,
229
+ 'brown': brown,
230
+ 'violet': violet,
231
+ }
232
+
233
+ def noise_generator(N=44100, color='white', state=None):
234
+ """Noise generator.
235
+ :param N: Amount of unique samples to generate.
236
+ :param color: Color of noise.
237
+ Generate `N` amount of unique samples and cycle over these samples.
238
+ """
239
+ #yield from itertools.cycle(noise(N, color)) # Python 3.3
240
+ for sample in itertools.cycle(noise(N, color, state)):
241
+ yield sample
242
+
243
+ def heaviside(N):
244
+ """Heaviside.
245
+ Returns the value 0 for `x < 0`, 1 for `x > 0`, and 1/2 for `x = 0`.
246
+ """
247
+ return 0.5 * (np.sign(N) + 1)
voice_cloning_fraud.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:440d118fdb3e6e210c5435cec6bf50d1c61190a2e567b62ba39137cc9274ce3b
3
+ size 4672978