Aspik101 commited on
Commit
f03f46e
1 Parent(s): 0c212b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -256
app.py CHANGED
@@ -1,15 +1,13 @@
1
- from transformers import VitsModel, AutoTokenizer
2
  import soundfile as sf
3
  import torch
4
  from datetime import datetime
5
  import random
6
  import time
7
- from ctransformers import AutoModelForCausalLM
8
  from datetime import datetime
9
  import whisper
10
- from transformers import VitsModel, AutoTokenizer
11
  import torch
12
- from transformers import MusicgenForConditionalGeneration, AutoProcessor, set_seed
13
  import torch
14
  import numpy as np
15
  import os
@@ -19,25 +17,15 @@ from timeit import default_timer as timer
19
  import torch
20
  import numpy as np
21
  import pandas as pd
22
- from huggingface_hub import hf_hub_download
23
- from model.bart import BartCaptionModel
24
- from utils.audio_utils import load_audio, STR_CH_FIRST
25
- from diffusers import DiffusionPipeline
26
-
27
- from PIL import Image
28
-
29
- def image_grid(imgs, rows, cols):
30
- assert len(imgs) == rows*cols
31
 
32
- w, h = imgs[0].size
33
- grid = Image.new('RGB', size=(cols*w, rows*h))
34
- grid_w, grid_h = grid.size
35
-
36
- for i, img in enumerate(imgs):
37
- grid.paste(img, box=(i%cols*w, i//cols*h))
38
- return grid
39
 
 
 
 
 
40
 
 
41
 
42
  def save_to_txt(text_to_save):
43
  with open('prompt.txt', 'w', encoding='utf-8') as f:
@@ -48,253 +36,140 @@ def read_txt():
48
  lines = f.readlines()
49
  return lines
50
 
 
51
  ##### Chat z LLAMA ####
52
  ##### Chat z LLAMA ####
53
  ##### Chat z LLAMA ####
54
- params = {
55
- "max_new_tokens":512,
56
- "stop":["<end>" ,"<|endoftext|>","[", "<user>"],
57
- "temperature":0.7,
58
- "top_p":0.8,
59
- "stream":True,
60
- "batch_size": 8}
61
 
62
 
63
- whisper_model = whisper.load_model("medium").to("cuda")
64
- print("Whisper Loaded!")
65
- llm = AutoModelForCausalLM.from_pretrained("Aspik101/trurl-2-7b-pl-instruct_GGML", model_type="llama")
66
- print("LLM Loaded!")
67
- tts_model = VitsModel.from_pretrained("facebook/mms-tts-pol")
68
- tts_model.to("cuda")
69
- print("TTS Loaded!")
70
- tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-pol")
71
 
72
- pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0",
73
- torch_dtype=torch.float16,
74
- use_safetensors=True,
75
- variant="fp16").to("cuda")
76
- print("DiffusionPipeline Loaded!")
77
 
78
- model_audio_gen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small").to("cuda")
79
- processor_audio_gen = AutoProcessor.from_pretrained("facebook/musicgen-small")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  with gr.Blocks() as chat_demo:
82
- chatbot = gr.Chatbot()
83
- audio_input = gr.Audio(source="microphone", type="filepath", show_label=False)
84
- submit_audio = gr.Button("Submit Audio")
85
- clear = gr.Button("Clear")
86
  audio_output = gr.Audio('temp_file.wav', label="Generated Audio (wav)", type='filepath', autoplay=False)
87
 
88
- def translate(audio):
89
- print("__Wysyłam nagranie do whisper!")
90
- transcription = whisper_model.transcribe(audio, language="pl")
91
- return transcription["text"]
92
-
93
- def read_text(text):
94
- print("Tutaj jest tekst to przeczytania!", text[-1][-1])
95
- inputs = tokenizer(text[-1][-1], return_tensors="pt").to("cuda")
96
- with torch.no_grad():
97
- output = tts_model(**inputs).waveform.squeeze().cpu().numpy()
98
- sf.write('temp_file.wav', output, tts_model.config.sampling_rate)
99
- return 'temp_file.wav'
100
-
101
- def user(audio_data, history):
102
- if audio_data:
103
- user_message = translate(audio_data)
104
- print("USER!:")
105
- print("", history + [[user_message, None]])
106
- return history + [[user_message, None]]
107
-
108
- def parse_history(hist):
109
- history_ = ""
110
- for q, a in hist:
111
- history_ += f"<user>: {q } \n"
112
- if a:
113
- history_ += f"<assistant>: {a} \n"
114
- return history_
115
-
116
- def bot(history):
117
- print(f"When: {datetime.today().strftime('%Y-%m-%d %H:%M:%S')}")
118
- prompt = f"Jesteś AI assystentem. Odpowiadaj krótko i po polsku. {parse_history(history)}. <assistant>:"
119
- stream = llm(prompt, **params)
120
- history[-1][1] = ""
121
- answer_save = ""
122
- for character in stream:
123
- history[-1][1] += character
124
- answer_save += character
125
- time.sleep(0.005)
126
- yield history
127
-
128
- submit_audio.click(user, [audio_input, chatbot], [chatbot], queue=False).then(bot, chatbot, chatbot).then(read_text, chatbot, audio_output)
129
- clear.click(lambda: None, None, chatbot, queue=False)
130
-
131
-
132
- ##### Audio Gen ####
133
- ##### Audio Gen ####
134
- ##### Audio Gen ####
135
-
136
-
137
-
138
- sampling_rate = model_audio_gen.audio_encoder.config.sampling_rate
139
- frame_rate = model_audio_gen.audio_encoder.config.frame_rate
140
- text_encoder = model_audio_gen.get_text_encoder()
141
-
142
- def generate_audio(decade, genre, instrument, guidance_scale=8, audio_length_in_s=20, seed=0):
143
- prompt = " ".join([decade, genre, 'track with ', instrument])
144
- save_to_txt(prompt)
145
- inputs = processor_audio_gen(
146
- text=[prompt, "drums"],
147
- padding=True,
148
- return_tensors="pt",
149
- ).to(device)
150
-
151
- with torch.no_grad():
152
- encoder_outputs = text_encoder(**inputs)
153
-
154
- max_new_tokens = int(frame_rate * audio_length_in_s)
155
-
156
- set_seed(seed)
157
- audio_values = model_audio_gen.generate(inputs.input_ids[0][None, :], attention_mask=inputs.attention_mask, encoder_outputs=encoder_outputs, do_sample=True, guidance_scale=guidance_scale, max_new_tokens=max_new_tokens)
158
- sf.write('generated_audio.wav', audio_values.cpu()[0][0], 32_000)
159
- audio_values = (audio_values.cpu().numpy() * 32767).astype(np.int16)
160
- return (sampling_rate, audio_values)
161
-
162
-
163
-
164
- audio_gen = gr.Interface(
165
- fn=generate_audio,
166
- inputs=[
167
- # gr.Text(label="Negative prompt", value="drums"),
168
- gr.Radio(["50s", " 60s", "70s", "80s", "90s"], label="decade", info=""),
169
- gr.Radio(["classic", "rock", "pop", "metal", "jazz", "synth"], label="genre", info=""),
170
- gr.Radio(["acoustic guitar", "electric guitar", "drums", "saxophone", "keyboard", "accordion", "fiddle"], label="instrument", info=""),
171
- gr.Slider(1.5, 10, value=8, step=0.5, label="Guidance scale"),
172
- gr.Slider(5, 30, value=20, step=5, label="Audio length in s"),
173
- # gr.Slider(0, 10, value=0, step=1, label="Seed"),
174
- ],
175
- outputs=[
176
- gr.Audio(label="Generated Music", type="numpy"),
177
- ]#,
178
- # examples=EXAMPLES,
179
- )
180
-
181
- #### Audio desc and Stable ###
182
- #### Audio desc and Stable ###
183
- #### Audio desc and Stable ###
184
-
185
- if os.path.isfile("transfer.pth") == False:
186
- torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth', 'transfer.pth')
187
- torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav', 'folk.wav')
188
- torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3', 'electronic.mp3')
189
- torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav', 'orchestra.wav')
190
-
191
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
192
-
193
- example_list = ['folk.wav', 'electronic.mp3', 'orchestra.wav']
194
- model = BartCaptionModel(max_length = 128)
195
- pretrained_object = torch.load('./transfer.pth', map_location='cpu')
196
- state_dict = pretrained_object['state_dict']
197
- model.load_state_dict(state_dict)
198
- if torch.cuda.is_available():
199
- torch.cuda.set_device(device)
200
- model = model.cuda(device)
201
- model.eval()
202
-
203
-
204
-
205
-
206
-
207
- def get_audio(audio_path, duration=10, target_sr=16000):
208
- n_samples = int(duration * target_sr)
209
- audio, sr = load_audio(
210
- path= audio_path,
211
- ch_format= STR_CH_FIRST,
212
- sample_rate= target_sr,
213
- downmix_to_mono= True,
214
- )
215
- if len(audio.shape) == 2:
216
- audio = audio.mean(0, False) # to mono
217
- input_size = int(n_samples)
218
- if audio.shape[-1] < input_size: # pad sequence
219
- pad = np.zeros(input_size)
220
- pad[: audio.shape[-1]] = audio
221
- audio = pad
222
- ceil = int(audio.shape[-1] // n_samples)
223
- audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32'))
224
- return audio
225
-
226
- def captioning(audio_path):
227
- audio_tensor = get_audio(audio_path = audio_path)
228
- if torch.cuda.is_available():
229
- audio_tensor = audio_tensor.to(device)
230
- with torch.no_grad():
231
- output = model.generate(
232
- samples=audio_tensor,
233
- num_beams=5,
234
- )
235
- inference = ""
236
- number_of_chunks = range(audio_tensor.shape[0])
237
- for chunk, text in zip(number_of_chunks, output):
238
- time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
239
- inference += f"{time}\n{text} \n \n"
240
- return inference
241
-
242
- title = ""
243
- description = ""
244
-
245
- article = ""
246
- def captioning():
247
- audio_path = 'generated_audio.wav'
248
- audio_tensor = get_audio(audio_path=audio_path)
249
-
250
- if torch.cuda.is_available():
251
- audio_tensor = audio_tensor.to(device)
252
-
253
- with torch.no_grad():
254
- output = model.generate(
255
- samples=audio_tensor,
256
- num_beams=5)
257
-
258
- inference = ""
259
- number_of_chunks = range(audio_tensor.shape[0])
260
- for chunk, text in zip(number_of_chunks, output):
261
- time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
262
- inference += f"{time}\n{text} \n \n"
263
-
264
- prompt = read_txt()
265
- print(prompt[0])
266
- # Generuj obraz na podstawie tekstu
267
- #generated_images = pipe(prompt=prompt[0]*5 + inference + prompt[0]*5).images
268
- #image = generated_images[0]
269
-
270
- num_images = 3
271
- prompt = [prompt[0]*5 + inference + prompt[0]*5] * num_images
272
- images = pipe(prompt, height=768, width=768).images
273
- grid = image_grid(images, rows=1, cols=3)
274
-
275
- return inference, grid
276
-
277
- audio_desc = gr.Interface(fn=captioning,
278
- inputs=None,
279
- outputs=[
280
- gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"),
281
- gr.Image(label="Generated Image") # Dodane wyjście dla obrazu
282
- ],
283
- title=title,
284
- description=description,
285
- article=article,
286
- cache_examples=False
287
- )
288
-
289
- music = gr.Video("muzyka_AI.mp4")
290
- voice_cloning = gr.Video("voice_cloning_fraud.mp4")
291
-
292
- ##### Run Alll #######
293
- ##### Run Alll #######
294
- ##### Run Alll #######
295
 
 
 
 
296
 
297
- demo_all = gr.TabbedInterface([music, audio_gen, audio_desc, voice_cloning, chat_demo], ["1.Music", "2.Audio Generation", "3.Image Generation", "4.Voice Cloning", "5.Chat with LLama"])
 
298
 
299
- demo_all.queue()
300
- demo_all.launch()
 
1
+
2
  import soundfile as sf
3
  import torch
4
  from datetime import datetime
5
  import random
6
  import time
 
7
  from datetime import datetime
8
  import whisper
 
9
  import torch
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, VitsModel
11
  import torch
12
  import numpy as np
13
  import os
 
17
  import torch
18
  import numpy as np
19
  import pandas as pd
20
+ import whisper
 
 
 
 
 
 
 
 
21
 
 
 
 
 
 
 
 
22
 
23
+ whisper_model = whisper.load_model("medium").to("cuda")
24
+ tts_model = VitsModel.from_pretrained("facebook/mms-tts-pol")
25
+ tts_model.to("cuda")
26
+ print("TTS Loaded!")
27
 
28
+ tokenizer_tss = AutoTokenizer.from_pretrained("facebook/mms-tts-pol")
29
 
30
  def save_to_txt(text_to_save):
31
  with open('prompt.txt', 'w', encoding='utf-8') as f:
 
36
  lines = f.readlines()
37
  return lines
38
 
39
+
40
  ##### Chat z LLAMA ####
41
  ##### Chat z LLAMA ####
42
  ##### Chat z LLAMA ####
 
 
 
 
 
 
 
43
 
44
 
45
+ def _load_model_tokenizer():
46
+ model_id = 'tangger/Qwen-7B-Chat'
47
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
48
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto",trust_remote_code=True, fp16=True).eval()
49
+ return model, tokenizer
 
 
 
50
 
 
 
 
 
 
51
 
52
+ model, tokenizer = _load_model_tokenizer()
53
+ def postprocess(self, y):
54
+ if y is None:
55
+ return []
56
+ for i, (message, response) in enumerate(y):
57
+ y[i] = (
58
+ None if message is None else mdtex2html.convert(message),
59
+ None if response is None else mdtex2html.convert(response),
60
+ )
61
+ return y
62
+
63
+
64
+ def _parse_text(text):
65
+ lines = text.split("\n")
66
+ lines = [line for line in lines if line != ""]
67
+ count = 0
68
+ for i, line in enumerate(lines):
69
+ if "```" in line:
70
+ count += 1
71
+ items = line.split("`")
72
+ if count % 2 == 1:
73
+ lines[i] = f'<pre><code class="language-{items[-1]}">'
74
+ else:
75
+ lines[i] = f"<br></code></pre>"
76
+ else:
77
+ if i > 0:
78
+ if count % 2 == 1:
79
+ line = line.replace("`", r"\`")
80
+ line = line.replace("<", "&lt;")
81
+ line = line.replace(">", "&gt;")
82
+ line = line.replace(" ", "&nbsp;")
83
+ line = line.replace("*", "&ast;")
84
+ line = line.replace("_", "&lowbar;")
85
+ line = line.replace("-", "&#45;")
86
+ line = line.replace(".", "&#46;")
87
+ line = line.replace("!", "&#33;")
88
+ line = line.replace("(", "&#40;")
89
+ line = line.replace(")", "&#41;")
90
+ line = line.replace("$", "&#36;")
91
+ lines[i] = "<br>" + line
92
+ text = "".join(lines)
93
+ return text
94
+
95
+ def predict(_query, _chatbot, _task_history):
96
+ print(f"User: {_parse_text(_query)}")
97
+ _chatbot.append((_parse_text(_query), ""))
98
+ full_response = ""
99
+
100
+ for response in model.chat_stream(tokenizer, _query, history=_task_history,system = "Jesteś assystentem AI. Odpowiadaj zawsze w języku poslkim" ):
101
+ _chatbot[-1] = (_parse_text(_query), _parse_text(response))
102
+
103
+ yield _chatbot
104
+ full_response = _parse_text(response)
105
+
106
+ print(f"History: {_task_history}")
107
+ _task_history.append((_query, full_response))
108
+ print(f"Qwen-7B-Chat: {_parse_text(full_response)}")
109
+
110
+ def read_text(text):
111
+ print("___Tekst do przeczytania!")
112
+ inputs = tokenizer_tss(text, return_tensors="pt").to("cuda")
113
+ with torch.no_grad():
114
+ output = tts_model(**inputs).waveform.squeeze().cpu().numpy()
115
+ sf.write('temp_file.wav', output, tts_model.config.sampling_rate)
116
+ return 'temp_file.wav'
117
+
118
+
119
+ def update_audio(text):
120
+ return 'temp_file.wav'
121
+
122
+ def translate(audio):
123
+ print("__Wysyłam nagranie do whisper!")
124
+ transcription = whisper_model.transcribe(audio, language="pl")
125
+ return transcription["text"]
126
+
127
+
128
+ def predict(audio, _chatbot, _task_history):
129
+ # Użyj funkcji translate, aby przekształcić audio w tekst
130
+ _query = translate(audio)
131
+
132
+ print(f"____User: {_parse_text(_query)}")
133
+ _chatbot.append((_parse_text(_query), ""))
134
+ full_response = ""
135
+
136
+ for response in model.chat_stream(tokenizer,
137
+ _query,
138
+ history= _task_history,
139
+ system = "Jesteś assystentem AI. Odpowiadaj zawsze w języku polskim. Odpowiadaj krótko."):
140
+ _chatbot[-1] = (_parse_text(_query), _parse_text(response))
141
+ yield _chatbot
142
+ full_response = _parse_text(response)
143
+
144
+ print(f"____History: {_task_history}")
145
+ _task_history.append((_query, full_response))
146
+ print(f"__Qwen-7B-Chat: {_parse_text(full_response)}")
147
+ print("____full_response",full_response)
148
+ audio_file = read_text(_parse_text(full_response)) # Generowanie audio
149
+ return full_response
150
+
151
+ def regenerate(_chatbot, _task_history):
152
+ if not _task_history:
153
+ yield _chatbot
154
+ return
155
+ item = _task_history.pop(-1)
156
+ _chatbot.pop(-1)
157
+ yield from predict(item[0], _chatbot, _task_history)
158
 
159
  with gr.Blocks() as chat_demo:
160
+ chatbot = gr.Chatbot(label='Llama Voice Chatbot', elem_classes="control-height")
161
+ query = gr.Textbox(lines=2, label='Input')
162
+ task_history = gr.State([])
 
163
  audio_output = gr.Audio('temp_file.wav', label="Generated Audio (wav)", type='filepath', autoplay=False)
164
 
165
+ with gr.Row():
166
+ submit_btn = gr.Button("🚀 Wyślij tekst")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
+ with gr.Row():
169
+ audio_upload = gr.Audio(source="microphone", type="filepath", show_label=False)
170
+ submit_audio_btn = gr.Button("🎙️ Wyślij audio")
171
 
172
+ submit_btn.click(predict, [query, chatbot, task_history], [chatbot], show_progress=True)
173
+ submit_audio_btn.click(predict, [audio_upload, chatbot, task_history], [chatbot], show_progress=True).then(update_audio, chatbot, audio_output)
174
 
175
+ chat_demo.queue().launch(share=True)