Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,7 @@ import torchaudio
|
|
5 |
from transformers import AutoProcessor, SeamlessM4TModel
|
6 |
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
|
7 |
model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
|
8 |
-
model.to('cuda')
|
9 |
|
10 |
language_dict = {
|
11 |
"Modern Standard Arabic" : "arb",
|
@@ -57,14 +57,14 @@ def png(source_lang,target_lang,audio,text):
|
|
57 |
processed_inputs = processor(text, src_lang=source_lang_code, return_tensors="pt")
|
58 |
else:
|
59 |
sample_rate, audio_data = audio
|
60 |
-
audio_tokens = torch.from_numpy(audio_data)
|
61 |
audio_tokens = audio_tokens.to(torch.float32)
|
62 |
audio_tokens = torchaudio.functional.resample(audio_tokens, orig_freq=sample_rate, new_freq=16_000)
|
63 |
-
audio_tokens = audio_tokens.cpu()
|
64 |
processed_inputs = processor(audios=audio_tokens, sampling_rate=16000, return_tensors="pt")
|
65 |
|
66 |
|
67 |
-
processed_inputs = processed_inputs.to("cuda")
|
68 |
generated_audio = model.generate(**processed_inputs, tgt_lang=target_lang_code)[0].cpu().numpy().squeeze()
|
69 |
output_tokens = model.generate(**processed_inputs, tgt_lang=target_lang_code, generate_speech=False)
|
70 |
generated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
|
|
|
5 |
from transformers import AutoProcessor, SeamlessM4TModel
|
6 |
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
|
7 |
model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
|
8 |
+
# model.to('cuda')
|
9 |
|
10 |
language_dict = {
|
11 |
"Modern Standard Arabic" : "arb",
|
|
|
57 |
processed_inputs = processor(text, src_lang=source_lang_code, return_tensors="pt")
|
58 |
else:
|
59 |
sample_rate, audio_data = audio
|
60 |
+
audio_tokens = torch.from_numpy(audio_data) #.to(torch.device("cuda"))
|
61 |
audio_tokens = audio_tokens.to(torch.float32)
|
62 |
audio_tokens = torchaudio.functional.resample(audio_tokens, orig_freq=sample_rate, new_freq=16_000)
|
63 |
+
# audio_tokens = audio_tokens.cpu()
|
64 |
processed_inputs = processor(audios=audio_tokens, sampling_rate=16000, return_tensors="pt")
|
65 |
|
66 |
|
67 |
+
# processed_inputs = processed_inputs.to("cuda")
|
68 |
generated_audio = model.generate(**processed_inputs, tgt_lang=target_lang_code)[0].cpu().numpy().squeeze()
|
69 |
output_tokens = model.generate(**processed_inputs, tgt_lang=target_lang_code, generate_speech=False)
|
70 |
generated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
|