|
from transformers import AutoModel |
|
import librosa |
|
import wget |
|
from modeling_diva import DiVAModel |
|
|
|
filename = wget.download( |
|
"https://github.com/ffaisal93/SD-QA/raw/refs/heads/master/dev/eng/irl/wav_eng/-1008642825401516622.wav" |
|
) |
|
|
|
speech_data, _ = librosa.load(filename, sr=16_000) |
|
|
|
model = DiVAModel.from_pretrained("./") |
|
|
|
print(model.generate([speech_data])) |
|
print(model.generate([speech_data], ["Repeat verbatim what is said to you."])) |
|
|
|
filename = wget.download( |
|
"https://github.com/ffaisal93/SD-QA/raw/refs/heads/master/dev/eng/irl/wav_eng/-2426554427049983479.wav" |
|
) |
|
|
|
speech_data2, _ = librosa.load(filename, sr=16_000) |
|
|
|
print( |
|
model.generate( |
|
[speech_data, speech_data2], |
|
["Reply Briefly Like A Pirate", "Reply Briefly Like A New Yorker"], |
|
) |
|
) |
|
|