Update app.py
Browse files
app.py
CHANGED
@@ -12,25 +12,25 @@ transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small
|
|
12 |
generator = pipeline("text-generation", model="gpt2")
|
13 |
|
14 |
# Initialize TTS tokenizer and model
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
# Initialize ASR pipeline
|
19 |
-
print("TTS Tokenizer:", tokenizer_tts) # Print the tokenizer for the TTS model
|
20 |
|
21 |
def transcribe_and_generate_audio(audio):
|
|
|
|
|
|
|
22 |
|
23 |
# Transcribe audio
|
24 |
-
asr_output = transcriber(
|
25 |
|
26 |
# Generate text based on ASR output
|
27 |
-
generated_text = generator(
|
28 |
|
29 |
-
# Generate audio from text
|
30 |
-
inputs =
|
31 |
set_seed(555)
|
32 |
with torch.no_grad():
|
33 |
-
outputs =
|
34 |
waveform = outputs.waveform[0]
|
35 |
waveform_path = "output.wav"
|
36 |
sf.write(waveform_path, waveform.numpy(), 16000, format='wav')
|
@@ -47,4 +47,4 @@ audio_input = gr.Interface(
|
|
47 |
)
|
48 |
|
49 |
# Launch the interface
|
50 |
-
audio_input.launch()
|
|
|
12 |
generator = pipeline("text-generation", model="gpt2")
|
13 |
|
14 |
# Initialize TTS tokenizer and model
|
15 |
+
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
|
16 |
+
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
|
|
|
|
|
|
|
17 |
|
18 |
def transcribe_and_generate_audio(audio):
|
19 |
+
sr, y = audio
|
20 |
+
y = y.astype(np.float32)
|
21 |
+
y /= np.max(np.abs(y))
|
22 |
|
23 |
# Transcribe audio
|
24 |
+
asr_output = transcriber({"sampling_rate": sr, "raw": y})["text"]
|
25 |
|
26 |
# Generate text based on ASR output
|
27 |
+
generated_text = generator(asr_output)[0]['generated_text']
|
28 |
|
29 |
+
# Generate audio from text
|
30 |
+
inputs = tokenizer(text=generated_text, return_tensors="pt")
|
31 |
set_seed(555)
|
32 |
with torch.no_grad():
|
33 |
+
outputs = model(**inputs)
|
34 |
waveform = outputs.waveform[0]
|
35 |
waveform_path = "output.wav"
|
36 |
sf.write(waveform_path, waveform.numpy(), 16000, format='wav')
|
|
|
47 |
)
|
48 |
|
49 |
# Launch the interface
|
50 |
+
audio_input.launch()
|