ccourc23 commited on
Commit
1956ad2
·
verified ·
1 Parent(s): 8dcb050

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import pipeline
3
+
4
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
5
+ pipe = pipeline(
6
+ "automatic-speech-recognition", model="openai/whisper-base", device=device
7
+ )
8
+
9
+ from datasets import load_dataset
10
+
11
+ dataset = load_dataset("facebook/voxpopuli", "en", split="validation", streaming=True, trust_remote_code=True)
12
+ sample = next(iter(dataset))
13
+
14
+ def translate(audio):
15
+ outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "fr"}) # "language": "fr"
16
+ return outputs["text"]
17
+
18
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
19
+
20
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
21
+
22
+ model = SpeechT5ForTextToSpeech.from_pretrained("ccourc23/fine_tuned_SpeechT5")
23
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
24
+
25
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
26
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
27
+
28
+ def synthesise(text):
29
+ inputs = processor(text=text, return_tensors="pt")
30
+ speech = model.generate_speech(
31
+ inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
32
+ )
33
+ return speech.cpu()
34
+
35
+ Audio(speech, rate=16000)
36
+
37
+ import numpy as np
38
+
39
+ target_dtype = np.int16
40
+ max_range = np.iinfo(target_dtype).max
41
+
42
+
43
+ def speech_to_speech_translation(audio):
44
+ translated_text = translate(audio)
45
+ synthesised_speech = synthesise(translated_text)
46
+ synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
47
+ return 16000, synthesised_speech
48
+
49
+ import gradio as gr
50
+
51
+ demo = gr.Blocks()
52
+
53
+ mic_translate = gr.Interface(
54
+ fn=speech_to_speech_translation,
55
+ inputs=gr.Audio(sources="microphone", type="filepath"),
56
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
57
+ )
58
+
59
+ file_translate = gr.Interface(
60
+ fn=speech_to_speech_translation,
61
+ inputs=gr.Audio(sources="upload", type="filepath"),
62
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
63
+ )
64
+
65
+ with demo:
66
+ gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
67
+
68
+ demo.launch(debug=True, share=True)