Reverb commited on
Commit
03e487a
1 Parent(s): 5c49105

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -0
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+
6
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
+
8
+
9
+ checkpoint = "microsoft/speecht5_tts"
10
+ processor = SpeechT5Processor.from_pretrained(checkpoint)
11
+ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
12
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
+
14
+
15
+ speaker_embeddings = {
16
+ "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
17
+ "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
18
+ "KSP": "spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
19
+ "RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
20
+ "SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
21
+ }
22
+
23
+
24
+ def predict(text, speaker):
25
+ if len(text.strip()) == 0:
26
+ return (16000, np.zeros(0).astype(np.int16))
27
+
28
+ inputs = processor(text=text, return_tensors="pt")
29
+
30
+ # limit input length
31
+ input_ids = inputs["input_ids"]
32
+ input_ids = input_ids[..., :model.config.max_text_positions]
33
+
34
+ if speaker == "Surprise Me!":
35
+ # load one of the provided speaker embeddings at random
36
+ idx = np.random.randint(len(speaker_embeddings))
37
+ key = list(speaker_embeddings.keys())[idx]
38
+ speaker_embedding = np.load(speaker_embeddings[key])
39
+
40
+ # randomly shuffle the elements
41
+ np.random.shuffle(speaker_embedding)
42
+
43
+ # randomly flip half the values
44
+ x = (np.random.rand(512) >= 0.5) * 1.0
45
+ x[x == 0] = -1.0
46
+ speaker_embedding *= x
47
+
48
+ #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
49
+ else:
50
+ speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
51
+
52
+ speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
53
+
54
+ speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
55
+
56
+ speech = (speech.numpy() * 32767).astype(np.int16)
57
+ return (16000, speech)
58
+
59
+ examples = [
60
+ ["It is not in the stars to hold our destiny but in ourselves.", "BDL (male)"],
61
+ ["The octopus and Oliver went to the opera in October.", "CLB (female)"],
62
+ ["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "RMS (male)"],
63
+ ["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "SLT (female)"],
64
+ ["A synonym for cinnamon is a cinnamon synonym.", "BDL (male)"],
65
+ ["How much wood would a woodchuck chuck if a woodchuck could chuck wood? He would chuck, he would, as much as he could, and chuck as much wood as a woodchuck would if a woodchuck could chuck wood.", "CLB (female)"],
66
+ ]
67
+
68
+ gr.Interface(
69
+ fn=predict,
70
+ inputs=[
71
+ gr.Text(label="Input Text"),
72
+ gr.Radio(label="Speaker", choices=[
73
+ "BDL (male)",
74
+ "CLB (female)",
75
+ "KSP (male)",
76
+ "RMS (male)",
77
+ "SLT (female)",
78
+ "Surprise Me!"
79
+ ],
80
+ value="BDL (male)"),
81
+ ],
82
+ outputs=[
83
+ gr.Audio(label="Generated Speech", type="numpy"),
84
+ ],
85
+ title=title,
86
+ description=description,
87
+ article=article,
88
+ examples=examples,
89
+ ).launch()