Matthijs Hollemans commited on
Commit
caaf71e
β€’
1 Parent(s): c66db33

add randomized voice

Browse files
app.py CHANGED
@@ -15,6 +15,7 @@ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
15
  speaker_embeddings = {
16
  "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
17
  "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
 
18
  "RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
19
  "SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
20
  }
@@ -26,7 +27,24 @@ def predict(text, speaker):
26
 
27
  inputs = processor(text=text, return_tensors="pt")
28
 
29
- speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
31
 
32
  speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
@@ -48,6 +66,7 @@ and the <a href="https://huggingface.co/spaces/Matthijs/speecht5-vc-demo">voice
48
 
49
  <b>How to use:</b> Enter some English text and choose a speaker. The output is a mel spectrogram, which is converted to a mono 16 kHz waveform by the
50
  HiFi-GAN vocoder. Because the model always applies random dropout, each attempt will give slightly different results.
 
51
  """
52
 
53
  article = """
@@ -86,7 +105,15 @@ gr.Interface(
86
  fn=predict,
87
  inputs=[
88
  gr.Text(label="Input Text"),
89
- gr.Radio(label="Speaker", choices=["BDL (male)", "CLB (female)", "RMS (male)", "SLT (female)"], value="BDL (male)"),
 
 
 
 
 
 
 
 
90
  ],
91
  outputs=[
92
  gr.Audio(label="Generated Speech", type="numpy"),
15
  speaker_embeddings = {
16
  "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
17
  "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
18
+ "KSP": "spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
19
  "RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
20
  "SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
21
  }
27
 
28
  inputs = processor(text=text, return_tensors="pt")
29
 
30
+ if speaker == "Surprise Me!":
31
+ # load one of the provided speaker embeddings at random
32
+ idx = np.random.randint(len(speaker_embeddings))
33
+ key = list(speaker_embeddings.keys())[idx]
34
+ speaker_embedding = np.load(speaker_embeddings[key])
35
+
36
+ # randomly shuffle the elements
37
+ np.random.shuffle(speaker_embedding)
38
+
39
+ # randomly flip half the values
40
+ x = (np.random.rand(512) >= 0.5) * 1.0
41
+ x[x == 0] = -1.0
42
+ speaker_embedding *= x
43
+
44
+ #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
45
+ else:
46
+ speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
47
+
48
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
49
 
50
  speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
66
 
67
  <b>How to use:</b> Enter some English text and choose a speaker. The output is a mel spectrogram, which is converted to a mono 16 kHz waveform by the
68
  HiFi-GAN vocoder. Because the model always applies random dropout, each attempt will give slightly different results.
69
+ The <em>Surprise Me!</em> option creates a completely randomized speaker.
70
  """
71
 
72
  article = """
105
  fn=predict,
106
  inputs=[
107
  gr.Text(label="Input Text"),
108
+ gr.Radio(label="Speaker", choices=[
109
+ "BDL (male)",
110
+ "CLB (female)",
111
+ "KSP (male)",
112
+ "RMS (male)",
113
+ "SLT (female)",
114
+ "Surprise Me!"
115
+ ],
116
+ value="BDL (male)"),
117
  ],
118
  outputs=[
119
  gr.Audio(label="Generated Speech", type="numpy"),
spkemb/cmu_us_awb_arctic-wav-arctic_a0002.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5db7a684ab490f21cec1628e00d461a184e369fe4eafb1ee441a796faf4ab6ae
3
+ size 2176
spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c5c2a38c2e400179019c560a74c4322f4ee13beda22ee601807545edee283e
3
+ size 2176