ylacombe HF staff commited on
Commit
e9c1685
1 Parent(s): 7db5157

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -69
app.py CHANGED
@@ -17,93 +17,65 @@ device = _grab_best_device()
17
  default_model_per_language = {
18
  "english": "kakao-enterprise/vits-ljs",
19
  "spanish": "facebook/mms-tts-spa",
20
- "tamil": "facebook/mms-tts-tam"
21
  }
22
 
23
  models_per_language = {
24
  "english": [
25
- "ylacombe/vits_ljs_irish_male_monospeaker",
26
- "ylacombe/vits_ljs_irish_male_monospeaker_2",
27
- "ylacombe/vits_ljs_irish_male_monospeaker_2",
28
- "ylacombe/vits_ljs_irish_male_2",
29
-
30
- "ylacombe/vits_ljs_welsh_female_monospeaker",
31
- "ylacombe/vits_ljs_welsh_female_monospeaker_2",
32
- "ylacombe/vits_ljs_welsh_female_2",
33
-
34
- "ylacombe/vits_ljs_welsh_male_monospeaker",
35
- "ylacombe/vits_ljs_welsh_male_monospeaker_2",
36
-
37
- "ylacombe/vits_ljs_scottish_female_monospeaker",
38
- "ylacombe/vits_ljs_scottish_female_2",
39
  ],
40
  "spanish": [
41
- "ylacombe/mms-spa-finetuned-chilean-monospeaker-all",
42
- "ylacombe/mms-spa-finetuned-chilean-monospeaker",
 
43
  ],
44
- "tamil": [
45
- "ylacombe/mms-tam-finetuned-monospeaker-all",
46
- "ylacombe/mms-tam-finetuned-monospeaker",
47
- ]
48
  }
49
 
50
- HUB_PATH = "ylacombe/vits_ljs_welsh_female_monospeaker"
51
-
52
-
53
  pipe_dict = {
54
- "current_model": "ylacombe/vits_ljs_welsh_female_monospeaker",
55
- "pipe": pipeline("text-to-speech", model=HUB_PATH, device=0),
56
  "original_pipe": pipeline("text-to-speech", model=default_model_per_language["english"], device=0),
57
  "language": "english",
58
  }
59
 
60
- title = "# 🐶 VITS"
61
-
62
- max_speakers = 15
63
 
64
  description = """
65
-
66
  """
67
 
 
68
 
69
  # Inference
70
- def generate_audio(text, model_id, language):
71
 
72
  if pipe_dict["language"] != language:
73
- gr.Warning(f"Language has changed - loading new default model: {default_model_per_language[language]}")
74
  pipe_dict["language"] = language
75
  pipe_dict["original_pipe"] = pipeline("text-to-speech", model=default_model_per_language[language], device=0)
76
 
77
- if pipe_dict["current_model"] != model_id:
78
- gr.Warning("Model has changed - loading new model")
79
- pipe_dict["pipe"] = pipeline("text-to-speech", model=model_id, device=0)
80
- pipe_dict["current_model"] = model_id
81
 
82
  num_speakers = pipe_dict["pipe"].model.config.num_speakers
83
 
84
  out = []
85
  # first generate original model result
86
  output = pipe_dict["original_pipe"](text)
87
- output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Non finetuned model prediction {default_model_per_language[language]}", show_label=True,
88
  visible=True)
89
  out.append(output)
90
 
91
 
92
- if num_speakers>1:
93
- for i in range(min(num_speakers, max_speakers - 1)):
94
- forward_params = {"speaker_id": i}
95
- output = pipe_dict["pipe"](text, forward_params=forward_params)
96
-
97
- output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True,
98
- visible=True)
99
- out.append(output)
100
- out.extend([gr.Audio(visible=False)]*(max_speakers-num_speakers))
101
- else:
102
- output = pipe_dict["pipe"](text)
103
- output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label="Generated Audio - Mono speaker", show_label=True,
104
- visible=True)
105
  out.append(output)
106
- out.extend([gr.Audio(visible=False)]*(max_speakers-2))
 
107
  return out
108
 
109
 
@@ -122,30 +94,13 @@ with gr.Blocks() as demo_blocks:
122
  info = "Language that you want to test"
123
  )
124
 
125
- model_id = gr.Dropdown(
126
- models_per_language["english"],
127
- value="ylacombe/vits_ljs_welsh_female_monospeaker_2",
128
- label="Model",
129
- info="Model you want to test",
130
- )
131
-
132
  with gr.Column():
133
  outputs = []
134
  for i in range(max_speakers):
135
  out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
136
  outputs.append(out_audio)
137
-
138
- language.change(lambda language: gr.Dropdown(
139
- models_per_language[language],
140
- value=models_per_language[language][0],
141
- label="Model",
142
- info="Model you want to test",
143
- ),
144
- language,
145
- model_id
146
- )
147
 
148
- btn.click(generate_audio, [inp_text, model_id, language], outputs)
149
 
150
 
151
  demo_blocks.queue().launch()
 
17
  default_model_per_language = {
18
  "english": "kakao-enterprise/vits-ljs",
19
  "spanish": "facebook/mms-tts-spa",
 
20
  }
21
 
22
  models_per_language = {
23
  "english": [
24
+ ("Irish Male Speaker", "ylacombe/vits_ljs_irish_male_monospeaker_2"),
25
+ ("Welsh Female Speaker", "ylacombe/vits_ljs_welsh_female_monospeaker_2"),
26
+ ("Welsh Male Speaker", "ylacombe/vits_ljs_welsh_male_monospeaker_2"),
27
+ ("Scottish Female Speaker", "ylacombe/vits_ljs_scottish_female_monospeaker"),
 
 
 
 
 
 
 
 
 
 
28
  ],
29
  "spanish": [
30
+ ("Male Chilean Speaker", "ylacombe/mms-spa-finetuned-chilean-monospeaker"),
31
+ ("Female Argentinian Speaker", "ylacombe/mms-spa-finetuned-argentinian-monospeaker"),
32
+ ("Male Colombian Speaker", "ylacombe/mms-spa-finetuned-colombian-monospeaker"),
33
  ],
 
 
 
 
34
  }
35
 
 
 
 
36
  pipe_dict = {
37
+ "pipe": [pipeline("text-to-speech", model=l[1], device=0) for l in models_per_language["english"]],
 
38
  "original_pipe": pipeline("text-to-speech", model=default_model_per_language["english"], device=0),
39
  "language": "english",
40
  }
41
 
42
+ title = "# VITS"
 
 
43
 
44
  description = """
45
+ TODO
46
  """
47
 
48
+ max_speakers = 15
49
 
50
  # Inference
51
+ def generate_audio(text, language):
52
 
53
  if pipe_dict["language"] != language:
54
+ gr.Warning(f"Language has changed - loading corresponding models: {default_model_per_language[language]}")
55
  pipe_dict["language"] = language
56
  pipe_dict["original_pipe"] = pipeline("text-to-speech", model=default_model_per_language[language], device=0)
57
 
58
+ pipe_dict["pipe"] = [pipeline("text-to-speech", model=l[1], device=0) for l in models_per_language["english"]]
 
 
 
59
 
60
  num_speakers = pipe_dict["pipe"].model.config.num_speakers
61
 
62
  out = []
63
  # first generate original model result
64
  output = pipe_dict["original_pipe"](text)
65
+ output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Prediction from the original checkpoint {default_model_per_language[language]}", show_label=True,
66
  visible=True)
67
  out.append(output)
68
 
69
 
70
+ for i in range(min(len(pipe_dict["pipe"]), max_speakers - 1)):
71
+
72
+ output = pipe_dict["pipe"][i](text)
73
+
74
+ output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Finetuned {models_per_language[language][i][0]}", show_label=True,
75
+ visible=True)
 
 
 
 
 
 
 
76
  out.append(output)
77
+
78
+ out.extend([gr.Audio(visible=False)]*(max_speakers-(len(out))))
79
  return out
80
 
81
 
 
94
  info = "Language that you want to test"
95
  )
96
 
 
 
 
 
 
 
 
97
  with gr.Column():
98
  outputs = []
99
  for i in range(max_speakers):
100
  out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
101
  outputs.append(out_audio)
 
 
 
 
 
 
 
 
 
 
102
 
103
+ btn.click(generate_audio, [inp_text, language], outputs)
104
 
105
 
106
  demo_blocks.queue().launch()