ylacombe HF staff commited on
Commit
ad7e686
1 Parent(s): 1141fd4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -8
app.py CHANGED
@@ -42,11 +42,12 @@ pipe_dict = {
42
  "language": "english",
43
  }
44
 
45
- title = "# VITS"
46
-
47
- description = """
48
- TODO
49
- """
 
50
 
51
  max_speakers = 15
52
 
@@ -80,12 +81,22 @@ def generate_audio(text, language):
80
  out.extend([gr.Audio(visible=False)]*(max_speakers-(len(out))))
81
  return out
82
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  # Gradio blocks demo
85
- with gr.Blocks() as demo_blocks:
86
  gr.Markdown(title)
87
- gr.Markdown(description)
88
- with gr.Row():
89
  with gr.Column():
90
  inp_text = gr.Textbox(label="Input Text", info="What would you like VITS to synthesise?")
91
  btn = gr.Button("Generate Audio!")
@@ -101,6 +112,47 @@ with gr.Blocks() as demo_blocks:
101
  for i in range(max_speakers):
102
  out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
103
  outputs.append(out_audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  btn.click(generate_audio, [inp_text, language], outputs)
106
 
 
42
  "language": "english",
43
  }
44
 
45
+ title = """# Explore English and Spanish Accents with VITS finetuning
46
+ ## Or how the best wine comes in old bottles
47
+ [VITS](https://huggingface.co/docs/transformers/model_doc/vits) is a light weight, low-latency TTS model.
48
+ Coupled with the right datasets and the right training recipes, you can get an excellent finetuned version in 20 minutes with as little as 80 to 150 samples.
49
+ Stay tuned, the training recipe is coming soon!
50
+ """,
51
 
52
  max_speakers = 15
53
 
 
81
  out.extend([gr.Audio(visible=False)]*(max_speakers-(len(out))))
82
  return out
83
 
84
+ css = """
85
+ #container{
86
+ margin: 0 auto;
87
+ max-width: 80rem;
88
+ }
89
+ #intro{
90
+ max-width: 100%;
91
+ text-align: center;
92
+ margin: 0 auto;
93
+ }
94
+ """
95
 
96
  # Gradio blocks demo
97
+ with gr.Blocks(css=css) as demo_blocks:
98
  gr.Markdown(title)
99
+ with gr.Row(elem_id="container"):
 
100
  with gr.Column():
101
  inp_text = gr.Textbox(label="Input Text", info="What would you like VITS to synthesise?")
102
  btn = gr.Button("Generate Audio!")
 
112
  for i in range(max_speakers):
113
  out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
114
  outputs.append(out_audio)
115
+
116
+
117
+ gr.Markdown("""
118
+ ## Datasets and models details
119
+
120
+ ### English
121
+
122
+ * **Model**: [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs)
123
+ * **Dataset**: [British Isles Accent](https://huggingface.co/datasets/ylacombe/english_dialects). For each accent, we used 100 to 150 samples of a single speaker to finetune [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs).
124
+
125
+ ### Spanish
126
+
127
+ * **Model**: [Spanish MMS TTS](https://huggingface.co/facebook/mms-tts-spa). This model is part of Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project, aiming to
128
+ provide speech technology across a diverse range of languages. You can find more details about the supported languages
129
+ and their ISO 639-3 codes in the [MMS Language Coverage Overview](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html),
130
+ and see all MMS-TTS checkpoints on the Hugging Face Hub: [facebook/mms-tts](https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts).
131
+ * **Datasets**: For each accent, we used 100 to 150 samples of a single speaker to finetune the model.
132
+ - [Colombian Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-colombian-spanish).
133
+ - [Argentinian Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-argentinian-spanish).
134
+ - [Chilean Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-spanish).
135
+
136
+ """)
137
+
138
+ with gr.Accordion("Run with transformers"):
139
+ gr.Markdown(
140
+ """## Running VITS and MMS with transformers
141
+ ```bash
142
+ pip install transformers
143
+ ```
144
+ ```py
145
+ from transformers import pipeline
146
+ import scipy
147
+ pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=0)
148
+
149
+ results = pipe("A cinematic shot of a baby racoon wearing an intricate italian priest robe")
150
+
151
+ # write to a wav file
152
+ scipy.io.wavfile.write("audio_vits.wav", rate=results["sampling_rate"], data=results["audio"].squeeze())
153
+ ```
154
+ """
155
+ )
156
 
157
  btn.click(generate_audio, [inp_text, language], outputs)
158