File size: 6,965 Bytes
c36d1d0
 
 
 
 
 
 
 
 
 
 
e95e308
c36d1d0
 
 
 
 
 
 
 
 
e95e308
c36d1d0
 
e95e308
8e3aff1
c36d1d0
 
8e3aff1
 
c36d1d0
 
8e3aff1
c36d1d0
 
 
8e3aff1
 
c36d1d0
8c13550
 
 
 
c36d1d0
 
 
 
 
 
197bc3e
 
c36d1d0
 
 
197bc3e
 
c36d1d0
197bc3e
c36d1d0
 
 
 
 
 
 
aa7c4d4
c36d1d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197bc3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c36d1d0
197bc3e
 
 
c36d1d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197bc3e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import gradio as gr
import torch
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
from string import punctuation
import re

from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed

device = "cuda:0" if torch.cuda.is_available() else "cpu"

repo_id = "parler-tts/parler-tts-mini-expresso"

model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)

SAMPLE_RATE = feature_extractor.sampling_rate
SEED = 42

default_text = "*Remember* - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of *five times*."
default_description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
examples = [
    [
        "Remember - this is only the first iteration of the model. To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
        "Thomas speaks in a sad tone at a moderate pace with high quality."
    ],
    [
        "Did you know? You can reproduce this entire training recipe by following the steps outlined on the model card!",
        "Talia speaks quickly with excitement and high quality audio.",
    ],
    [
        "But that's no secret! The entire project is open source first, with all release artefacts on the Hub.",
        "Elisabeth speaks happily at a slightly slower than average pace with high quality audio.",
    ],
    [
        "Hey there! I'm Jerry. Or at least I think I am? I just need to check that quickly.",
        "Jerry speaks in a confused tone at a moderately slow pace with high quality audio.",
    ],
    [
        "<laugh> It can even laugh! Do you believe it ? I don't!",
        "Talia speaks with laughter with high quality.",
    ],
]

number_normalizer = EnglishNumberNormalizer()

def preprocess(text):
    text = number_normalizer(text).strip()
    if not text.endswith(punctuation):
        text += "."

    abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'

    def separate_abb(match):
        return match.group(0).replace(".", " ")

    text = re.sub(abbreviations_pattern, separate_abb, text)
    return text

def gen_tts(text, description):
    inputs = tokenizer(description, return_tensors="pt").to(device)
    prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)

    set_seed(SEED)
    generation = model.generate(input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids)
    audio_arr = generation.cpu().numpy().squeeze()

    return SAMPLE_RATE, audio_arr

css = """
        #share-btn-container {
            display: flex;
            padding-left: 0.5rem !important;
            padding-right: 0.5rem !important;
            background-color: #000000;
            justify-content: center;
            align-items: center;
            border-radius: 9999px !important; 
            width: 13rem;
            margin-top: 10px;
            margin-left: auto;
            flex: unset !important;
        }
        #share-btn {
            all: initial;
            color: #ffffff;
            font-weight: 600;
            cursor: pointer;
            font-family: 'IBM Plex Sans', sans-serif;
            margin-left: 0.5rem !important;
            padding-top: 0.25rem !important;
            padding-bottom: 0.25rem !important;
            right:0;
        }
        #share-btn * {
            all: unset !important;
        }
        #share-btn-container div:nth-child(-n+2){
            width: auto !important;
            min-height: 0px !important;
        }
        #share-btn-container .wrap {
            display: none !important;
        }
"""

html_blocks = [
    """
    <div style="text-align: center; max-width: 700px; margin: 0 auto;">
      <div
        style="
          display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
        "
      >
        <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
          Parler-TTS: Expresso ☕️️
        </h1>
      </div>
    </div>
    """,
    f"""
    <p><a href="https://huggingface.co/parler-tts/parler-tts-mini-expresso"> Parler-TTS Mini: Expresso</a>
    is a text-to-speech (TTS) model fine-tuned on the <a href="https://huggingface.co/datasets/ylacombe/expresso"> Expresso dataset</a>.
    It generates high-quality speech in a given <b>emotion</b> and <b>voice</b> that can be controlled through a simple text prompt.</p>
    <p>Tips for ensuring good generation:
    <ul>
        <li>Specify the name of a male speaker (Jerry, Thomas) or female speaker (Talia, Elisabeth) for consistent voices</li>
        <li>The model can generate in a range of emotions, including: "happy", "confused", "default" (meaning no particular emotion conveyed), "laughing", "sad", "whisper", "emphasis"</li>
        <li>Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech</li>
        <li>To emphasise particular words, wrap them in asterisk (e.g. *you* in the example above) and include "emphasis" in the prompt</li>
    </ul>
    </p>
    """
]

with gr.Blocks(css=css) as block:
    for html_block in html_blocks:
        gr.HTML(html_block)

    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
            description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
            run_button = gr.Button("Generate Audio", variant="primary")
        with gr.Column():
            audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")

    inputs = [input_text, description]
    outputs = [audio_out]
    gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
    run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
    gr.HTML(
        """
        <p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.
        The v1 release of the model will be trained on this data, as well as inference optimisations, such as flash attention
        and torch compile, that will improve the latency by 2-4x. If you want to find out more about how this model was trained and even fine-tune it yourself, check-out the 
        <a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> repository on GitHub. The Parler-TTS codebase and its associated checkpoints are licensed under <a href='https://github.com/huggingface/parler-tts?tab=Apache-2.0-1-ov-file#readme'> Apache 2.0</a>.</p>
        """
    )

block.queue()
block.launch(share=True)