Spaces:
Runtime error
Runtime error
File size: 2,718 Bytes
5034c86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
from typing import cast
import gradio as gr
import numpy as np
import torch
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan
from speecht5_openjtalk_tokenizer import SpeechT5OpenjtalkTokenizer
import pandas as pd
import transformers
setattr(transformers, SpeechT5OpenjtalkTokenizer.__name__, SpeechT5OpenjtalkTokenizer)
class SpeechT5OpenjtalkProcessor(SpeechT5Processor):
tokenizer_class = SpeechT5OpenjtalkTokenizer.__name__
model = SpeechT5ForTextToSpeech.from_pretrained("esnya/japanese_speecht5_tts")
assert isinstance(model, SpeechT5ForTextToSpeech)
processor = SpeechT5OpenjtalkProcessor.from_pretrained("esnya/japanese_speecht5_tts")
assert isinstance(processor, SpeechT5OpenjtalkProcessor)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
assert isinstance(vocoder, SpeechT5HifiGan)
if torch.cuda.is_available():
model = model.cuda()
vocoder = vocoder.cuda()
def convert_float32_to_int16(wav: np.ndarray) -> np.ndarray:
assert wav.dtype == np.float32
return np.clip(wav * 32768.0, -32768.0, 32767.0).astype(np.int16)
@torch.inference_mode()
def text_to_speech(
text: str,
threshold: float = 0.5,
minlenratio: float = 0.0,
maxlenratio: float = 10.0,
):
speaker_embeddings = (
torch.rand(
(1, model.config.speaker_embedding_dim),
dtype=torch.float32,
device=model.device,
)
* 2
- 1
)
input_ids = processor(text=text, return_tensors="pt")
assert input_ids is not None
input_ids = input_ids.input_ids.to(model.device)
speaker_embeddings = cast(torch.FloatTensor, speaker_embeddings)
wav = model.generate_speech(
input_ids,
speaker_embeddings,
threshold=threshold,
minlenratio=minlenratio,
maxlenratio=maxlenratio,
vocoder=vocoder,
)
wav = cast(torch.FloatTensor, wav)
wav = convert_float32_to_int16(wav.reshape(-1).cpu().float().numpy())
return [
(vocoder.config.sampling_rate, wav),
pd.DataFrame(
{
"dim": range(speaker_embeddings.shape[-1]),
"value": speaker_embeddings[0].cpu().float().numpy(),
}
),
]
demo = gr.Interface(
fn=text_to_speech,
inputs=[
"text",
gr.Slider(0, 0.5, 0.5, label="threshold"),
gr.Slider(0, 100, 0, label="minlenratio"),
gr.Slider(0, 100, 10, label="maxlenratio"),
],
outputs=[
"audio",
gr.BarPlot(
label="speaker_embedding (random generated)",
x="dim",
y="value",
y_lim=[-1, 1],
),
],
)
demo.launch()
|