File size: 2,455 Bytes
f8672c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268101f
f8672c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
import spaces
import torch
from loguru import logger
from parler_tts import ParlerTTSForConditionalGeneration
from rubyinserter import add_ruby
from transformers import AutoTokenizer

device = "cuda:0" if torch.cuda.is_available() else "cpu"
repo_id = "2121-8/japanese-parler-tts-large-bate"

logger.info(f"Using device: {device}")
logger.info(f"Loading model from: {repo_id}")
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
logger.success("Model loaded successfully")
model.eval()
tokenizer = AutoTokenizer.from_pretrained(repo_id)


@spaces.GPU
def parler_tts(prompt: str, description: str):
    logger.info(f"Prompt: {prompt}")
    logger.info(f"Description: {description}")
    if len(prompt) > 150:
        return "Text is too long. Please keep it under 150 characters.", None
    if len(description) > 300:
        return "Description is too long. Please keep it under 300 characters.", None
    prompt = add_ruby(prompt)
    logger.info(f"Prompt with ruby: {prompt}")
    input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
    prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        generation = model.generate(
            input_ids=input_ids, prompt_input_ids=prompt_input_ids
        )
    audio_arr = generation.cpu().numpy().squeeze()
    return "Success", (model.config.sampling_rate, audio_arr)


md = """
# Japanese Parler-TTS Large (β版) デモ

第三者による [Japanese Parler-TTS Large (β版)](https://huggingface.co/2121-8/japanese-parler-tts-large-bate) の音声合成デモです。

- 入力文章: 150文字以内の文章を入力してください。
- 説明文章: 300文字以内の文章を入力してください。音声の特徴を説明する文章を入力します(多分)。
"""

with gr.Blocks() as app:
    gr.Markdown(md)
    prompt = gr.Textbox(label="入力文章")
    description = gr.Textbox(
        label="説明文章",
        value="A female speaker with a slightly high-pitched voice delivers her words at a moderate speed with a quite monotone tone in a confined environment, resulting in a quite clear audio recording.",
    )
    btn = gr.Button("生成")
    info_text = gr.Textbox(label="情報")
    audio = gr.Audio()

    btn.click(
        fn=parler_tts,
        inputs=[prompt, description],
        outputs=[info_text, audio],
    )

app.launch()