litagin's picture
I forgot it
268101f
import gradio as gr
import spaces
import torch
from loguru import logger
from parler_tts import ParlerTTSForConditionalGeneration
from rubyinserter import add_ruby
from transformers import AutoTokenizer
device = "cuda:0" if torch.cuda.is_available() else "cpu"
repo_id = "2121-8/japanese-parler-tts-large-bate"
logger.info(f"Using device: {device}")
logger.info(f"Loading model from: {repo_id}")
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
logger.success("Model loaded successfully")
model.eval()
tokenizer = AutoTokenizer.from_pretrained(repo_id)
@spaces.GPU
def parler_tts(prompt: str, description: str):
logger.info(f"Prompt: {prompt}")
logger.info(f"Description: {description}")
if len(prompt) > 150:
return "Text is too long. Please keep it under 150 characters.", None
if len(description) > 300:
return "Description is too long. Please keep it under 300 characters.", None
prompt = add_ruby(prompt)
logger.info(f"Prompt with ruby: {prompt}")
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
with torch.no_grad():
generation = model.generate(
input_ids=input_ids, prompt_input_ids=prompt_input_ids
)
audio_arr = generation.cpu().numpy().squeeze()
return "Success", (model.config.sampling_rate, audio_arr)
md = """
# Japanese Parler-TTS Large (β版) デモ
第三者による [Japanese Parler-TTS Large (β版)](https://huggingface.co/2121-8/japanese-parler-tts-large-bate) の音声合成デモです。
- 入力文章: 150文字以内の文章を入力してください。
- 説明文章: 300文字以内の文章を入力してください。音声の特徴を説明する文章を入力します(多分)。
"""
with gr.Blocks() as app:
gr.Markdown(md)
prompt = gr.Textbox(label="入力文章")
description = gr.Textbox(
label="説明文章",
value="A female speaker with a slightly high-pitched voice delivers her words at a moderate speed with a quite monotone tone in a confined environment, resulting in a quite clear audio recording.",
)
btn = gr.Button("生成")
info_text = gr.Textbox(label="情報")
audio = gr.Audio()
btn.click(
fn=parler_tts,
inputs=[prompt, description],
outputs=[info_text, audio],
)
app.launch()