import gradio as gr import spaces import torch from loguru import logger from parler_tts import ParlerTTSForConditionalGeneration from rubyinserter import add_ruby from transformers import AutoTokenizer device = "cuda:0" if torch.cuda.is_available() else "cpu" repo_id = "2121-8/japanese-parler-tts-large-bate" logger.info(f"Using device: {device}") logger.info(f"Loading model from: {repo_id}") model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device) logger.success("Model loaded successfully") model.eval() tokenizer = AutoTokenizer.from_pretrained(repo_id) @spaces.GPU def parler_tts(prompt: str, description: str): logger.info(f"Prompt: {prompt}") logger.info(f"Description: {description}") if len(prompt) > 150: return "Text is too long. Please keep it under 150 characters.", None if len(description) > 300: return "Description is too long. Please keep it under 300 characters.", None prompt = add_ruby(prompt) logger.info(f"Prompt with ruby: {prompt}") input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device) prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) with torch.no_grad(): generation = model.generate( input_ids=input_ids, prompt_input_ids=prompt_input_ids ) audio_arr = generation.cpu().numpy().squeeze() return "Success", (model.config.sampling_rate, audio_arr) md = """ # Japanese Parler-TTS Large (β版) デモ 第三者による [Japanese Parler-TTS Large (β版)](https://huggingface.co/2121-8/japanese-parler-tts-large-bate) の音声合成デモです。 - 入力文章: 150文字以内の文章を入力してください。 - 説明文章: 300文字以内の文章を入力してください。音声の特徴を説明する文章を入力します(多分)。 """ with gr.Blocks() as app: gr.Markdown(md) prompt = gr.Textbox(label="入力文章") description = gr.Textbox( label="説明文章", value="A female speaker with a slightly high-pitched voice delivers her words at a moderate speed with a quite monotone tone in a confined environment, resulting in a quite clear audio recording.", ) btn = gr.Button("生成") info_text = gr.Textbox(label="情報") audio = gr.Audio() btn.click( fn=parler_tts, inputs=[prompt, description], outputs=[info_text, audio], ) app.launch()