IndexTTS

Runtime error

File size: 6,811 Bytes

'''
pip install datasets soundfile huggingface_hub librosa


from datasets import load_dataset
import soundfile as sf
import os
from collections import defaultdict
import io

def download_voices_with_dynamic_counting(output_folder='genshin_voices_sample_5', max_files_per_speaker=5):
    """动态统计并下载所有speaker的音频和转录文件（兼容bytes/path格式）"""
    # 加载数据集（流式模式）
    dataset = load_dataset('simon3000/genshin-voice', split='train', streaming=True)
    
    # 过滤条件：中文、有转录、类型为对话
    filtered_data = dataset.filter(
        lambda x: (
            x['language'] == 'Chinese' and 
            x['transcription'] != '' and 
            x['type'] == 'Dialog'
        )
    )
    
    # 动态统计speaker计数和文件下载
    speaker_counts = defaultdict(int)
    speaker_file_indices = defaultdict(int)
    
    os.makedirs(output_folder, exist_ok=True)
    
    for voice in filtered_data:
        speaker = voice['speaker']
        
        # 如果该speaker已下载足够文件，跳过
        if speaker_counts[speaker] >= max_files_per_speaker:
            continue
        
        # 更新speaker计数
        speaker_counts[speaker] += 1
        file_num = str(speaker_file_indices[speaker] + 1).zfill(5)  # 从00001开始
        
        # 创建speaker子文件夹
        speaker_folder = os.path.join(output_folder, speaker)
        os.makedirs(speaker_folder, exist_ok=True)
        
        # 构建文件路径
        audio_path = os.path.join(speaker_folder, f'{speaker}_{file_num}.wav')
        transcription_path = os.path.join(speaker_folder, f'{speaker}_{file_num}.txt')
        
        # 处理音频数据（兼容bytes或path格式）
        audio_data = voice['audio']
        try:
            if 'bytes' in audio_data and audio_data['bytes'] is not None:
                # 从bytes直接读取音频
                with io.BytesIO(audio_data['bytes']) as audio_bytes:
                    data, samplerate = sf.read(audio_bytes)
                    sf.write(audio_path, data, samplerate)
                
                #elif 'path' in audio_data and os.path.exists(audio_data['path']):
                # 如果提供path且文件存在，直接复制
                #data, samplerate = sf.read(audio_data['path'])
                #sf.write(audio_path, data, samplerate)
                
            else:
                print(f"警告: {speaker}的音频数据格式不支持，跳过")
                speaker_counts[speaker] -= 1  # 回滚计数
                continue
        except Exception as e:
            print(f"处理{speaker}的音频时出错: {str(e)}")
            speaker_counts[speaker] -= 1
            continue
        
        # 保存转录文件
        with open(transcription_path, 'w', encoding='utf-8') as f:
            f.write(voice['transcription'])
        
        speaker_file_indices[speaker] += 1
        print(
            f"[下载进度] {speaker}_{file_num} | "
            f"进度: {speaker_counts[speaker]}/{max_files_per_speaker}"
        )
    
    # 打印最终统计
    print("\n=== 下载结果 ===")
    for speaker, count in speaker_counts.items():
        print(f"{speaker}: {count}个文件")

if __name__ == '__main__':
    download_voices_with_dynamic_counting()


from gradio_client import Client, handle_file

client = Client("http://localhost:7860")
result = client.predict(
		prompt=handle_file('genshin_voices_sample_5/Ahangar/Ahangar_00001.wav'),
		text="偷窃者没有好下场",
		api_name="/gen_single"
)
print(result)
from shutil import copy2
copy2(result["value"], result["value"].split("/")[-1])

'''

import spaces
import os
import shutil
import threading
import time
import sys

from huggingface_hub import snapshot_download

current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(current_dir)
sys.path.append(os.path.join(current_dir, "indextts"))

import gradio as gr
from indextts.infer import IndexTTS
from tools.i18n.i18n import I18nAuto

i18n = I18nAuto(language="zh_CN")
MODE = 'local'
snapshot_download("IndexTeam/IndexTTS-1.5",local_dir="checkpoints",)
tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml")

os.makedirs("outputs/tasks",exist_ok=True)
os.makedirs("prompts",exist_ok=True)

@spaces.GPU
def infer(voice, text,output_path=None):
    if not tts:
        raise Exception("Model not loaded")
    if not output_path:
        output_path = os.path.join("outputs", f"spk_{int(time.time())}.wav")
    tts.infer(voice, text, output_path)
    return output_path

def tts_api(voice, text):
    try:
        output_path = infer(voice, text)
        with open(output_path, "rb") as f:
            audio_bytes = f.read()
        return (200, {}, audio_bytes)
    except Exception as e:
        return (500, {"error": str(e)}, None)

def gen_single(prompt, text):
    output_path = infer(prompt, text)
    return gr.update(value=output_path,visible=True)

def update_prompt_audio():
    update_button = gr.update(interactive=True)
    return update_button

with gr.Blocks() as demo:
    mutex = threading.Lock()
    gr.HTML('''
    <h2><center>IndexTTS: An Industrial-Level Controllable and Efficient Zero-Shot Text-To-Speech System</h2>

<p align="center">
<a href='https://arxiv.org/abs/2502.05512'><img src='https://img.shields.io/badge/ArXiv-2502.05512-red'></a>
    ''')
    with gr.Tab("音频生成"):
        with gr.Row():
            os.makedirs("prompts",exist_ok=True)
            prompt_audio = gr.Audio(label="请上传参考音频",key="prompt_audio",
                                    sources=["upload","microphone"],type="filepath")
            prompt_list = os.listdir("prompts")
            default = ''
            if prompt_list:
                default = prompt_list[0]
            input_text_single = gr.Textbox(label="请输入目标文本",key="input_text_single")
            gen_button = gr.Button("生成语音",key="gen_button",interactive=True)
            output_audio = gr.Audio(label="生成结果", visible=False,key="output_audio")

    prompt_audio.upload(update_prompt_audio,
                         inputs=[],
                         outputs=[gen_button])

    gen_button.click(gen_single,
                     inputs=[prompt_audio, input_text_single],
                     outputs=[output_audio])

    # 移除 Interface 相关内容，避免重复渲染
    # 只保留 Blocks demo，UI和API共用
    # 这样既有UI，也能通过Gradio HTTP API调用
    # 通过POST /run/predict即可API调用

    # 移除 add_api_route 和 mount_gradio_app，Spaces 不支持

def main():
    tts.load_normalizer()
    demo.launch(server_name="0.0.0.0", server_port=7860, share = True)

if __name__ == "__main__":
    main()