tts_vc / app.py
hoansu
update bark and vc code
e56fe99
raw
history blame
5.09 kB
import gradio as gr
import numpy as np
import os
from scipy.io.wavfile import write
import tempfile
import zipfile
import shutil
from pydub import AudioSegment
from pydub.silence import split_on_silence
import nltk # we'll use this to split into sentences
import subprocess
from bark import SAMPLE_RATE, generate_audio, preload_models
import numpy as np
from bark.generation import (
generate_text_semantic,
preload_models,
)
from bark.api import semantic_to_waveform
from bark import generate_audio, SAMPLE_RATE
# Preload models if necessary
preload_models()
def process_audio_files_with_logging(script, speaker, cloneFile):
log_messages = "Starting audio processing...\n"
sentences = script.split('\n')
sentences = [item.strip() for item in sentences if item.strip()]
GEN_TEMP = 0.4 # Example temperature, adjust as necessary
temp_dir = tempfile.mkdtemp()
for idx, sentence in enumerate(sentences):
log_messages += f"Processing sentence {idx + 1}: {sentence}\n"
semantic_tokens = generate_text_semantic(
sentence,
history_prompt=speaker,
temp=GEN_TEMP,
min_eos_p=0.05,
)
audio_array = semantic_to_waveform(semantic_tokens, history_prompt=speaker)
filename = os.path.join(temp_dir, f"audio_{idx:02d}.wav")
write(filename, SAMPLE_RATE, audio_array)
log_messages += f"Generated audio for sentence {idx + 1}.\n"
log_messages += "All sentences processed. Starting silence reduction...\n"
# Process each file to remove or reduce silence
for root, _, files in os.walk(temp_dir):
with open("FreeVC/convert.txt", "w") as f:
for file in files:
file_path = os.path.join(root, file)
audio = AudioSegment.from_file(file_path, format="wav")
# Detect non-silent chunks and process
processed_audio = process_audio_for_silence(audio, log_messages)
# Overwrite the original file with processed audio
processed_audio.export(file_path, format="wav")
file_name_without_extension, file_extension = os.path.splitext(file)
line = f"{file_name_without_extension}|{file_path}|{cloneFile[0]}\n"
f.write(line)
log_messages += line + "\n"
#command = "python FreeVC/convert.py --hpfile FreeVC/configs/freevc.json --ptfile FreeVC/checkpoints/freevc.pth --txtpath FreeVC/convert.txt --outdir FreeVC/outputs/freevc"
#subprocess.run(command, shell=True)
log_messages += "Silence reduction complete. Zipping files...\n"
# Zip the processed files
zip_filename = zip_processed_files(temp_dir, log_messages)
# Clean up the temporary directory
shutil.rmtree(temp_dir)
log_messages += "Processing complete. Files ready for download.\n"
return zip_filename, log_messages
def process_audio_for_silence(audio, log_messages):
# Parameters for silence detection
silence_thresh = -32 # Silence threshold in dB
min_silence_len = 1000 # Minimum length of silence to consider in ms
keep_silence = 300 # Amount of silence to keep after the silence in ms
# Detect non-silent chunks
non_silent_chunks = split_on_silence(
audio,
min_silence_len=min_silence_len,
silence_thresh=silence_thresh,
keep_silence=keep_silence
)
# Combine the non-silent chunks back into a single audio segment
processed_audio = AudioSegment.empty()
for chunk in non_silent_chunks:
processed_audio += chunk
log_messages += "Audio processed for silence.\n"
return processed_audio
def zip_processed_files(temp_dir, log_messages):
zip_filename = os.path.join(tempfile.gettempdir(), "processed_audio_files.zip")
with zipfile.ZipFile(zip_filename, 'w') as zipf:
for root, _, files in os.walk(temp_dir):
for file in files:
zipf.write(os.path.join(root, file), file)
log_messages += "Files zipped successfully.\n"
return zip_filename
# Define the Gradio interface
interface = gr.Interface(
fn=process_audio_files_with_logging,
inputs=[gr.Textbox(label="Script", lines=10), gr.Dropdown(label="Speaker", choices=[("French","v2/fr_speaker_7"), ("English","v2/en_speaker_7"), ("Japanese","v2/ja_speaker_2"), ("German","v2/de_speaker_6"), ("Hindi","v2/hi_speaker_2"), ("Italian","v2/it_speaker_6"), ("Korean","v2/ko_speaker_0"), ("Polish","v2/pl_speaker_2"), ("Portuguese","v2/pt_speaker_5"), ("Russian","v2/ru_speaker_4"), ("Spanish","v2/es_speaker_0"), ("Turkish","v2/tr_speaker_1")]), gr.Files(label="clone voice")],
outputs=[gr.File(label="Download Processed Files"), gr.Textbox(label="Log Messages", lines=20)],
title="Audio Processing and Generation",
description="Enter a script and select a speaker to generate and process audio files. Process logs will be displayed below."
)
interface.launch()