Spaces:
Running
on
A10G
Running
on
A10G
File size: 6,878 Bytes
9c73c4f 48324cb 9c73c4f 22be480 9c73c4f 22be480 9c73c4f 6b64247 9c73c4f 48324cb 9c73c4f 9397c10 9c73c4f 6b64247 9c73c4f 6b64247 9c73c4f 6b64247 9c73c4f 6b64247 9c73c4f 48324cb 9c73c4f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import os
import gradio as gr
from openai import OpenAI
from playdiffusion import PlayDiffusion, InpaintInput, TTSInput, RVCInput
inpainter = PlayDiffusion()
_whisper_client = None
def get_whisper_client():
global _whisper_client
if _whisper_client is None:
_whisper_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
return _whisper_client
def run_asr(audio):
whisper_client = get_whisper_client()
audio_file = open(audio, "rb")
transcript = whisper_client.audio.transcriptions.create(
file=audio_file,
model="whisper-1",
response_format="verbose_json",
timestamp_granularities=["word"]
)
word_times = [{
"word": word.word,
"start": word.start,
"end": word.end
} for word in transcript.words]
return transcript.text, transcript.text, word_times
def run_inpainter(input_text, output_text, word_times, audio, num_steps, init_temp, init_diversity, guidance, rescale, topk, use_manual_ratio, audio_token_syllable_ratio):
if not use_manual_ratio:
audio_token_syllable_ratio = None
return inpainter.inpaint(InpaintInput(input_text=input_text, output_text=output_text, input_word_times=word_times, audio=audio, num_steps=num_steps,
init_temp=init_temp, init_diversity=init_diversity, guidance=guidance, rescale=rescale, topk=topk,
audio_token_syllable_ratio=audio_token_syllable_ratio))
def run_inpainter_tts(input_text, voice_audio, num_steps, init_temp, init_diversity, guidance, rescale, topk, use_manual_ratio, audio_token_syllable_ratio):
if not use_manual_ratio:
audio_token_syllable_ratio = None
return inpainter.tts(TTSInput(output_text=input_text, voice=voice_audio, num_steps=num_steps, init_temp=init_temp,
init_diversity=init_diversity, guidance=guidance, rescale=rescale, topk=topk,
audio_token_syllable_ratio=audio_token_syllable_ratio))
def toggle_ratio_input(use_manual):
return gr.update(visible=use_manual, interactive=use_manual)
def create_advanced_options_accordion():
with gr.Accordion("Advanced options", open=False):
num_steps_slider = gr.Slider(1, 100, 30, step=1, label="number of sampling steps codebook")
init_temp_slider = gr.Slider(0.5, 10, 1, step=0.1, label="Initial temperature")
init_diversity_slider = gr.Slider(0, 10, 1, step=0.1, label="Initial diversity")
guidance_slider = gr.Slider(0, 10, 0.5, step=0.1, label="guidance")
rescale_slider = gr.Slider(0, 1, 0.7, step=0.1, label="guidance rescale factor")
topk_slider = gr.Slider(1, 10000, 25, step=1, label="sampling from top-k logits")
gr.Markdown("#### Audio Token Syllable Ratio")
gr.Markdown("*Automatic calculation (recommended) provides the best results in most cases.*")
use_manual_ratio = gr.Checkbox(label="Use manual audio token syllable ratio", value=False)
audio_token_syllable_ratio = gr.Number(
label="Audio token syllable ratio (manual)",
value=12.5, precision=2, minimum=5.0, maximum=25.0,
visible=False, interactive=False
)
use_manual_ratio.change(
toggle_ratio_input,
inputs=[use_manual_ratio],
outputs=[audio_token_syllable_ratio]
)
return (num_steps_slider, init_temp_slider, init_diversity_slider,
guidance_slider, rescale_slider, topk_slider,
use_manual_ratio, audio_token_syllable_ratio)
def speech_rvc(rvc_source_speech, rvc_target_voice):
return inpainter.rvc(RVCInput(source_speech=rvc_source_speech, target_voice=rvc_target_voice))
if __name__ == '__main__':
with gr.Blocks(analytics_enabled=False, title="PlayDiffusion") as demo:
gr.Markdown("## PlayDiffusion")
with gr.Tab("Inpaint"):
gr.Markdown("### Upload an audio file and run ASR to get the text.")
gr.Markdown("### Then, specify the desired output text.")
gr.Markdown("### Run the inpainter to generate the modified audio.")
gr.Markdown("### Note: The model and demo are currently targeted for English.")
inpaint_advanced_options = create_advanced_options_accordion()
with gr.Row():
audio_input = gr.Audio(label="Upload audio to be modified", sources=["upload", "microphone"], type="filepath")
with gr.Row():
asr_submit = gr.Button("Run ASR")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(label="Input text from ASR", interactive=False)
text_output = gr.Textbox(label="Desired output text")
with gr.Column():
word_times = gr.JSON(label="Word times from ASR")
with gr.Row():
inpainter_submit = gr.Button("Run Inpainter")
with gr.Row():
audio_output = gr.Audio(label="Output audio")
asr_submit.click(run_asr, inputs=[audio_input], outputs=[text_input, text_output, word_times])
inpainter_submit.click(
run_inpainter,
inputs=[text_input, text_output, word_times, audio_input] + list(inpaint_advanced_options),
outputs=[audio_output])
with gr.Tab("Text to Speech"):
gr.Markdown("### Text to Speech")
tts_advanced_options = create_advanced_options_accordion()
tts_text = gr.Textbox(label="TTS Input", placeholder="Enter text to convert to speech", lines=2)
tts_voice = gr.Audio(label="Voice to use for TTS",
sources=["upload", "microphone"], type="filepath",
)
tts_submit = gr.Button("Convert to Speech")
tts_output = gr.Audio(label="Generated Speech")
tts_submit.click(
run_inpainter_tts,
inputs=[tts_text, tts_voice] + list(tts_advanced_options),
outputs=[tts_output]
)
with gr.Tab("Voice Conversion"):
gr.Markdown("### Real Time Voice Conversion (works best for english)")
rvc_source_speech = gr.Audio(label="Source Conversion Speech",
sources=["upload", "microphone"], type="filepath",
)
rvc_target_voice = gr.Audio(label="Target Voice",
sources=["upload", "microphone"], type="filepath",
)
rvc_submit = gr.Button("Real time Voice Conversion")
rvc_output = gr.Audio(label="Converted Speech")
rvc_submit.click(
speech_rvc,
inputs=[rvc_source_speech, rvc_target_voice],
outputs=[rvc_output]
)
demo.launch(share=True)
|