"""Simple demo app. Copyright PolyAI Limited. """ import time from pathlib import Path import gradio as gr from transformer_infer import PhemeClient, parse_arguments VOICE_OPTIONS = [ "male_voice", "POD1000000004_S0000246", "POD1000000018_S0000253", "POD1000000048_S0000035", "YOU1000000006_S0000051", "YOU1000000044_S0000798", ] args = parse_arguments() model = PhemeClient(args) def inference( text, voice, top_k, temperature ): with open("PhemeVoice.log", "a") as f: f.write(f"{voice}: {text} \n") start_time = time.time() data = model.infer( text, voice, top_k=top_k, temperature=temperature) samplerate = 16_000 print("Time taken: ", time.time() - start_time) yield (samplerate, data) def main(): title = "Pheme" description = """Pheme Model can generate a variety of conversational voices in 16 kHz for phone-call applications. Paper: https://arxiv.org/pdf/2401.02839.pdf Github: https://github.com/PolyAI-LDN/pheme Voices are generated in a zero-shot manner, the model has never seen them before. """ text = gr.Textbox( lines=3, value="I gotta say, I never expect that to happened. Um I had some expectations but you know.", label="Text", ) voice = gr.Dropdown( VOICE_OPTIONS, value="POD1000000048_S0000035", label="Select voice:", type="value" ) temperature = gr.Slider(minimum=.3, maximum=1.5, value=0.7, step=0.05) top_k = gr.Slider(minimum=10, maximum=250, value=210) output_audio = gr.Audio(label="audio:", autoplay=True) interface = gr.Interface( fn=inference, inputs=[ text, voice, top_k, temperature, ], title=title, description=description, outputs=[output_audio], ) interface.queue().launch(share=True) if __name__ == "__main__": main()