pheme / app.py
pfb30's picture
Update app.py
e946294 verified
raw
history blame contribute delete
No virus
1.98 kB
"""Simple demo app.
Copyright PolyAI Limited.
"""
import time
from pathlib import Path
import gradio as gr
from transformer_infer import PhemeClient, parse_arguments
VOICE_OPTIONS = [
"male_voice",
"POD1000000004_S0000246",
"POD1000000018_S0000253",
"POD1000000048_S0000035",
"YOU1000000006_S0000051",
"YOU1000000044_S0000798",
]
args = parse_arguments()
model = PhemeClient(args)
def inference(
text,
voice,
top_k,
temperature
):
with open("PhemeVoice.log", "a") as f:
f.write(f"{voice}: {text} \n")
start_time = time.time()
data = model.infer(
text, voice, top_k=top_k, temperature=temperature)
samplerate = 16_000
print("Time taken: ", time.time() - start_time)
yield (samplerate, data)
def main():
title = "Pheme"
description = """Pheme Model can generate a variety of conversational voices in 16 kHz for phone-call applications.
Paper: https://arxiv.org/pdf/2401.02839.pdf
Github: https://github.com/PolyAI-LDN/pheme
Voices are generated in a zero-shot manner, the model has never seen them before.
"""
text = gr.Textbox(
lines=3,
value="I gotta say, I never expect that to happened. Um I had some expectations but you know.",
label="Text",
)
voice = gr.Dropdown(
VOICE_OPTIONS, value="POD1000000048_S0000035", label="Select voice:", type="value"
)
temperature = gr.Slider(minimum=.3, maximum=1.5, value=0.7, step=0.05)
top_k = gr.Slider(minimum=10, maximum=250, value=210)
output_audio = gr.Audio(label="audio:", autoplay=True)
interface = gr.Interface(
fn=inference,
inputs=[
text,
voice,
top_k,
temperature,
],
title=title,
description=description,
outputs=[output_audio],
)
interface.queue().launch(share=True)
if __name__ == "__main__":
main()