File size: 1,980 Bytes
8b3b181
fcfc5d9
 
 
 
 
 
 
 
 
 
8b3b181
fcfc5d9
 
694ecc6
 
 
 
 
fcfc5d9
 
 
694ecc6
fcfc5d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f770b6
 
 
 
e946294
 
2f770b6
fcfc5d9
 
8b3b181
fcfc5d9
 
 
 
8b3b181
fcfc5d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
694ecc6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Simple demo app.

Copyright PolyAI Limited.
"""
import time
from pathlib import Path

import gradio as gr

from transformer_infer import PhemeClient, parse_arguments


VOICE_OPTIONS = [
        "male_voice",
        "POD1000000004_S0000246", 
        "POD1000000018_S0000253", 
        "POD1000000048_S0000035", 
        "YOU1000000006_S0000051", 
        "YOU1000000044_S0000798", 
]

args = parse_arguments()

model = PhemeClient(args)


def inference(
    text,
    voice,
    top_k,
    temperature
):
    with open("PhemeVoice.log", "a") as f:
        f.write(f"{voice}: {text} \n")
    start_time = time.time()

    data = model.infer(
        text, voice, top_k=top_k, temperature=temperature)
    samplerate = 16_000
    print("Time taken: ", time.time() - start_time)
    yield (samplerate, data)


def main():
    title = "Pheme"
    description = """Pheme Model can generate a variety of conversational voices in 16 kHz for phone-call applications.
    
    Paper: https://arxiv.org/pdf/2401.02839.pdf
    Github: https://github.com/PolyAI-LDN/pheme

    Voices are generated in a zero-shot manner, the model has never seen them before.
    """
    text = gr.Textbox(
        lines=3,
        value="I gotta say, I never expect that to happened. Um I had some expectations but you know.",
        label="Text",
    )

    voice = gr.Dropdown(
        VOICE_OPTIONS, value="POD1000000048_S0000035", label="Select voice:", type="value"
    )
    temperature = gr.Slider(minimum=.3, maximum=1.5, value=0.7, step=0.05)
    top_k = gr.Slider(minimum=10, maximum=250, value=210)
    output_audio = gr.Audio(label="audio:", autoplay=True)
    interface = gr.Interface(
        fn=inference,
        inputs=[
            text,
            voice,
            top_k,
            temperature,
        ],
        title=title,
        description=description,
        outputs=[output_audio],
    )
    interface.queue().launch(share=True)


if __name__ == "__main__":
    main()