File size: 2,751 Bytes
d51e19d
 
 
 
 
 
 
 
 
 
1b34aa5
d51e19d
1b34aa5
 
 
5ba294c
1b34aa5
c8b7fcf
1b34aa5
 
d51e19d
 
 
 
 
 
 
 
 
 
86b5bcb
d51e19d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ba294c
d51e19d
 
 
 
 
874fe80
d51e19d
 
 
 
86b5bcb
 
 
d51e19d
86b5bcb
1b34aa5
86b5bcb
1b34aa5
5ba294c
1b34aa5
874fe80
d51e19d
1b34aa5
d51e19d
 
5ba294c
3cbdf32
c8b7fcf
d51e19d
 
 
 
 
 
 
 
 
 
 
 
 
c8b7fcf
98bbe93
f76ff96
9022329
98bbe93
d51e19d
2b54ce7
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient


ASR_MODEL_NAME = "openai/whisper-small"
LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"


system_prompt = """"<s>[INST] You are Friday, a helpful and conversational AI assistant and You respond with one to two sentences. [/INST] Hello there! I'm friday how can I help you?</s>"""

instruct_history = system_prompt + """"""

formatted_history = """"""

client = InferenceClient(LLM_MODEL_NAME)

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL_NAME,
    device=device,
)


def generate(instruct_history, temperature=0.1, max_new_tokens=128, top_p=0.95, repetition_penalty=1.0):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    output = client.text_generation(
        instruct_history, **generate_kwargs, stream=False, details=False, return_full_text=False)

    return output


@spaces.GPU(duration=60)
def transcribe(audio, instruct_history=instruct_history, formatted_history=formatted_history):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    transcribed_user_audio = pipe({"sampling_rate": sr, "raw": y})["text"]

    formatted_history += f"""Human: {transcribed_user_audio}\n\n"""

    instruct_history += f"""<s>[INST] {transcribed_user_audio} [/INST] """

    llm_response = generate(instruct_history)

    instruct_history += f""" {llm_response}</s>"""

    formatted_history += f"""Friday: {llm_response}\n\n"""

    audio_response = gTTS(llm_response)
    audio_response.save("response.mp3")

    print(instruct_history)

    return "response.mp3", formatted_history


with gr.Blocks() as demo:
    gr.HTML("<center><h1>Friday: AI Virtual Assistant<h1><center>")

    with gr.Row():
        audio_input = gr.Audio(label="Human", sources="microphone")
        output_audio = gr.Audio(label="Friday", type="filepath",
                                interactive=False,
                                autoplay=True,
                                elem_classes="audio")

    transcribe_btn = gr.Button("Transcribe")

    transcription_box = gr.Textbox(label="Transcription")

    transcribe_btn.click(fn=transcribe, inputs=[audio_input],
                         outputs=[output_audio, transcription_box])

if __name__ == "__main__":
    demo.queue()
    demo.launch()