File size: 2,611 Bytes
d51e19d
 
 
 
 
 
 
 
 
 
1b34aa5
d51e19d
1b34aa5
 
 
 
 
c8b7fcf
1b34aa5
 
d51e19d
 
 
 
 
 
 
 
 
 
1b34aa5
d51e19d
 
 
 
 
 
 
 
 
 
 
 
 
 
3cbdf32
d51e19d
 
1b34aa5
d51e19d
 
 
 
 
 
 
 
 
 
 
 
 
1b34aa5
 
 
 
 
 
 
d51e19d
1b34aa5
d51e19d
 
3cbdf32
 
c8b7fcf
d51e19d
 
 
 
 
 
 
 
 
 
 
 
 
c8b7fcf
 
 
3cbdf32
c8b7fcf
 
d51e19d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient


ASR_MODEL_NAME = "openai/whisper-small"
LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"


system_prompt = """"<s>[INST] You are Friday, a helpful and conversational AI assistant and You respond with one to two sentences. [/INST] Hello there! I'm friday how can I help you?</s>"""

chat_history = system_prompt + """"""

formatted_history = """"""

client = InferenceClient(LLM_MODEL_NAME)

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL_NAME,
    device=device,
)


def generate(user_prompt, temperature=0.1, max_new_tokens=128, top_p=0.95, repetition_penalty=1.0):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    chat_history += f"""<s>[INST] {user_prompt} [/INST] """

    output = client.text_generation(
        chat_history, **generate_kwargs, stream=False, details=False, return_full_text=False)

    print(output)
    return output


@spaces.GPU(duration=60)
def transcribe(audio):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    inputs = pipe({"sampling_rate": sr, "raw": y})["text"]

    formatted_history += f"""Human: {inputs}\n"""

    llm_response = generate(inputs)

    chat_history += f""" {llm_response}</s>"""

    formatted_history += f"""Friday: {llm_response}\n"""

    audio_response = gTTS(llm_response)
    audio_response.save("response.mp3")

    print(chat_history)

    return "response.mp3", formatted_history


with gr.Blocks() as demo:
    gr.HTML("<center><h1>Friday: AI Virtual Assistant<h1><center>")

    with gr.Row():
        audio_input = gr.Audio(label="Human", sources="microphone")
        output_audio = gr.Audio(label="Friday", type="filepath",
                                interactive=False,
                                autoplay=True,
                                elem_classes="audio")

    transcribe_btn = gr.Button("Transcribe")

    transcription_box = gr.Textbox(
        formatted_history, label="Transcription")
    
    transcribe_btn.click(fn=transcribe, inputs=audio_input,
                         outputs=[output_audio, transcription_box])


demo.queue()
demo.launch()