Spaces:

gospacedev
/

friday

Sleeping

File size: 2,611 Bytes

d51e19d
 
 
 
 
 
 
 
 
 
1b34aa5
d51e19d
1b34aa5
 
 
 
 
c8b7fcf
1b34aa5
 
d51e19d
 
 
 
 
 
 
 
 
 
1b34aa5
d51e19d
 
 
 
 
 
 
 
 
 
 
 
 
 
3cbdf32
d51e19d
 
1b34aa5
d51e19d
 
 
 
 
 
 
 
 
 
 
 
 
1b34aa5
 
 
 
 
 
 
d51e19d
1b34aa5
d51e19d
 
3cbdf32
 
c8b7fcf
d51e19d
 
 
 
 
 
 
 
 
 
 
 
 
c8b7fcf
 
 
3cbdf32
c8b7fcf
 
d51e19d

import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient


ASR_MODEL_NAME = "openai/whisper-small"
LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"


system_prompt = """"<s>[INST] You are Friday, a helpful and conversational AI assistant and You respond with one to two sentences. [/INST] Hello there! I'm friday how can I help you?</s>"""

chat_history = system_prompt + """"""

formatted_history = """"""

client = InferenceClient(LLM_MODEL_NAME)

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL_NAME,
    device=device,
)


def generate(user_prompt, temperature=0.1, max_new_tokens=128, top_p=0.95, repetition_penalty=1.0):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    chat_history += f"""<s>[INST] {user_prompt} [/INST] """

    output = client.text_generation(
        chat_history, **generate_kwargs, stream=False, details=False, return_full_text=False)

    print(output)
    return output


@spaces.GPU(duration=60)
def transcribe(audio):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    inputs = pipe({"sampling_rate": sr, "raw": y})["text"]

    formatted_history += f"""Human: {inputs}\n"""

    llm_response = generate(inputs)

    chat_history += f""" {llm_response}</s>"""

    formatted_history += f"""Friday: {llm_response}\n"""

    audio_response = gTTS(llm_response)
    audio_response.save("response.mp3")

    print(chat_history)

    return "response.mp3", formatted_history


with gr.Blocks() as demo:
    gr.HTML("<center><h1>Friday: AI Virtual Assistant<h1><center>")

    with gr.Row():
        audio_input = gr.Audio(label="Human", sources="microphone")
        output_audio = gr.Audio(label="Friday", type="filepath",
                                interactive=False,
                                autoplay=True,
                                elem_classes="audio")

    transcribe_btn = gr.Button("Transcribe")

    transcription_box = gr.Textbox(
        formatted_history, label="Transcription")
    
    transcribe_btn.click(fn=transcribe, inputs=audio_input,
                         outputs=[output_audio, transcription_box])


demo.queue()
demo.launch()