Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,751 Bytes
d51e19d 1b34aa5 d51e19d 1b34aa5 5ba294c 1b34aa5 c8b7fcf 1b34aa5 d51e19d 86b5bcb d51e19d 5ba294c d51e19d 874fe80 d51e19d 86b5bcb d51e19d 86b5bcb 1b34aa5 86b5bcb 1b34aa5 5ba294c 1b34aa5 874fe80 d51e19d 1b34aa5 d51e19d 5ba294c 3cbdf32 c8b7fcf d51e19d c8b7fcf 98bbe93 f76ff96 9022329 98bbe93 d51e19d 2b54ce7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient
ASR_MODEL_NAME = "openai/whisper-small"
LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
system_prompt = """"<s>[INST] You are Friday, a helpful and conversational AI assistant and You respond with one to two sentences. [/INST] Hello there! I'm friday how can I help you?</s>"""
instruct_history = system_prompt + """"""
formatted_history = """"""
client = InferenceClient(LLM_MODEL_NAME)
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
device=device,
)
def generate(instruct_history, temperature=0.1, max_new_tokens=128, top_p=0.95, repetition_penalty=1.0):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,
)
output = client.text_generation(
instruct_history, **generate_kwargs, stream=False, details=False, return_full_text=False)
return output
@spaces.GPU(duration=60)
def transcribe(audio, instruct_history=instruct_history, formatted_history=formatted_history):
sr, y = audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
transcribed_user_audio = pipe({"sampling_rate": sr, "raw": y})["text"]
formatted_history += f"""Human: {transcribed_user_audio}\n\n"""
instruct_history += f"""<s>[INST] {transcribed_user_audio} [/INST] """
llm_response = generate(instruct_history)
instruct_history += f""" {llm_response}</s>"""
formatted_history += f"""Friday: {llm_response}\n\n"""
audio_response = gTTS(llm_response)
audio_response.save("response.mp3")
print(instruct_history)
return "response.mp3", formatted_history
with gr.Blocks() as demo:
gr.HTML("<center><h1>Friday: AI Virtual Assistant<h1><center>")
with gr.Row():
audio_input = gr.Audio(label="Human", sources="microphone")
output_audio = gr.Audio(label="Friday", type="filepath",
interactive=False,
autoplay=True,
elem_classes="audio")
transcribe_btn = gr.Button("Transcribe")
transcription_box = gr.Textbox(label="Transcription")
transcribe_btn.click(fn=transcribe, inputs=[audio_input],
outputs=[output_audio, transcription_box])
if __name__ == "__main__":
demo.queue()
demo.launch()
|