friday / app.py
gospacedev's picture
add transcription box to outputs
98bbe93
raw
history blame
No virus
2.69 kB
import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient
ASR_MODEL_NAME = "openai/whisper-small"
LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
system_prompt = """"<s>[INST] You are Friday, a helpful and conversational AI assistant and You respond with one to two sentences. [/INST] Hello there! I'm friday how can I help you?</s>"""
instruct_history = system_prompt + """"""
formatted_history = """"""
client = InferenceClient(LLM_MODEL_NAME)
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
device=device,
)
def generate(user_prompt, temperature=0.1, max_new_tokens=128, top_p=0.95, repetition_penalty=1.0):
global instruct_history
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,
)
instruct_history += f"""<s>[INST] {user_prompt} [/INST] """
output = client.text_generation(
instruct_history, **generate_kwargs, stream=False, details=False, return_full_text=False)
return output
@spaces.GPU(duration=60)
def transcribe(audio):
global instruct_history, formatted_history
sr, y = audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
inputs = pipe({"sampling_rate": sr, "raw": y})["text"]
formatted_history += f"""Human: {inputs}\n"""
llm_response = generate(inputs)
instruct_history += f""" {llm_response}</s>"""
formatted_history += f"""Friday: {llm_response}\n"""
audio_response = gTTS(llm_response)
audio_response.save("response.mp3")
print(instruct_history)
return "response.mp3", formatted_history
with gr.Blocks() as demo:
gr.HTML("<center><h1>Friday: AI Virtual Assistant<h1><center>")
with gr.Row():
audio_input = gr.Audio(label="Human", sources="microphone")
output_audio = gr.Audio(label="Friday", type="filepath",
interactive=False,
autoplay=True,
elem_classes="audio")
transcribe_btn = gr.Button("Transcribe")
transcription_box = gr.Textbox(label="Transcription")
transcribe_btn.click(fn=transcribe, inputs=[audio_input],
outputs=[output_audio, transcription_box])
if __name__ == "__main__":
demo.queue()
demo.launch()