Spaces:
Running
Running
import torch | |
from llama_index.core.prompts import PromptTemplate | |
from transformers import AutoTokenizer | |
from llama_index.core import Settings | |
import os | |
import time | |
from llama_index.llms.text_generation_inference import TextGenerationInference | |
import whisper | |
import gradio as gr | |
from gtts import gTTS | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
import soundfile as sf | |
from datasets import load_dataset | |
model = whisper.load_model("base") | |
HF_API_TOKEN = os.getenv("HF_TOKEN") | |
def translate_audio(audio): | |
# load audio and pad/trim it to fit 30 seconds | |
audio = whisper.load_audio(audio) | |
audio = whisper.pad_or_trim(audio) | |
# make log-Mel spectrogram and move to the same device as the model | |
mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
# decode the audio | |
options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0) | |
result = whisper.decode(model, mel, options) | |
return result.text | |
def audio_response(text, output_path="speech.wav"): | |
# Load the processor, model, and vocoder | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
# Process the input text | |
inputs = processor(text=text, return_tensors="pt") | |
# Load xvector containing speaker's voice characteristics | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
# Generate speech | |
with torch.no_grad(): | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
# Save the audio to a file | |
sf.write(output_path, speech.numpy(), samplerate=16000) # Ensure the sample rate matches your needs | |
return output_path | |
def messages_to_prompt(messages): | |
# Default system message for a chatbot | |
default_system_prompt = "You are an AI chatbot designed to assist with user queries in a friendly and conversational manner." | |
prompt = default_system_prompt + "\n" | |
for message in messages: | |
if message.role == 'system': | |
prompt += f"\n{message.content}</s>\n" | |
elif message.role == 'user': | |
prompt += f"\n{message.content}</s>\n" | |
elif message.role == 'assistant': | |
prompt += f"\n{message.content}</s>\n" | |
# Ensure we start with a system prompt, insert blank if needed | |
if not prompt.startswith("\n"): | |
prompt = "\n</s>\n" + prompt | |
# Add final assistant prompt | |
prompt = prompt + "\n" | |
return prompt | |
def completion_to_prompt(completion): | |
return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n" | |
Settings.llm = TextGenerationInference( | |
model_url="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct", | |
token=HF_API_TOKEN, | |
messages_to_prompt=messages_to_prompt, | |
completion_to_prompt=completion_to_prompt | |
) | |
def text_response(t): | |
time.sleep(1) # Adjust the delay as needed | |
response = Settings.llm.complete(t) | |
message = response.text | |
return message | |
def transcribe_(a): | |
t1 = translate_audio(a) | |
t2 = text_response(t1) | |
t3 = audio_response(t2) | |
return (t1, t2, t3) | |
output_1 = gr.Textbox(label="Speech to Text") | |
output_2 = gr.Textbox(label="LLM Output") | |
output_3 = gr.Audio(label="LLM output to audio") | |
gr.Interface( | |
title='AI Voice Assistant', | |
fn=transcribe_, | |
inputs=[ | |
gr.Audio(sources="microphone", type="filepath"), | |
], | |
outputs=[ | |
output_1, output_2, output_3 | |
] | |
).launch(share=True) | |