import gradio import torch, torchaudio from transformers import VitsTokenizer, VitsModel, AutoModelForCausalLM, AutoTokenizer, pipeline import numpy as np transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en") model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng") tts_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") def ASR(Query): sr, data = Query data = data.astype(np.float32) data /= np.max(np.abs(data)) return transcriber({"sampling_rate": sr, "raw": data})["text"] def LLM(Query): messages = [{"role": "user", "content": Query}] inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") outputs = model.generate(inputs, max_new_tokens=64) text = tokenizer.batch_decode(outputs)[0] return text[text.find("<|assistant|>")+13:text.rfind(".")+1].strip() def TTS(Query): tts_input = LLM(Query) toSpeech = tts_tokenizer(text=tts_input, return_tensors="pt") with torch.no_grad(): outputs = tts_model(**toSpeech) return tts_model.config.sampling_rate, outputs.waveform[0].numpy() def Alexei(Query): ASRout = ASR(Query) LLMout = LLM(ASRout) TTSout = TTS(LLMout) return TTSout gradio.Interface(fn=Alexei, inputs=gradio.Audio(sources=["microphone"]), outputs='audio').launch()