import gradio as gr import whisper from playsound import playsound import os from transformers import MarianMTModel, MarianTokenizer from TTS.api import TTS from transformers import GPTNeoForCausalLM, GPT2Tokenizer # Load the MarianMT model and tokenizer for multilingual translation def load_translation_model(src_lang, tgt_lang): model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}' model = MarianMTModel.from_pretrained(model_name) tokenizer = MarianTokenizer.from_pretrained(model_name) return model, tokenizer # Translation function using MarianMT def translate_text(text, src_lang, tgt_lang): model, tokenizer = load_translation_model(src_lang, tgt_lang) inputs = tokenizer(text, return_tensors="pt", padding=True) translated_tokens = model.generate(**inputs) translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] return translated_text # # Predefined responses in English (updated with more general responses) # def get_predefined_response(query): # """Retrieve a predefined response based on keywords in English.""" # query_lower = query.lower() # # Define some common keywords related to specific queries # keywords = { # "countries": "There are 195 countries in the world.", # "how many countries": "There are 195 countries in the world.", # "prime minister": "The Prime Minister of India is Narendra Modi.", # "name": "I am Nora, a chatbot.", # "favorite color": "My favourite color is white and black.", # "how are you": "I'm good, thank you! How about you?", # "hello": "Hello! How can I help you?", # "bye": "Goodbye! Have a nice day!", # "how is today":"Today is a beautiful day for learning something new", # "language":"Language is a bridge that connects cultures and people", # "Technology":"Technology is only as good as the person using it", # "consistency":"Consistency is the key to achieving long-term success", # "empathy":"Empathy and understanding make the world a better place", # "impossible":"Nothing is impossible when you put your mind to it", # "communication":"Communication is essential in any relationship.", # "honesty":"Honesty is the best policy.", # "eating healthy":"Eating healthy foods boosts your immune system", # } # # Check if any of the keywords exist in the query # for keyword, response in keywords.items(): # if keyword in query_lower: # return response # # If no keyword matches, return a default message # return "I'm sorry, I didn't understand that." def fetch_answers_gpt_neo(query): model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B") tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B") prompt = ( "" + query ) input_ids = tokenizer(prompt, return_tensors="pt").input_ids gen_tokens = model.generate( input_ids, do_sample=True, temperature=0.9, max_length=100, ) gen_text = tokenizer.batch_decode(gen_tokens)[0] return gen_text # Initialize the Coqui TTS model tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False) # Updated function to synthesize audio using Coqui TTS def synthesize_audio(text, lang): """Convert text to speech using Coqui TTS and play it.""" output_path = "/content/drive/MyDrive/WhisperAudio/ProcessedAudio/response_audio_coqui.mp3" # speakers = tts_model.list_speakers() # print(speakers) # selected_speaker = speakers[1] # tts_model.tts_to_file(text=text, file_path=output_path,speaker=selected_speaker) # playsound(output_path) # os.remove(output_path) # List available 🐸TTS models device = "cuda" print(TTS().list_models()) # Init TTS tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2") # Run TTS # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language # Text to speech list of amplitude values as output speaker = "/content/drive/MyDrive/WhisperAudio/Speakers/LJ001-0002.wav" wav = tts.tts(text=text, speaker_wav=speaker, language=lang) # Text to speech to a file tts.tts_to_file(text=text, speaker_wav=speaker, language=lang, file_path=output_path) def process_audio_with_whisper(audio_path, model_size="medium"): """ Load the Whisper model, detect the language of the audio, transcribe it, and provide a response based on the detected language. """ # Load the Whisper model model = whisper.load_model(model_size) # Detect language and transcribe audio result = model.transcribe(audio_path) detected_language = result["language"] transcription = result["text"] print(f"Detected Language: {detected_language}") print(f"Original Transcription: {transcription}") # Translate to English if needed if detected_language != "en": translated_result = model.transcribe(audio_path, task="translate") transcription_in_english = translated_result["text"] print(f"Translated Text to English: {transcription_in_english}") else: transcription_in_english = transcription print("Audio is already in English.") # Get a predefined response in English response_text_in_english = fetch_answers_gpt_neo(transcription_in_english) print(f"Response in English: {response_text_in_english}") # Translate the response back to the original language using MarianMT translated_response = translate_text(response_text_in_english, "en", detected_language) print(f"Translated Response: {translated_response}") # Synthesize and play response audio in the detected language synthesize_audio(translated_response, detected_language) def process_audio(audio): # audio is a tuple: (filepath, sampling_rate) filepath, sampling_rate = audio return f"Audio file received: {filepath}, Sampling Rate: {sampling_rate} Hz" interface = gr.Interface( fn=process_audio, inputs=gr.Audio(source="upload", type="filepath"), # User uploads an audio file outputs="text" ) interface.launch() process_audio_with_whisper(audio_path)