shahugging's picture
Updated app.py
08285b3 verified
import gradio as gr
import whisper
from playsound import playsound
import os
from transformers import MarianMTModel, MarianTokenizer
from TTS.api import TTS
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
# Load the MarianMT model and tokenizer for multilingual translation
def load_translation_model(src_lang, tgt_lang):
model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)
return model, tokenizer
# Translation function using MarianMT
def translate_text(text, src_lang, tgt_lang):
model, tokenizer = load_translation_model(src_lang, tgt_lang)
inputs = tokenizer(text, return_tensors="pt", padding=True)
translated_tokens = model.generate(**inputs)
translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
return translated_text
# # Predefined responses in English (updated with more general responses)
# def get_predefined_response(query):
# """Retrieve a predefined response based on keywords in English."""
# query_lower = query.lower()
# # Define some common keywords related to specific queries
# keywords = {
# "countries": "There are 195 countries in the world.",
# "how many countries": "There are 195 countries in the world.",
# "prime minister": "The Prime Minister of India is Narendra Modi.",
# "name": "I am Nora, a chatbot.",
# "favorite color": "My favourite color is white and black.",
# "how are you": "I'm good, thank you! How about you?",
# "hello": "Hello! How can I help you?",
# "bye": "Goodbye! Have a nice day!",
# "how is today":"Today is a beautiful day for learning something new",
# "language":"Language is a bridge that connects cultures and people",
# "Technology":"Technology is only as good as the person using it",
# "consistency":"Consistency is the key to achieving long-term success",
# "empathy":"Empathy and understanding make the world a better place",
# "impossible":"Nothing is impossible when you put your mind to it",
# "communication":"Communication is essential in any relationship.",
# "honesty":"Honesty is the best policy.",
# "eating healthy":"Eating healthy foods boosts your immune system",
# }
# # Check if any of the keywords exist in the query
# for keyword, response in keywords.items():
# if keyword in query_lower:
# return response
# # If no keyword matches, return a default message
# return "I'm sorry, I didn't understand that."
def fetch_answers_gpt_neo(query):
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
prompt = (
"" + query
)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
gen_tokens = model.generate(
input_ids,
do_sample=True,
temperature=0.9,
max_length=100,
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]
return gen_text
# Initialize the Coqui TTS model
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)
# Updated function to synthesize audio using Coqui TTS
def synthesize_audio(text, lang):
"""Convert text to speech using Coqui TTS and play it."""
output_path = "/content/drive/MyDrive/WhisperAudio/ProcessedAudio/response_audio_coqui.mp3"
# speakers = tts_model.list_speakers()
# print(speakers)
# selected_speaker = speakers[1]
# tts_model.tts_to_file(text=text, file_path=output_path,speaker=selected_speaker)
# playsound(output_path)
# os.remove(output_path)
# List available 🐸TTS models
device = "cuda"
print(TTS().list_models())
# Init TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
# Run TTS
# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
# Text to speech list of amplitude values as output
speaker = "/content/drive/MyDrive/WhisperAudio/Speakers/LJ001-0002.wav"
wav = tts.tts(text=text, speaker_wav=speaker, language=lang)
# Text to speech to a file
tts.tts_to_file(text=text, speaker_wav=speaker, language=lang, file_path=output_path)
def process_audio_with_whisper(audio_path, model_size="medium"):
"""
Load the Whisper model, detect the language of the audio,
transcribe it, and provide a response based on the detected language.
"""
# Load the Whisper model
model = whisper.load_model(model_size)
# Detect language and transcribe audio
result = model.transcribe(audio_path)
detected_language = result["language"]
transcription = result["text"]
print(f"Detected Language: {detected_language}")
print(f"Original Transcription: {transcription}")
# Translate to English if needed
if detected_language != "en":
translated_result = model.transcribe(audio_path, task="translate")
transcription_in_english = translated_result["text"]
print(f"Translated Text to English: {transcription_in_english}")
else:
transcription_in_english = transcription
print("Audio is already in English.")
# Get a predefined response in English
response_text_in_english = fetch_answers_gpt_neo(transcription_in_english)
print(f"Response in English: {response_text_in_english}")
# Translate the response back to the original language using MarianMT
translated_response = translate_text(response_text_in_english, "en", detected_language)
print(f"Translated Response: {translated_response}")
# Synthesize and play response audio in the detected language
synthesize_audio(translated_response, detected_language)
def process_audio(audio):
# audio is a tuple: (filepath, sampling_rate)
filepath, sampling_rate = audio
return f"Audio file received: {filepath}, Sampling Rate: {sampling_rate} Hz"
interface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(source="upload", type="filepath"), # User uploads an audio file
outputs="text"
)
interface.launch()
process_audio_with_whisper(audio_path)