Spaces:
Running
Running
import gradio as gr | |
import whisper | |
from playsound import playsound | |
import os | |
from transformers import MarianMTModel, MarianTokenizer | |
from TTS.api import TTS | |
from transformers import GPTNeoForCausalLM, GPT2Tokenizer | |
# Load the MarianMT model and tokenizer for multilingual translation | |
def load_translation_model(src_lang, tgt_lang): | |
model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}' | |
model = MarianMTModel.from_pretrained(model_name) | |
tokenizer = MarianTokenizer.from_pretrained(model_name) | |
return model, tokenizer | |
# Translation function using MarianMT | |
def translate_text(text, src_lang, tgt_lang): | |
model, tokenizer = load_translation_model(src_lang, tgt_lang) | |
inputs = tokenizer(text, return_tensors="pt", padding=True) | |
translated_tokens = model.generate(**inputs) | |
translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] | |
return translated_text | |
# # Predefined responses in English (updated with more general responses) | |
# def get_predefined_response(query): | |
# """Retrieve a predefined response based on keywords in English.""" | |
# query_lower = query.lower() | |
# # Define some common keywords related to specific queries | |
# keywords = { | |
# "countries": "There are 195 countries in the world.", | |
# "how many countries": "There are 195 countries in the world.", | |
# "prime minister": "The Prime Minister of India is Narendra Modi.", | |
# "name": "I am Nora, a chatbot.", | |
# "favorite color": "My favourite color is white and black.", | |
# "how are you": "I'm good, thank you! How about you?", | |
# "hello": "Hello! How can I help you?", | |
# "bye": "Goodbye! Have a nice day!", | |
# "how is today":"Today is a beautiful day for learning something new", | |
# "language":"Language is a bridge that connects cultures and people", | |
# "Technology":"Technology is only as good as the person using it", | |
# "consistency":"Consistency is the key to achieving long-term success", | |
# "empathy":"Empathy and understanding make the world a better place", | |
# "impossible":"Nothing is impossible when you put your mind to it", | |
# "communication":"Communication is essential in any relationship.", | |
# "honesty":"Honesty is the best policy.", | |
# "eating healthy":"Eating healthy foods boosts your immune system", | |
# } | |
# # Check if any of the keywords exist in the query | |
# for keyword, response in keywords.items(): | |
# if keyword in query_lower: | |
# return response | |
# # If no keyword matches, return a default message | |
# return "I'm sorry, I didn't understand that." | |
def fetch_answers_gpt_neo(query): | |
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B") | |
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B") | |
prompt = ( | |
"" + query | |
) | |
input_ids = tokenizer(prompt, return_tensors="pt").input_ids | |
gen_tokens = model.generate( | |
input_ids, | |
do_sample=True, | |
temperature=0.9, | |
max_length=100, | |
) | |
gen_text = tokenizer.batch_decode(gen_tokens)[0] | |
return gen_text | |
# Initialize the Coqui TTS model | |
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False) | |
# Updated function to synthesize audio using Coqui TTS | |
def synthesize_audio(text, lang): | |
"""Convert text to speech using Coqui TTS and play it.""" | |
output_path = "/content/drive/MyDrive/WhisperAudio/ProcessedAudio/response_audio_coqui.mp3" | |
# speakers = tts_model.list_speakers() | |
# print(speakers) | |
# selected_speaker = speakers[1] | |
# tts_model.tts_to_file(text=text, file_path=output_path,speaker=selected_speaker) | |
# playsound(output_path) | |
# os.remove(output_path) | |
# List available 🐸TTS models | |
device = "cuda" | |
print(TTS().list_models()) | |
# Init TTS | |
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2") | |
# Run TTS | |
# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language | |
# Text to speech list of amplitude values as output | |
speaker = "/content/drive/MyDrive/WhisperAudio/Speakers/LJ001-0002.wav" | |
wav = tts.tts(text=text, speaker_wav=speaker, language=lang) | |
# Text to speech to a file | |
tts.tts_to_file(text=text, speaker_wav=speaker, language=lang, file_path=output_path) | |
def process_audio_with_whisper(audio_path, model_size="medium"): | |
""" | |
Load the Whisper model, detect the language of the audio, | |
transcribe it, and provide a response based on the detected language. | |
""" | |
# Load the Whisper model | |
model = whisper.load_model(model_size) | |
# Detect language and transcribe audio | |
result = model.transcribe(audio_path) | |
detected_language = result["language"] | |
transcription = result["text"] | |
print(f"Detected Language: {detected_language}") | |
print(f"Original Transcription: {transcription}") | |
# Translate to English if needed | |
if detected_language != "en": | |
translated_result = model.transcribe(audio_path, task="translate") | |
transcription_in_english = translated_result["text"] | |
print(f"Translated Text to English: {transcription_in_english}") | |
else: | |
transcription_in_english = transcription | |
print("Audio is already in English.") | |
# Get a predefined response in English | |
response_text_in_english = fetch_answers_gpt_neo(transcription_in_english) | |
print(f"Response in English: {response_text_in_english}") | |
# Translate the response back to the original language using MarianMT | |
translated_response = translate_text(response_text_in_english, "en", detected_language) | |
print(f"Translated Response: {translated_response}") | |
# Synthesize and play response audio in the detected language | |
synthesize_audio(translated_response, detected_language) | |
def process_audio(audio): | |
# audio is a tuple: (filepath, sampling_rate) | |
filepath, sampling_rate = audio | |
return f"Audio file received: {filepath}, Sampling Rate: {sampling_rate} Hz" | |
interface = gr.Interface( | |
fn=process_audio, | |
inputs=gr.Audio(source="upload", type="filepath"), # User uploads an audio file | |
outputs="text" | |
) | |
interface.launch() | |
process_audio_with_whisper(audio_path) |