Spaces:

shahugging
/

lipsync_assistant

Running

File size: 6,334 Bytes


import gradio as gr
import whisper
from playsound import playsound
import os
from transformers import MarianMTModel, MarianTokenizer
from TTS.api import TTS
from transformers import GPTNeoForCausalLM, GPT2Tokenizer


# Load the MarianMT model and tokenizer for multilingual translation
def load_translation_model(src_lang, tgt_lang):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    return model, tokenizer

# Translation function using MarianMT
def translate_text(text, src_lang, tgt_lang):
    model, tokenizer = load_translation_model(src_lang, tgt_lang)
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated_tokens = model.generate(**inputs)
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    return translated_text

# # Predefined responses in English (updated with more general responses)
# def get_predefined_response(query):
#     """Retrieve a predefined response based on keywords in English."""
#     query_lower = query.lower()

#     # Define some common keywords related to specific queries
#     keywords = {
#         "countries": "There are 195 countries in the world.",
#         "how many countries": "There are 195 countries in the world.",
#         "prime minister": "The Prime Minister of India is Narendra Modi.",
#         "name": "I am Nora, a chatbot.",
#         "favorite color": "My favourite color is white and black.",
#         "how are you": "I'm good, thank you! How about you?",
#         "hello": "Hello! How can I help you?",
#         "bye": "Goodbye! Have a nice day!",
#         "how is today":"Today is a beautiful day for learning something new",
#         "language":"Language is a bridge that connects cultures and people",
#         "Technology":"Technology is only as good as the person using it",
#         "consistency":"Consistency is the key to achieving long-term success",
#         "empathy":"Empathy and understanding make the world a better place",
#         "impossible":"Nothing is impossible when you put your mind to it",
#         "communication":"Communication is essential in any relationship.",
#         "honesty":"Honesty is the best policy.",
#         "eating healthy":"Eating healthy foods boosts your immune system",


#     }

#     # Check if any of the keywords exist in the query
#     for keyword, response in keywords.items():
#         if keyword in query_lower:
#             return response

#     # If no keyword matches, return a default message
#     return "I'm sorry, I didn't understand that."


def fetch_answers_gpt_neo(query):


    model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
    tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

    prompt = (
        "" + query

    )

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    gen_tokens = model.generate(
        input_ids,
        do_sample=True,
        temperature=0.9,
        max_length=100,
    )
    gen_text = tokenizer.batch_decode(gen_tokens)[0]
    return gen_text


# Initialize the Coqui TTS model
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)

# Updated function to synthesize audio using Coqui TTS
def synthesize_audio(text, lang):
    """Convert text to speech using Coqui TTS and play it."""
    output_path = "/content/drive/MyDrive/WhisperAudio/ProcessedAudio/response_audio_coqui.mp3"

    # speakers = tts_model.list_speakers()

    # print(speakers)

    # selected_speaker = speakers[1]


    # tts_model.tts_to_file(text=text, file_path=output_path,speaker=selected_speaker)
    # playsound(output_path)
    # os.remove(output_path)


    # List available 🐸TTS models
    device = "cuda"
    print(TTS().list_models())

    # Init TTS
    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")

    # Run TTS
    # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
    # Text to speech list of amplitude values as output

    speaker = "/content/drive/MyDrive/WhisperAudio/Speakers/LJ001-0002.wav"
    wav = tts.tts(text=text, speaker_wav=speaker, language=lang)
    # Text to speech to a file
    tts.tts_to_file(text=text, speaker_wav=speaker, language=lang, file_path=output_path)


def process_audio_with_whisper(audio_path, model_size="medium"):
    """
    Load the Whisper model, detect the language of the audio,
    transcribe it, and provide a response based on the detected language.
    """
    # Load the Whisper model
    model = whisper.load_model(model_size)

    # Detect language and transcribe audio
    result = model.transcribe(audio_path)
    detected_language = result["language"]
    transcription = result["text"]

    print(f"Detected Language: {detected_language}")
    print(f"Original Transcription: {transcription}")

    # Translate to English if needed
    if detected_language != "en":
        translated_result = model.transcribe(audio_path, task="translate")
        transcription_in_english = translated_result["text"]
        print(f"Translated Text to English: {transcription_in_english}")
    else:
        transcription_in_english = transcription
        print("Audio is already in English.")

    # Get a predefined response in English
    response_text_in_english = fetch_answers_gpt_neo(transcription_in_english)
    print(f"Response in English: {response_text_in_english}")

    # Translate the response back to the original language using MarianMT
    translated_response = translate_text(response_text_in_english, "en", detected_language)
    print(f"Translated Response: {translated_response}")

    # Synthesize and play response audio in the detected language
    synthesize_audio(translated_response, detected_language)

def process_audio(audio):
    # audio is a tuple: (filepath, sampling_rate)
    filepath, sampling_rate = audio
    return f"Audio file received: {filepath}, Sampling Rate: {sampling_rate} Hz"

interface = gr.Interface(
    fn=process_audio, 
    inputs=gr.Audio(source="upload", type="filepath"),  # User uploads an audio file
    outputs="text"
)

interface.launch()


process_audio_with_whisper(audio_path)