Spaces:

shahugging
/

lipsync_assistant

Running

App Files Files Community

lipsync_assistant / app.py

shahugging

Updated app.py

08285b3 verified 8 months ago

raw

history blame contribute delete

6.33 kB


	import gradio as gr
	import whisper
	from playsound import playsound
	import os
	from transformers import MarianMTModel, MarianTokenizer
	from TTS.api import TTS
	from transformers import GPTNeoForCausalLM, GPT2Tokenizer


	# Load the MarianMT model and tokenizer for multilingual translation
	def load_translation_model(src_lang, tgt_lang):
	model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
	model = MarianMTModel.from_pretrained(model_name)
	tokenizer = MarianTokenizer.from_pretrained(model_name)
	return model, tokenizer

	# Translation function using MarianMT
	def translate_text(text, src_lang, tgt_lang):
	model, tokenizer = load_translation_model(src_lang, tgt_lang)
	inputs = tokenizer(text, return_tensors="pt", padding=True)
	translated_tokens = model.generate(**inputs)
	translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
	return translated_text

	# # Predefined responses in English (updated with more general responses)
	# def get_predefined_response(query):
	# """Retrieve a predefined response based on keywords in English."""
	# query_lower = query.lower()

	# # Define some common keywords related to specific queries
	# keywords = {
	# "countries": "There are 195 countries in the world.",
	# "how many countries": "There are 195 countries in the world.",
	# "prime minister": "The Prime Minister of India is Narendra Modi.",
	# "name": "I am Nora, a chatbot.",
	# "favorite color": "My favourite color is white and black.",
	# "how are you": "I'm good, thank you! How about you?",
	# "hello": "Hello! How can I help you?",
	# "bye": "Goodbye! Have a nice day!",
	# "how is today":"Today is a beautiful day for learning something new",
	# "language":"Language is a bridge that connects cultures and people",
	# "Technology":"Technology is only as good as the person using it",
	# "consistency":"Consistency is the key to achieving long-term success",
	# "empathy":"Empathy and understanding make the world a better place",
	# "impossible":"Nothing is impossible when you put your mind to it",
	# "communication":"Communication is essential in any relationship.",
	# "honesty":"Honesty is the best policy.",
	# "eating healthy":"Eating healthy foods boosts your immune system",


	# }

	# # Check if any of the keywords exist in the query
	# for keyword, response in keywords.items():
	# if keyword in query_lower:
	# return response

	# # If no keyword matches, return a default message
	# return "I'm sorry, I didn't understand that."


	def fetch_answers_gpt_neo(query):


	model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
	tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

	prompt = (
	"" + query

	)

	input_ids = tokenizer(prompt, return_tensors="pt").input_ids

	gen_tokens = model.generate(
	input_ids,
	do_sample=True,
	temperature=0.9,
	max_length=100,
	)
	gen_text = tokenizer.batch_decode(gen_tokens)[0]
	return gen_text


	# Initialize the Coqui TTS model
	tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)

	# Updated function to synthesize audio using Coqui TTS
	def synthesize_audio(text, lang):
	"""Convert text to speech using Coqui TTS and play it."""
	output_path = "/content/drive/MyDrive/WhisperAudio/ProcessedAudio/response_audio_coqui.mp3"

	# speakers = tts_model.list_speakers()

	# print(speakers)

	# selected_speaker = speakers[1]


	# tts_model.tts_to_file(text=text, file_path=output_path,speaker=selected_speaker)
	# playsound(output_path)
	# os.remove(output_path)


	# List available 🐸TTS models
	device = "cuda"
	print(TTS().list_models())

	# Init TTS
	tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")

	# Run TTS
	# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
	# Text to speech list of amplitude values as output

	speaker = "/content/drive/MyDrive/WhisperAudio/Speakers/LJ001-0002.wav"
	wav = tts.tts(text=text, speaker_wav=speaker, language=lang)
	# Text to speech to a file
	tts.tts_to_file(text=text, speaker_wav=speaker, language=lang, file_path=output_path)


	def process_audio_with_whisper(audio_path, model_size="medium"):
	"""
	Load the Whisper model, detect the language of the audio,
	transcribe it, and provide a response based on the detected language.
	"""
	# Load the Whisper model
	model = whisper.load_model(model_size)

	# Detect language and transcribe audio
	result = model.transcribe(audio_path)
	detected_language = result["language"]
	transcription = result["text"]

	print(f"Detected Language: {detected_language}")
	print(f"Original Transcription: {transcription}")

	# Translate to English if needed
	if detected_language != "en":
	translated_result = model.transcribe(audio_path, task="translate")
	transcription_in_english = translated_result["text"]
	print(f"Translated Text to English: {transcription_in_english}")
	else:
	transcription_in_english = transcription
	print("Audio is already in English.")

	# Get a predefined response in English
	response_text_in_english = fetch_answers_gpt_neo(transcription_in_english)
	print(f"Response in English: {response_text_in_english}")

	# Translate the response back to the original language using MarianMT
	translated_response = translate_text(response_text_in_english, "en", detected_language)
	print(f"Translated Response: {translated_response}")

	# Synthesize and play response audio in the detected language
	synthesize_audio(translated_response, detected_language)

	def process_audio(audio):
	# audio is a tuple: (filepath, sampling_rate)
	filepath, sampling_rate = audio
	return f"Audio file received: {filepath}, Sampling Rate: {sampling_rate} Hz"

	interface = gr.Interface(
	fn=process_audio,
	inputs=gr.Audio(source="upload", type="filepath"), # User uploads an audio file
	outputs="text"
	)

	interface.launch()


	process_audio_with_whisper(audio_path)