File size: 6,334 Bytes
93694e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc081c0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

import gradio as gr
import whisper
from playsound import playsound
import os
from transformers import MarianMTModel, MarianTokenizer
from TTS.api import TTS
from transformers import GPTNeoForCausalLM, GPT2Tokenizer


# Load the MarianMT model and tokenizer for multilingual translation
def load_translation_model(src_lang, tgt_lang):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    return model, tokenizer

# Translation function using MarianMT
def translate_text(text, src_lang, tgt_lang):
    model, tokenizer = load_translation_model(src_lang, tgt_lang)
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated_tokens = model.generate(**inputs)
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    return translated_text

# # Predefined responses in English (updated with more general responses)
# def get_predefined_response(query):
#     """Retrieve a predefined response based on keywords in English."""
#     query_lower = query.lower()

#     # Define some common keywords related to specific queries
#     keywords = {
#         "countries": "There are 195 countries in the world.",
#         "how many countries": "There are 195 countries in the world.",
#         "prime minister": "The Prime Minister of India is Narendra Modi.",
#         "name": "I am Nora, a chatbot.",
#         "favorite color": "My favourite color is white and black.",
#         "how are you": "I'm good, thank you! How about you?",
#         "hello": "Hello! How can I help you?",
#         "bye": "Goodbye! Have a nice day!",
#         "how is today":"Today is a beautiful day for learning something new",
#         "language":"Language is a bridge that connects cultures and people",
#         "Technology":"Technology is only as good as the person using it",
#         "consistency":"Consistency is the key to achieving long-term success",
#         "empathy":"Empathy and understanding make the world a better place",
#         "impossible":"Nothing is impossible when you put your mind to it",
#         "communication":"Communication is essential in any relationship.",
#         "honesty":"Honesty is the best policy.",
#         "eating healthy":"Eating healthy foods boosts your immune system",


#     }

#     # Check if any of the keywords exist in the query
#     for keyword, response in keywords.items():
#         if keyword in query_lower:
#             return response

#     # If no keyword matches, return a default message
#     return "I'm sorry, I didn't understand that."


def fetch_answers_gpt_neo(query):


    model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
    tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

    prompt = (
        "" + query

    )

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    gen_tokens = model.generate(
        input_ids,
        do_sample=True,
        temperature=0.9,
        max_length=100,
    )
    gen_text = tokenizer.batch_decode(gen_tokens)[0]
    return gen_text


# Initialize the Coqui TTS model
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)

# Updated function to synthesize audio using Coqui TTS
def synthesize_audio(text, lang):
    """Convert text to speech using Coqui TTS and play it."""
    output_path = "/content/drive/MyDrive/WhisperAudio/ProcessedAudio/response_audio_coqui.mp3"

    # speakers = tts_model.list_speakers()

    # print(speakers)

    # selected_speaker = speakers[1]


    # tts_model.tts_to_file(text=text, file_path=output_path,speaker=selected_speaker)
    # playsound(output_path)
    # os.remove(output_path)


    # List available 🐸TTS models
    device = "cuda"
    print(TTS().list_models())

    # Init TTS
    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")

    # Run TTS
    # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
    # Text to speech list of amplitude values as output

    speaker = "/content/drive/MyDrive/WhisperAudio/Speakers/LJ001-0002.wav"
    wav = tts.tts(text=text, speaker_wav=speaker, language=lang)
    # Text to speech to a file
    tts.tts_to_file(text=text, speaker_wav=speaker, language=lang, file_path=output_path)


def process_audio_with_whisper(audio_path, model_size="medium"):
    """
    Load the Whisper model, detect the language of the audio,
    transcribe it, and provide a response based on the detected language.
    """
    # Load the Whisper model
    model = whisper.load_model(model_size)

    # Detect language and transcribe audio
    result = model.transcribe(audio_path)
    detected_language = result["language"]
    transcription = result["text"]

    print(f"Detected Language: {detected_language}")
    print(f"Original Transcription: {transcription}")

    # Translate to English if needed
    if detected_language != "en":
        translated_result = model.transcribe(audio_path, task="translate")
        transcription_in_english = translated_result["text"]
        print(f"Translated Text to English: {transcription_in_english}")
    else:
        transcription_in_english = transcription
        print("Audio is already in English.")

    # Get a predefined response in English
    response_text_in_english = fetch_answers_gpt_neo(transcription_in_english)
    print(f"Response in English: {response_text_in_english}")

    # Translate the response back to the original language using MarianMT
    translated_response = translate_text(response_text_in_english, "en", detected_language)
    print(f"Translated Response: {translated_response}")

    # Synthesize and play response audio in the detected language
    synthesize_audio(translated_response, detected_language)

def process_audio(audio):
    # audio is a tuple: (filepath, sampling_rate)
    filepath, sampling_rate = audio
    return f"Audio file received: {filepath}, Sampling Rate: {sampling_rate} Hz"

interface = gr.Interface(
    fn=process_audio, 
    inputs=gr.Audio(source="upload", type="filepath"),  # User uploads an audio file
    outputs="text"
)

interface.launch()


process_audio_with_whisper(audio_path)