import os
import tempfile
import gradio as gr
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
from IPython.display import Audio

# Load the models and tokenizer
whisper_model = whisper.load_model("base")
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")

def translate_audio(input_file, to_lang):
    # Load the audio file
    audio = whisper.load_audio(input_file)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
    
    # Detect language using Whisper
    _, probs = whisper_model.detect_language(mel)
    lang = max(probs, key=probs.get)
    
    # Convert audio to text
    options = whisper.DecodingOptions()
    result = whisper.decode(whisper_model, mel, options)
    text = result.text
    
    # Translate the text
    tokenizer.src_lang = lang
    encoded_bg = tokenizer(text, return_tensors="pt")
    generated_tokens = model.generate(**encoded_bg)
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    
    # Convert translated text to audio
    tts = gTTS(text=translated_text, lang=to_lang)
    temp_output_file = tempfile.NamedTemporaryFile(suffix=".mp3").name
    tts.save(temp_output_file)
    
    # Load audio data from file
    audio_data = open(temp_output_file, "rb").read()
    
    return Audio(audio_data)

def translate_audio_interface(input_file, to_lang):
    return translate_audio(input_file, to_lang)

iface = gr.Interface(
    fn=translate_audio_interface,
    inputs=["file", "text"],
    outputs="audio",
    title="Audio Translation",
    description="Upload an MP3 file and select the target language for translation.",
    examples=[
        ["audio_example.mp3", "en"],
        ["speech_sample.mp3", "fr"],
    ]
)

iface.launch(debug = True)