import speech_recognition as sr
from gtts import gTTS
from pydub import AudioSegment
from IPython.display import Audio

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import soundfile as sf

# Setup device and dtype
device = "cuda:0" if torch.cuda.is_available() else "cpu"


import os
from groq import Groq

# Initialize the Groq client with the API key
client = Groq(
    api_key="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",
)


#@@##

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load model and processor
model_id = "openai/whisper-medium"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

from transformers import pipeline
from gtts import gTTS
import gradio as gr
import torch

# Load ASR pipeline
asr_pipe =pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

# Initialize Groq client
client = Groq(
    api_key="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"
)

# Text-to-Speech function
def text_to_speech(text):
    try:
        # Convert text to speech using gTTS
        tts = gTTS(text, lang='hi')
        tts.save("response.mp3")
        return "response.mp3"  # Return the MP3 file path for playback in Gradio
    except Exception as e:
        print(f"Text-to-speech error: {e}")
        return None

# Function to process audio, get model response, and return TTS output
def process_audio(audio):
    # Convert audio to text
    print("Converting audio to text...")
    result = asr_pipe(audio, generate_kwargs={"language": "urdu"})

    # Check if audio-to-text conversion was successful
    if "text" in result and result["text"].strip():
        user_ques = result["text"]
        print("Audio-to-text conversion successful. User Question:", user_ques)

        # Prepare messages for model input
        messages = [
            {
                "role": "system",
                "content": "You are a helpful assistant named SSk BOT that stands for (sehar bot) who mostly answers in Roman Urdu. Be professional. No emojis; just Urdu written in English letters, and if you receive a prompt in Urdu font, answer only in English (Roman Urdu).",
            },
            {
                "role": "user",
                "content": user_ques,
            }
        ]

        # Get response from Groq model
        print("Getting response from the model...")
        response = client.chat.completions.create(
            messages=messages,
            model="gemma2-9b-it",
        )

        # Extract model's response
        model_response = response['choices'][0]['message']['content']
        print("Model:", model_response)

        # Convert model's response to speech
        audio_path = text_to_speech(model_response)
        return model_response, audio_path

    else:
        print("Audio-to-text conversion failed or produced no text.")
        return "Audio-to-text conversion failed or no text was detected.", None

# Gradio interface
interface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath"),
    outputs=[gr.Textbox(label="Model Response"), gr.Audio(label="Response Audio")],
    title="Real-time ASR to Language Model Response",
    description="Upload an audio file in Urdu, get a text response from the model, and hear the response in English."
)

# Launch the Gradio Interface
interface.launch()