import torch
from transformers import BitsAndBytesConfig, pipeline
import whisper
import gradio as gr
import time
import warnings
import os
from gtts import gTTS
from PIL import Image
import nltk
from nltk import sent_tokenize
import re
import numpy as np
import datetime
import subprocess

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Ensure nltk punkt is downloaded
nltk.download('punkt')

# Model ID
model_id = "llava-hf/llava-1.5-7b-hf"

# Load the pipeline without GPU-dependent quantization
pipe = pipeline("image-to-text", model=model_id)

# Initialize Whisper model
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("medium", device=DEVICE)

# Function to log history
def writehistory(text):
    tstamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    logfile = f'{tstamp}_log.txt'
    with open(logfile, 'a', encoding='utf-8') as f:
        f.write(text + '\n')

# Function to process image and generate description
def img2txt(input_text, input_image):
    image = Image.open(input_image)
    if type(input_text) == tuple:
        prompt_instructions = "Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?"
    else:
        prompt_instructions = f"Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt: {input_text}"

    prompt = f"USER: <image>\n{prompt_instructions}\nASSISTANT:"

    outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})

    if outputs and len(outputs[0]["generated_text"]) > 0:
        match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
        reply = match.group(1) if match else "No response found."
    else:
        reply = "No response generated."

    return reply

# Function to transcribe audio
def transcribe(audio):
    if not audio:
        return '', '', None

    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    return result.text

# Function to convert text to speech
def text_to_speech(text, file_path):
    audioobj = gTTS(text=text, lang='en', slow=False)
    audioobj.save(file_path)
    return file_path

# Generate a silent audio file for initial output
command = ['ffmpeg', '-f', 'lavfi', '-i', 'anullsrc=r=44100:cl=mono', '-t', '10', '-q:a', '9', '-acodec', 'libmp3lame', 'Temp.mp3']
subprocess.run(command, check=True)

# Function to handle inputs and generate outputs
def process_inputs(audio_path, image_path):
    speech_to_text_output = transcribe(audio_path)
    chatgpt_output = img2txt(speech_to_text_output, image_path) if image_path else "No image provided."
    processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3")
    return speech_to_text_output, chatgpt_output, processed_audio_path

# Create the Gradio interface
iface = gr.Interface(
    fn=process_inputs,
    inputs=[gr.Audio(sources=["microphone"], type="filepath"), gr.Image(type="filepath")],
    outputs=[gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio("Temp.mp3")],
    title="Multilanguage Voice Assistant App",
    description="Upload an image and interact via voice input and audio response."
)

# Launch the interface
iface.launch(debug=True, share=True)