import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
from kokoro import KPipeline
import soundfile as sf
import numpy as np
import gradio as gr


# Initialize the image-to-text pipeline
captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

# Initialize the text-generation pipeline
device = "cpu"
model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview"

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    trust_remote_code=True,
)

# Use LlamaTokenizer for compatibility
tokenizer = LlamaTokenizer.from_pretrained(model_id)

# Initialize the text-generation pipeline
generator = transformers_pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=500,
    do_sample=False,
)

# Function to generate caption
def Image_Caption(image):
    caption = captionImage(image)
    return caption[0]['generated_text']

# Function to generate a story
def Generate_story(textAbout):
  storyAbout =  {"role": "user", "content": f'write a long story about {textAbout} that takes 3 min to read'},    
  story = generator(storyAbout)
  story = story[0]['generated_text']
  story = story.replace('\n', ' ').replace('arafed', ' ')
  return story

# Function to generate audio
def Generate_audio(text, voice='bm_lewis', speed=1):
    pipeline = KPipeline(lang_code='b')
    generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
    full_audio = []
    for _, _, audio in generator:
        full_audio.extend(audio)
    full_audio = np.array(full_audio)
    return full_audio, 24000

# Main function to process the image and generate audio
def Mustalhim(image):
    caption = Image_Caption(image)
    story = Generate_story(caption)
    audio = Generate_audio(story)
    return audio

# Gradio interface
def gradio_interface(image):
    audio_waveform, sampling_rate = Mustalhim(image)
    audio_file = "output_audio.wav"
    sf.write(audio_file, audio_waveform, sampling_rate)
    return audio_file

# Path to the example image
example_image = "Example.PNG"

# Create the Gradio app
app = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Image(type="pil"),
    outputs=gr.Audio(type="filepath"),
    title="Mustalhim",
    description="Upload an image, and the app will generate a story and convert it to audio.",
    examples=[[example_image]]
)

# Launch the app
app.launch()