from transformers import pipeline
import torch
from datasets import load_dataset
import soundfile as sf  
from transformers import AutoTokenizer, AutoModelForCausalLM, VitsModel
import numpy as np
import re

# Convert image to text description using a vision-language model
def img2text(url):
    image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
    text = image_to_text_model(url)[0]["generated_text"]

    # Remove art-related words to make the description more neutral
    for word in ["illustration", "drawing", "painting", "rendering"]:
        text = text.replace(word, "").strip()

    return text

# Generate a short story from a given text prompt
def text2story(caption):
    """
    Generates a child-friendly story (50–100 words) from a given image caption.
    Ensures it avoids dark/adult themes and encourages a whimsical tone.
    """
    tokenizer = AutoTokenizer.from_pretrained("pranavpsv/gpt2-genre-story-generator")
    model = AutoModelForCausalLM.from_pretrained("pranavpsv/gpt2-genre-story-generator")

    # Prompt to guide the model
    prompt = (
    f"Write a heartwarming story for a child. "
    f"Must use {caption} as places and characters in the story. "
    f"\n\nStory:"
    )

    inputs = tokenizer(prompt, return_tensors="pt")

    outputs = model.generate(
        inputs.input_ids,
        max_length=180,
        do_sample=True,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove prompt prefix if present
    if "Story:" in output_text:
        output_text = output_text.split("Story:")[-1].strip()

    # Limit to ~100 words, but try to cut at a sentence ending (., !, ?)
    word_list = output_text.split()
    cut_text = " ".join(word_list[:130])  # give buffer for sentence endings

    sentences = re.split(r'(?<=[.!?])\s+', cut_text)

    trimmed_story = ""
    total_words = 0
    for sentence in sentences:
        sentence = sentence.strip()
        word_count = len(sentence.split())
        if total_words + word_count > 100:
            break
        if sentence:
            trimmed_story += sentence + " "
            total_words += word_count

    story = trimmed_story.strip()

    # If no sentence-ending punctuation found, just force cut at 100 words
    if not story:
        story = " ".join(word_list[:100])
        if not story.endswith(('.', '!', '?')):
            story += "."

    return story

# Convert text story into audio using a speech synthesis model
def text2audio(story_text):
    model = VitsModel.from_pretrained("facebook/mms-tts-eng")
    tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

    inputs = tokenizer(story_text, return_tensors="pt")

    # Important: convert input IDs to LongTensor to avoid runtime error
    inputs["input_ids"] = inputs["input_ids"].long()

    with torch.no_grad():
        output = model(**inputs).waveform

    # Convert tensor to NumPy array and save it as a .wav file
    audio_np = output.squeeze().cpu().numpy()
    output_path = "generated_audio.wav"
    sf.write(output_path, audio_np, 22050)

    return output_path