Spaces:

Manireddy1508
/

imagetoimage

Paused

File size: 6,287 Bytes

9e3bd6c
d3a3bf1
69a2277
f29389b
433cace
 
acc4043
 
 
 
 
69a2277
f29389b
acc4043
f29389b
 
69a2277
acc4043
69a2277
f29389b
acc4043
433cace
 
 
8cafce9
 
9362fe6
8cafce9
acc4043
433cace
 
acc4043
433cace
 
 
 
 
 
9362fe6
 
433cace
 
 
 
 
 
 
9362fe6
f29389b
 
69a2277
9362fe6
69a2277
054edad
 
69a2277
 
 
 
 
433cace
69a2277
433cace
9362fe6
 
 
 
 
 
 
433cace
69a2277
 
 
f29389b
433cace
69a2277
 
 
 
acc4043
 
 
9362fe6
acc4043
 
9362fe6
 
 
 
 
 
acc4043
 
69a2277
 
 
 
054edad
 
 
69a2277
 
 
 
f29389b
9362fe6
f29389b
054edad
 
9362fe6
054edad
9362fe6
 
 
054edad
69a2277
054edad
 
 
 
9362fe6
054edad
 
 
 
 
 
9362fe6
054edad
 
 
 
 
 
 
9362fe6
054edad
 
8d8a928
 
9362fe6
8d8a928
acc4043
 
 
 
 
8d8a928
 
acc4043
 
8d8a928
 
 
acc4043
8d8a928
 
acc4043
 
8d8a928
9362fe6
8d8a928
acc4043
 
045059f
9362fe6

import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from PIL import Image
import torch
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    CLIPTokenizer
)

# ----------------------------
# 🔐 Load API Keys & Setup
# ----------------------------
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
device = "cuda" if torch.cuda.is_available() else "cpu"

# ----------------------------
# 📸 Load BLIP Captioning Model
# ----------------------------
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# ----------------------------
# 🧠 Load CLIP Tokenizer (for token check)
# ----------------------------
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")

# ----------------------------
# 📸 Generate Caption from Product Image
# ----------------------------
def generate_blip_caption(image: Image.Image) -> str:
    try:
        inputs = processor(images=image, return_tensors="pt").to(device)
        out = blip_model.generate(**inputs, max_length=50)
        caption = processor.decode(out[0], skip_special_tokens=True)
        # Clean duplicate tokens
        caption = " ".join(dict.fromkeys(caption.split()))
        print(f"🖼️ BLIP Caption: {caption}")
        return caption
    except Exception as e:
        print("❌ BLIP Captioning Error:", e)
        return "a product image"

# ----------------------------
# 🧠 GPT Scene Planning with Caption + Visual Style
# ----------------------------
SCENE_SYSTEM_INSTRUCTIONS = """
You are a scene planning assistant for an AI image generation system.
Your job is to take a caption from a product image, a visual style hint, and a user prompt, then return a structured JSON with:
- scene (environment, setting)
- subject (main_actor)
- objects (main_product or items)
- layout (foreground/background elements and their placement)
- rules (validation rules to ensure visual correctness)
Respond ONLY in raw JSON format. Do NOT include explanations.
"""

def extract_scene_plan(prompt: str, image: Image.Image) -> dict:
    try:
        caption = generate_blip_caption(image)
        visual_hint = caption if "shoe" in caption or "product" in caption else "low-top product photo on white background"
        
        merged_prompt = (
            f"Image Caption: {caption}\n"
            f"Image Visual Style: {visual_hint}\n"
            f"User Prompt: {prompt}"
        )

        response = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",
            messages=[
                {"role": "system", "content": SCENE_SYSTEM_INSTRUCTIONS},
                {"role": "user", "content": merged_prompt}
            ],
            temperature=0.3,
            max_tokens=500
        )
        content = response.choices[0].message.content
        print("🧠 Scene Plan (Raw):", content)

        # Logging
        os.makedirs("logs", exist_ok=True)
        with open("logs/scene_plans.jsonl", "a") as f:
            f.write(json.dumps({
                "caption": caption,
                "visual_hint": visual_hint,
                "prompt": prompt,
                "scene_plan": content
            }) + "\n")

        return json.loads(content)

    except Exception as e:
        print("❌ extract_scene_plan() Error:", e)
        return {
            "scene": {"environment": "studio", "setting": "plain white background"},
            "subject": {"main_actor": "a product"},
            "objects": {"main_product": "product"},
            "layout": {},
            "rules": {}
        }

# ----------------------------
# ✨ Enriched Prompt Generation (GPT, 77-token safe)
# ----------------------------
ENRICHED_PROMPT_INSTRUCTIONS = """
You are a prompt engineer for an AI image generation model.
Given a structured scene plan and a user prompt, generate a single natural-language enriched prompt that:
1. Describes the subject, product, setting, and layout clearly
2. Uses natural, photo-realistic language
3. Stays strictly under 77 tokens (CLIP token limit)
Return ONLY the enriched prompt string. No explanations.
"""

def generate_prompt_variations_from_scene(scene_plan: dict, base_prompt: str, n: int = 3) -> list:
    prompts = []
    for _ in range(n):
        try:
            user_input = f"Scene Plan:\n{json.dumps(scene_plan)}\n\nUser Prompt:\n{base_prompt}"
            response = client.chat.completions.create(
                model="gpt-4o-mini-2024-07-18",
                messages=[
                    {"role": "system", "content": ENRICHED_PROMPT_INSTRUCTIONS},
                    {"role": "user", "content": user_input}
                ],
                temperature=0.4,
                max_tokens=100
            )
            enriched = response.choices[0].message.content.strip()
            token_count = len(tokenizer(enriched)["input_ids"])
            print(f"📝 Enriched Prompt ({token_count} tokens): {enriched}")
            prompts.append(enriched)
        except Exception as e:
            print("⚠️ Prompt fallback:", e)
            prompts.append(base_prompt)
    return prompts

# ----------------------------
# ❌ Negative Prompt Generator
# ----------------------------
NEGATIVE_SYSTEM_PROMPT = """
You are a prompt engineer. Given a structured scene plan, generate a short negative prompt
to suppress unwanted visual elements such as: distortion, blurriness, poor anatomy,
logo errors, background noise, or low realism.
Return a single comma-separated list. No intro text.
"""

def generate_negative_prompt_from_scene(scene_plan: dict) -> str:
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",
            messages=[
                {"role": "system", "content": NEGATIVE_SYSTEM_PROMPT},
                {"role": "user", "content": json.dumps(scene_plan)}
            ],
            temperature=0.2,
            max_tokens=100
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print("❌ Negative Prompt Error:", e)
        return "blurry, distorted, low quality, deformed, watermark"