Image2prompt / app.py
pormungtai's picture
Update app.py
261977f verified
# -*- coding: utf-8 -*-
"""
Streamlit app: Prompt Generator from Image (NSFW-ready, self-hosted on Hugging Face Spaces)
- Backends: Gemini API (optional) + Local open-source (Qwen2-VL 2B/7B)
- Detail modes: soft / artistic / raw
- JSONL export with policy fields (adult-only, consent)
- Simple keyword tag extractor (can be swapped for WD14/DeepDanbooru later)
NOTE: To use the local backend you must select a Qwen2-VL model that fits your Space hardware.
Suggested default for T4/low VRAM: "Qwen/Qwen2-VL-2B-Instruct" (loads with 4-bit if bitsandbytes available).
Requirements (put these lines into requirements.txt):
----- requirements.txt -----
streamlit==1.37.1
Pillow
transformers>=4.43.0
accelerate>=0.33.0
sentencepiece
safetensors
huggingface_hub
bitsandbytes; platform_system != 'Darwin'
google-generativeai==0.7.2 # only if you keep Gemini option
---------------------------
"""
import os
import io
import json
from datetime import datetime
import streamlit as st
from PIL import Image
# ===== Gemini (optional) =====
USE_GEMINI = True
try:
import google.generativeai as genai # type: ignore
except Exception:
USE_GEMINI = False
def get_gemini_api_key() -> str:
# Return Gemini API key from SECRET_KEY or GOOGLE_API_KEY (if present)
return os.getenv('SECRET_KEY') or os.getenv('GOOGLE_API_KEY') or ''
# ===== Transformers (open-source backend) =====
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
# ---------------- UI CONFIG ----------------
st.set_page_config(page_title="🖼️ Prompt Generator from Image (NSFW-ready)", layout="wide")
st.title("🖼️ Prompt Generator from Image")
st.markdown(
"> Please try my other tool at : https://imgkey.lovable.app"
)
with st.sidebar:
st.header("⚙️ Settings")
# Gemini availability message
gem_key = get_gemini_api_key() if USE_GEMINI else ''
gem_ready = bool(gem_key)
backend_opts = ["Local Qwen2-VL (Open-Source)"]
if USE_GEMINI and gem_ready:
backend_opts.append("Gemini API")
elif USE_GEMINI and not gem_ready:
backend_opts.append("Gemini API (key missing)")
else:
backend_opts.append("Gemini API (unavailable)")
backend = st.selectbox("Backend", backend_opts, index=0)
mode = st.selectbox("Detail level", ["soft", "artistic", "raw"], index=2)
model_id = st.text_input(
"HF Model (local backend)",
value="Qwen/Qwen2-VL-2B-Instruct",
help="Pick a Qwen2-VL Instruct model that fits your GPU (e.g., 2B/7B).",
)
max_tokens = st.slider("Max new tokens", 64, 512, 220, 8)
temperature = st.slider("Temperature", 0.0, 1.2, 0.6, 0.05)
# Gemini status badge
if USE_GEMINI:
if gem_ready:
st.success("Gemini key detected (SECRET_KEY / GOOGLE_API_KEY)")
else:
st.warning("Gemini key not found. Add SECRET_KEY or GOOGLE_API_KEY in Space Secrets.")
st.divider()
st.subheader("🔐 Policy")
st.caption("This app only describes consenting adults. It refuses illegal/underage/forced content.")
# ---------------- Helpers ----------------
@st.cache_resource(show_spinner=False)
def load_qwen(model_id: str):
"""Load Qwen2-VL model+processor with best effort for the available hardware."""
device = "cuda" if torch.cuda.is_available() else "cpu"
load_kwargs = {"torch_dtype": torch.float16 if device == "cuda" else torch.float32,
"low_cpu_mem_usage": True,
"trust_remote_code": True}
try:
if device == "cuda":
load_kwargs.update({"device_map": "auto", "load_in_4bit": True})
except Exception:
pass
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(model_id, **load_kwargs)
if device == "cpu":
model.to(device)
model.eval()
return processor, model
def _system_prompt():
return (
"You are an expert adult-image describer. Describe only consenting adults. "
"Never mention or imply minors, coercion, violence, incest, or illegal content."
)
def _style_prompt(mode: str):
if mode == "soft":
return (
"Write an elegant, suggestive description focusing on mood, wardrobe, and lighting. "
"Avoid explicit anatomy terms."
)
if mode == "artistic":
return (
"Describe composition, pose, camera angle, lens, lighting, colors, materials, and setting."
)
# raw
return (
"Use precise, clinical adult anatomy/act terms when visible. Mention positions, garments, accessories, fluids, and interactions explicitly."
)
def caption_with_qwen(image: Image.Image, mode: str, model_id: str, max_new_tokens=220, temperature=0.6):
processor, model = load_qwen(model_id)
messages = [
{"role": "system", "content": _system_prompt()},
{
"role": "user",
"content": [
{"type": "image", "image": image},
{
"type": "text",
"text": (
f"Describe the image in the '{mode}' style. "
+ _style_prompt(mode)
+ " Return a single coherent paragraph."
),
},
],
},
]
# Build chat template -> tokens
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[image], return_tensors="pt")
device = next(model.parameters()).device
inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
with torch.no_grad():
out = model.generate(
**inputs,
max_new_tokens=int(max_new_tokens),
do_sample=True,
temperature=float(temperature),
top_p=0.9,
repetition_penalty=1.05,
)
decoded = processor.batch_decode(out, skip_special_tokens=True)[0]
# Try to cut assistant prefix if present
if "assistant" in decoded:
decoded = decoded.split("assistant")[-1].strip().lstrip("\n: ")
return decoded.strip()
# Minimal keyword tagger from caption text
KEYWORDS = {
"acts": [
"oral", "blowjob", "doggy", "missionary", "cowgirl", "anal", "handjob",
"cumshot", "facial", "masturbation", "kissing", "threesome", "spanking",
],
"nudity": ["topless", "nude", "full", "see-through", "lingerie", "panties", "stockings"],
"style": ["photo", "studio", "candid", "mirror", "selfie", "soft lighting", "hard lighting", "bokeh"],
}
def extract_tags(text: str):
tags = set()
low = text.lower()
for group in KEYWORDS.values():
for w in group:
if w in low:
tags.add(w)
return sorted(tags)
# Gemini helper
def get_gemini_response(image: Image.Image):
if not USE_GEMINI:
raise RuntimeError("Gemini library not available in this Space.")
secret_key = get_gemini_api_key()
if not secret_key:
raise RuntimeError("Gemini API key not found. Set SECRET_KEY or GOOGLE_API_KEY in Space secrets.")
genai.configure(api_key=secret_key)
model = genai.GenerativeModel("gemini-2.0-flash")
sys = (
"You are a prompt generator for text-to-image models. "
"When content is adult, describe it clinically without euphemism."
)
resp = model.generate_content([sys, image])
return getattr(resp, "text", "")
# ---------------- LAYOUT ----------------
col1, col2 = st.columns(2)
with col1:
st.markdown("### 📤 Upload Your Image")
uploaded_file = st.file_uploader(
"Drag and drop or click to upload an image...",
type=["jpg", "jpeg", "png", "webp"],
label_visibility="collapsed",
)
image = None
if uploaded_file is not None:
try:
image = Image.open(uploaded_file).convert("RGB")
st.image(image, caption="Uploaded Image", use_column_width=True)
except Exception as e:
st.error(f"Failed to open image: {e}")
with col2:
st.markdown("### 🎯 Generated Prompt")
if image is None:
st.info("Please upload an image to generate a prompt.")
else:
if st.button("✨ Generate Prompt", use_container_width=True):
with st.spinner("Generating prompt..."):
try:
if backend.startswith("Local Qwen2-VL"):
prompt = caption_with_qwen(
image,
mode=mode,
model_id=model_id,
max_new_tokens=max_tokens,
temperature=temperature,
)
else:
prompt = get_gemini_response(image)
if not prompt:
st.warning("No text generated.")
else:
st.code(prompt, language="markdown")
# Build JSON record
record = {
"timestamp": datetime.utcnow().isoformat() + "Z",
"image": uploaded_file.name,
"mode": mode if backend.startswith("Local") else "gemini_default",
"prompt": prompt,
"tags": extract_tags(prompt),
"policy": {"age": "adult_only", "consent": True},
"backend": "qwen2-vl" if backend.startswith("Local") else "gemini",
"model": model_id if backend.startswith("Local") else "gemini-2.0-flash",
}
st.json(record)
# Append to JSONL
out_path = "captions.jsonl"
with open(out_path, "a", encoding="utf-8") as f:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
st.success(f"Appended to {out_path}")
except torch.cuda.OutOfMemoryError:
st.error("CUDA OOM. Try a smaller model (e.g., Qwen2-VL-2B) or reduce max tokens.")
except Exception as e:
st.error(f"Generation failed: {e}")
# Footer
st.markdown("---")
st.caption(
"This Space is intended for lawful, adult-only NSFW dataset preparation. You are responsible for compliance with local laws and platform policies."
)