Image2prompt

Sleeping

App Files Files Community

Image2prompt / app.py

pormungtai

Update app.py

261977f verified about 2 months ago

raw

history blame contribute delete

10.6 kB

	# -- coding: utf-8 --
	"""
	Streamlit app: Prompt Generator from Image (NSFW-ready, self-hosted on Hugging Face Spaces)
	- Backends: Gemini API (optional) + Local open-source (Qwen2-VL 2B/7B)
	- Detail modes: soft / artistic / raw
	- JSONL export with policy fields (adult-only, consent)
	- Simple keyword tag extractor (can be swapped for WD14/DeepDanbooru later)

	NOTE: To use the local backend you must select a Qwen2-VL model that fits your Space hardware.
	Suggested default for T4/low VRAM: "Qwen/Qwen2-VL-2B-Instruct" (loads with 4-bit if bitsandbytes available).

	Requirements (put these lines into requirements.txt):
	----- requirements.txt -----
	streamlit==1.37.1
	Pillow
	transformers>=4.43.0
	accelerate>=0.33.0
	sentencepiece
	safetensors
	huggingface_hub
	bitsandbytes; platform_system != 'Darwin'
	google-generativeai==0.7.2 # only if you keep Gemini option
	---------------------------
	"""

	import os
	import io
	import json
	from datetime import datetime

	import streamlit as st
	from PIL import Image

	# ===== Gemini (optional) =====
	USE_GEMINI = True
	try:
	import google.generativeai as genai # type: ignore
	except Exception:
	USE_GEMINI = False

	def get_gemini_api_key() -> str:
	# Return Gemini API key from SECRET_KEY or GOOGLE_API_KEY (if present)
	return os.getenv('SECRET_KEY') or os.getenv('GOOGLE_API_KEY') or ''

	# ===== Transformers (open-source backend) =====
	import torch
	from transformers import AutoProcessor, AutoModelForVision2Seq

	# ---------------- UI CONFIG ----------------
	st.set_page_config(page_title="🖼️ Prompt Generator from Image (NSFW-ready)", layout="wide")
	st.title("🖼️ Prompt Generator from Image")
	st.markdown(
	"> Please try my other tool at : https://imgkey.lovable.app"
	)

	with st.sidebar:
	st.header("⚙️ Settings")

	# Gemini availability message
	gem_key = get_gemini_api_key() if USE_GEMINI else ''
	gem_ready = bool(gem_key)

	backend_opts = ["Local Qwen2-VL (Open-Source)"]
	if USE_GEMINI and gem_ready:
	backend_opts.append("Gemini API")
	elif USE_GEMINI and not gem_ready:
	backend_opts.append("Gemini API (key missing)")
	else:
	backend_opts.append("Gemini API (unavailable)")

	backend = st.selectbox("Backend", backend_opts, index=0)

	mode = st.selectbox("Detail level", ["soft", "artistic", "raw"], index=2)

	model_id = st.text_input(
	"HF Model (local backend)",
	value="Qwen/Qwen2-VL-2B-Instruct",
	help="Pick a Qwen2-VL Instruct model that fits your GPU (e.g., 2B/7B).",
	)

	max_tokens = st.slider("Max new tokens", 64, 512, 220, 8)
	temperature = st.slider("Temperature", 0.0, 1.2, 0.6, 0.05)

	# Gemini status badge
	if USE_GEMINI:
	if gem_ready:
	st.success("Gemini key detected (SECRET_KEY / GOOGLE_API_KEY)")
	else:
	st.warning("Gemini key not found. Add SECRET_KEY or GOOGLE_API_KEY in Space Secrets.")

	st.divider()
	st.subheader("🔐 Policy")
	st.caption("This app only describes consenting adults. It refuses illegal/underage/forced content.")

	# ---------------- Helpers ----------------
	@st.cache_resource(show_spinner=False)
	def load_qwen(model_id: str):
	"""Load Qwen2-VL model+processor with best effort for the available hardware."""
	device = "cuda" if torch.cuda.is_available() else "cpu"
	load_kwargs = {"torch_dtype": torch.float16 if device == "cuda" else torch.float32,
	"low_cpu_mem_usage": True,
	"trust_remote_code": True}
	try:
	if device == "cuda":
	load_kwargs.update({"device_map": "auto", "load_in_4bit": True})
	except Exception:
	pass

	processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
	model = AutoModelForVision2Seq.from_pretrained(model_id, **load_kwargs)
	if device == "cpu":
	model.to(device)
	model.eval()
	return processor, model


	def _system_prompt():
	return (
	"You are an expert adult-image describer. Describe only consenting adults. "
	"Never mention or imply minors, coercion, violence, incest, or illegal content."
	)


	def _style_prompt(mode: str):
	if mode == "soft":
	return (
	"Write an elegant, suggestive description focusing on mood, wardrobe, and lighting. "
	"Avoid explicit anatomy terms."
	)
	if mode == "artistic":
	return (
	"Describe composition, pose, camera angle, lens, lighting, colors, materials, and setting."
	)
	# raw
	return (
	"Use precise, clinical adult anatomy/act terms when visible. Mention positions, garments, accessories, fluids, and interactions explicitly."
	)


	def caption_with_qwen(image: Image.Image, mode: str, model_id: str, max_new_tokens=220, temperature=0.6):
	processor, model = load_qwen(model_id)

	messages = [
	{"role": "system", "content": _system_prompt()},
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{
	"type": "text",
	"text": (
	f"Describe the image in the '{mode}' style. "
	+ _style_prompt(mode)
	+ " Return a single coherent paragraph."
	),
	},
	],
	},
	]

	# Build chat template -> tokens
	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(text=[text], images=[image], return_tensors="pt")
	device = next(model.parameters()).device
	inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}

	with torch.no_grad():
	out = model.generate(
	**inputs,
	max_new_tokens=int(max_new_tokens),
	do_sample=True,
	temperature=float(temperature),
	top_p=0.9,
	repetition_penalty=1.05,
	)
	decoded = processor.batch_decode(out, skip_special_tokens=True)[0]
	# Try to cut assistant prefix if present
	if "assistant" in decoded:
	decoded = decoded.split("assistant")[-1].strip().lstrip("\n: ")
	return decoded.strip()


	# Minimal keyword tagger from caption text
	KEYWORDS = {
	"acts": [
	"oral", "blowjob", "doggy", "missionary", "cowgirl", "anal", "handjob",
	"cumshot", "facial", "masturbation", "kissing", "threesome", "spanking",
	],
	"nudity": ["topless", "nude", "full", "see-through", "lingerie", "panties", "stockings"],
	"style": ["photo", "studio", "candid", "mirror", "selfie", "soft lighting", "hard lighting", "bokeh"],
	}


	def extract_tags(text: str):
	tags = set()
	low = text.lower()
	for group in KEYWORDS.values():
	for w in group:
	if w in low:
	tags.add(w)
	return sorted(tags)


	# Gemini helper
	def get_gemini_response(image: Image.Image):
	if not USE_GEMINI:
	raise RuntimeError("Gemini library not available in this Space.")
	secret_key = get_gemini_api_key()
	if not secret_key:
	raise RuntimeError("Gemini API key not found. Set SECRET_KEY or GOOGLE_API_KEY in Space secrets.")
	genai.configure(api_key=secret_key)
	model = genai.GenerativeModel("gemini-2.0-flash")
	sys = (
	"You are a prompt generator for text-to-image models. "
	"When content is adult, describe it clinically without euphemism."
	)
	resp = model.generate_content([sys, image])
	return getattr(resp, "text", "")


	# ---------------- LAYOUT ----------------
	col1, col2 = st.columns(2)

	with col1:
	st.markdown("### 📤 Upload Your Image")
	uploaded_file = st.file_uploader(
	"Drag and drop or click to upload an image...",
	type=["jpg", "jpeg", "png", "webp"],
	label_visibility="collapsed",
	)
	image = None
	if uploaded_file is not None:
	try:
	image = Image.open(uploaded_file).convert("RGB")
	st.image(image, caption="Uploaded Image", use_column_width=True)
	except Exception as e:
	st.error(f"Failed to open image: {e}")

	with col2:
	st.markdown("### 🎯 Generated Prompt")
	if image is None:
	st.info("Please upload an image to generate a prompt.")
	else:
	if st.button("✨ Generate Prompt", use_container_width=True):
	with st.spinner("Generating prompt..."):
	try:
	if backend.startswith("Local Qwen2-VL"):
	prompt = caption_with_qwen(
	image,
	mode=mode,
	model_id=model_id,
	max_new_tokens=max_tokens,
	temperature=temperature,
	)
	else:
	prompt = get_gemini_response(image)

	if not prompt:
	st.warning("No text generated.")
	else:
	st.code(prompt, language="markdown")
	# Build JSON record
	record = {
	"timestamp": datetime.utcnow().isoformat() + "Z",
	"image": uploaded_file.name,
	"mode": mode if backend.startswith("Local") else "gemini_default",
	"prompt": prompt,
	"tags": extract_tags(prompt),
	"policy": {"age": "adult_only", "consent": True},
	"backend": "qwen2-vl" if backend.startswith("Local") else "gemini",
	"model": model_id if backend.startswith("Local") else "gemini-2.0-flash",
	}
	st.json(record)
	# Append to JSONL
	out_path = "captions.jsonl"
	with open(out_path, "a", encoding="utf-8") as f:
	f.write(json.dumps(record, ensure_ascii=False) + "\n")
	st.success(f"Appended to {out_path}")
	except torch.cuda.OutOfMemoryError:
	st.error("CUDA OOM. Try a smaller model (e.g., Qwen2-VL-2B) or reduce max tokens.")
	except Exception as e:
	st.error(f"Generation failed: {e}")

	# Footer
	st.markdown("---")
	st.caption(
	"This Space is intended for lawful, adult-only NSFW dataset preparation. You are responsible for compliance with local laws and platform policies."
	)