Spaces:
Runtime error
Runtime error
Commit
·
fceeb2f
1
Parent(s):
10e2bf6
init
Browse files- .gitattributes copy +38 -0
- .gitignore +17 -0
- .python-version +1 -0
- README copy.md +12 -0
- api_utils.py +174 -0
- app.py +152 -0
- requirements.txt +5 -0
- schema.py +216 -0
.gitattributes copy
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/scene.jpg filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/style.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/subject.jpg filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python-generated files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[oc]
|
| 4 |
+
build/
|
| 5 |
+
dist/
|
| 6 |
+
wheels/
|
| 7 |
+
*.egg-info
|
| 8 |
+
|
| 9 |
+
image_generate.png
|
| 10 |
+
|
| 11 |
+
# Virtual environments
|
| 12 |
+
.venv
|
| 13 |
+
.env
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# Images
|
| 17 |
+
images/
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.11
|
README copy.md
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: FIBO Mashup
|
| 3 |
+
emoji: 🏃
|
| 4 |
+
colorFrom: yellow
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.49.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
api_utils.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import io
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import time
|
| 6 |
+
from typing import Any, Dict, Optional
|
| 7 |
+
from PIL import Image
|
| 8 |
+
import requests
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _image_to_base64(image: Image.Image) -> str:
|
| 12 |
+
buffer = io.BytesIO()
|
| 13 |
+
image_format = (image.format or "PNG").upper()
|
| 14 |
+
if image_format not in {"PNG", "JPEG", "JPG"}:
|
| 15 |
+
image_format = "PNG"
|
| 16 |
+
image.save(buffer, format=image_format)
|
| 17 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _extract_status(payload: Dict[str, Any]) -> Optional[str]:
|
| 21 |
+
status_info = payload.get("status") or payload.get("state")
|
| 22 |
+
if isinstance(status_info, dict):
|
| 23 |
+
state = status_info.get("state") or status_info.get("status")
|
| 24 |
+
if isinstance(state, str):
|
| 25 |
+
return state.lower()
|
| 26 |
+
elif isinstance(status_info, str):
|
| 27 |
+
return status_info.lower()
|
| 28 |
+
return None
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _poll_bria_status(
|
| 32 |
+
status_url: str,
|
| 33 |
+
headers: Dict[str, str],
|
| 34 |
+
timeout_seconds: int = 120,
|
| 35 |
+
poll_interval: float = 1.5,
|
| 36 |
+
) -> Dict[str, Any]:
|
| 37 |
+
deadline = time.time() + timeout_seconds
|
| 38 |
+
while True:
|
| 39 |
+
response = requests.get(status_url, headers=headers, timeout=30)
|
| 40 |
+
response.raise_for_status()
|
| 41 |
+
payload: Dict[str, Any] = response.json()
|
| 42 |
+
state = _extract_status(payload)
|
| 43 |
+
|
| 44 |
+
if state in {"succeeded", "success", "completed", "done"}:
|
| 45 |
+
if isinstance(payload.get("result"), dict):
|
| 46 |
+
return payload["result"]
|
| 47 |
+
if payload.get("results") is not None:
|
| 48 |
+
return payload["results"]
|
| 49 |
+
return payload
|
| 50 |
+
|
| 51 |
+
if state in {"failed", "error", "cancelled", "canceled"}:
|
| 52 |
+
raise RuntimeError(
|
| 53 |
+
f"Bria VLM API request failed: {json.dumps(payload, indent=2)}"
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
if time.time() > deadline:
|
| 57 |
+
raise TimeoutError(
|
| 58 |
+
f"Bria VLM API request timed out while polling {status_url}"
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
time.sleep(poll_interval)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _submit_bria_request(
|
| 65 |
+
url: str, payload: Dict[str, Any], api_token: str
|
| 66 |
+
) -> Dict[str, Any]:
|
| 67 |
+
headers = {
|
| 68 |
+
"Content-Type": "application/json",
|
| 69 |
+
"api_token": api_token,
|
| 70 |
+
}
|
| 71 |
+
response = requests.post(url, json=payload, headers=headers, timeout=30)
|
| 72 |
+
response.raise_for_status()
|
| 73 |
+
initial_payload: Dict[str, Any] = response.json()
|
| 74 |
+
|
| 75 |
+
status_url = (
|
| 76 |
+
initial_payload.get("status_url")
|
| 77 |
+
or initial_payload.get("statusUrl")
|
| 78 |
+
or (initial_payload.get("status") or {}).get("status_url")
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
if status_url:
|
| 82 |
+
return _poll_bria_status(status_url, headers)
|
| 83 |
+
|
| 84 |
+
if isinstance(initial_payload.get("result"), dict):
|
| 85 |
+
return initial_payload["result"]
|
| 86 |
+
if initial_payload.get("results") is not None:
|
| 87 |
+
return initial_payload["results"]
|
| 88 |
+
|
| 89 |
+
return initial_payload
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def _parse_vlm_response(data: Any, prompt_role: str) -> str:
|
| 93 |
+
if isinstance(data, dict):
|
| 94 |
+
direct_match = data.get(prompt_role)
|
| 95 |
+
if isinstance(direct_match, str):
|
| 96 |
+
return direct_match
|
| 97 |
+
|
| 98 |
+
for key in ("prompt", "structured_prompt", "structuredPrompt", "text"):
|
| 99 |
+
if key in data:
|
| 100 |
+
value = data[key]
|
| 101 |
+
if isinstance(value, str):
|
| 102 |
+
return value
|
| 103 |
+
if isinstance(value, dict):
|
| 104 |
+
nested = value.get(prompt_role)
|
| 105 |
+
if isinstance(nested, str):
|
| 106 |
+
return nested
|
| 107 |
+
|
| 108 |
+
for key in ("result", "results"):
|
| 109 |
+
if key in data:
|
| 110 |
+
nested_result = _parse_vlm_response(data[key], prompt_role)
|
| 111 |
+
if nested_result:
|
| 112 |
+
return nested_result
|
| 113 |
+
|
| 114 |
+
if isinstance(data, list):
|
| 115 |
+
for item in data:
|
| 116 |
+
nested_result = _parse_vlm_response(item, prompt_role)
|
| 117 |
+
if nested_result:
|
| 118 |
+
return nested_result
|
| 119 |
+
|
| 120 |
+
return json.dumps(data)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def get_prompt_api(image_path: str, prompt_role: str) -> str:
|
| 124 |
+
"""Send an image to the Bria VLM API and return the extracted prompt text.
|
| 125 |
+
|
| 126 |
+
The payload keys are aligned with the current public docs but may require
|
| 127 |
+
adjustment if your Bria workspace is configured differently. Override the
|
| 128 |
+
default endpoint via the ``BRIA_API_VLM_ENDPOINT`` environment variable if
|
| 129 |
+
you are using a custom workflow.
|
| 130 |
+
"""
|
| 131 |
+
api_token = os.environ.get("BRIA_API_KEY")
|
| 132 |
+
if not api_token:
|
| 133 |
+
raise EnvironmentError(
|
| 134 |
+
"BRIA_API_KEY environment variable is required to use the Bria VLM API."
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
base_url = os.environ.get("BRIA_API_BASE_URL", "https://engine.prod.bria-api.com")
|
| 138 |
+
endpoint = os.environ.get("BRIA_API_VLM_ENDPOINT", "/v2/structured_prompt/generate")
|
| 139 |
+
url = f"{base_url.rstrip('/')}{endpoint}"
|
| 140 |
+
|
| 141 |
+
# convert image to base64
|
| 142 |
+
with Image.open(image_path) as image:
|
| 143 |
+
image_b64 = _image_to_base64(image)
|
| 144 |
+
|
| 145 |
+
payload = {"images": [image_b64]}
|
| 146 |
+
|
| 147 |
+
response = _submit_bria_request(url, payload, api_token)
|
| 148 |
+
|
| 149 |
+
return response["structured_prompt"]
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def get_image_from_url(image_url: str) -> Image.Image:
|
| 153 |
+
"""Get an image from a URL."""
|
| 154 |
+
response = requests.get(image_url)
|
| 155 |
+
return Image.open(io.BytesIO(response.content))
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def generate_image(prompt: str) -> Image.Image:
|
| 159 |
+
"""Generate an image from a prompt using the Bria VLM API."""
|
| 160 |
+
api_token = os.environ.get("BRIA_API_KEY")
|
| 161 |
+
if not api_token:
|
| 162 |
+
raise EnvironmentError(
|
| 163 |
+
"BRIA_API_KEY environment variable is required to use the Bria VLM API."
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
base_url = os.environ.get("BRIA_API_BASE_URL", "https://engine.prod.bria-api.com")
|
| 167 |
+
endpoint = os.environ.get("BRIA_API_GENERATE_ENDPOINT", "/v2/image/generate")
|
| 168 |
+
url = f"{base_url.rstrip('/')}{endpoint}"
|
| 169 |
+
|
| 170 |
+
payload = {"structured_prompt": prompt}
|
| 171 |
+
|
| 172 |
+
response = _submit_bria_request(url, payload, api_token)
|
| 173 |
+
|
| 174 |
+
return get_image_from_url(response["image_url"])
|
app.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import tempfile
|
| 3 |
+
import concurrent.futures
|
| 4 |
+
import dotenv
|
| 5 |
+
import os
|
| 6 |
+
from api_utils import get_prompt_api, generate_image
|
| 7 |
+
from schema import claude_structured_output
|
| 8 |
+
import time
|
| 9 |
+
dotenv.load_dotenv()
|
| 10 |
+
|
| 11 |
+
def get_image_suffix(image):
|
| 12 |
+
if hasattr(image, 'format') and image.format:
|
| 13 |
+
return '.' + image.format.lower()
|
| 14 |
+
if hasattr(image, 'filename') and image.filename:
|
| 15 |
+
_, ext = os.path.splitext(image.filename)
|
| 16 |
+
if ext:
|
| 17 |
+
return ext.lower()
|
| 18 |
+
return '.jpg'
|
| 19 |
+
|
| 20 |
+
def process_images(subject_image, scene_image, style_image):
|
| 21 |
+
"""
|
| 22 |
+
Process three images and generate a combined image.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
subject_image: PIL Image for the main subject
|
| 26 |
+
scene_image: PIL Image for the scene/background
|
| 27 |
+
style_image: PIL Image for the artistic style
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
PIL Image: The generated combined image
|
| 31 |
+
"""
|
| 32 |
+
if subject_image is None or scene_image is None or style_image is None:
|
| 33 |
+
raise gr.Error("Please upload all three images (subject, scene, and style)")
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
# Save images temporarily to pass to the API
|
| 37 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 38 |
+
subject_path = os.path.join(temp_dir, "subject" + get_image_suffix(subject_image))
|
| 39 |
+
scene_path = os.path.join(temp_dir, "scene" + get_image_suffix(scene_image))
|
| 40 |
+
style_path = os.path.join(temp_dir, "style" + get_image_suffix(style_image))
|
| 41 |
+
subject_image.save(subject_path)
|
| 42 |
+
scene_image.save(scene_path)
|
| 43 |
+
style_image.save(style_path)
|
| 44 |
+
|
| 45 |
+
# Get descriptions for each image
|
| 46 |
+
time_start = time.time()
|
| 47 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 48 |
+
future_subject = executor.submit(get_prompt_api, subject_path, "subject")
|
| 49 |
+
future_scene = executor.submit(get_prompt_api, scene_path, "scene")
|
| 50 |
+
future_style = executor.submit(get_prompt_api, style_path, "style")
|
| 51 |
+
subject = future_subject.result()
|
| 52 |
+
scene = future_scene.result()
|
| 53 |
+
style = future_style.result()
|
| 54 |
+
time_end = time.time()
|
| 55 |
+
print(f"Time taken to get descriptions: {time_end - time_start} seconds")
|
| 56 |
+
# Create combined prompt
|
| 57 |
+
prompt = f"""
|
| 58 |
+
place the main subject from the first image description and place it in the scene from the second image description with a style taken from the third image description.
|
| 59 |
+
|
| 60 |
+
first (subject) image description:
|
| 61 |
+
{subject}
|
| 62 |
+
second (scene) image description:
|
| 63 |
+
{scene}
|
| 64 |
+
third (style) image description:
|
| 65 |
+
{style}
|
| 66 |
+
|
| 67 |
+
create a new image description that incorporates all of the descriptions.
|
| 68 |
+
put the subject in the scene with the style.
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
# Generate structured output using Claude API
|
| 72 |
+
time_start = time.time()
|
| 73 |
+
response = claude_structured_output(prompt)
|
| 74 |
+
time_end = time.time()
|
| 75 |
+
print(f"Time taken to generate structured output: {time_end - time_start} seconds")
|
| 76 |
+
# Generate the final image
|
| 77 |
+
time_start = time.time()
|
| 78 |
+
result_image = generate_image(response)
|
| 79 |
+
time_end = time.time()
|
| 80 |
+
print(f"Time taken to generate image: {time_end - time_start} seconds")
|
| 81 |
+
|
| 82 |
+
return result_image
|
| 83 |
+
|
| 84 |
+
except Exception as e:
|
| 85 |
+
# Clean up temporary files on error
|
| 86 |
+
if "subject_path" in locals():
|
| 87 |
+
os.unlink(subject_path)
|
| 88 |
+
if "scene_path" in locals():
|
| 89 |
+
os.unlink(scene_path)
|
| 90 |
+
if "style_path" in locals():
|
| 91 |
+
os.unlink(style_path)
|
| 92 |
+
raise gr.Error(f"Error processing images: {str(e)}")
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# Create Gradio interface
|
| 96 |
+
with gr.Blocks(title="2IM - Image Combination Generator") as demo:
|
| 97 |
+
gr.Markdown("""
|
| 98 |
+
# 🎨 FIBO Mashup - Image Combination Generator
|
| 99 |
+
|
| 100 |
+
Combine three images into one:
|
| 101 |
+
1. **Subject Image**: The main object or person you want in the final image
|
| 102 |
+
2. **Scene Image**: The background/environment for the final image
|
| 103 |
+
3. **Style Image**: The artistic style to apply to the final image
|
| 104 |
+
""")
|
| 105 |
+
|
| 106 |
+
with gr.Row():
|
| 107 |
+
with gr.Column():
|
| 108 |
+
subject_input = gr.Image(
|
| 109 |
+
label="Subject Image",
|
| 110 |
+
type="pil",
|
| 111 |
+
height=300,
|
| 112 |
+
value="assets/subject.jpg",
|
| 113 |
+
)
|
| 114 |
+
gr.Markdown("*Upload the main subject/object*")
|
| 115 |
+
|
| 116 |
+
with gr.Column():
|
| 117 |
+
scene_input = gr.Image(
|
| 118 |
+
label="Scene Image", type="pil", height=300, value="assets/scene.jpg"
|
| 119 |
+
)
|
| 120 |
+
gr.Markdown("*Upload the scene/background*")
|
| 121 |
+
|
| 122 |
+
with gr.Column():
|
| 123 |
+
style_input = gr.Image(
|
| 124 |
+
label="Style Image", type="pil", height=300, value="assets/style.png"
|
| 125 |
+
)
|
| 126 |
+
gr.Markdown("*Upload the style reference*")
|
| 127 |
+
|
| 128 |
+
generate_btn = gr.Button("🎨 Generate Combined Image", variant="primary", size="lg")
|
| 129 |
+
|
| 130 |
+
gr.Markdown("---")
|
| 131 |
+
|
| 132 |
+
output_image = gr.Image(label="Generated Image", type="pil", height=500)
|
| 133 |
+
|
| 134 |
+
# Set up the event handler
|
| 135 |
+
generate_btn.click(
|
| 136 |
+
fn=process_images,
|
| 137 |
+
inputs=[subject_input, scene_input, style_input],
|
| 138 |
+
outputs=output_image,
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
gr.Markdown("""
|
| 142 |
+
### How it works:
|
| 143 |
+
1. Upload three images using the fields above
|
| 144 |
+
2. Click "Generate Combined Image"
|
| 145 |
+
3. The AI will analyze each image and create a new image that combines the subject from the first image, places it in the scene from the second image, and applies the style from the third image
|
| 146 |
+
|
| 147 |
+
*Note: Generation may take a minute or two depending on API response times.*
|
| 148 |
+
""")
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
demo.launch(share=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=3.36.1
|
| 2 |
+
pillow>=12.0.0
|
| 3 |
+
setuptools>=80.9.0
|
| 4 |
+
pydantic>=2.12.3
|
| 5 |
+
pydantic-ai>=1.11.0
|
schema.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
from pydantic_ai import Agent
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class ObjectDescription(BaseModel):
|
| 8 |
+
description: str = Field(..., description="Short description of the object.")
|
| 9 |
+
location: str = Field(
|
| 10 |
+
..., description="E.g., 'center', 'top-left', 'bottom-right foreground'."
|
| 11 |
+
)
|
| 12 |
+
relationship: str = Field(
|
| 13 |
+
...,
|
| 14 |
+
description="Describe the relationship between the object and the other objects in the image.",
|
| 15 |
+
)
|
| 16 |
+
relative_size: Optional[str] = Field(
|
| 17 |
+
None, description="E.g., 'small', 'medium', 'large within frame'."
|
| 18 |
+
)
|
| 19 |
+
shape_and_color: Optional[str] = Field(
|
| 20 |
+
None, description="Describe the basic shape and dominant color."
|
| 21 |
+
)
|
| 22 |
+
texture: Optional[str] = Field(
|
| 23 |
+
None, description="E.g., 'smooth', 'rough', 'metallic', 'furry'."
|
| 24 |
+
)
|
| 25 |
+
appearance_details: Optional[str] = Field(
|
| 26 |
+
None, description="Any other notable visual details."
|
| 27 |
+
)
|
| 28 |
+
# If cluster of object
|
| 29 |
+
number_of_objects: Optional[int] = Field(
|
| 30 |
+
None, description="The number of objects in the cluster."
|
| 31 |
+
)
|
| 32 |
+
# Human-specific fields
|
| 33 |
+
pose: Optional[str] = Field(None, description="Describe the body position.")
|
| 34 |
+
expression: Optional[str] = Field(None, description="Describe facial expression.")
|
| 35 |
+
clothing: Optional[str] = Field(None, description="Describe attire.")
|
| 36 |
+
action: Optional[str] = Field(None, description="Describe the action of the human.")
|
| 37 |
+
gender: Optional[str] = Field(None, description="Describe the gender of the human.")
|
| 38 |
+
skin_tone_and_texture: Optional[str] = Field(
|
| 39 |
+
None, description="Describe the skin tone and texture."
|
| 40 |
+
)
|
| 41 |
+
orientation: Optional[str] = Field(
|
| 42 |
+
None, description="Describe the orientation of the human."
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class LightingDetails(BaseModel):
|
| 47 |
+
conditions: str = Field(
|
| 48 |
+
...,
|
| 49 |
+
description="E.g., 'bright daylight', 'dim indoor', 'studio lighting', 'golden hour'.",
|
| 50 |
+
)
|
| 51 |
+
direction: str = Field(
|
| 52 |
+
..., description="E.g., 'front-lit', 'backlit', 'side-lit from left'."
|
| 53 |
+
)
|
| 54 |
+
shadows: Optional[str] = Field(
|
| 55 |
+
None, description="Describe the presence of shadows."
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class AestheticsDetails(BaseModel):
|
| 60 |
+
composition: str = Field(
|
| 61 |
+
...,
|
| 62 |
+
description="E.g., 'rule of thirds', 'symmetrical', 'centered', 'leading lines'.",
|
| 63 |
+
)
|
| 64 |
+
color_scheme: str = Field(
|
| 65 |
+
...,
|
| 66 |
+
description="E.g., 'monochromatic blue', 'warm complementary colors', 'high contrast'.",
|
| 67 |
+
)
|
| 68 |
+
mood_atmosphere: str = Field(
|
| 69 |
+
..., description="E.g., 'serene', 'energetic', 'mysterious', 'joyful'."
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class PhotographicCharacteristicsDetails(BaseModel):
|
| 74 |
+
depth_of_field: str = Field(
|
| 75 |
+
..., description="E.g., 'shallow', 'deep', 'bokeh background'."
|
| 76 |
+
)
|
| 77 |
+
focus: str = Field(
|
| 78 |
+
..., description="E.g., 'sharp focus on subject', 'soft focus', 'motion blur'."
|
| 79 |
+
)
|
| 80 |
+
camera_angle: str = Field(
|
| 81 |
+
..., description="E.g., 'eye-level', 'low angle', 'high angle', 'dutch angle'."
|
| 82 |
+
)
|
| 83 |
+
lens_focal_length: str = Field(
|
| 84 |
+
..., description="E.g., 'wide-angle', 'telephoto', 'macro', 'fisheye'."
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class TextRender(BaseModel):
|
| 89 |
+
text: str = Field(..., description="The text content.")
|
| 90 |
+
location: str = Field(
|
| 91 |
+
..., description="E.g., 'center', 'top-left', 'bottom-right foreground'."
|
| 92 |
+
)
|
| 93 |
+
size: str = Field(..., description="E.g., 'small', 'medium', 'large within frame'.")
|
| 94 |
+
color: str = Field(..., description="E.g., 'red', 'blue', 'green'.")
|
| 95 |
+
font: str = Field(..., description="E.g., 'realistic', 'cartoonish', 'minimalist'.")
|
| 96 |
+
appearance_details: Optional[str] = Field(
|
| 97 |
+
None, description="Any other notable visual details."
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
class ImageAnalysis(BaseModel):
|
| 102 |
+
short_description: str = Field(
|
| 103 |
+
..., description="A concise summary of the image content, 200 words maximum."
|
| 104 |
+
)
|
| 105 |
+
objects: List[ObjectDescription] = Field(
|
| 106 |
+
..., description="List of prominent foreground/midground objects."
|
| 107 |
+
)
|
| 108 |
+
background_setting: str = Field(
|
| 109 |
+
...,
|
| 110 |
+
description="Describe the overall environment, setting, or background, including any notable background elements.",
|
| 111 |
+
)
|
| 112 |
+
lighting: LightingDetails = Field(..., description="Details about the lighting.")
|
| 113 |
+
aesthetics: AestheticsDetails = Field(
|
| 114 |
+
..., description="Details about the image aesthetics."
|
| 115 |
+
)
|
| 116 |
+
photographic_characteristics: Optional[PhotographicCharacteristicsDetails] = Field(
|
| 117 |
+
None, description="Details about photographic characteristics."
|
| 118 |
+
)
|
| 119 |
+
style_medium: Optional[str] = Field(
|
| 120 |
+
None, description="Identify the artistic style or medium."
|
| 121 |
+
)
|
| 122 |
+
text_render: Optional[List[TextRender]] = Field(
|
| 123 |
+
None, description="List of text renders in the image."
|
| 124 |
+
)
|
| 125 |
+
context: str = Field(
|
| 126 |
+
...,
|
| 127 |
+
description="Provide any additional context that helps understand the image better.",
|
| 128 |
+
)
|
| 129 |
+
artistic_style: Optional[str] = Field(
|
| 130 |
+
None, description="describe specific artistic characteristics, 3 words maximum."
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
json_schema_full = """1. `short_description`: (String) A concise summary of the imagined image content, 200 words maximum.
|
| 135 |
+
2. `objects`: (Array of Objects) List a maximum of 5 prominent objects. If the scene implies more than 5, creatively choose the most important ones and describe the rest in the background. For each object, include:
|
| 136 |
+
* `description`: (String) A detailed description of the imagined object, 100 words maximum.
|
| 137 |
+
* `location`: (String) E.g., "center", "top-left", "bottom-right foreground".
|
| 138 |
+
* `relative_size`: (String) E.g., "small", "medium", "large within frame". (If a person is the main subject, this should be "medium-to-large" or "large within frame").
|
| 139 |
+
* `shape_and_color`: (String) Describe the basic shape and dominant color.
|
| 140 |
+
* `texture`: (String) E.g., "smooth", "rough", "metallic", "furry".
|
| 141 |
+
* `appearance_details`: (String) Any other notable visual details.
|
| 142 |
+
* `relationship`: (String) Describe the relationship between the object and the other objects in the image.
|
| 143 |
+
* `orientation`: (String) Describe the orientation or positioning of the object, e.g., "upright", "tilted 45 degrees", "horizontal", "vertical", "facing left", "facing right", "upside down", "lying on its side".
|
| 144 |
+
* If the object is a human or a human-like object, include the following:
|
| 145 |
+
* `pose`: (String) Describe the body position.
|
| 146 |
+
* `expression`: (String) Describe facial expression and emotion. E.g., "winking", "joyful", "serious", "surprised", "calm".
|
| 147 |
+
* `clothing`: (String) Describe attire.
|
| 148 |
+
* `action`: (String) Describe the action of the human.
|
| 149 |
+
* `gender`: (String) Describe the gender of the human.
|
| 150 |
+
* `skin_tone_and_texture`: (String) Describe the skin tone and texture.
|
| 151 |
+
* If the object is a cluster of objects, include the following:
|
| 152 |
+
* `number_of_objects`: (Integer) The number of objects in the cluster.
|
| 153 |
+
3. `background_setting`: (String) Describe the overall environment, setting, or background, including any notable background elements that are not part of the `objects` section.
|
| 154 |
+
4. `lighting`: (Object)
|
| 155 |
+
* `conditions`: (String) E.g., "bright daylight", "dim indoor", "studio lighting", "golden hour".
|
| 156 |
+
* `direction`: (String) E.g., "front-lit", "backlit", "side-lit from left".
|
| 157 |
+
* `shadows`: (String) Describe the presence and quality of shadows, e.g., "long, soft shadows", "sharp, defined shadows", "minimal shadows".
|
| 158 |
+
5. `aesthetics`: (Object)
|
| 159 |
+
* `composition`: (String) E.g., "rule of thirds", "symmetrical", "centered", "leading lines". If people are the main subject, specify the shot type, e.g., "medium shot", "close-up", "portrait composition".
|
| 160 |
+
* `color_scheme`: (String) E.g., "monochromatic blue", "warm complementary colors", "high contrast".
|
| 161 |
+
* `mood_atmosphere`: (String) E.g., "serene", "energetic", "mysterious", "joyful".
|
| 162 |
+
6. `photographic_characteristics`: (Object)
|
| 163 |
+
* `depth_of_field`: (String) E.g., "shallow", "deep", "bokeh background".
|
| 164 |
+
* `focus`: (String) E.g., "sharp focus on subject", "soft focus", "motion blur".
|
| 165 |
+
* `camera_angle`: (String) E.g., "eye-level", "low angle", "high angle", "dutch angle".
|
| 166 |
+
* `lens_focal_length`: (String) E.g., "wide-angle", "telephoto", "macro", "fisheye". (If the main subject is a person, prefer "standard lens (e.g., 35mm-50mm)" or "portrait lens (e.g., 50mm-85mm)" to ensure they are framed more closely. Avoid "wide-angle" for people unless specified).
|
| 167 |
+
7. `style_medium`: (String) Identify the artistic style or medium based on the user's prompt or creative interpretation (e.g., "photograph", "oil painting", "watercolor", "3D render", "digital illustration", "pencil sketch").
|
| 168 |
+
8. `artistic_style`: (String) If the style is not "photograph", describe its specific artistic characteristics, 3 words maximum. (e.g., "impressionistic, vibrant, textured" for an oil painting).
|
| 169 |
+
9. `context`: (String) Provide a general description of the type of image this would be. For example: "This is a concept for a high-fashion editorial photograph intended for a magazine spread," or "This describes a piece of concept art for a fantasy video game."
|
| 170 |
+
10. `text_render`: (Array of Objects) By default, this array should be empty (`[]`). Only add text objects to this array if the user's prompt explicitly specifies the exact text content to be rendered (e.g., user asks for "a poster with the title 'Cosmic Dream'"). Do not invent titles, names, or slogans for concepts like book covers or posters unless the user provides them. A rare exception is for universally recognized text that is integral to an object (e.g., the word 'STOP' on a 'stop sign'). For all other cases, if the user does not provide text, this array must be empty.
|
| 171 |
+
* `text`: (String) The exact text content provided by the user. NEVER use generic placeholders.
|
| 172 |
+
* `location`: (String) E.g., "center", "top-left", "bottom-right foreground".
|
| 173 |
+
* `size`: (String) E.g., "medium", "large", "large within frame".
|
| 174 |
+
* `color`: (String) E.g., "red", "blue", "green".
|
| 175 |
+
* `font`: (String) E.g., "realistic", "cartoonish", "minimalist", "serif typeface".
|
| 176 |
+
* `appearance_details`: (String) Any other notable visual details."""
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def get_system_prompt() -> str:
|
| 180 |
+
return f"""You are a Meticulous Visual Editor and Senior Art Director at a leading Generative AI company.
|
| 181 |
+
Your expertise is in combining multiple image descriptions into a single image description.
|
| 182 |
+
Your primary task is to receive multiple image descriptions and generate a single image description that incorporates all of the descriptions.
|
| 183 |
+
Adhere strictly to the following structure and guidelines:
|
| 184 |
+
1. **Input:** You will receive three image descriptions. The first image description is the main subject, the second image description is the scene, and the third image description is the style.
|
| 185 |
+
2. **Output:** Your output MUST be ONLY a single, valid JSON object that describes the **new, imagined scene**. Do not describe the original reference image.
|
| 186 |
+
3. **Modification Logic:**
|
| 187 |
+
* Carefully parse the three image descriptions to understand the desired changes.
|
| 188 |
+
* Combine the three image descriptions into a single image description that incorporates all of the descriptions.
|
| 189 |
+
* The image description should be in the same format as the image descriptions provided.
|
| 190 |
+
* The image description should be a single, valid image description that incorporates all of the descriptions.
|
| 191 |
+
* The image description should be a single, valid image description that incorporates all of the descriptions.
|
| 192 |
+
4. **Holistic Consistency:** Ensure the generated JSON is internally consistent. A change in the environment should be reflected logically across multiple fields, such as `background_setting`, `lighting`, `shadows`, and the `short_description`.
|
| 193 |
+
5. **Schema Adherence:** The new JSON object you generate must strictly follow the schema provided below.
|
| 194 |
+
The JSON object must contain the following keys precisely:
|
| 195 |
+
{json_schema_full}"""
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def claude_structured_output(prompt: str) -> str:
|
| 199 |
+
"""
|
| 200 |
+
Call Claude API with a prompt and return structured output using a Pydantic schema.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
prompt: The user prompt/query to send to Claude
|
| 204 |
+
|
| 205 |
+
Returns:
|
| 206 |
+
A JSON string with the structured response
|
| 207 |
+
"""
|
| 208 |
+
system_prompt = get_system_prompt()
|
| 209 |
+
agent = Agent(
|
| 210 |
+
"anthropic:claude-sonnet-4-5",
|
| 211 |
+
output_type=ImageAnalysis,
|
| 212 |
+
system_prompt=system_prompt,
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
response_text = agent.run_sync(prompt)
|
| 216 |
+
return response_text.output.model_dump_json()
|