davidi-bria commited on
Commit
fceeb2f
·
1 Parent(s): 10e2bf6
Files changed (8) hide show
  1. .gitattributes copy +38 -0
  2. .gitignore +17 -0
  3. .python-version +1 -0
  4. README copy.md +12 -0
  5. api_utils.py +174 -0
  6. app.py +152 -0
  7. requirements.txt +5 -0
  8. schema.py +216 -0
.gitattributes copy ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/scene.jpg filter=lfs diff=lfs merge=lfs -text
37
+ assets/style.png filter=lfs diff=lfs merge=lfs -text
38
+ assets/subject.jpg filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ image_generate.png
10
+
11
+ # Virtual environments
12
+ .venv
13
+ .env
14
+
15
+
16
+ # Images
17
+ images/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11
README copy.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FIBO Mashup
3
+ emoji: 🏃
4
+ colorFrom: yellow
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 5.49.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
api_utils.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import json
4
+ import os
5
+ import time
6
+ from typing import Any, Dict, Optional
7
+ from PIL import Image
8
+ import requests
9
+
10
+
11
+ def _image_to_base64(image: Image.Image) -> str:
12
+ buffer = io.BytesIO()
13
+ image_format = (image.format or "PNG").upper()
14
+ if image_format not in {"PNG", "JPEG", "JPG"}:
15
+ image_format = "PNG"
16
+ image.save(buffer, format=image_format)
17
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
18
+
19
+
20
+ def _extract_status(payload: Dict[str, Any]) -> Optional[str]:
21
+ status_info = payload.get("status") or payload.get("state")
22
+ if isinstance(status_info, dict):
23
+ state = status_info.get("state") or status_info.get("status")
24
+ if isinstance(state, str):
25
+ return state.lower()
26
+ elif isinstance(status_info, str):
27
+ return status_info.lower()
28
+ return None
29
+
30
+
31
+ def _poll_bria_status(
32
+ status_url: str,
33
+ headers: Dict[str, str],
34
+ timeout_seconds: int = 120,
35
+ poll_interval: float = 1.5,
36
+ ) -> Dict[str, Any]:
37
+ deadline = time.time() + timeout_seconds
38
+ while True:
39
+ response = requests.get(status_url, headers=headers, timeout=30)
40
+ response.raise_for_status()
41
+ payload: Dict[str, Any] = response.json()
42
+ state = _extract_status(payload)
43
+
44
+ if state in {"succeeded", "success", "completed", "done"}:
45
+ if isinstance(payload.get("result"), dict):
46
+ return payload["result"]
47
+ if payload.get("results") is not None:
48
+ return payload["results"]
49
+ return payload
50
+
51
+ if state in {"failed", "error", "cancelled", "canceled"}:
52
+ raise RuntimeError(
53
+ f"Bria VLM API request failed: {json.dumps(payload, indent=2)}"
54
+ )
55
+
56
+ if time.time() > deadline:
57
+ raise TimeoutError(
58
+ f"Bria VLM API request timed out while polling {status_url}"
59
+ )
60
+
61
+ time.sleep(poll_interval)
62
+
63
+
64
+ def _submit_bria_request(
65
+ url: str, payload: Dict[str, Any], api_token: str
66
+ ) -> Dict[str, Any]:
67
+ headers = {
68
+ "Content-Type": "application/json",
69
+ "api_token": api_token,
70
+ }
71
+ response = requests.post(url, json=payload, headers=headers, timeout=30)
72
+ response.raise_for_status()
73
+ initial_payload: Dict[str, Any] = response.json()
74
+
75
+ status_url = (
76
+ initial_payload.get("status_url")
77
+ or initial_payload.get("statusUrl")
78
+ or (initial_payload.get("status") or {}).get("status_url")
79
+ )
80
+
81
+ if status_url:
82
+ return _poll_bria_status(status_url, headers)
83
+
84
+ if isinstance(initial_payload.get("result"), dict):
85
+ return initial_payload["result"]
86
+ if initial_payload.get("results") is not None:
87
+ return initial_payload["results"]
88
+
89
+ return initial_payload
90
+
91
+
92
+ def _parse_vlm_response(data: Any, prompt_role: str) -> str:
93
+ if isinstance(data, dict):
94
+ direct_match = data.get(prompt_role)
95
+ if isinstance(direct_match, str):
96
+ return direct_match
97
+
98
+ for key in ("prompt", "structured_prompt", "structuredPrompt", "text"):
99
+ if key in data:
100
+ value = data[key]
101
+ if isinstance(value, str):
102
+ return value
103
+ if isinstance(value, dict):
104
+ nested = value.get(prompt_role)
105
+ if isinstance(nested, str):
106
+ return nested
107
+
108
+ for key in ("result", "results"):
109
+ if key in data:
110
+ nested_result = _parse_vlm_response(data[key], prompt_role)
111
+ if nested_result:
112
+ return nested_result
113
+
114
+ if isinstance(data, list):
115
+ for item in data:
116
+ nested_result = _parse_vlm_response(item, prompt_role)
117
+ if nested_result:
118
+ return nested_result
119
+
120
+ return json.dumps(data)
121
+
122
+
123
+ def get_prompt_api(image_path: str, prompt_role: str) -> str:
124
+ """Send an image to the Bria VLM API and return the extracted prompt text.
125
+
126
+ The payload keys are aligned with the current public docs but may require
127
+ adjustment if your Bria workspace is configured differently. Override the
128
+ default endpoint via the ``BRIA_API_VLM_ENDPOINT`` environment variable if
129
+ you are using a custom workflow.
130
+ """
131
+ api_token = os.environ.get("BRIA_API_KEY")
132
+ if not api_token:
133
+ raise EnvironmentError(
134
+ "BRIA_API_KEY environment variable is required to use the Bria VLM API."
135
+ )
136
+
137
+ base_url = os.environ.get("BRIA_API_BASE_URL", "https://engine.prod.bria-api.com")
138
+ endpoint = os.environ.get("BRIA_API_VLM_ENDPOINT", "/v2/structured_prompt/generate")
139
+ url = f"{base_url.rstrip('/')}{endpoint}"
140
+
141
+ # convert image to base64
142
+ with Image.open(image_path) as image:
143
+ image_b64 = _image_to_base64(image)
144
+
145
+ payload = {"images": [image_b64]}
146
+
147
+ response = _submit_bria_request(url, payload, api_token)
148
+
149
+ return response["structured_prompt"]
150
+
151
+
152
+ def get_image_from_url(image_url: str) -> Image.Image:
153
+ """Get an image from a URL."""
154
+ response = requests.get(image_url)
155
+ return Image.open(io.BytesIO(response.content))
156
+
157
+
158
+ def generate_image(prompt: str) -> Image.Image:
159
+ """Generate an image from a prompt using the Bria VLM API."""
160
+ api_token = os.environ.get("BRIA_API_KEY")
161
+ if not api_token:
162
+ raise EnvironmentError(
163
+ "BRIA_API_KEY environment variable is required to use the Bria VLM API."
164
+ )
165
+
166
+ base_url = os.environ.get("BRIA_API_BASE_URL", "https://engine.prod.bria-api.com")
167
+ endpoint = os.environ.get("BRIA_API_GENERATE_ENDPOINT", "/v2/image/generate")
168
+ url = f"{base_url.rstrip('/')}{endpoint}"
169
+
170
+ payload = {"structured_prompt": prompt}
171
+
172
+ response = _submit_bria_request(url, payload, api_token)
173
+
174
+ return get_image_from_url(response["image_url"])
app.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tempfile
3
+ import concurrent.futures
4
+ import dotenv
5
+ import os
6
+ from api_utils import get_prompt_api, generate_image
7
+ from schema import claude_structured_output
8
+ import time
9
+ dotenv.load_dotenv()
10
+
11
+ def get_image_suffix(image):
12
+ if hasattr(image, 'format') and image.format:
13
+ return '.' + image.format.lower()
14
+ if hasattr(image, 'filename') and image.filename:
15
+ _, ext = os.path.splitext(image.filename)
16
+ if ext:
17
+ return ext.lower()
18
+ return '.jpg'
19
+
20
+ def process_images(subject_image, scene_image, style_image):
21
+ """
22
+ Process three images and generate a combined image.
23
+
24
+ Args:
25
+ subject_image: PIL Image for the main subject
26
+ scene_image: PIL Image for the scene/background
27
+ style_image: PIL Image for the artistic style
28
+
29
+ Returns:
30
+ PIL Image: The generated combined image
31
+ """
32
+ if subject_image is None or scene_image is None or style_image is None:
33
+ raise gr.Error("Please upload all three images (subject, scene, and style)")
34
+
35
+ try:
36
+ # Save images temporarily to pass to the API
37
+ with tempfile.TemporaryDirectory() as temp_dir:
38
+ subject_path = os.path.join(temp_dir, "subject" + get_image_suffix(subject_image))
39
+ scene_path = os.path.join(temp_dir, "scene" + get_image_suffix(scene_image))
40
+ style_path = os.path.join(temp_dir, "style" + get_image_suffix(style_image))
41
+ subject_image.save(subject_path)
42
+ scene_image.save(scene_path)
43
+ style_image.save(style_path)
44
+
45
+ # Get descriptions for each image
46
+ time_start = time.time()
47
+ with concurrent.futures.ThreadPoolExecutor() as executor:
48
+ future_subject = executor.submit(get_prompt_api, subject_path, "subject")
49
+ future_scene = executor.submit(get_prompt_api, scene_path, "scene")
50
+ future_style = executor.submit(get_prompt_api, style_path, "style")
51
+ subject = future_subject.result()
52
+ scene = future_scene.result()
53
+ style = future_style.result()
54
+ time_end = time.time()
55
+ print(f"Time taken to get descriptions: {time_end - time_start} seconds")
56
+ # Create combined prompt
57
+ prompt = f"""
58
+ place the main subject from the first image description and place it in the scene from the second image description with a style taken from the third image description.
59
+
60
+ first (subject) image description:
61
+ {subject}
62
+ second (scene) image description:
63
+ {scene}
64
+ third (style) image description:
65
+ {style}
66
+
67
+ create a new image description that incorporates all of the descriptions.
68
+ put the subject in the scene with the style.
69
+ """
70
+
71
+ # Generate structured output using Claude API
72
+ time_start = time.time()
73
+ response = claude_structured_output(prompt)
74
+ time_end = time.time()
75
+ print(f"Time taken to generate structured output: {time_end - time_start} seconds")
76
+ # Generate the final image
77
+ time_start = time.time()
78
+ result_image = generate_image(response)
79
+ time_end = time.time()
80
+ print(f"Time taken to generate image: {time_end - time_start} seconds")
81
+
82
+ return result_image
83
+
84
+ except Exception as e:
85
+ # Clean up temporary files on error
86
+ if "subject_path" in locals():
87
+ os.unlink(subject_path)
88
+ if "scene_path" in locals():
89
+ os.unlink(scene_path)
90
+ if "style_path" in locals():
91
+ os.unlink(style_path)
92
+ raise gr.Error(f"Error processing images: {str(e)}")
93
+
94
+
95
+ # Create Gradio interface
96
+ with gr.Blocks(title="2IM - Image Combination Generator") as demo:
97
+ gr.Markdown("""
98
+ # 🎨 FIBO Mashup - Image Combination Generator
99
+
100
+ Combine three images into one:
101
+ 1. **Subject Image**: The main object or person you want in the final image
102
+ 2. **Scene Image**: The background/environment for the final image
103
+ 3. **Style Image**: The artistic style to apply to the final image
104
+ """)
105
+
106
+ with gr.Row():
107
+ with gr.Column():
108
+ subject_input = gr.Image(
109
+ label="Subject Image",
110
+ type="pil",
111
+ height=300,
112
+ value="assets/subject.jpg",
113
+ )
114
+ gr.Markdown("*Upload the main subject/object*")
115
+
116
+ with gr.Column():
117
+ scene_input = gr.Image(
118
+ label="Scene Image", type="pil", height=300, value="assets/scene.jpg"
119
+ )
120
+ gr.Markdown("*Upload the scene/background*")
121
+
122
+ with gr.Column():
123
+ style_input = gr.Image(
124
+ label="Style Image", type="pil", height=300, value="assets/style.png"
125
+ )
126
+ gr.Markdown("*Upload the style reference*")
127
+
128
+ generate_btn = gr.Button("🎨 Generate Combined Image", variant="primary", size="lg")
129
+
130
+ gr.Markdown("---")
131
+
132
+ output_image = gr.Image(label="Generated Image", type="pil", height=500)
133
+
134
+ # Set up the event handler
135
+ generate_btn.click(
136
+ fn=process_images,
137
+ inputs=[subject_input, scene_input, style_input],
138
+ outputs=output_image,
139
+ )
140
+
141
+ gr.Markdown("""
142
+ ### How it works:
143
+ 1. Upload three images using the fields above
144
+ 2. Click "Generate Combined Image"
145
+ 3. The AI will analyze each image and create a new image that combines the subject from the first image, places it in the scene from the second image, and applies the style from the third image
146
+
147
+ *Note: Generation may take a minute or two depending on API response times.*
148
+ """)
149
+
150
+
151
+ if __name__ == "__main__":
152
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=3.36.1
2
+ pillow>=12.0.0
3
+ setuptools>=80.9.0
4
+ pydantic>=2.12.3
5
+ pydantic-ai>=1.11.0
schema.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+ from pydantic_ai import Agent
5
+
6
+
7
+ class ObjectDescription(BaseModel):
8
+ description: str = Field(..., description="Short description of the object.")
9
+ location: str = Field(
10
+ ..., description="E.g., 'center', 'top-left', 'bottom-right foreground'."
11
+ )
12
+ relationship: str = Field(
13
+ ...,
14
+ description="Describe the relationship between the object and the other objects in the image.",
15
+ )
16
+ relative_size: Optional[str] = Field(
17
+ None, description="E.g., 'small', 'medium', 'large within frame'."
18
+ )
19
+ shape_and_color: Optional[str] = Field(
20
+ None, description="Describe the basic shape and dominant color."
21
+ )
22
+ texture: Optional[str] = Field(
23
+ None, description="E.g., 'smooth', 'rough', 'metallic', 'furry'."
24
+ )
25
+ appearance_details: Optional[str] = Field(
26
+ None, description="Any other notable visual details."
27
+ )
28
+ # If cluster of object
29
+ number_of_objects: Optional[int] = Field(
30
+ None, description="The number of objects in the cluster."
31
+ )
32
+ # Human-specific fields
33
+ pose: Optional[str] = Field(None, description="Describe the body position.")
34
+ expression: Optional[str] = Field(None, description="Describe facial expression.")
35
+ clothing: Optional[str] = Field(None, description="Describe attire.")
36
+ action: Optional[str] = Field(None, description="Describe the action of the human.")
37
+ gender: Optional[str] = Field(None, description="Describe the gender of the human.")
38
+ skin_tone_and_texture: Optional[str] = Field(
39
+ None, description="Describe the skin tone and texture."
40
+ )
41
+ orientation: Optional[str] = Field(
42
+ None, description="Describe the orientation of the human."
43
+ )
44
+
45
+
46
+ class LightingDetails(BaseModel):
47
+ conditions: str = Field(
48
+ ...,
49
+ description="E.g., 'bright daylight', 'dim indoor', 'studio lighting', 'golden hour'.",
50
+ )
51
+ direction: str = Field(
52
+ ..., description="E.g., 'front-lit', 'backlit', 'side-lit from left'."
53
+ )
54
+ shadows: Optional[str] = Field(
55
+ None, description="Describe the presence of shadows."
56
+ )
57
+
58
+
59
+ class AestheticsDetails(BaseModel):
60
+ composition: str = Field(
61
+ ...,
62
+ description="E.g., 'rule of thirds', 'symmetrical', 'centered', 'leading lines'.",
63
+ )
64
+ color_scheme: str = Field(
65
+ ...,
66
+ description="E.g., 'monochromatic blue', 'warm complementary colors', 'high contrast'.",
67
+ )
68
+ mood_atmosphere: str = Field(
69
+ ..., description="E.g., 'serene', 'energetic', 'mysterious', 'joyful'."
70
+ )
71
+
72
+
73
+ class PhotographicCharacteristicsDetails(BaseModel):
74
+ depth_of_field: str = Field(
75
+ ..., description="E.g., 'shallow', 'deep', 'bokeh background'."
76
+ )
77
+ focus: str = Field(
78
+ ..., description="E.g., 'sharp focus on subject', 'soft focus', 'motion blur'."
79
+ )
80
+ camera_angle: str = Field(
81
+ ..., description="E.g., 'eye-level', 'low angle', 'high angle', 'dutch angle'."
82
+ )
83
+ lens_focal_length: str = Field(
84
+ ..., description="E.g., 'wide-angle', 'telephoto', 'macro', 'fisheye'."
85
+ )
86
+
87
+
88
+ class TextRender(BaseModel):
89
+ text: str = Field(..., description="The text content.")
90
+ location: str = Field(
91
+ ..., description="E.g., 'center', 'top-left', 'bottom-right foreground'."
92
+ )
93
+ size: str = Field(..., description="E.g., 'small', 'medium', 'large within frame'.")
94
+ color: str = Field(..., description="E.g., 'red', 'blue', 'green'.")
95
+ font: str = Field(..., description="E.g., 'realistic', 'cartoonish', 'minimalist'.")
96
+ appearance_details: Optional[str] = Field(
97
+ None, description="Any other notable visual details."
98
+ )
99
+
100
+
101
+ class ImageAnalysis(BaseModel):
102
+ short_description: str = Field(
103
+ ..., description="A concise summary of the image content, 200 words maximum."
104
+ )
105
+ objects: List[ObjectDescription] = Field(
106
+ ..., description="List of prominent foreground/midground objects."
107
+ )
108
+ background_setting: str = Field(
109
+ ...,
110
+ description="Describe the overall environment, setting, or background, including any notable background elements.",
111
+ )
112
+ lighting: LightingDetails = Field(..., description="Details about the lighting.")
113
+ aesthetics: AestheticsDetails = Field(
114
+ ..., description="Details about the image aesthetics."
115
+ )
116
+ photographic_characteristics: Optional[PhotographicCharacteristicsDetails] = Field(
117
+ None, description="Details about photographic characteristics."
118
+ )
119
+ style_medium: Optional[str] = Field(
120
+ None, description="Identify the artistic style or medium."
121
+ )
122
+ text_render: Optional[List[TextRender]] = Field(
123
+ None, description="List of text renders in the image."
124
+ )
125
+ context: str = Field(
126
+ ...,
127
+ description="Provide any additional context that helps understand the image better.",
128
+ )
129
+ artistic_style: Optional[str] = Field(
130
+ None, description="describe specific artistic characteristics, 3 words maximum."
131
+ )
132
+
133
+
134
+ json_schema_full = """1. `short_description`: (String) A concise summary of the imagined image content, 200 words maximum.
135
+ 2. `objects`: (Array of Objects) List a maximum of 5 prominent objects. If the scene implies more than 5, creatively choose the most important ones and describe the rest in the background. For each object, include:
136
+ * `description`: (String) A detailed description of the imagined object, 100 words maximum.
137
+ * `location`: (String) E.g., "center", "top-left", "bottom-right foreground".
138
+ * `relative_size`: (String) E.g., "small", "medium", "large within frame". (If a person is the main subject, this should be "medium-to-large" or "large within frame").
139
+ * `shape_and_color`: (String) Describe the basic shape and dominant color.
140
+ * `texture`: (String) E.g., "smooth", "rough", "metallic", "furry".
141
+ * `appearance_details`: (String) Any other notable visual details.
142
+ * `relationship`: (String) Describe the relationship between the object and the other objects in the image.
143
+ * `orientation`: (String) Describe the orientation or positioning of the object, e.g., "upright", "tilted 45 degrees", "horizontal", "vertical", "facing left", "facing right", "upside down", "lying on its side".
144
+ * If the object is a human or a human-like object, include the following:
145
+ * `pose`: (String) Describe the body position.
146
+ * `expression`: (String) Describe facial expression and emotion. E.g., "winking", "joyful", "serious", "surprised", "calm".
147
+ * `clothing`: (String) Describe attire.
148
+ * `action`: (String) Describe the action of the human.
149
+ * `gender`: (String) Describe the gender of the human.
150
+ * `skin_tone_and_texture`: (String) Describe the skin tone and texture.
151
+ * If the object is a cluster of objects, include the following:
152
+ * `number_of_objects`: (Integer) The number of objects in the cluster.
153
+ 3. `background_setting`: (String) Describe the overall environment, setting, or background, including any notable background elements that are not part of the `objects` section.
154
+ 4. `lighting`: (Object)
155
+ * `conditions`: (String) E.g., "bright daylight", "dim indoor", "studio lighting", "golden hour".
156
+ * `direction`: (String) E.g., "front-lit", "backlit", "side-lit from left".
157
+ * `shadows`: (String) Describe the presence and quality of shadows, e.g., "long, soft shadows", "sharp, defined shadows", "minimal shadows".
158
+ 5. `aesthetics`: (Object)
159
+ * `composition`: (String) E.g., "rule of thirds", "symmetrical", "centered", "leading lines". If people are the main subject, specify the shot type, e.g., "medium shot", "close-up", "portrait composition".
160
+ * `color_scheme`: (String) E.g., "monochromatic blue", "warm complementary colors", "high contrast".
161
+ * `mood_atmosphere`: (String) E.g., "serene", "energetic", "mysterious", "joyful".
162
+ 6. `photographic_characteristics`: (Object)
163
+ * `depth_of_field`: (String) E.g., "shallow", "deep", "bokeh background".
164
+ * `focus`: (String) E.g., "sharp focus on subject", "soft focus", "motion blur".
165
+ * `camera_angle`: (String) E.g., "eye-level", "low angle", "high angle", "dutch angle".
166
+ * `lens_focal_length`: (String) E.g., "wide-angle", "telephoto", "macro", "fisheye". (If the main subject is a person, prefer "standard lens (e.g., 35mm-50mm)" or "portrait lens (e.g., 50mm-85mm)" to ensure they are framed more closely. Avoid "wide-angle" for people unless specified).
167
+ 7. `style_medium`: (String) Identify the artistic style or medium based on the user's prompt or creative interpretation (e.g., "photograph", "oil painting", "watercolor", "3D render", "digital illustration", "pencil sketch").
168
+ 8. `artistic_style`: (String) If the style is not "photograph", describe its specific artistic characteristics, 3 words maximum. (e.g., "impressionistic, vibrant, textured" for an oil painting).
169
+ 9. `context`: (String) Provide a general description of the type of image this would be. For example: "This is a concept for a high-fashion editorial photograph intended for a magazine spread," or "This describes a piece of concept art for a fantasy video game."
170
+ 10. `text_render`: (Array of Objects) By default, this array should be empty (`[]`). Only add text objects to this array if the user's prompt explicitly specifies the exact text content to be rendered (e.g., user asks for "a poster with the title 'Cosmic Dream'"). Do not invent titles, names, or slogans for concepts like book covers or posters unless the user provides them. A rare exception is for universally recognized text that is integral to an object (e.g., the word 'STOP' on a 'stop sign'). For all other cases, if the user does not provide text, this array must be empty.
171
+ * `text`: (String) The exact text content provided by the user. NEVER use generic placeholders.
172
+ * `location`: (String) E.g., "center", "top-left", "bottom-right foreground".
173
+ * `size`: (String) E.g., "medium", "large", "large within frame".
174
+ * `color`: (String) E.g., "red", "blue", "green".
175
+ * `font`: (String) E.g., "realistic", "cartoonish", "minimalist", "serif typeface".
176
+ * `appearance_details`: (String) Any other notable visual details."""
177
+
178
+
179
+ def get_system_prompt() -> str:
180
+ return f"""You are a Meticulous Visual Editor and Senior Art Director at a leading Generative AI company.
181
+ Your expertise is in combining multiple image descriptions into a single image description.
182
+ Your primary task is to receive multiple image descriptions and generate a single image description that incorporates all of the descriptions.
183
+ Adhere strictly to the following structure and guidelines:
184
+ 1. **Input:** You will receive three image descriptions. The first image description is the main subject, the second image description is the scene, and the third image description is the style.
185
+ 2. **Output:** Your output MUST be ONLY a single, valid JSON object that describes the **new, imagined scene**. Do not describe the original reference image.
186
+ 3. **Modification Logic:**
187
+ * Carefully parse the three image descriptions to understand the desired changes.
188
+ * Combine the three image descriptions into a single image description that incorporates all of the descriptions.
189
+ * The image description should be in the same format as the image descriptions provided.
190
+ * The image description should be a single, valid image description that incorporates all of the descriptions.
191
+ * The image description should be a single, valid image description that incorporates all of the descriptions.
192
+ 4. **Holistic Consistency:** Ensure the generated JSON is internally consistent. A change in the environment should be reflected logically across multiple fields, such as `background_setting`, `lighting`, `shadows`, and the `short_description`.
193
+ 5. **Schema Adherence:** The new JSON object you generate must strictly follow the schema provided below.
194
+ The JSON object must contain the following keys precisely:
195
+ {json_schema_full}"""
196
+
197
+
198
+ def claude_structured_output(prompt: str) -> str:
199
+ """
200
+ Call Claude API with a prompt and return structured output using a Pydantic schema.
201
+
202
+ Args:
203
+ prompt: The user prompt/query to send to Claude
204
+
205
+ Returns:
206
+ A JSON string with the structured response
207
+ """
208
+ system_prompt = get_system_prompt()
209
+ agent = Agent(
210
+ "anthropic:claude-sonnet-4-5",
211
+ output_type=ImageAnalysis,
212
+ system_prompt=system_prompt,
213
+ )
214
+
215
+ response_text = agent.run_sync(prompt)
216
+ return response_text.output.model_dump_json()