Spaces:
Runtime error
Runtime error
| from typing import List, Optional | |
| from pydantic import BaseModel, Field | |
| from pydantic_ai import Agent | |
| class ObjectDescription(BaseModel): | |
| description: str = Field(..., description="Short description of the object.") | |
| location: str = Field( | |
| ..., description="E.g., 'center', 'top-left', 'bottom-right foreground'." | |
| ) | |
| relationship: str = Field( | |
| ..., | |
| description="Describe the relationship between the object and the other objects in the image.", | |
| ) | |
| relative_size: Optional[str] = Field( | |
| None, description="E.g., 'small', 'medium', 'large within frame'." | |
| ) | |
| shape_and_color: Optional[str] = Field( | |
| None, description="Describe the basic shape and dominant color." | |
| ) | |
| texture: Optional[str] = Field( | |
| None, description="E.g., 'smooth', 'rough', 'metallic', 'furry'." | |
| ) | |
| appearance_details: Optional[str] = Field( | |
| None, description="Any other notable visual details." | |
| ) | |
| # If cluster of object | |
| number_of_objects: Optional[int] = Field( | |
| None, description="The number of objects in the cluster." | |
| ) | |
| # Human-specific fields | |
| pose: Optional[str] = Field(None, description="Describe the body position.") | |
| expression: Optional[str] = Field(None, description="Describe facial expression.") | |
| clothing: Optional[str] = Field(None, description="Describe attire.") | |
| action: Optional[str] = Field(None, description="Describe the action of the human.") | |
| gender: Optional[str] = Field(None, description="Describe the gender of the human.") | |
| skin_tone_and_texture: Optional[str] = Field( | |
| None, description="Describe the skin tone and texture." | |
| ) | |
| orientation: Optional[str] = Field( | |
| None, description="Describe the orientation of the human." | |
| ) | |
| class LightingDetails(BaseModel): | |
| conditions: str = Field( | |
| ..., | |
| description="E.g., 'bright daylight', 'dim indoor', 'studio lighting', 'golden hour'.", | |
| ) | |
| direction: str = Field( | |
| ..., description="E.g., 'front-lit', 'backlit', 'side-lit from left'." | |
| ) | |
| shadows: Optional[str] = Field( | |
| None, description="Describe the presence of shadows." | |
| ) | |
| class AestheticsDetails(BaseModel): | |
| composition: str = Field( | |
| ..., | |
| description="E.g., 'rule of thirds', 'symmetrical', 'centered', 'leading lines'.", | |
| ) | |
| color_scheme: str = Field( | |
| ..., | |
| description="E.g., 'monochromatic blue', 'warm complementary colors', 'high contrast'.", | |
| ) | |
| mood_atmosphere: str = Field( | |
| ..., description="E.g., 'serene', 'energetic', 'mysterious', 'joyful'." | |
| ) | |
| class PhotographicCharacteristicsDetails(BaseModel): | |
| depth_of_field: str = Field( | |
| ..., description="E.g., 'shallow', 'deep', 'bokeh background'." | |
| ) | |
| focus: str = Field( | |
| ..., description="E.g., 'sharp focus on subject', 'soft focus', 'motion blur'." | |
| ) | |
| camera_angle: str = Field( | |
| ..., description="E.g., 'eye-level', 'low angle', 'high angle', 'dutch angle'." | |
| ) | |
| lens_focal_length: str = Field( | |
| ..., description="E.g., 'wide-angle', 'telephoto', 'macro', 'fisheye'." | |
| ) | |
| class TextRender(BaseModel): | |
| text: str = Field(..., description="The text content.") | |
| location: str = Field( | |
| ..., description="E.g., 'center', 'top-left', 'bottom-right foreground'." | |
| ) | |
| size: str = Field(..., description="E.g., 'small', 'medium', 'large within frame'.") | |
| color: str = Field(..., description="E.g., 'red', 'blue', 'green'.") | |
| font: str = Field(..., description="E.g., 'realistic', 'cartoonish', 'minimalist'.") | |
| appearance_details: Optional[str] = Field( | |
| None, description="Any other notable visual details." | |
| ) | |
| class ImageAnalysis(BaseModel): | |
| short_description: str = Field( | |
| ..., description="A concise summary of the image content, 200 words maximum." | |
| ) | |
| objects: List[ObjectDescription] = Field( | |
| ..., description="List of prominent foreground/midground objects." | |
| ) | |
| background_setting: str = Field( | |
| ..., | |
| description="Describe the overall environment, setting, or background, including any notable background elements.", | |
| ) | |
| lighting: LightingDetails = Field(..., description="Details about the lighting.") | |
| aesthetics: AestheticsDetails = Field( | |
| ..., description="Details about the image aesthetics." | |
| ) | |
| photographic_characteristics: Optional[PhotographicCharacteristicsDetails] = Field( | |
| None, description="Details about photographic characteristics." | |
| ) | |
| style_medium: Optional[str] = Field( | |
| None, description="Identify the artistic style or medium." | |
| ) | |
| text_render: Optional[List[TextRender]] = Field( | |
| None, description="List of text renders in the image." | |
| ) | |
| context: str = Field( | |
| ..., | |
| description="Provide any additional context that helps understand the image better.", | |
| ) | |
| artistic_style: Optional[str] = Field( | |
| None, description="describe specific artistic characteristics, 3 words maximum." | |
| ) | |
| json_schema_full = """1. `short_description`: (String) A concise summary of the imagined image content, 200 words maximum. | |
| 2. `objects`: (Array of Objects) List a maximum of 5 prominent objects. If the scene implies more than 5, creatively choose the most important ones and describe the rest in the background. For each object, include: | |
| * `description`: (String) A detailed description of the imagined object, 100 words maximum. | |
| * `location`: (String) E.g., "center", "top-left", "bottom-right foreground". | |
| * `relative_size`: (String) E.g., "small", "medium", "large within frame". (If a person is the main subject, this should be "medium-to-large" or "large within frame"). | |
| * `shape_and_color`: (String) Describe the basic shape and dominant color. | |
| * `texture`: (String) E.g., "smooth", "rough", "metallic", "furry". | |
| * `appearance_details`: (String) Any other notable visual details. | |
| * `relationship`: (String) Describe the relationship between the object and the other objects in the image. | |
| * `orientation`: (String) Describe the orientation or positioning of the object, e.g., "upright", "tilted 45 degrees", "horizontal", "vertical", "facing left", "facing right", "upside down", "lying on its side". | |
| * If the object is a human or a human-like object, include the following: | |
| * `pose`: (String) Describe the body position. | |
| * `expression`: (String) Describe facial expression and emotion. E.g., "winking", "joyful", "serious", "surprised", "calm". | |
| * `clothing`: (String) Describe attire. | |
| * `action`: (String) Describe the action of the human. | |
| * `gender`: (String) Describe the gender of the human. | |
| * `skin_tone_and_texture`: (String) Describe the skin tone and texture. | |
| * If the object is a cluster of objects, include the following: | |
| * `number_of_objects`: (Integer) The number of objects in the cluster. | |
| 3. `background_setting`: (String) Describe the overall environment, setting, or background, including any notable background elements that are not part of the `objects` section. | |
| 4. `lighting`: (Object) | |
| * `conditions`: (String) E.g., "bright daylight", "dim indoor", "studio lighting", "golden hour". | |
| * `direction`: (String) E.g., "front-lit", "backlit", "side-lit from left". | |
| * `shadows`: (String) Describe the presence and quality of shadows, e.g., "long, soft shadows", "sharp, defined shadows", "minimal shadows". | |
| 5. `aesthetics`: (Object) | |
| * `composition`: (String) E.g., "rule of thirds", "symmetrical", "centered", "leading lines". If people are the main subject, specify the shot type, e.g., "medium shot", "close-up", "portrait composition". | |
| * `color_scheme`: (String) E.g., "monochromatic blue", "warm complementary colors", "high contrast". | |
| * `mood_atmosphere`: (String) E.g., "serene", "energetic", "mysterious", "joyful". | |
| 6. `photographic_characteristics`: (Object) | |
| * `depth_of_field`: (String) E.g., "shallow", "deep", "bokeh background". | |
| * `focus`: (String) E.g., "sharp focus on subject", "soft focus", "motion blur". | |
| * `camera_angle`: (String) E.g., "eye-level", "low angle", "high angle", "dutch angle". | |
| * `lens_focal_length`: (String) E.g., "wide-angle", "telephoto", "macro", "fisheye". (If the main subject is a person, prefer "standard lens (e.g., 35mm-50mm)" or "portrait lens (e.g., 50mm-85mm)" to ensure they are framed more closely. Avoid "wide-angle" for people unless specified). | |
| 7. `style_medium`: (String) Identify the artistic style or medium based on the user's prompt or creative interpretation (e.g., "photograph", "oil painting", "watercolor", "3D render", "digital illustration", "pencil sketch"). | |
| 8. `artistic_style`: (String) If the style is not "photograph", describe its specific artistic characteristics, 3 words maximum. (e.g., "impressionistic, vibrant, textured" for an oil painting). | |
| 9. `context`: (String) Provide a general description of the type of image this would be. For example: "This is a concept for a high-fashion editorial photograph intended for a magazine spread," or "This describes a piece of concept art for a fantasy video game." | |
| 10. `text_render`: (Array of Objects) By default, this array should be empty (`[]`). Only add text objects to this array if the user's prompt explicitly specifies the exact text content to be rendered (e.g., user asks for "a poster with the title 'Cosmic Dream'"). Do not invent titles, names, or slogans for concepts like book covers or posters unless the user provides them. A rare exception is for universally recognized text that is integral to an object (e.g., the word 'STOP' on a 'stop sign'). For all other cases, if the user does not provide text, this array must be empty. | |
| * `text`: (String) The exact text content provided by the user. NEVER use generic placeholders. | |
| * `location`: (String) E.g., "center", "top-left", "bottom-right foreground". | |
| * `size`: (String) E.g., "medium", "large", "large within frame". | |
| * `color`: (String) E.g., "red", "blue", "green". | |
| * `font`: (String) E.g., "realistic", "cartoonish", "minimalist", "serif typeface". | |
| * `appearance_details`: (String) Any other notable visual details.""" | |
| def get_system_prompt() -> str: | |
| return f"""You are a Meticulous Visual Editor and Senior Art Director at a leading Generative AI company. | |
| Your expertise is in combining multiple image descriptions into a single image description. | |
| Your primary task is to receive multiple image descriptions and generate a single image description that incorporates all of the descriptions. | |
| Adhere strictly to the following structure and guidelines: | |
| 1. **Input:** You will receive three image descriptions. The first image description is the main subject, the second image description is the scene, and the third image description is the style. | |
| 2. **Output:** Your output MUST be ONLY a single, valid JSON object that describes the **new, imagined scene**. Do not describe the original reference image. | |
| 3. **Modification Logic:** | |
| * Carefully parse the three image descriptions to understand the desired changes. | |
| * Combine the three image descriptions into a single image description that incorporates all of the descriptions. | |
| * The image description should be in the same format as the image descriptions provided. | |
| * The image description should be a single, valid image description that incorporates all of the descriptions. | |
| * The image description should be a single, valid image description that incorporates all of the descriptions. | |
| 4. **Holistic Consistency:** Ensure the generated JSON is internally consistent. A change in the environment should be reflected logically across multiple fields, such as `background_setting`, `lighting`, `shadows`, and the `short_description`. | |
| 5. **Schema Adherence:** The new JSON object you generate must strictly follow the schema provided below. | |
| The JSON object must contain the following keys precisely: | |
| {json_schema_full}""" | |
| def claude_structured_output(prompt: str) -> str: | |
| """ | |
| Call Claude API with a prompt and return structured output using a Pydantic schema. | |
| Args: | |
| prompt: The user prompt/query to send to Claude | |
| Returns: | |
| A JSON string with the structured response | |
| """ | |
| system_prompt = get_system_prompt() | |
| agent = Agent( | |
| "anthropic:claude-sonnet-4-5", | |
| output_type=ImageAnalysis, | |
| system_prompt=system_prompt, | |
| ) | |
| response_text = agent.run_sync(prompt) | |
| return response_text.output.model_dump_json() | |