attribution-2steps-method

Sleeping

App Files Files Community

thanhnt-cf commited on Mar 26

Commit

8ba64a4

1 Parent(s): 0a2ea2e

initial commit

Browse files

Files changed (29) hide show

.gitignore +6 -0
app.py +275 -0
app/__init__.py +0 -0
app/config.py +93 -0
app/core/__init__.py +0 -0
app/core/errors.py +12 -0
app/core/prompts.py +41 -0
app/core/security.py +0 -0
app/request_handler/__init__.py +2 -0
app/request_handler/extract_handler.py +111 -0
app/request_handler/follow_handler.py +52 -0
app/request_handler/validate.py +44 -0
app/schemas/__init__.py +0 -0
app/schemas/requests.py +35 -0
app/schemas/responses.py +67 -0
app/schemas/schema_tools.py +91 -0
app/services/__init__.py +0 -0
app/services/base.py +62 -0
app/services/factory.py +20 -0
app/services/service_anthropic.py +155 -0
app/services/service_openai.py +172 -0
app/utils/__init__.py +0 -0
app/utils/converter.py +14 -0
app/utils/image_processing.py +14 -0
app/utils/logger.py +39 -0
app/utils/rate_limiter.py +0 -0
app/utils/token_counter.py +0 -0
clean_for_gradio.sh +4 -0
requirements.txt +15 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+env
+.env
+app.log
+gradio_temp/
+__pycache__/

app.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import os
+os.environ["HUGGINGFACE_DEMO"] = "1"  # set before import from app
+from dotenv import load_dotenv
+load_dotenv()
+################################################################################################
+import gradio as gr
+import uuid
+import shutil
+from app.config import get_settings
+from app.schemas.requests import Attribute
+from app.request_handler import handle_extract
+from app.services.factory import AIServiceFactory
+settings = get_settings()
+IMAGE_MAX_SIZE = 1536
+async def forward_request(attributes, product_taxonomy, product_data, ai_model, pil_images):
+    # prepare temp folder
+    request_id = str(uuid.uuid4())
+    request_temp_folder = os.path.join('gradio_temp', request_id)
+    os.makedirs(request_temp_folder, exist_ok=True)
+    try:
+        # convert attributes to schema
+        attributes = "attributes_object = {" + attributes + "}"
+        try:
+            attributes = exec(attributes, globals())
+        except:
+            raise gr.Error("Invalid `Attribute Schema`. Please insert valid schema following the example.")
+        for key, value in attributes_object.items(): # type: ignore
+            attributes_object[key] = Attribute(**value) # type: ignore
+        if product_data == "":
+            product_data = "{}"
+        product_data_code = f"product_data_object = {product_data}"
+        try:
+            exec(product_data_code, globals())
+        except:
+            raise gr.Error('Invalid `Product Data`. Please insert valid dictionary or leave it empty.')
+        if pil_images is None:
+            raise gr.Error('Please upload image(s) of the product')
+        pil_images = [pil_image[0] for pil_image in pil_images]
+        img_paths = []
+        for i, pil_image in enumerate(pil_images):
+            if max(pil_image.size) > IMAGE_MAX_SIZE:
+                ratio = IMAGE_MAX_SIZE / max(pil_image.size)
+                pil_image = pil_image.resize((int(pil_image.width * ratio), int(pil_image.height * ratio)))
+            img_path = os.path.join(request_temp_folder, f'{i}.jpg')
+            if pil_image.mode in ('RGBA', 'LA') or (pil_image.mode == 'P' and 'transparency' in pil_image.info):
+                pil_image = pil_image.convert("RGBA")
+                if pil_image.getchannel("A").getextrema() == (255, 255):  # if fully opaque, save as JPEG
+                    pil_image = pil_image.convert("RGB")
+                    image_format = 'JPEG'
+                else:
+                    image_format = 'PNG'
+            else:
+                image_format = 'JPEG'
+            pil_image.save(img_path, image_format, quality=100, subsampling=0)
+            img_paths.append(img_path)
+        # mapping
+        if ai_model in settings.OPENAI_MODELS:
+            ai_vendor = 'openai'
+        elif ai_model in settings.ANTHROPIC_MODELS:
+            ai_vendor = 'anthropic'
+        service = AIServiceFactory.get_service(ai_vendor)
+        try:
+            json_attributes = await service.extract_attributes_with_validation(
+                attributes_object, # type: ignore
+                ai_model,
+                None,
+                product_taxonomy,
+                product_data_object,  # type: ignore
+                img_paths=img_paths,
+            )
+        except:
+            raise gr.Error('Failed to extract attributes. Something went wrong.')
+    finally:
+        # remove temp folder anyway
+        shutil.rmtree(request_temp_folder)
+    gr.Info('Process completed!')
+    return json_attributes
+def add_attribute_schema(attributes, attr_name, attr_desc, attr_type, allowed_values):
+    schema = f"""
+"{attr_name}": {{
+    "description": "{attr_desc}",
+    "data_type": "{attr_type}",
+    "allowed_values": [
+        {', '.join([f'"{v.strip()}"' for v in allowed_values.split(',')])}
+    ]
+}},
+"""
+    return attributes + schema, "", "", "", ""
+sample_schema = """"category": {
+    "description": "Category of the garment",
+    "data_type": "list[string]",
+    "allowed_values": [
+        "upper garment", "lower garment", "footwear", "accessory", "headwear", "dresses"
+    ]
+},
+"color": {
+    "description": "Color of the garment",
+    "data_type": "list[string]",
+    "allowed_values": [
+        "black", "white", "red", "blue", "green", "yellow", "pink", "purple", "orange", "brown", "grey", "beige", "multi-color", "other"
+    ]
+},
+"pattern": {
+    "description": "Pattern of the garment",
+    "data_type": "list[string]",
+    "allowed_values": [
+        "plain", "striped", "checkered", "floral", "polka dot", "camouflage", "animal print", "abstract", "other"
+    ]
+},
+"material": {
+    "description": "Material of the garment",
+    "data_type": "string",
+    "allowed_values": []
+}
+"""
+description = """
+This is a simple demo for Attribution. Follow the steps below:
+1. Upload image(s) of a product.
+2. Enter the product taxonomy (e.g. 'upper garment', 'lower garment', 'bag'). If only one product is in the image, you can leave this field empty.
+3. Select the AI model to use.
+4. Enter known attributes (optional).
+5. Enter the attribute schema or use the "Add Attributes" section to add attributes.
+6. Click "Extract Attributes" to get the extracted attributes.
+"""
+product_data_placeholder = """Example:
+{
+    "brand": "Leaf",
+    "size": "M",
+    "product_name": "Leaf T-shirt",
+    "color": "red"
+}
+"""
+product_data_value = """
+{
+    "data1": "",
+    "data2": ""
+}
+"""
+with gr.Blocks(title="Internal Demo for Attribution") as demo:
+    with gr.Row():
+        with gr.Column(scale=12):
+            gr.Markdown(
+                """<div style="text-align: center; font-size: 24px;"><strong>Internal Demo for Attribution</strong></div>"""
+            )
+            gr.Markdown(description)
+    with gr.Row():
+        with gr.Column(scale=12):
+            with gr.Row():
+                with gr.Column():
+                    gallery = gr.Gallery(
+                        label="Upload images of your product here", type="pil"
+                    )
+                    product_taxnomy = gr.Textbox(
+                        label="Product Taxonomy",
+                        placeholder="Enter product taxonomy here (e.g. 'upper garment', 'lower garment', 'bag')",
+                        lines=1,
+                        max_lines=1,
+                    )
+                    ai_model = gr.Dropdown(
+                        label="AI Model",
+                        choices=settings.SUPPORTED_MODELS,
+                        interactive=True,
+                    )
+                    product_data = gr.TextArea(
+                        label="Product Data (Optional)",
+                        placeholder=product_data_placeholder,
+                        value=product_data_value.strip(),
+                        interactive=True,
+                        lines=10,
+                        max_lines=10,
+                    )
+                    # track_count = gr.State(1)
+                    # @gr.render(inputs=track_count)
+                    # def render_tracks(count):
+                    #     ka_names = []
+                    #     ka_values = []
+                    #     with gr.Column():
+                    #         for i in range(count):
+                    #             with gr.Column(variant="panel"):
+                    #                 with gr.Row():
+                    #                     ka_name = gr.Textbox(placeholder="key", key=f"key-{i}", show_label=False)
+                    #                     ka_value = gr.Textbox(placeholder="data", key=f"data-{i}", show_label=False)
+                    #                     ka_names.append(ka_name)
+                    #                     ka_values.append(ka_value)
+                    # add_track_btn = gr.Button("Add Product Data")
+                    # remove_track_btn = gr.Button("Remove Product Data")
+                    # add_track_btn.click(lambda count: count + 1, track_count, track_count)
+                    # remove_track_btn.click(lambda count: count - 1, track_count, track_count)
+                with gr.Column():
+                    attributes = gr.TextArea(
+                        label="Attribute Schema",
+                        value=sample_schema,
+                        placeholder="Enter schema here or use Add Attributes below",
+                        interactive=True,
+                        lines=30,
+                        max_lines=30,
+                    )
+                    with gr.Accordion("Add Attributes", open=False):
+                        attr_name = gr.Textbox(
+                            label="Attribute name", placeholder="Enter attribute name"
+                        )
+                        attr_desc = gr.Textbox(
+                            label="Description", placeholder="Enter description"
+                        )
+                        attr_type = gr.Dropdown(
+                            label="Type",
+                            choices=[
+                                "string",
+                                "list[string]",
+                                "int",
+                                "list[int]",
+                                "float",
+                                "list[float]",
+                                "bool",
+                                "list[bool]",
+                            ],
+                            interactive=True,
+                        )
+                        allowed_values = gr.Textbox(
+                            label="Allowed values (separated by comma)",
+                            placeholder="yellow, red, blue",
+                        )
+                        add_btn = gr.Button("Add Attribute")
+            with gr.Row():
+                submit_btn = gr.Button("Extract Attributes")
+        with gr.Column(scale=6):
+            output_json = gr.Json(
+                label="Extracted Attributes", value={}, show_indices=False
+            )
+    add_btn.click(
+        add_attribute_schema,
+        inputs=[attributes, attr_name, attr_desc, attr_type, allowed_values],
+        outputs=[attributes, attr_name, attr_desc, attr_type, allowed_values],
+    )
+    submit_btn.click(
+        forward_request,
+        inputs=[attributes, product_taxnomy, product_data, ai_model, gallery],
+        outputs=output_json,
+    )
+demo.launch()

app/__init__.py ADDED Viewed

File without changes

app/config.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import os
+from functools import lru_cache
+from typing import Optional
+from pydantic_settings import BaseSettings
+if os.getenv("HUGGINGFACE_DEMO"):
+    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+    ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
+else:
+    from app.aws.secrets import get_secret
+    secrets = get_secret()
+    os.environ["WANDB_API_KEY"] = secrets["WANDB_API_KEY"]
+    OPENAI_API_KEY = secrets["OPENAI_API_KEY"]
+    ANTHROPIC_API_KEY = secrets["ANTHROPIC_API_KEY"]
+os.environ["WANDB_BASE_URL"] = "https://api.wandb.ai"
+class Settings(BaseSettings):
+    # Supported openai models
+    OPENAI_MODELS: list = [
+        "gpt-4o",  # first model is the default of the vendor
+        "gpt-4o-2024-11-20",
+        "gpt-4o-mini",
+    ]
+    # Supported anthropic models
+    ANTHROPIC_MODELS: list = [
+        "claude-3-5-sonnet-latest"  # first model is the default of the vendor
+    ]
+    # Supprted AI Services
+    SUPPORTED_MODELS: list = OPENAI_MODELS + ANTHROPIC_MODELS
+    # API Keys
+    OPENAI_API_KEY: str
+    ANTHROPIC_API_KEY: str
+    DEFAULT_MAX_ATTEMPTS: int = 1
+    # AI Service Configuration
+    DEFAULT_MODEL: str = OPENAI_MODELS[0]
+    MAX_TOKENS: int = 2000
+    TEMPERATURE: float = 0.0
+    # CORS Configuration
+    CORS_ALLOW_ORIGINS: bool = True
+    # API Configuration
+    API_V1_PREFIX: str = "/api/v1"
+    PROJECT_NAME: str = "Dreem Attribution"
+    DEBUG: bool = False
+    # Rate Limiting
+    RATE_LIMIT_CALLS: int = 100
+    RATE_LIMIT_PERIOD: int = 60
+    # Cache Configuration
+    REDIS_URL: Optional[str] = None
+    CACHE_TTL: int = 3600  # 1 hour
+    # Logging
+    LOG_LEVEL: str = "INFO"
+    LOG_FORMAT: str = "json"
+    # Timeout Configuration
+    OPENAI_TIMEOUT: float = 30.0
+    ANTHROPIC_TIMEOUT: float = 30.0
+    # API Keys
+    OPENAI_API_KEY: str = OPENAI_API_KEY
+    ANTHROPIC_API_KEY: str = ANTHROPIC_API_KEY
+    def validate_api_keys(self):
+        """Validate that required API keys are present."""
+        if not self.OPENAI_API_KEY:
+            raise ValueError("OPENAI_API_KEY is required")
+        if not self.ANTHROPIC_API_KEY:
+            raise ValueError("ANTHROPIC_API_KEY is required")
+# Create a cached instance of settings
+@lru_cache
+def get_settings() -> Settings:
+    """
+    Create and cache a Settings instance.
+    Returns the same instance for subsequent calls.
+    """
+    settings = Settings()
+    settings.validate_api_keys()
+    return settings

app/core/__init__.py ADDED Viewed

File without changes

app/core/errors.py ADDED Viewed

	@@ -0,0 +1,12 @@

+VENDOR_ERROR_INVALID_JSON = "Vendor Error: Invalid JSON data"
+VENDOR_THROW_ERROR = "Vendor Error: {error_message}"
+class VendorError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+class BadRequestError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)

app/core/prompts.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from functools import lru_cache
+from typing import Optional
+from pydantic_settings import BaseSettings
+EXTRACT_INFO_SYSTEM = "You are an expert at structured data extraction. You will be given an image of a product and should output the its properties into the given structure."
+EXTRACT_INFO_HUMAN = (
+    """Output properties of the {product_taxonomy} product in the images. You should use the following attributes to help you if it exists:
+    {product_data}
+    If an attribute is both in the image and the attributes, use the one in the attribute."""
+).replace("    ", "")
+FOLLOW_SCHEMA_SYSTEM = "You are an expert at structured data extraction. You will be given an dictionary of attributes of a product and should output the its properties into the given structure."
+FOLLOW_SCHEMA_HUMAN = """Convert following attributes to structured schema. Keep all the keys and number of values. Only replace the values themselves. :
+{json_info}"""
+class Prompts(BaseSettings):
+    EXTRACT_INFO_SYSTEM_MESSAGE: str = EXTRACT_INFO_SYSTEM
+    EXTRACT_INFO_HUMAN_MESSAGE: str = EXTRACT_INFO_HUMAN
+    FOLLOW_SCHEMA_SYSTEM_MESSAGE: str = FOLLOW_SCHEMA_SYSTEM
+    FOLLOW_SCHEMA_HUMAN_MESSAGE: str = FOLLOW_SCHEMA_HUMAN
+# Create a cached instance of settings
+@lru_cache
+def get_prompts() -> Prompts:
+    """
+    Create and cache a Prompts instance.
+    Returns the same instance for subsequent calls.
+    """
+    prompts = Prompts()
+    return prompts

app/core/security.py ADDED Viewed

File without changes

app/request_handler/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from app.request_handler.extract_handler import handle_extract
2	+ from app.request_handler.follow_handler import handle_follow

app/request_handler/extract_handler.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from io import BytesIO
+import requests
+from fastapi import HTTPException
+from PIL import Image
+from app.config import get_settings
+from app.core.errors import BadRequestError, VendorError
+from app.schemas.requests import ExtractionRequest
+from app.schemas.responses import APIResponse
+from app.services.factory import AIServiceFactory
+from app.utils.logger import setup_logger
+logger = setup_logger(__name__)
+settings = get_settings()
+async def handle_extract(request: ExtractionRequest):
+    request.max_attempts = max(request.max_attempts, 1)
+    request.max_attempts = min(request.max_attempts, 5)
+    for attempt in range(1, request.max_attempts + 1):
+        try:
+            logger.info(f"Attempt: {attempt}")
+            if request.ai_model in settings.OPENAI_MODELS:
+                ai_vendor = "openai"
+            elif request.ai_model in settings.ANTHROPIC_MODELS:
+                ai_vendor = "anthropic"
+            else:
+                raise ValueError(
+                    f"Invalid AI model: {request.ai_model}, only support {settings.SUPPORTED_MODELS}"
+                )
+            service = AIServiceFactory.get_service(ai_vendor)
+            pil_images = []
+            for url in request.img_urls:
+                try:
+                    response = requests.get(url)
+                    response.raise_for_status()
+                    image = Image.open(BytesIO(response.content))
+                    pil_images.append(image)
+                except Exception as e:
+                    print(e)
+                    logger.error(f"Failed to download or process image from {url}: {e}")
+                    raise HTTPException(
+                        status_code=400,
+                        detail=f"Failed to process image from {url}",
+                        headers={"attempt": attempt},
+                    )
+            json_attributes = await service.extract_attributes_with_validation(
+                request.attributes,
+                request.ai_model,
+                request.img_urls,
+                request.product_taxonomy,
+                request.product_data,
+                pil_images=pil_images,
+            )
+            break
+        except BadRequestError as e:
+            logger.error("Bad request error: ", e)
+            raise HTTPException(
+                status_code=400, detail=str(e), headers={"attempt": attempt}
+            )
+        except ValueError as e:
+            logger.error("Value error: ", e)
+            raise HTTPException(
+                status_code=400, detail=str(e), headers={"attempt": attempt}
+            )
+        except VendorError as e:
+            logger.error("Vendor error: ", e)
+            if attempt == request.max_attempts:
+                raise HTTPException(
+                    status_code=500, detail=str(e), headers={"attempt": attempt}
+                )
+            else:
+                if request.ai_model in settings.ANTHROPIC_MODELS:
+                    request.ai_model = settings.OPENAI_MODELS[
+                        0
+                    ]  # switch to OpenAI, and try again if max_attempts not reached
+                    logger.info(
+                        f"Switching from anthropic to {request.ai_model} for attempt {attempt + 1}"
+                    )
+                elif request.ai_model in settings.OPENAI_MODELS:
+                    request.ai_model = settings.ANTHROPIC_MODELS[
+                        0
+                    ]  # switch to anthropic, and try again if max_attempts not reached
+                    logger.info(
+                        f"Switching from OpenAI to {request.ai_model} for attempt {attempt + 1}"
+                    )
+        except HTTPException as e:
+            logger.error("HTTP exception: ", e)
+            raise e
+        except Exception as e:
+            logger.error("Exception: ", e)
+            if (
+                "overload" in str(e).lower()
+                and request.ai_model in settings.ANTHROPIC_MODELS
+            ):
+                request.ai_model = settings.OPENAI_MODELS[
+                    0
+                ]  # switch to OpenAI, and try again if max_attempts not reached
+            if attempt == request.max_attempts:
+                raise HTTPException(
+                    status_code=500,
+                    detail="Internal server error",
+                    headers={"attempt": attempt},
+                )
+    return json_attributes, attempt

app/request_handler/follow_handler.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from fastapi import APIRouter, HTTPException
+from app.config import get_settings
+from app.core.errors import VendorError
+from app.schemas.requests import FollowSchemaRequest
+from app.services.factory import AIServiceFactory
+from app.utils.logger import setup_logger
+logger = setup_logger(__name__)
+settings = get_settings()
+async def handle_follow(request: FollowSchemaRequest):
+    request.max_attempts = max(request.max_attempts, 1)
+    request.max_attempts = min(request.max_attempts, 5)
+    for attempt in range(1, request.max_attempts + 1):
+        try:
+            logger.info(f"Attempt: {attempt}")
+            if request.ai_model in settings.OPENAI_MODELS:
+                ai_vendor = "openai"
+            elif request.ai_model in settings.ANTHROPIC_MODELS:
+                ai_vendor = "anthropic"
+            else:
+                raise ValueError(
+                    f"Invalid AI model: {request.ai_model}, only support {settings.SUPPORTED_MODELS}"
+                )
+            service = AIServiceFactory.get_service(ai_vendor)
+            json_attributes = await service.follow_schema_with_validation(
+                request.data_schema, request.data
+            )
+            break
+        except ValueError as e:
+            if attempt == request.max_attempts:
+                raise HTTPException(
+                    status_code=400, detail=str(e), headers={"attempt": attempt}
+                )
+        except VendorError as e:
+            if attempt == request.max_attempts:
+                raise HTTPException(
+                    status_code=500, detail=str(e), headers={"attempt": attempt}
+                )
+        except Exception as e:
+            if attempt == request.max_attempts:
+                raise HTTPException(
+                    status_code=500,
+                    detail="Internal server error",
+                    headers={"attempt": attempt},
+                )
+    return json_attributes, attempt

app/request_handler/validate.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from app.config import get_settings
+from app.schemas.requests import ExtractionRequest, FollowSchemaRequest
+from app.schemas.schema_tools import validate_json_schema
+from app.utils.logger import setup_logger
+logger = setup_logger(__name__)
+settings = get_settings()
+def validate_extract_request(request: ExtractionRequest):
+    """Validate the request to extract attributes."""
+    request.max_attempts = max(request.max_attempts, 1)
+    request.max_attempts = min(request.max_attempts, 5)
+    # Limit the number of images to 10
+    if len(request.img_urls) > 10:
+        logger.warning(
+            f"Number of images exceeds 10: {len(request.img_urls)}. Limiting to 10."
+        )
+        request.img_urls = request.img_urls[:10]
+    for url in request.img_urls:
+        if not url.startswith("http"):
+            raise ValueError(f"Invalid URL: {url}")
+    # validate_json_schema(request.data_schema)
+    if request.ai_model.lower() not in settings.SUPPORTED_MODELS:
+        raise ValueError(
+            f"Invalid ai_model: {request.ai_model}, only support {settings.SUPPORTED_MODELS}"
+        )
+def validate_follow_request(request: FollowSchemaRequest):
+    """Validate the request to follow a schema."""
+    request.max_attempts = max(request.max_attempts, 1)
+    request.max_attempts = min(request.max_attempts, 5)
+    validate_json_schema(request.data_schema)
+    if request.ai_model.lower() not in settings.SUPPORTED_MODELS:
+        raise ValueError(
+            f"Invalid ai_model: {request.ai_model}, only support {settings.SUPPORTED_MODELS}"
+        )

app/schemas/__init__.py ADDED Viewed

File without changes

app/schemas/requests.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel
+from app.config import get_settings
+settings = get_settings()
+class Attribute(BaseModel):
+    description: str
+    data_type: str
+    allowed_values: Optional[List[str]] = []
+class ExtractionRequest(BaseModel):
+    attributes: Dict[str, Attribute]
+    img_urls: Optional[List[str]] = None
+    product_taxonomy: str
+    request_meta: Optional[Dict[str, str]] = None
+    product_data: Optional[Dict[str, str]] = None
+    ai_model: str = settings.DEFAULT_MODEL  # type: ignore
+    max_attempts: int = settings.DEFAULT_MAX_ATTEMPTS  # type: ignore
+class FollowSchemaRequest(BaseModel):
+    data_schema: Dict[str, Any]
+    data: Dict[str, Any]
+    request_meta: Optional[Dict[str, str]] = None
+    ai_model: str = settings.DEFAULT_MODEL
+    max_attempts: int = settings.DEFAULT_MAX_ATTEMPTS  # type: ignore
+class ResultRequest(BaseModel):
+    task_id: str

app/schemas/responses.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from typing import Any, Dict, Optional
+from pydantic import BaseModel
+class SubmitResponse(BaseModel):
+    task_id: str
+class ResultResponse(BaseModel):
+    request_meta: Optional[Dict[str, str]] = None
+    task_id: str
+    result: dict
+    status_code: int
+    detail: str
+    attempt: int
+class HealthCheckResponse(BaseModel):
+    status: str
+class APIResponse(BaseModel):
+    detail: str
+    data: Dict[str, Any]
+    attempts: int
+class APIErrorResponse(BaseModel):
+    detail: str
+HEALTH_CHECK_RESPONSES = {}
+SUBMIT_EXTRACT_RESPONSES = {
+    400: {
+        "model": APIErrorResponse,
+    },
+    500: {"model": APIErrorResponse},
+}
+SUBMIT_FOLLOW_RESPONSES = {
+    400: {
+        "model": APIErrorResponse,
+    },
+    500: {"model": APIErrorResponse},
+}
+RESULT_RESPONSES = {
+    400: {
+        "model": APIErrorResponse,
+    },
+    404: {
+        "model": APIErrorResponse,
+    },
+    500: {"model": APIErrorResponse},
+}
+RESPONSES = {
+    400: {
+        "model": APIErrorResponse,
+    },
+    404: {
+        "model": APIErrorResponse,
+    },
+    500: {"model": APIErrorResponse},
+}

app/schemas/schema_tools.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from enum import Enum  # do not remove this import for exec
+from typing import List  # do not remove this import for exec
+from typing import Any, Dict
+import jsonschema
+from jsf import JSF
+from pydantic import BaseModel, Field  # do not remove this import for exec
+from app.core.errors import VendorError
+from app.schemas.requests import Attribute
+def validate_json_data(data: Dict[str, Any], schema: Dict[str, Any]):
+    """
+    Standalone JSON schema validation utility
+    """
+    try:
+        jsonschema.validate(instance=data, schema=schema)
+    except jsonschema.ValidationError as e:
+        raise VendorError(f"Vendor generated invalid data: {e}")
+def validate_json_schema(schema: Dict[str, Any]):
+    """
+    Standalone JSON schema validation utility
+    """
+    if schema == {}:
+        raise ValueError(f"JSON Schema validation failed")
+    try:
+        faker = JSF(schema)
+        _ = faker.generate()
+    except:
+        raise ValueError(f"JSON Schema validation failed")
+SUPPORTED_DATA_TYPE = [
+    "string",
+    "int",
+    "float",
+    "bool",
+    "list[string]",
+    "list[int]",
+    "list[float]",
+    "list[bool]",
+]
+def convert_attribute_to_model(attributes: Dict[str, Attribute]) -> Dict[str, Any]:
+    import_code = ""
+    enum_code_list = []
+    master_class_code = "class Product(BaseModel):\n"
+    for key, value in attributes.items():
+        description = value.description
+        data_type = value.data_type
+        allowed_values = value.allowed_values
+        is_list = False
+        if data_type not in SUPPORTED_DATA_TYPE:
+            raise ValueError(f"Data type {data_type} is not supported")
+        if "list" in data_type:
+            is_list = True
+        if "int" in data_type:
+            data_type = "int"
+        elif "float" in data_type:
+            data_type = "float"
+        elif "bool" in data_type:
+            data_type = "bool"
+        elif "string" in data_type:
+            data_type = "str"
+        if len(allowed_values) > 0:
+            enum_code = f"class {key.capitalize()}Enum(str, Enum):\n"
+            for allowed_value in allowed_values:
+                enum_code += f"    {allowed_value.replace(' ', '_').replace('-', '_').upper()} = '{allowed_value}'\n"
+            enum_code_list.append(enum_code)
+            data_type = f"{key.capitalize()}Enum"
+        if is_list:
+            data_type = f"List[{data_type}]"
+        master_class_code += (
+            f"    {key}: {data_type} = Field(..., description='{description}')\n"
+        )
+    entire_code = import_code + "\n".join(enum_code_list) + "\n" + master_class_code
+    exec(entire_code, globals())
+    return Product  # type: ignore

app/services/__init__.py ADDED Viewed

File without changes

app/services/base.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Type
+from pydantic import BaseModel
+from app.schemas.schema_tools import (
+    convert_attribute_to_model,
+    validate_json_data,
+    validate_json_schema,
+)
+class BaseAttributionService(ABC):
+    @abstractmethod
+    async def extract_attributes(
+        self,
+        attributes_model: Type[BaseModel],
+        ai_model: str,
+        img_urls: List[str],
+        product_taxonomy: str,
+        pil_images: List[Any] = None,
+    ) -> Dict[str, Any]:
+        pass
+    @abstractmethod
+    async def follow_schema(
+        self, schema: Dict[str, Any], data: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        pass
+    async def extract_attributes_with_validation(
+        self,
+        attributes: Dict[str, Any],
+        ai_model: str,
+        img_urls: List[str],
+        product_taxonomy: str,
+        product_data: Dict[str, str],
+        pil_images: List[Any] = None,
+        img_paths: List[str] = None,
+    ) -> Dict[str, Any]:
+        # validate_json_schema(schema)
+        attributes_model = convert_attribute_to_model(attributes)
+        schema = attributes_model.model_json_schema()
+        data = await self.extract_attributes(
+            attributes_model,
+            ai_model,
+            img_urls,
+            product_taxonomy,
+            product_data,
+            # pil_images=pil_images, # temporarily removed for save cost
+            img_paths=img_paths,
+        )
+        validate_json_data(data, schema)
+        return data
+    async def follow_schema_with_validation(
+        self, schema: Dict[str, Any], data: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        validate_json_schema(schema)
+        data = await self.follow_schema(schema, data)
+        validate_json_data(data, schema)
+        return data

app/services/factory.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from typing import Type
+from ..config import get_settings
+from .base import BaseAttributionService
+from .service_anthropic import AnthropicService
+from .service_openai import OpenAIService
+settings = get_settings()
+class AIServiceFactory:
+    _services = {"openai": OpenAIService, "anthropic": AnthropicService}
+    @classmethod
+    def get_service(cls, ai_vendor: str = None) -> BaseAttributionService:
+        ai_vendor = ai_vendor or settings.DEFAULT_VENDOR
+        service_class = cls._services.get(ai_vendor.lower())
+        if not service_class:
+            raise ValueError(f"Unsupported ai_vendor: {ai_vendor}")
+        return service_class()

app/services/service_anthropic.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import json
+import os
+from typing import Any, Dict, List, Type
+import anthropic
+import weave
+from anthropic import APIStatusError, AsyncAnthropic
+from pydantic import BaseModel
+from app.config import get_settings
+from app.core import errors
+from app.core.errors import BadRequestError, VendorError
+from app.core.prompts import get_prompts
+from app.services.base import BaseAttributionService
+from app.utils.converter import product_data_to_str
+from app.utils.image_processing import get_data_format, get_image_data
+from app.utils.logger import setup_logger
+deployment = os.getenv("DEPLOYMENT", "LOCAL")
+if deployment == "LOCAL":  # local or demo
+    weave_project_name = "cfai/attribution-exp"
+elif deployment == "DEV":
+    weave_project_name = "cfai/attribution-dev"
+elif deployment == "PROD":
+    weave_project_name = "cfai/attribution-prod"
+weave.init(project_name=weave_project_name)
+settings = get_settings()
+prompts = get_prompts()
+logger = setup_logger(__name__)
+class AnthropicService(BaseAttributionService):
+    def __init__(self):
+        self.client = AsyncAnthropic(api_key=settings.ANTHROPIC_API_KEY)
+    @weave.op
+    async def extract_attributes(
+        self,
+        attributes_model: Type[BaseModel],
+        ai_model: str,
+        img_urls: List[str],
+        product_taxonomy: str,
+        product_data: Dict[str, str],
+        pil_images: List[Any] = None,  # do not remove, this is for weave
+        img_paths: List[str] = None,
+    ) -> Dict[str, Any]:
+        logger.info("Extracting info via Anthropic...")
+        tools = [
+            {
+                "name": "extract_garment_info",
+                "description": "Extracts key information from the image.",
+                "input_schema": attributes_model.model_json_schema(),
+                "cache_control": {"type": "ephemeral"},
+            }
+        ]
+        if img_urls is not None:
+            image_messages = [
+                {
+                    "type": "image",
+                    "source": {"type": "url", "url": img_url},
+                }
+                for img_url in img_urls
+            ]
+        elif img_paths is not None:
+            image_messages = [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": f"image/{get_data_format(img_path)}",
+                        "data": get_image_data(img_path),
+                    },
+                }
+                for img_path in img_paths
+            ]
+        else:
+            # this is not expected, raise some errors here later.
+            pass
+        system_message = [{"type": "text", "text": prompts.EXTRACT_INFO_SYSTEM_MESSAGE}]
+        text_messages = [
+            {
+                "type": "text",
+                "text": prompts.EXTRACT_INFO_HUMAN_MESSAGE.format(
+                    product_taxonomy=product_taxonomy,
+                    product_data=product_data_to_str(product_data),
+                ),
+            }
+        ]
+        messages = [{"role": "user", "content": text_messages + image_messages}]
+        # try:
+        try:
+            response = await self.client.messages.create(
+                model=ai_model,
+                extra_headers={"anthropic-beta": "prompt-caching-2024-07-31"},
+                max_tokens=2048,
+                system=system_message,
+                tools=tools,
+                messages=messages,
+            )
+        except anthropic.BadRequestError as e:
+            raise BadRequestError(e.message)
+        except Exception as e:
+            raise VendorError(errors.VENDOR_THROW_ERROR.format(error_message=str(e)))
+        for content in response.content:
+            if content.type == "tool_use":
+                return content.input
+    @weave.op
+    async def follow_schema(self, schema, data):
+        logger.info("Following structure via Anthropic...")
+        tools = [
+            {
+                "name": "extract_garment_info",
+                "description": prompts.FOLLOW_SCHEMA_HUMAN_MESSAGE,
+                "input_schema": schema,
+                "cache_control": {"type": "ephemeral"},
+            }
+        ]
+        text_messages = [
+            {
+                "type": "text",
+                "text": prompts.FOLLOW_SCHEMA_HUMAN_MESSAGE.format(json_info=data),
+            }
+        ]
+        system_message = [
+            {"type": "text", "text": prompts.FOLLOW_SCHEMA_SYSTEM_MESSAGE}
+        ]
+        messages = [{"role": "user", "content": text_messages}]
+        try:
+            response = await self.client.messages.create(
+                model=settings.ANTHROPIC_DEFAULT_MODEL,
+                extra_headers={"anthropic-beta": "prompt-caching-2024-07-31"},
+                max_tokens=2048,
+                system=system_message,
+                tools=tools,
+                messages=messages,
+            )
+        except Exception as e:
+            raise VendorError(errors.VENDOR_THROW_ERROR.format(error_message=str(e)))
+        for content in response.content:
+            if content.type == "tool_use":
+                return content.input["json_info"]
+        return {"status": "ERROR: no tool_use found"}

app/services/service_openai.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import json
+import os
+from typing import Any, Dict, List, Type
+import openai
+import weave
+from openai import AsyncOpenAI
+from pydantic import BaseModel
+from app.utils.converter import product_data_to_str
+from app.utils.image_processing import get_data_format, get_image_data
+from app.utils.logger import setup_logger
+from ..config import get_settings
+from ..core import errors
+from ..core.errors import BadRequestError, VendorError
+from ..core.prompts import get_prompts
+from .base import BaseAttributionService
+deployment = os.getenv("DEPLOYMENT", "LOCAL")
+if deployment == "LOCAL":  # local or demo
+    weave_project_name = "cfai/attribution-exp"
+elif deployment == "DEV":
+    weave_project_name = "cfai/attribution-dev"
+elif deployment == "PROD":
+    weave_project_name = "cfai/attribution-prod"
+weave.init(project_name=weave_project_name)
+settings = get_settings()
+prompts = get_prompts()
+logger = setup_logger(__name__)
+def get_response_format(json_schema: dict[str, any]) -> dict[str, any]:
+    # OpenAI requires each $def have to have additionalProperties set to False
+    json_schema["additionalProperties"] = False
+    # check if the schema has a $defs key
+    if "$defs" in json_schema:
+        for keys in json_schema["$defs"].keys():
+            json_schema["$defs"][keys]["additionalProperties"] = False
+    response_format = {
+        "type": "json_schema",
+        "json_schema": {"strict": True, "name": "GarmentSchema", "schema": json_schema},
+    }
+    return response_format
+class OpenAIService(BaseAttributionService):
+    def __init__(self):
+        self.client = AsyncOpenAI(api_key=settings.OPENAI_API_KEY)
+    @weave.op
+    async def extract_attributes(
+        self,
+        attributes_model: Type[BaseModel],
+        ai_model: str,
+        img_urls: List[str],
+        product_taxonomy: str,
+        product_data: Dict[str, str],
+        pil_images: List[Any] = None,  # do not remove, this is for weave
+        img_paths: List[str] = None,
+    ) -> Dict[str, Any]:
+        logger.info("Extracting info via OpenAI...")
+        text_content = [
+            {
+                "type": "text",
+                "text": prompts.EXTRACT_INFO_HUMAN_MESSAGE.format(
+                    product_taxonomy=product_taxonomy,
+                    product_data=product_data_to_str(product_data),
+                ),
+            },
+        ]
+        if img_urls is not None:
+            image_content = [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": img_url,
+                    },
+                }
+                for img_url in img_urls
+            ]
+        elif img_paths is not None:
+            image_content = [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/{get_data_format(img_path)};base64,{get_image_data(img_path)}",
+                    },
+                }
+                for img_path in img_paths
+            ]
+        try:
+            response = await self.client.beta.chat.completions.parse(
+                model=ai_model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": prompts.EXTRACT_INFO_SYSTEM_MESSAGE,
+                    },
+                    {
+                        "role": "user",
+                        "content": text_content + image_content,
+                    },
+                ],
+                max_tokens=1000,
+                response_format=attributes_model,
+                logprobs=False,
+                # top_logprobs=2,
+                temperature=0.0,
+            )
+        except openai.BadRequestError as e:
+            raise BadRequestError(str(e))
+        except Exception as e:
+            raise VendorError(errors.VENDOR_THROW_ERROR.format(error_message=str(e)))
+        try:
+            content = response.choices[0].message.content
+            parsed_data = json.loads(content)
+        except:
+            raise VendorError(errors.VENDOR_ERROR_INVALID_JSON)
+        return parsed_data
+    @weave.op
+    async def follow_schema(
+        self, schema: Dict[str, Any], data: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        logger.info("Following structure via OpenAI...")
+        text_content = [
+            {
+                "type": "text",
+                "text": prompts.FOLLOW_SCHEMA_HUMAN_MESSAGE.format(json_info=data),
+            },
+        ]
+        try:
+            response = await self.client.beta.chat.completions.parse(
+                model="gpt-4o-2024-11-20",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": prompts.FOLLOW_SCHEMA_SYSTEM_MESSAGE,
+                    },
+                    {
+                        "role": "user",
+                        "content": text_content,
+                    },
+                ],
+                max_tokens=1000,
+                response_format=get_response_format(schema),
+                logprobs=False,
+                # top_logprobs=2,
+                temperature=0.0,
+            )
+        except Exception as e:
+            raise VendorError(errors.VENDOR_THROW_ERROR.format(error_message=str(e)))
+        if response.choices[0].message.refusal:
+            logger.info("OpenAI refused to respond to the request")
+            return {"status": "refused"}
+        try:
+            content = response.choices[0].message.content
+            parsed_data = json.loads(content)
+        except:
+            raise ValueError(errors.VENDOR_ERROR_INVALID_JSON)
+        return parsed_data

app/utils/__init__.py ADDED Viewed

File without changes

app/utils/converter.py ADDED Viewed

	@@ -0,0 +1,14 @@

+def product_data_to_str(product_data: dict[str, any]) -> str:
+    """
+    Convert product data to a string.
+    Args:
+    - product_data: a dictionary of product data
+    Returns:
+    - a string representation of the product data
+    """
+    if product_data is None:
+        return ""
+    return "\n".join([f"{k}: {v}" for k, v in product_data.items()])

app/utils/image_processing.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import base64
+def get_image_data(image_path):
+    with open(image_path, "rb") as f:
+        image_data = base64.b64encode(f.read()).decode("utf-8")
+    return image_data
+def get_data_format(image_path):
+    image_format = image_path.split(".")[-1]
+    if image_format == "jpg":
+        image_format = "jpeg"
+    return image_format

app/utils/logger.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import logging
+import os
+from logging.handlers import RotatingFileHandler
+# Configure logger
+def setup_logger(name: str) -> logging.Logger:
+    log_level = os.getenv("LOG_LEVEL", "INFO").upper()
+    log_file = os.getenv("LOG_FILE", "app.log")
+    max_bytes = int(os.getenv("LOG_MAX_BYTES", 10 * 1024 * 1024))  # 10 MB
+    backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5))
+    logger = logging.getLogger(name)
+    logger.setLevel(log_level)
+    # Console handler
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(log_level)
+    console_formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    console_handler.setFormatter(console_formatter)
+    # Rotating file handler
+    file_handler = RotatingFileHandler(
+        log_file, maxBytes=max_bytes, backupCount=backup_count
+    )
+    file_handler.setLevel(log_level)
+    file_formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    file_handler.setFormatter(file_formatter)
+    # Add handlers
+    logger.addHandler(console_handler)
+    logger.addHandler(file_handler)
+    logger.propagate = False
+    return logger

app/utils/rate_limiter.py ADDED Viewed

File without changes

app/utils/token_counter.py ADDED Viewed

File without changes

clean_for_gradio.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+rm -rf app/api
+rm -rf app/aws
+rm app/main.py
+rm worker.py

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+fastapi==0.115.6
+fastapi-cli==0.0.7
+pydantic==2.7.4
+pydantic_settings==2.7.0
+openai
+anthropic==0.42.0
+Pillow==11.0.0
+requests==2.32.3
+jsonschema==4.23.0
+jsf==0.11.2
+pytest==8.3.4
+boto3==1.35.87
+redis==5.2.1
+weave==0.51.39
+gradio==5.22.0