from typing import Union from fastapi.encoders import jsonable_encoder from inference.core.devices.utils import GLOBAL_INFERENCE_SERVER_ID from inference.core.entities.requests.inference import InferenceRequest from inference.core.entities.responses.inference import InferenceResponse from inference.core.env import TINY_CACHE from inference.core.logger import logger from inference.core.version import __version__ def to_cachable_inference_item( infer_request: InferenceRequest, infer_response: Union[InferenceResponse, list[InferenceResponse]], ) -> dict: if not TINY_CACHE: return { "inference_id": infer_request.id, "inference_server_version": __version__, "inference_server_id": GLOBAL_INFERENCE_SERVER_ID, "request": jsonable_encoder(infer_request), "response": jsonable_encoder(infer_response), } included_request_fields = { "api_key", "confidence", "model_id", "model_type", "source", "source_info", } request = infer_request.dict(include=included_request_fields) response = build_condensed_response(infer_response) return { "inference_id": infer_request.id, "inference_server_version": __version__, "inference_server_id": GLOBAL_INFERENCE_SERVER_ID, "request": jsonable_encoder(request), "response": jsonable_encoder(response), } def build_condensed_response(responses): if not isinstance(responses, list): responses = [responses] formatted_responses = [] for response in responses: if not getattr(response, "predictions", None): continue try: predictions = [ {"confidence": pred.confidence, "class": pred.class_name} for pred in response.predictions ] formatted_responses.append( { "predictions": predictions, "time": response.time, } ) except Exception as e: logger.warning(f"Error formatting response, skipping caching: {e}") return formatted_responses