import gradio as gr import requests import os import base64 from PIL import Image import io api_key = os.getenv('API_KEY') def resize_image(image_path, max_size=(800, 800), quality=85): with Image.open(image_path) as img: img.thumbnail(max_size, Image.Resampling.LANCZOS) buffer = io.BytesIO() img.save(buffer, format="JPEG", quality=quality) return buffer.getvalue() def filepath_to_base64(image_path): img_bytes = resize_image(image_path) img_base64 = base64.b64encode(img_bytes) return img_base64.decode('utf-8') def format_response(response_body): content = response_body['choices'][0]['message']['content'] formatted_content = content.replace("<0x0A>", "\n") return formatted_content def call_deplot_api(image_path, content, temperature=0.2, top_p=0.7, max_tokens=1024): image_base64 = filepath_to_base64(image_path) invoke_url = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/0bcd1a8c-451f-4b12-b7f0-64b4781190d1" api_key = os.getenv('API_KEY') headers = { "Authorization": f"Bearer {api_key}", "Accept": "application/json", } payload = { "messages": [ { "content": f"{content} ", "role": "user" } ], "temperature": temperature, "top_p": top_p, "max_tokens": max_tokens, "stream": False } session = requests.Session() response = session.post(invoke_url, headers=headers, json=payload) while response.status_code == 202: request_id = response.headers.get("NVCF-REQID") fetch_url = f"https://api.nvcf.nvidia.com/v2/nvcf/pexec/status/{request_id}" response = session.get(fetch_url, headers=headers) response.raise_for_status() response_body = response.json() return format_response(response_body) content_input = gr.Textbox(lines=2, placeholder="Enter your content here...", label="Content") image_input = gr.Image(type="filepath", label="Upload Image") temperature_input = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Temperature") top_p_input = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.7, label="Top P") max_tokens_input = gr.Slider(minimum=1, maximum=1024, step=1, value=1024, label="Max Tokens") iface = gr.Interface(fn=call_deplot_api, inputs=[image_input, content_input, temperature_input, top_p_input, max_tokens_input], outputs="text", title="Kosmos-2 API Explorer", description="""
Explore Visual Language Understanding with Kosmos-2

Kosmos-2 model is a groundbreaking multimodal large language model (MLLM). Kosmos-2 is designed to ground text to the visual world, enabling it to understand and reason about visual elements in images.

""" ) iface.launch()