artificialguybr commited on
Commit
d69ccc5
1 Parent(s): df6a72f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import os
4
+ import base64
5
+ from PIL import Image
6
+ import io
7
+
8
+ api_key = os.getenv('API_KEY')
9
+
10
+
11
+ def resize_image(image_path, max_size=(800, 800), quality=85):
12
+ with Image.open(image_path) as img:
13
+ img.thumbnail(max_size, Image.Resampling.LANCZOS)
14
+ buffer = io.BytesIO()
15
+ img.save(buffer, format="JPEG", quality=quality)
16
+ return buffer.getvalue()
17
+
18
+ def filepath_to_base64(image_path):
19
+ img_bytes = resize_image(image_path)
20
+ img_base64 = base64.b64encode(img_bytes)
21
+ return img_base64.decode('utf-8')
22
+
23
+ def format_response(response_body):
24
+ content = response_body['choices'][0]['message']['content']
25
+ formatted_content = content.replace("<0x0A>", "\n")
26
+ return formatted_content
27
+
28
+ def call_deplot_api(image_path, content, temperature=0.2, top_p=0.7, max_tokens=1024):
29
+ image_base64 = filepath_to_base64(image_path)
30
+ invoke_url = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/0bcd1a8c-451f-4b12-b7f0-64b4781190d1"
31
+ api_key = os.getenv('API_KEY')
32
+ headers = {
33
+ "Authorization": f"Bearer {api_key}",
34
+ "Accept": "application/json",
35
+ }
36
+ payload = {
37
+ "messages": [
38
+ {
39
+ "content": f"{content} <img src=\"data:image/jpeg;base64,{image_base64}\" />",
40
+ "role": "user"
41
+ }
42
+ ],
43
+ "temperature": temperature,
44
+ "top_p": top_p,
45
+ "max_tokens": max_tokens,
46
+ "stream": False
47
+ }
48
+ session = requests.Session()
49
+ response = session.post(invoke_url, headers=headers, json=payload)
50
+ while response.status_code == 202:
51
+ request_id = response.headers.get("NVCF-REQID")
52
+ fetch_url = f"https://api.nvcf.nvidia.com/v2/nvcf/pexec/status/{request_id}"
53
+ response = session.get(fetch_url, headers=headers)
54
+ response.raise_for_status()
55
+ response_body = response.json()
56
+ return format_response(response_body)
57
+
58
+ content_input = gr.Textbox(lines=2, placeholder="Enter your content here...", label="Content")
59
+ image_input = gr.Image(type="filepath", label="Upload Image")
60
+ temperature_input = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Temperature")
61
+ top_p_input = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.7, label="Top P")
62
+ max_tokens_input = gr.Slider(minimum=1, maximum=1024, step=1, value=1024, label="Max Tokens")
63
+
64
+ iface = gr.Interface(fn=call_deplot_api,
65
+ inputs=[image_input, content_input, temperature_input, top_p_input, max_tokens_input],
66
+ outputs="text",
67
+ title="Kosmos-2 API Explorer",
68
+ description="""
69
+ <div style="text-align: center; font-size: 1.5em; margin-bottom: 20px;">
70
+ <strong>Explore Visual Language Understanding with Kosmos-2</strong>
71
+ </div>
72
+ <p>
73
+ Kosmos-2 model is a groundbreaking multimodal large language model (MLLM). Kosmos-2 is designed to ground text to the visual world, enabling it to understand and reason about visual elements in images.
74
+ </p>
75
+ """
76
+ )
77
+
78
+ iface.launch()