wglu2024 commited on
Commit
10fe343
1 Parent(s): 42f0ceb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -296
app.py CHANGED
@@ -1,296 +1 @@
1
- """### Step 1 - Export the NVIDIA_API_KEY
2
- You can supply the NVIDIA_API_KEY directly in this notebook when you run the cell below
3
- """
4
-
5
- import getpass
6
- import os
7
-
8
- ## API Key can be found by going to NVIDIA NGC -> AI Foundation Models -> (some model) -> Get API Code or similar.
9
- ## 10K free queries to any endpoint (which is a lot actually).
10
-
11
- # del os.environ['NVIDIA_API_KEY'] ## delete key and reset
12
- if os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
13
- print("Valid NVIDIA_API_KEY already in environment. Delete to reset")
14
- else:
15
- nvapi_key = getpass.getpass("NVAPI Key (starts with nvapi-): ")
16
- assert nvapi_key.startswith("nvapi-"), f"{nvapi_key[:5]}... is not a valid key"
17
- os.environ["NVIDIA_API_KEY"] = nvapi_key
18
- global nvapi_key
19
-
20
- """### Step 2 - wrap the NeVa API call into a function and verify by supplying an image to get a respond"""
21
-
22
- import openai, httpx, sys
23
-
24
- import base64, io
25
- from PIL import Image
26
-
27
-
28
- def img2base64_string(img_path):
29
- image = Image.open(img_path)
30
- if image.width > 800 or image.height > 800:
31
- image.thumbnail((800, 800))
32
- buffered = io.BytesIO()
33
- image.convert("RGB").save(buffered, format="JPEG", quality=85)
34
- image_base64 = base64.b64encode(buffered.getvalue()).decode()
35
- return image_base64
36
-
37
- def nv_api_response(prompt, img_path):
38
- base = "https://api.nvcf.nvidia.com"
39
- url = "/v2/nvcf/pexec/functions/8bf70738-59b9-4e5f-bc87-7ab4203be7a0"
40
-
41
- # Get your key at: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-foundation/models/neva-22b/api
42
- # click on the "Generate Key" button
43
-
44
- def hook(request):
45
- request.url = httpx.URL(request.url, path=url)
46
- request.headers['Accept'] = 'text/event-stream'
47
-
48
- client = openai.OpenAI(
49
- base_url=base,
50
- api_key=nvapi_key,
51
- http_client=httpx.Client(event_hooks={'request': [hook]})
52
- )
53
- base64_str=img2base64_string(img_path)
54
-
55
- result = client.chat.completions.create(
56
- model="neva-22b",
57
- messages=[
58
- {"role": "user", "content": [
59
- {"type": "text", "text": prompt},
60
- {"type": "image_url", "image_url": f"data:image/png;base64,{base64_str}"} # or image/jpeg
61
- ]
62
- },
63
-
64
- # {"role": "assistant", "labels": {'creativity': 0}} # Uncomment to get less verbose response
65
- ],
66
- max_tokens=512, # Minimum 32, maximum 512. This is a bug.
67
- temperature=0.2,
68
- top_p=0.7,
69
- stream=True # Use streaming mode for responses longer than 32 tokens.
70
- )
71
-
72
- for chunk in result:
73
- print(chunk.choices[0].delta.content, end="")
74
- sys.stdout.flush()
75
- return result
76
-
77
- """fetch a test image of a pair of white sneakers and verify the function works"""
78
-
79
- wget "https://docs.google.com/uc?export=download&id=12ZpBBFkYu-jzz1iz356U5kMikn4uN9ww" -O ./jordan.png
80
-
81
- img_path="./jordan.png"
82
- prompt="describe the image"
83
- out=nv_api_response(prompt,img_path)
84
-
85
- """### Step 3 - we are gonna use mixtral_8x7b model as our main LLM"""
86
-
87
- # test run and see that you can genreate a respond successfully
88
- from langchain_nvidia_ai_endpoints import ChatNVIDIA
89
- llm = ChatNVIDIA(model="mixtral_8x7b", nvidia_api_key=nvapi_key)
90
-
91
- #Set up Prerequisites for Image Captioning App User Interface
92
- import os
93
- import io
94
- import IPython.display
95
- from PIL import Image
96
- import base64
97
- import requests
98
- import gradio as gr
99
-
100
- """### Step 4- wrap Deplot and Neva as tools for later usage"""
101
-
102
- #Set up Prerequisites for Image Captioning App User Interface
103
- import os
104
- import io
105
- import IPython.display
106
- from PIL import Image
107
- import base64
108
- import requests
109
- import gradio as gr
110
-
111
- from langchain.tools import BaseTool
112
- from transformers import BlipProcessor, BlipForConditionalGeneration, DetrImageProcessor, DetrForObjectDetection
113
- from PIL import Image
114
- import torch
115
- #
116
- import os
117
- from tempfile import NamedTemporaryFile
118
- from langchain.agents import initialize_agent
119
- from langchain.chains.conversation.memory import ConversationBufferWindowMemory
120
-
121
- class ImageCaptionTool(BaseTool):
122
- name = "Image captioner from NeVa"
123
- description = "Use this tool when given the path to an image that you would like to be described. " \
124
- "It will return a simple caption describing the image."
125
-
126
- # generate api key via https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-foundation/models/neva-22b/api
127
- def img2base64_string(self,img_path):
128
- print(img_path)
129
- image = Image.open(img_path)
130
- if image.width > 800 or image.height > 800:
131
- image.thumbnail((800, 800))
132
- buffered = io.BytesIO()
133
- image.convert("RGB").save(buffered, format="JPEG", quality=85)
134
- image_base64 = base64.b64encode(buffered.getvalue()).decode()
135
- return image_base64
136
-
137
- def _run(self, img_path):
138
- invoke_url = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/8bf70738-59b9-4e5f-bc87-7ab4203be7a0"
139
- fetch_url_format = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/status/"
140
-
141
- headers = {
142
- "Authorization": f"Bearer {nvapi_key}",
143
- "Accept": "application/json",
144
- }
145
- base64_str = self.img2base64_string(img_path)
146
- prompt = """\
147
- can you summarize what is in the image\
148
- and return the answer \
149
- """
150
- payload = {
151
- "messages":[
152
- {"role": "user", "content": [
153
- {"type": "text", "text": prompt},
154
- {"type": "image_url", "image_url": f"data:image/png;base64,{base64_str}"} # or image/jpeg
155
- ]
156
- },
157
- {
158
- "labels": {
159
- "creativity": 6,
160
- "helpfulness": 6,
161
- "humor": 0,
162
- "quality": 6
163
- },
164
- "role": "assistant"
165
- } ],
166
- "temperature": 0.2,
167
- "top_p": 0.7,
168
- "max_tokens": 512,
169
- "stream": False
170
- }
171
-
172
- # re-use connections
173
- session = requests.Session()
174
-
175
- response = session.post(invoke_url, headers=headers, json=payload)
176
- print(response)
177
- while response.status_code == 202:
178
- request_id = response.headers.get("NVCF-REQID")
179
- fetch_url = fetch_url_format + request_id
180
- response = session.get(fetch_url, headers=headers)
181
-
182
- response.raise_for_status()
183
- response_body = response.json()
184
- print(response_body)
185
- return response_body['choices'][0]['message']['content']
186
-
187
-
188
- def _arun(self, query: str):
189
- raise NotImplementedError("This tool does not support async")
190
-
191
-
192
- class TabularPlotTool(BaseTool):
193
- name = "Tabular Plot reasoning tool"
194
- description = "Use this tool when given the path to an image that contain bar, pie chart objects. " \
195
- "It will extract and return the tabular data "
196
-
197
- def img2base64_string(self, img_path):
198
- print(img_path)
199
- image = Image.open(img_path)
200
- if image.width > 800 or image.height > 800:
201
- image.thumbnail((800, 800))
202
- buffered = io.BytesIO()
203
- image.convert("RGB").save(buffered, format="JPEG", quality=85)
204
- image_base64 = base64.b64encode(buffered.getvalue()).decode()
205
- return image_base64
206
-
207
- def _run(self, img_path):
208
- # using DePlot from NVIDIA AI Endpoint playground, generate your key via :https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-foundation/models/deplot/api
209
- invoke_url = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/3bc390c7-eeec-40f7-a64d-0c6a719985f7"
210
- fetch_url_format = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/status/"
211
-
212
- headers = {
213
- "Authorization": f"Bearer {nvapi_key}",
214
- "Accept": "application/json",
215
- }
216
-
217
- base64_str = self.img2base64_string(img_path)
218
- prompt = """\
219
- can you summarize what is in the image\
220
- and return the answer \
221
- """
222
- payload = {
223
- "messages":[
224
- {"role": "user", "content": [
225
- {"type": "text", "text": prompt},
226
- {"type": "image_url", "image_url": f"data:image/png;base64,{base64_str}"} # or image/jpeg
227
- ]
228
- },
229
- ],
230
- "temperature": 0.2,
231
- "top_p": 0.7,
232
- "max_tokens": 512,
233
- "stream": False
234
- }
235
-
236
- # re-use connections
237
- session = requests.Session()
238
-
239
- response = session.post(invoke_url, headers=headers, json=payload)
240
-
241
- while response.status_code == 202:
242
- request_id = response.headers.get("NVCF-REQID")
243
- fetch_url = fetch_url_format + request_id
244
- response = session.get(fetch_url, headers=headers)
245
-
246
- response.raise_for_status()
247
- response_body = response.json()
248
- print(response_body)
249
- return response_body['choices'][0]['message']['content']
250
-
251
- def _arun(self, query: str):
252
- raise NotImplementedError("This tool does not support async")
253
-
254
- """### Step 5 - initaite the agent with tools we previously defined"""
255
-
256
- #initialize the gent
257
- tools = [ImageCaptionTool(),TabularPlotTool()]
258
-
259
- conversational_memory = ConversationBufferWindowMemory(
260
- memory_key='chat_history',
261
- k=5,
262
- return_messages=True
263
- )
264
-
265
-
266
- agent = initialize_agent(
267
- agent="chat-conversational-react-description",
268
- tools=tools,
269
- llm=llm,
270
- max_iterations=5,
271
- verbose=True,
272
- memory=conversational_memory,
273
- handle_parsing_errors=True,
274
- early_stopping_method='generate'
275
- )
276
-
277
- """### Step 6 - verify the agent can indeed use the tools with the supplied image and query"""
278
-
279
- user_question = "What is in this image?"
280
- img_path="./jordan.png"
281
- response = agent.run(f'{user_question}, this is the image path: {img_path}')
282
- print(response)
283
-
284
- """### Step 7 - wrap the agent into a simple gradio UI so we can interactively upload arbitrary image"""
285
-
286
- import gradio as gr
287
- ImageCaptionApp = gr.Interface(fn=agent,
288
- inputs=[gr.Image(label="Upload image", type="filepath")],
289
- outputs=[gr.Textbox(label="Caption")],
290
- title="Image Captioning with langchain agent",
291
- description="combine langchain agent using tools for image reasoning",
292
- allow_flagging="never")
293
-
294
- ImageCaptionApp.launch(share=True)
295
-
296
- !
 
1
+ print("Valid NVIDIA_API_KEY already in environment. Delete to reset")