taesiri commited on
Commit
f163829
โ€ข
1 Parent(s): f97cf44
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +47 -38
  3. requirements.txt +2 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Detic+ChatGPT
3
- emoji: ๐Ÿ‘€
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
1
  ---
2
+ title: Detic+LangChain
3
+ emoji: ๐Ÿฆœ๏ธ๐Ÿ”—
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
2
  from pyChatGPT import ChatGPT
 
 
3
 
4
  os.system("pip install -U gradio")
5
 
@@ -61,8 +63,6 @@ cfg.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL = (
61
  )
62
  predictor = DefaultPredictor(cfg)
63
 
64
- # Setup the model's vocabulary using build-in datasets
65
-
66
  BUILDIN_CLASSIFIER = {
67
  "lvis": "datasets/metadata/lvis_v1_clip_a+cname.npy",
68
  "objects365": "datasets/metadata/o365_clip_a+cnamefix.npy",
@@ -80,19 +80,22 @@ BUILDIN_METADATA_PATH = {
80
  session_token = os.environ.get("SessionToken")
81
 
82
 
83
- def get_response_from_chatbot(text):
 
 
 
 
 
84
  try:
85
- api = ChatGPT(session_token)
86
- resp = api.send_message(text)
87
- api.refresh_auth()
88
- api.reset_conversation()
89
- response = resp["message"]
90
  except:
91
- response = "Sorry, I'm busy. Try again later."
92
- return response
93
 
 
94
 
95
- def inference(img, vocabulary):
 
96
  metadata = MetadataCatalog.get(BUILDIN_METADATA_PATH[vocabulary])
97
  classifier = BUILDIN_CLASSIFIER[vocabulary]
98
  num_classes = len(metadata.thing_classes)
@@ -128,45 +131,51 @@ def inference(img, vocabulary):
128
  f"{predicted_label} - X:({int(x0)} Y: {int(y0)} Width {int(width)} Height: {int(height)})"
129
  )
130
 
131
- chat_gpt_response = get_response_from_chatbot(
132
- f"You are an intelligent image captioner. I will hand you the objects and their position, and you should give me a detailed description for the photo. In this photo we have the following objects\n{object_list_str}"
133
- )
 
134
 
135
  return (
136
  Image.fromarray(np.uint8(out.get_image())).convert("RGB"),
137
- chat_gpt_response,
138
  )
139
 
140
 
141
- # create a gradio block for image classification
142
  with gr.Blocks() as demo:
143
- gr.Markdown("# Detic+ChatGPT")
144
- gr.Markdown(
145
- "Use Detic to detect objects in an image and then use ChatGPT to describe the image."
146
- )
147
-
148
- gr.HTML(
149
- "<p>You can duplicating this space and use your own session token: <a style='display:inline-block' href='https://huggingface.co/spaces/yizhangliu/chatGPT?duplicate=true'><img src='https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14' alt='Duplicate Space'></a></p>"
150
- )
151
- gr.HTML(
152
- "<p> Instruction on how to get session token can be seen in video <a style='display:inline-block' href='https://www.youtube.com/watch?v=TdNSj_qgdFk'><font style='color:blue;weight:bold;'>here</font></a>. Add your session token by going to settings and add under secrets. </p>"
153
- )
154
 
155
  with gr.Column():
156
  with gr.Row():
157
  inp = gr.Image(label="Input Image", type="filepath")
158
- vocab = gr.Dropdown(
159
- ["lvis", "objects365", "openimages", "coco"],
160
- label="Vocabulary",
161
- value="lvis",
162
- )
163
-
164
- btn_detic = gr.Button("Run Detic+ChatGPT")
 
 
 
 
 
 
 
 
165
  with gr.Row():
166
  outviz = gr.Image(label="Visualization", type="pil")
167
- output_desc = gr.Textbox(label="ChatGPT Description", lines=5)
168
- # outputjson = gr.JSON(label="Detected Objects")
 
 
 
 
 
169
 
170
- btn_detic.click(fn=inference, inputs=[inp, vocab], outputs=[outviz, output_desc])
171
 
172
- demo.launch()
1
  import os
2
  from pyChatGPT import ChatGPT
3
+ from langchain.llms import OpenAI
4
+
5
 
6
  os.system("pip install -U gradio")
7
 
63
  )
64
  predictor = DefaultPredictor(cfg)
65
 
 
 
66
  BUILDIN_CLASSIFIER = {
67
  "lvis": "datasets/metadata/lvis_v1_clip_a+cname.npy",
68
  "objects365": "datasets/metadata/o365_clip_a+cnamefix.npy",
80
  session_token = os.environ.get("SessionToken")
81
 
82
 
83
+ def generate_caption(object_list_str, api_key, temperature):
84
+ query = f"You are an intelligent image captioner. I will hand you the objects and their position, and you should give me a detailed description for the photo. In this photo we have the following objects\n{object_list_str}"
85
+ llm = OpenAI(
86
+ model_name="text-davinci-003", openai_api_key=api_key, temperature=temperature
87
+ )
88
+
89
  try:
90
+ caption = llm(query)
91
+ caption = caption.strip()
 
 
 
92
  except:
93
+ caption = "Sorry, something went wrong!"
 
94
 
95
+ return caption
96
 
97
+
98
+ def inference(img, vocabulary, api_key, temperature):
99
  metadata = MetadataCatalog.get(BUILDIN_METADATA_PATH[vocabulary])
100
  classifier = BUILDIN_CLASSIFIER[vocabulary]
101
  num_classes = len(metadata.thing_classes)
131
  f"{predicted_label} - X:({int(x0)} Y: {int(y0)} Width {int(width)} Height: {int(height)})"
132
  )
133
 
134
+ if api_key is not None:
135
+ gpt_response = generate_caption(object_list_str, api_key, temperature)
136
+ else:
137
+ gpt_response = "Please paste your OpenAI key to use"
138
 
139
  return (
140
  Image.fromarray(np.uint8(out.get_image())).convert("RGB"),
141
+ gpt_response,
142
  )
143
 
144
 
 
145
  with gr.Blocks() as demo:
146
+ with gr.Column():
147
+ gr.Markdown("# Image Captioning using LangChain (GPT3.5) ๐Ÿฆœ๏ธ๐Ÿ”—")
148
+ gr.Markdown(
149
+ "Use Detic to detect objects in an image and then use GPT to describe the image."
150
+ )
 
 
 
 
 
 
151
 
152
  with gr.Column():
153
  with gr.Row():
154
  inp = gr.Image(label="Input Image", type="filepath")
155
+ with gr.Column():
156
+ openai_api_key_textbox = gr.Textbox(
157
+ placeholder="Paste your OpenAI API key (sk-...)",
158
+ show_label=False,
159
+ lines=1,
160
+ type="password",
161
+ )
162
+ temperature = gr.Slider(0, 1, 0.1, label="Temperature")
163
+ vocab = gr.Dropdown(
164
+ ["lvis", "objects365", "openimages", "coco"],
165
+ label="Detic Vocabulary",
166
+ value="lvis",
167
+ )
168
+
169
+ btn_detic = gr.Button("Run Detic+GPT3.5")
170
  with gr.Row():
171
  outviz = gr.Image(label="Visualization", type="pil")
172
+ output_desc = gr.Textbox(label="Description Description", lines=5)
173
+
174
+ btn_detic.click(
175
+ fn=inference,
176
+ inputs=[inp, vocab, openai_api_key_textbox, temperature],
177
+ outputs=[outviz, output_desc],
178
+ )
179
 
 
180
 
181
+ demo.launch(debug=False)
requirements.txt CHANGED
@@ -36,3 +36,5 @@ nltk
36
  pyChatGPT
37
 
38
  git+https://github.com/openai/CLIP.git
 
 
36
  pyChatGPT
37
 
38
  git+https://github.com/openai/CLIP.git
39
+
40
+ langchain