ChatGPT-ImageCaptioner

Paused

App Files Files Community

taesiri commited on Feb 27, 2023

Commit

f163829

1 Parent(s): f97cf44

update

Browse files

Files changed (3) hide show

README.md +2 -2
app.py +47 -38
requirements.txt +2 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-title: Detic+ChatGPT
-emoji: 👀
 colorFrom: blue
 colorTo: red
 sdk: gradio

 ---
+title: Detic+LangChain
+emoji: 🦜️🔗
 colorFrom: blue
 colorTo: red
 sdk: gradio

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
 from pyChatGPT import ChatGPT
 os.system("pip install -U gradio")
@@ -61,8 +63,6 @@ cfg.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL = (
 )
 predictor = DefaultPredictor(cfg)
-# Setup the model's vocabulary using build-in datasets
 BUILDIN_CLASSIFIER = {
     "lvis": "datasets/metadata/lvis_v1_clip_a+cname.npy",
     "objects365": "datasets/metadata/o365_clip_a+cnamefix.npy",
@@ -80,19 +80,22 @@ BUILDIN_METADATA_PATH = {
 session_token = os.environ.get("SessionToken")
-def get_response_from_chatbot(text):
     try:
-        api = ChatGPT(session_token)
-        resp = api.send_message(text)
-        api.refresh_auth()
-        api.reset_conversation()
-        response = resp["message"]
     except:
-        response = "Sorry, I'm busy. Try again later."
-    return response
-def inference(img, vocabulary):
     metadata = MetadataCatalog.get(BUILDIN_METADATA_PATH[vocabulary])
     classifier = BUILDIN_CLASSIFIER[vocabulary]
     num_classes = len(metadata.thing_classes)
@@ -128,45 +131,51 @@ def inference(img, vocabulary):
             f"{predicted_label} - X:({int(x0)} Y: {int(y0)} Width {int(width)} Height: {int(height)})"
         )
-    chat_gpt_response = get_response_from_chatbot(
-        f"You are an intelligent image captioner. I will hand you the objects and their position, and you should give me a detailed description for the photo. In this photo we have the following objects\n{object_list_str}"
-    )
     return (
         Image.fromarray(np.uint8(out.get_image())).convert("RGB"),
-        chat_gpt_response,
     )
-# create a gradio block for image classification
 with gr.Blocks() as demo:
-    gr.Markdown("# Detic+ChatGPT")
-    gr.Markdown(
-        "Use Detic to detect objects in an image and then use ChatGPT to describe the image."
-    )
-    gr.HTML(
-        "<p>You can duplicating this space and use your own session token: <a style='display:inline-block' href='https://huggingface.co/spaces/yizhangliu/chatGPT?duplicate=true'><img src='https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14' alt='Duplicate Space'></a></p>"
-    )
-    gr.HTML(
-        "<p> Instruction on how to get session token can be seen in video <a style='display:inline-block' href='https://www.youtube.com/watch?v=TdNSj_qgdFk'><font style='color:blue;weight:bold;'>here</font></a>. Add your session token by going to settings and add under secrets. </p>"
-    )
     with gr.Column():
         with gr.Row():
             inp = gr.Image(label="Input Image", type="filepath")
-            vocab = gr.Dropdown(
-                ["lvis", "objects365", "openimages", "coco"],
-                label="Vocabulary",
-                value="lvis",
-            )
-        btn_detic = gr.Button("Run Detic+ChatGPT")
     with gr.Row():
         outviz = gr.Image(label="Visualization", type="pil")
-        output_desc = gr.Textbox(label="ChatGPT Description", lines=5)
-        # outputjson = gr.JSON(label="Detected Objects")
-    btn_detic.click(fn=inference, inputs=[inp, vocab], outputs=[outviz, output_desc])
-demo.launch()

 import os
 from pyChatGPT import ChatGPT
+from langchain.llms import OpenAI
 os.system("pip install -U gradio")
 )
 predictor = DefaultPredictor(cfg)
 BUILDIN_CLASSIFIER = {
     "lvis": "datasets/metadata/lvis_v1_clip_a+cname.npy",
     "objects365": "datasets/metadata/o365_clip_a+cnamefix.npy",
 session_token = os.environ.get("SessionToken")
+def generate_caption(object_list_str, api_key, temperature):
+    query = f"You are an intelligent image captioner. I will hand you the objects and their position, and you should give me a detailed description for the photo. In this photo we have the following objects\n{object_list_str}"
+    llm = OpenAI(
+        model_name="text-davinci-003", openai_api_key=api_key, temperature=temperature
+    )
     try:
+        caption = llm(query)
+        caption = caption.strip()
     except:
+        caption = "Sorry, something went wrong!"
+    return caption
+def inference(img, vocabulary, api_key, temperature):
     metadata = MetadataCatalog.get(BUILDIN_METADATA_PATH[vocabulary])
     classifier = BUILDIN_CLASSIFIER[vocabulary]
     num_classes = len(metadata.thing_classes)
             f"{predicted_label} - X:({int(x0)} Y: {int(y0)} Width {int(width)} Height: {int(height)})"
         )
+    if api_key is not None:
+        gpt_response = generate_caption(object_list_str, api_key, temperature)
+    else:
+        gpt_response = "Please paste your OpenAI key to use"
     return (
         Image.fromarray(np.uint8(out.get_image())).convert("RGB"),
+        gpt_response,
     )
 with gr.Blocks() as demo:
+    with gr.Column():
+        gr.Markdown("# Image Captioning using LangChain (GPT3.5) 🦜️🔗")
+        gr.Markdown(
+            "Use Detic to detect objects in an image and then use GPT to describe the image."
+        )
     with gr.Column():
         with gr.Row():
             inp = gr.Image(label="Input Image", type="filepath")
+            with gr.Column():
+                openai_api_key_textbox = gr.Textbox(
+                    placeholder="Paste your OpenAI API key (sk-...)",
+                    show_label=False,
+                    lines=1,
+                    type="password",
+                )
+                temperature = gr.Slider(0, 1, 0.1, label="Temperature")
+                vocab = gr.Dropdown(
+                    ["lvis", "objects365", "openimages", "coco"],
+                    label="Detic Vocabulary",
+                    value="lvis",
+                )
+        btn_detic = gr.Button("Run Detic+GPT3.5")
     with gr.Row():
         outviz = gr.Image(label="Visualization", type="pil")
+        output_desc = gr.Textbox(label="Description Description", lines=5)
+    btn_detic.click(
+        fn=inference,
+        inputs=[inp, vocab, openai_api_key_textbox, temperature],
+        outputs=[outviz, output_desc],
+    )
+demo.launch(debug=False)

requirements.txt CHANGED Viewed

@@ -36,3 +36,5 @@ nltk
 pyChatGPT
 git+https://github.com/openai/CLIP.git

 pyChatGPT
 git+https://github.com/openai/CLIP.git
+langchain