geonmo.gu commited on
Commit
4b145c8
β€’
1 Parent(s): fba8607

add description

Browse files
Files changed (2) hide show
  1. .gitignore +2 -0
  2. app.py +17 -5
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.swp
2
+ *.pt
app.py CHANGED
@@ -3,8 +3,6 @@ import torch
3
  import gradio as gr
4
  import time
5
  import clip
6
- #from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
7
- #from flores200_codes import flores_codes
8
  import requests
9
  import csv
10
  import json
@@ -22,7 +20,6 @@ os.environ['CUDA_VISIBLE_DEVICES'] = ''
22
 
23
  API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
24
  HF_TOKEN = os.environ["HF_TOKEN"]
25
- headers = {"Authorization": f"Bearer {HF_TOKEN}"}
26
 
27
  def load_openimage_classnames(csv_path):
28
  csv_data = open(csv_path)
@@ -261,8 +258,23 @@ if __name__ == '__main__':
261
 
262
  title = "Socratic models for image captioning with BLOOM"
263
 
264
- demo_status = "Demo is running on CPU"
265
- description = f"Details: https://github.com/geonm/socratic-models-demo. {demo_status}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.00598'>Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language</a></p>"
267
  examples = ['k21-1.jpg']
268
 
 
3
  import gradio as gr
4
  import time
5
  import clip
 
 
6
  import requests
7
  import csv
8
  import json
 
20
 
21
  API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
22
  HF_TOKEN = os.environ["HF_TOKEN"]
 
23
 
24
  def load_openimage_classnames(csv_path):
25
  csv_data = open(csv_path)
 
258
 
259
  title = "Socratic models for image captioning with BLOOM"
260
 
261
+ description = """
262
+ ## Details
263
+ **Without any fine-tuning**, we can do image captioning using Visual-Language models (e.g., CLIP, SLIP, ...) and Large language models (e.g., GPT, BLOOM, ...).
264
+ In this demo, I choose BLOOM as the language model and CLIP ViT-L/14 as the visual-language model.
265
+ The order of generating image caption is as follow:
266
+ 1. Classify whether there are people, where the location is, and what objects are in the input image using the visual-language model.
267
+ 2. Then, build a prompt using classified results.
268
+ 3. Request BLOOM API with the prompt.
269
+
270
+ This demo is slightly different with the original method proposed in the socratie model paper.
271
+ I used not only tencent ml class names, but also OpenImage class names and I adopt BLOOM for the large language model
272
+
273
+ If you want the demo using GPT3 from OpenAI, check https://github.com/geonm/socratic-models-demo.
274
+
275
+ Demo is running on CPU.
276
+ """
277
+
278
  article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.00598'>Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language</a></p>"
279
  examples = ['k21-1.jpg']
280