Spaces:

Geonmo
/

socratic-models-image-captioning-with-BLOOM

Running

socratic-models-image-captioning-with-BLOOM

File size: 13,392 Bytes

import os
import torch
import gradio as gr
import time
import clip
import requests
import csv
import json
import wget

url_dict = {'clip_ViTL14_openimage_classifier_weights.pt': 'https://raw.githubusercontent.com/geonm/socratic-models-demo/master/prompts/clip_ViTL14_openimage_classifier_weights.pt',
            'clip_ViTL14_place365_classifier_weights.pt': 'https://raw.githubusercontent.com/geonm/socratic-models-demo/master/prompts/clip_ViTL14_place365_classifier_weights.pt',
            'clip_ViTL14_tencentml_classifier_weights.pt': 'https://raw.githubusercontent.com/geonm/socratic-models-demo/master/prompts/clip_ViTL14_tencentml_classifier_weights.pt'}

os.makedirs('./prompts', exist_ok=True)
for k, v in url_dict.items():
        wget.download(v, out='./prompts')

os.environ['CUDA_VISIBLE_DEVICES'] = ''

API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
HF_TOKEN = os.environ["HF_TOKEN"]

def load_openimage_classnames(csv_path):
    csv_data = open(csv_path)
    csv_reader = csv.reader(csv_data)
    classnames = {idx: row[-1] for idx, row in enumerate(csv_reader)}
    return classnames


def load_tencentml_classnames(txt_path):
    txt_data = open(txt_path)
    lines = txt_data.readlines()
    classnames = {idx: line.strip() for idx, line in enumerate(lines)}
    return classnames


def build_simple_classifier(clip_model, text_list, template, device):
    with torch.no_grad():
        texts = [template(text) for text in text_list]
        text_inputs = clip.tokenize(texts).to(device)
        text_features = clip_model.encode_text(text_inputs)
        text_features /= text_features.norm(dim=-1, keepdim=True)

    return text_features, {idx: text for idx, text in enumerate(text_list)}


def load_models():
    # build model and tokenizer
    model_dict = {}

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print('\tLoading CLIP ViT-L/14')
    clip_model, clip_preprocess = clip.load("ViT-L/14", device=device)
    print('\tLoading precomputed zeroshot classifier')
    openimage_classifier_weights = torch.load('./prompts/clip_ViTL14_openimage_classifier_weights.pt', map_location=device).type(torch.FloatTensor)
    openimage_classnames = load_openimage_classnames('./prompts/openimage-classnames.csv')
    tencentml_classifier_weights = torch.load('./prompts/clip_ViTL14_tencentml_classifier_weights.pt', map_location=device).type(torch.FloatTensor)
    tencentml_classnames = load_tencentml_classnames('./prompts/tencent-ml-classnames.txt')
    place365_classifier_weights = torch.load('./prompts/clip_ViTL14_place365_classifier_weights.pt', map_location=device).type(torch.FloatTensor)
    place365_classnames = load_tencentml_classnames('./prompts/place365-classnames.txt')

    print('\tBuilding simple zeroshot classifier')
    img_types = ['photo', 'cartoon', 'sketch', 'painting']
    ppl_texts = ['no people', 'people']
    ifppl_texts = ['is one person', 'are two people', 'are three people', 'are several people', 'are many people']
    imgtype_classifier_weights, imgtype_classnames = build_simple_classifier(clip_model, img_types, lambda c: f'This is a {c}.', device)
    ppl_classifier_weights, ppl_classnames = build_simple_classifier(clip_model, ppl_texts, lambda c: f'There are {c} in this photo.', device)
    ifppl_classifier_weights, ifppl_classnames = build_simple_classifier(clip_model, ifppl_texts, lambda c: f'There {c} in this photo.', device)

    model_dict['clip_model'] = clip_model
    model_dict['clip_preprocess'] = clip_preprocess
    model_dict['openimage_classifier_weights'] = openimage_classifier_weights
    model_dict['openimage_classnames'] = openimage_classnames
    model_dict['tencentml_classifier_weights'] = tencentml_classifier_weights
    model_dict['tencentml_classnames'] = tencentml_classnames
    model_dict['place365_classifier_weights'] = place365_classifier_weights
    model_dict['place365_classnames'] = place365_classnames
    model_dict['imgtype_classifier_weights'] = imgtype_classifier_weights
    model_dict['imgtype_classnames'] = imgtype_classnames
    model_dict['ppl_classifier_weights'] = ppl_classifier_weights
    model_dict['ppl_classnames'] = ppl_classnames
    model_dict['ifppl_classifier_weights'] = ifppl_classifier_weights
    model_dict['ifppl_classnames'] = ifppl_classnames
    model_dict['device'] = device

    return model_dict


def drop_gpu(tensor):
    if torch.cuda.is_available():
        return tensor.cpu().numpy()
    else:
        return tensor.numpy()


def zeroshot_classifier(image):
    image_input = model_dict['clip_preprocess'](image).unsqueeze(0).to(model_dict['device'])
    with torch.no_grad():
        image_features = model_dict['clip_model'].encode_image(image_input)
        image_features /= image_features.norm(dim=-1, keepdim=True)

        sim = (100.0 * image_features @ model_dict['openimage_classifier_weights'].T).softmax(dim=-1)
        openimage_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(10)]
        openimage_classes = [model_dict['openimage_classnames'][idx] for idx in indices]

        sim = (100.0 * image_features @ model_dict['tencentml_classifier_weights'].T).softmax(dim=-1)
        tencentml_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(10)]
        tencentml_classes = [model_dict['tencentml_classnames'][idx] for idx in indices]

        sim = (100.0 * image_features @ model_dict['place365_classifier_weights'].T).softmax(dim=-1)
        place365_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(10)]
        place365_classes = [model_dict['place365_classnames'][idx] for idx in indices]

        sim = (100.0 * image_features @ model_dict['imgtype_classifier_weights'].T).softmax(dim=-1)
        imgtype_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(model_dict['imgtype_classnames']))]
        imgtype_classes = [model_dict['imgtype_classnames'][idx] for idx in indices]

        sim = (100.0 * image_features @ model_dict['ppl_classifier_weights'].T).softmax(dim=-1)
        ppl_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(model_dict['ppl_classnames']))]
        ppl_classes = [model_dict['ppl_classnames'][idx] for idx in indices]

        sim = (100.0 * image_features @ model_dict['ifppl_classifier_weights'].T).softmax(dim=-1)
        ifppl_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(model_dict['ifppl_classnames']))]
        ifppl_classes = [model_dict['ifppl_classnames'][idx] for idx in indices]

    return image_features, openimage_scores, openimage_classes, tencentml_scores, tencentml_classes,\
           place365_scores, place365_classes, imgtype_scores, imgtype_classes,\
           ppl_scores, ppl_classes, ifppl_scores, ifppl_classes


def generate_prompt(openimage_classes, tencentml_classes, place365_classes, imgtype_classes, ppl_classes, ifppl_classes):
    img_type = imgtype_classes[0]
    ppl_result = ppl_classes[0]
    if ppl_result == 'people':
        ppl_result = ifppl_classes[0]
    else:
        ppl_result = 'are %s' % ppl_result

    sorted_places = place365_classes

    object_list = ''
    for cls in tencentml_classes:
        object_list += f'{cls}, '
    for cls in openimage_classes[:2]:
        object_list += f'{cls}, '
    object_list = object_list[:-2]

    prompt_caption = f'''I am an intelligent image captioning bot.
    This image is a {img_type}. There {ppl_result}.
    I think this photo was taken at a {sorted_places[0]}, {sorted_places[1]}, or {sorted_places[2]}.
    I think there might be a {object_list} in this {img_type}.
    A creative short caption I can generate to describe this image is:'''

    #prompt_search = f'''Let's list keywords that include the following description.
    #This image is a {img_type}. There {ppl_result}.
    #I think this photo was taken at a {sorted_places[0]}, {sorted_places[1]}, or {sorted_places[2]}.
    #I think there might be a {object_list} in this {img_type}.
    #Relevant keywords which we can list and are seperated with comma are:'''

    return prompt_caption


def generate_captions(prompt, num_captions=3):
    headers = {"Authorization": f"Bearer {HF_TOKEN}"}

    max_length = 16
    seed = 42
    sample_or_greedy = 'Greedy'
    input_sentence = prompt
    if sample_or_greedy == "Sample":
        parameters = {
            "max_new_tokens": max_length,
            "top_p": 0.7,
            "do_sample": True,
            "seed": seed,
            "early_stopping": False,
            "length_penalty": 0.0,
            "eos_token_id": None,
        }
    else:
        parameters = {
            "max_new_tokens": max_length,
            "do_sample": False,
            "seed": seed,
            "early_stopping": False,
            "length_penalty": 0.0,
            "eos_token_id": None,
        }

    payload = {"inputs": input_sentence, "parameters": parameters,"options" : {"use_cache": False}}

    bloom_results = []
    for _ in range(num_captions):
        response = requests.post(API_URL, headers=headers, json=payload)
        output = response.json()
        generated_text = output[0]['generated_text'].replace(prompt, '').split('.')[0] + '.'
        bloom_results.append(generated_text)
    return bloom_results


def sorting_texts(image_features, captions):
    with torch.no_grad():
        text_inputs = clip.tokenize(captions).to(model_dict['device'])
        text_features = model_dict['clip_model'].encode_text(text_inputs)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        sim = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(captions))]
        sorted_captions = [captions[idx] for idx in indices]

    return scores, sorted_captions


def postprocess_results(scores, classes):
    scores = [float('%.4f' % float(val)) for val in scores]
    outputs = []
    for score, cls in zip(scores, classes):
        outputs.append({'score': score, 'output': cls})
    return outputs


def image_captioning(image):
    start_time = time.time()
    image_features, openimage_scores, openimage_classes, tencentml_scores, tencentml_classes, place365_scores, place365_classes, imgtype_scores, imgtype_classes, ppl_scores, ppl_classes, ifppl_scores, ifppl_classes = zeroshot_classifier(image)
    end_zeroshot = time.time()
    prompt_caption = generate_prompt(openimage_classes, tencentml_classes, place365_classes, imgtype_classes, ppl_classes, ifppl_classes)
    generated_captions = generate_captions(prompt_caption, num_captions=1)
    end_bloom = time.time()
    caption_scores, sorted_captions = sorting_texts(image_features, generated_captions)

    output_dict = {}
    output_dict['inference_time'] = {'CLIP inference': end_zeroshot - start_time,
                                     'BLOOM request': end_bloom - end_zeroshot}

    output_dict['generated_captions'] = postprocess_results(caption_scores, sorted_captions)
    output_dict['reasoning'] = {'openimage_results': postprocess_results(openimage_scores, openimage_classes),
                                'tencentml_results': postprocess_results(tencentml_scores, tencentml_classes),
                                'place365_results': postprocess_results(place365_scores, place365_classes),
                                'imgtype_results': postprocess_results(imgtype_scores, imgtype_classes),
                                'ppl_results': postprocess_results(ppl_scores, ppl_classes),
                                'ifppl_results': postprocess_results(ifppl_scores, ifppl_classes)}
    return output_dict


if __name__ == '__main__':
    print('\tinit models')

    global model_dict

    model_dict = load_models()
    
    # define gradio demo
    inputs = [gr.inputs.Image(type="pil", label="Image")
              ]

    outputs = gr.outputs.JSON()

    title = "Socratic models for image captioning with BLOOM"

    description = """
    ## Details
    **Without any fine-tuning**, we can do image captioning using Visual-Language models (e.g., CLIP, SLIP, ...) and Large language models (e.g., GPT, BLOOM, ...).
    In this demo, I choose BLOOM as the language model and CLIP ViT-L/14 as the visual-language model.
    The order of generating image caption is as follow:
    1. Classify whether there are people, where the location is, and what objects are in the input image using the visual-language model.
    2. Then, build a prompt using classified results.
    3. Request BLOOM API with the prompt.

    This demo is slightly different with the original method proposed in the socratic model paper.
    I used not only tencent ml class names, but also OpenImage class names and I adopt BLOOM for the large language model

    If you want the demo using GPT3 from OpenAI, check https://github.com/geonm/socratic-models-demo.

    Demo is running on CPU.
    """

    article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.00598'>Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language</a></p>"
    examples = ['k21-1.jpg']

    gr.Interface(image_captioning,
                 inputs,
                 outputs,
                 title=title,
                 description=description,
                 article=article,
                 examples=examples,
                 #examples_per_page=50,
                 ).launch()