Spaces:

Geonmo
/

socratic-models-image-captioning-with-BLOOM

Running

App Files Files Community

socratic-models-image-captioning-with-BLOOM / app.py

Geonmo

fix typo

e839607 almost 2 years ago

raw history blame contribute delete

No virus

13.4 kB

	import os
	import torch
	import gradio as gr
	import time
	import clip
	import requests
	import csv
	import json
	import wget

	url_dict = {'clip_ViTL14_openimage_classifier_weights.pt': 'https://raw.githubusercontent.com/geonm/socratic-models-demo/master/prompts/clip_ViTL14_openimage_classifier_weights.pt',
	'clip_ViTL14_place365_classifier_weights.pt': 'https://raw.githubusercontent.com/geonm/socratic-models-demo/master/prompts/clip_ViTL14_place365_classifier_weights.pt',
	'clip_ViTL14_tencentml_classifier_weights.pt': 'https://raw.githubusercontent.com/geonm/socratic-models-demo/master/prompts/clip_ViTL14_tencentml_classifier_weights.pt'}

	os.makedirs('./prompts', exist_ok=True)
	for k, v in url_dict.items():
	wget.download(v, out='./prompts')

	os.environ['CUDA_VISIBLE_DEVICES'] = ''

	API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
	HF_TOKEN = os.environ["HF_TOKEN"]

	def load_openimage_classnames(csv_path):
	csv_data = open(csv_path)
	csv_reader = csv.reader(csv_data)
	classnames = {idx: row[-1] for idx, row in enumerate(csv_reader)}
	return classnames


	def load_tencentml_classnames(txt_path):
	txt_data = open(txt_path)
	lines = txt_data.readlines()
	classnames = {idx: line.strip() for idx, line in enumerate(lines)}
	return classnames


	def build_simple_classifier(clip_model, text_list, template, device):
	with torch.no_grad():
	texts = [template(text) for text in text_list]
	text_inputs = clip.tokenize(texts).to(device)
	text_features = clip_model.encode_text(text_inputs)
	text_features /= text_features.norm(dim=-1, keepdim=True)

	return text_features, {idx: text for idx, text in enumerate(text_list)}


	def load_models():
	# build model and tokenizer
	model_dict = {}

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print('\tLoading CLIP ViT-L/14')
	clip_model, clip_preprocess = clip.load("ViT-L/14", device=device)
	print('\tLoading precomputed zeroshot classifier')
	openimage_classifier_weights = torch.load('./prompts/clip_ViTL14_openimage_classifier_weights.pt', map_location=device).type(torch.FloatTensor)
	openimage_classnames = load_openimage_classnames('./prompts/openimage-classnames.csv')
	tencentml_classifier_weights = torch.load('./prompts/clip_ViTL14_tencentml_classifier_weights.pt', map_location=device).type(torch.FloatTensor)
	tencentml_classnames = load_tencentml_classnames('./prompts/tencent-ml-classnames.txt')
	place365_classifier_weights = torch.load('./prompts/clip_ViTL14_place365_classifier_weights.pt', map_location=device).type(torch.FloatTensor)
	place365_classnames = load_tencentml_classnames('./prompts/place365-classnames.txt')

	print('\tBuilding simple zeroshot classifier')
	img_types = ['photo', 'cartoon', 'sketch', 'painting']
	ppl_texts = ['no people', 'people']
	ifppl_texts = ['is one person', 'are two people', 'are three people', 'are several people', 'are many people']
	imgtype_classifier_weights, imgtype_classnames = build_simple_classifier(clip_model, img_types, lambda c: f'This is a {c}.', device)
	ppl_classifier_weights, ppl_classnames = build_simple_classifier(clip_model, ppl_texts, lambda c: f'There are {c} in this photo.', device)
	ifppl_classifier_weights, ifppl_classnames = build_simple_classifier(clip_model, ifppl_texts, lambda c: f'There {c} in this photo.', device)

	model_dict['clip_model'] = clip_model
	model_dict['clip_preprocess'] = clip_preprocess
	model_dict['openimage_classifier_weights'] = openimage_classifier_weights
	model_dict['openimage_classnames'] = openimage_classnames
	model_dict['tencentml_classifier_weights'] = tencentml_classifier_weights
	model_dict['tencentml_classnames'] = tencentml_classnames
	model_dict['place365_classifier_weights'] = place365_classifier_weights
	model_dict['place365_classnames'] = place365_classnames
	model_dict['imgtype_classifier_weights'] = imgtype_classifier_weights
	model_dict['imgtype_classnames'] = imgtype_classnames
	model_dict['ppl_classifier_weights'] = ppl_classifier_weights
	model_dict['ppl_classnames'] = ppl_classnames
	model_dict['ifppl_classifier_weights'] = ifppl_classifier_weights
	model_dict['ifppl_classnames'] = ifppl_classnames
	model_dict['device'] = device

	return model_dict


	def drop_gpu(tensor):
	if torch.cuda.is_available():
	return tensor.cpu().numpy()
	else:
	return tensor.numpy()


	def zeroshot_classifier(image):
	image_input = model_dict['clip_preprocess'](image).unsqueeze(0).to(model_dict['device'])
	with torch.no_grad():
	image_features = model_dict['clip_model'].encode_image(image_input)
	image_features /= image_features.norm(dim=-1, keepdim=True)

	sim = (100.0 * image_features @ model_dict['openimage_classifier_weights'].T).softmax(dim=-1)
	openimage_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(10)]
	openimage_classes = [model_dict['openimage_classnames'][idx] for idx in indices]

	sim = (100.0 * image_features @ model_dict['tencentml_classifier_weights'].T).softmax(dim=-1)
	tencentml_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(10)]
	tencentml_classes = [model_dict['tencentml_classnames'][idx] for idx in indices]

	sim = (100.0 * image_features @ model_dict['place365_classifier_weights'].T).softmax(dim=-1)
	place365_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(10)]
	place365_classes = [model_dict['place365_classnames'][idx] for idx in indices]

	sim = (100.0 * image_features @ model_dict['imgtype_classifier_weights'].T).softmax(dim=-1)
	imgtype_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(model_dict['imgtype_classnames']))]
	imgtype_classes = [model_dict['imgtype_classnames'][idx] for idx in indices]

	sim = (100.0 * image_features @ model_dict['ppl_classifier_weights'].T).softmax(dim=-1)
	ppl_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(model_dict['ppl_classnames']))]
	ppl_classes = [model_dict['ppl_classnames'][idx] for idx in indices]

	sim = (100.0 * image_features @ model_dict['ifppl_classifier_weights'].T).softmax(dim=-1)
	ifppl_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(model_dict['ifppl_classnames']))]
	ifppl_classes = [model_dict['ifppl_classnames'][idx] for idx in indices]

	return image_features, openimage_scores, openimage_classes, tencentml_scores, tencentml_classes,\
	place365_scores, place365_classes, imgtype_scores, imgtype_classes,\
	ppl_scores, ppl_classes, ifppl_scores, ifppl_classes


	def generate_prompt(openimage_classes, tencentml_classes, place365_classes, imgtype_classes, ppl_classes, ifppl_classes):
	img_type = imgtype_classes[0]
	ppl_result = ppl_classes[0]
	if ppl_result == 'people':
	ppl_result = ifppl_classes[0]
	else:
	ppl_result = 'are %s' % ppl_result

	sorted_places = place365_classes

	object_list = ''
	for cls in tencentml_classes:
	object_list += f'{cls}, '
	for cls in openimage_classes[:2]:
	object_list += f'{cls}, '
	object_list = object_list[:-2]

	prompt_caption = f'''I am an intelligent image captioning bot.
	This image is a {img_type}. There {ppl_result}.
	I think this photo was taken at a {sorted_places[0]}, {sorted_places[1]}, or {sorted_places[2]}.
	I think there might be a {object_list} in this {img_type}.
	A creative short caption I can generate to describe this image is:'''

	#prompt_search = f'''Let's list keywords that include the following description.
	#This image is a {img_type}. There {ppl_result}.
	#I think this photo was taken at a {sorted_places[0]}, {sorted_places[1]}, or {sorted_places[2]}.
	#I think there might be a {object_list} in this {img_type}.
	#Relevant keywords which we can list and are seperated with comma are:'''

	return prompt_caption


	def generate_captions(prompt, num_captions=3):
	headers = {"Authorization": f"Bearer {HF_TOKEN}"}

	max_length = 16
	seed = 42
	sample_or_greedy = 'Greedy'
	input_sentence = prompt
	if sample_or_greedy == "Sample":
	parameters = {
	"max_new_tokens": max_length,
	"top_p": 0.7,
	"do_sample": True,
	"seed": seed,
	"early_stopping": False,
	"length_penalty": 0.0,
	"eos_token_id": None,
	}
	else:
	parameters = {
	"max_new_tokens": max_length,
	"do_sample": False,
	"seed": seed,
	"early_stopping": False,
	"length_penalty": 0.0,
	"eos_token_id": None,
	}

	payload = {"inputs": input_sentence, "parameters": parameters,"options" : {"use_cache": False}}

	bloom_results = []
	for _ in range(num_captions):
	response = requests.post(API_URL, headers=headers, json=payload)
	output = response.json()
	generated_text = output[0]['generated_text'].replace(prompt, '').split('.')[0] + '.'
	bloom_results.append(generated_text)
	return bloom_results


	def sorting_texts(image_features, captions):
	with torch.no_grad():
	text_inputs = clip.tokenize(captions).to(model_dict['device'])
	text_features = model_dict['clip_model'].encode_text(text_inputs)
	text_features /= text_features.norm(dim=-1, keepdim=True)

	sim = (100.0 * image_features @ text_features.T).softmax(dim=-1)
	scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(captions))]
	sorted_captions = [captions[idx] for idx in indices]

	return scores, sorted_captions


	def postprocess_results(scores, classes):
	scores = [float('%.4f' % float(val)) for val in scores]
	outputs = []
	for score, cls in zip(scores, classes):
	outputs.append({'score': score, 'output': cls})
	return outputs


	def image_captioning(image):
	start_time = time.time()
	image_features, openimage_scores, openimage_classes, tencentml_scores, tencentml_classes, place365_scores, place365_classes, imgtype_scores, imgtype_classes, ppl_scores, ppl_classes, ifppl_scores, ifppl_classes = zeroshot_classifier(image)
	end_zeroshot = time.time()
	prompt_caption = generate_prompt(openimage_classes, tencentml_classes, place365_classes, imgtype_classes, ppl_classes, ifppl_classes)
	generated_captions = generate_captions(prompt_caption, num_captions=1)
	end_bloom = time.time()
	caption_scores, sorted_captions = sorting_texts(image_features, generated_captions)

	output_dict = {}
	output_dict['inference_time'] = {'CLIP inference': end_zeroshot - start_time,
	'BLOOM request': end_bloom - end_zeroshot}

	output_dict['generated_captions'] = postprocess_results(caption_scores, sorted_captions)
	output_dict['reasoning'] = {'openimage_results': postprocess_results(openimage_scores, openimage_classes),
	'tencentml_results': postprocess_results(tencentml_scores, tencentml_classes),
	'place365_results': postprocess_results(place365_scores, place365_classes),
	'imgtype_results': postprocess_results(imgtype_scores, imgtype_classes),
	'ppl_results': postprocess_results(ppl_scores, ppl_classes),
	'ifppl_results': postprocess_results(ifppl_scores, ifppl_classes)}
	return output_dict


	if __name__ == '__main__':
	print('\tinit models')

	global model_dict

	model_dict = load_models()

	# define gradio demo
	inputs = [gr.inputs.Image(type="pil", label="Image")
	]

	outputs = gr.outputs.JSON()

	title = "Socratic models for image captioning with BLOOM"

	description = """
	## Details
	Without any fine-tuning, we can do image captioning using Visual-Language models (e.g., CLIP, SLIP, ...) and Large language models (e.g., GPT, BLOOM, ...).
	In this demo, I choose BLOOM as the language model and CLIP ViT-L/14 as the visual-language model.
	The order of generating image caption is as follow:
	1. Classify whether there are people, where the location is, and what objects are in the input image using the visual-language model.
	2. Then, build a prompt using classified results.
	3. Request BLOOM API with the prompt.

	This demo is slightly different with the original method proposed in the socratic model paper.
	I used not only tencent ml class names, but also OpenImage class names and I adopt BLOOM for the large language model

	If you want the demo using GPT3 from OpenAI, check https://github.com/geonm/socratic-models-demo.

	Demo is running on CPU.
	"""

	article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.00598'>Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language</a></p>"
	examples = ['k21-1.jpg']

	gr.Interface(image_captioning,
	inputs,
	outputs,
	title=title,
	description=description,
	article=article,
	examples=examples,
	#examples_per_page=50,
	).launch()