Spaces:

Geonmo
/

socratic-models-image-captioning-with-BLOOM

Runtime error

App Files Files Community

socratic-models-image-captioning-with-BLOOM / app.py

Geonmo

fix typo

e839607 over 2 years ago

raw

history blame

13.4 kB

	import os
	import torch
	import gradio as gr
	import time
	import clip
	import requests
	import csv
	import json
	import wget

	url_dict = {'clip_ViTL14_openimage_classifier_weights.pt': 'https://raw.githubusercontent.com/geonm/socratic-models-demo/master/prompts/clip_ViTL14_openimage_classifier_weights.pt',
	'clip_ViTL14_place365_classifier_weights.pt': 'https://raw.githubusercontent.com/geonm/socratic-models-demo/master/prompts/clip_ViTL14_place365_classifier_weights.pt',
	'clip_ViTL14_tencentml_classifier_weights.pt': 'https://raw.githubusercontent.com/geonm/socratic-models-demo/master/prompts/clip_ViTL14_tencentml_classifier_weights.pt'}

	os.makedirs('./prompts', exist_ok=True)
	for k, v in url_dict.items():
	wget.download(v, out='./prompts')

	os.environ['CUDA_VISIBLE_DEVICES'] = ''

	API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
	HF_TOKEN = os.environ["HF_TOKEN"]

	def load_openimage_classnames(csv_path):
	csv_data = open(csv_path)
	csv_reader = csv.reader(csv_data)
	classnames = {idx: row[-1] for idx, row in enumerate(csv_reader)}
	return classnames


	def load_tencentml_classnames(txt_path):
	txt_data = open(txt_path)
	lines = txt_data.readlines()
	classnames = {idx: line.strip() for idx, line in enumerate(lines)}
	return classnames


	def build_simple_classifier(clip_model, text_list, template, device):
	with torch.no_grad():
	texts = [template(text) for text in text_list]
	text_inputs = clip.tokenize(texts).to(device)
	text_features = clip_model.encode_text(text_inputs)
	text_features /= text_features.norm(dim=-1, keepdim=True)

	return text_features, {idx: text for idx, text in enumerate(text_list)}


	def load_models():
	# build model and tokenizer
	model_dict = {}

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print('\tLoading CLIP ViT-L/14')
	clip_model, clip_preprocess = clip.load("ViT-L/14", device=device)
	print('\tLoading precomputed zeroshot classifier')
	openimage_classifier_weights = torch.load('./prompts/clip_ViTL14_openimage_classifier_weights.pt', map_location=device).type(torch.FloatTensor)
	openimage_classnames = load_openimage_classnames('./prompts/openimage-classnames.csv')
	tencentml_classifier_weights = torch.load('./prompts/clip_ViTL14_tencentml_classifier_weights.pt', map_location=device).type(torch.FloatTensor)
	tencentml_classnames = load_tencentml_classnames('./prompts/tencent-ml-classnames.txt')
	place365_classifier_weights = torch.load('./prompts/clip_ViTL14_place365_classifier_weights.pt', map_location=device).type(torch.FloatTensor)
	place365_classnames = load_tencentml_classnames('./prompts/place365-classnames.txt')

	print('\tBuilding simple zeroshot classifier')
	img_types = ['photo', 'cartoon', 'sketch', 'painting']
	ppl_texts = ['no people', 'people']
	ifppl_texts = ['is one person', 'are two people', 'are three people', 'are several people', 'are many people']
	imgtype_classifier_weights, imgtype_classnames = build_simple_classifier(clip_model, img_types, lambda c: f'This is a {c}.', device)
	ppl_classifier_weights, ppl_classnames = build_simple_classifier(clip_model, ppl_texts, lambda c: f'There are {c} in this photo.', device)
	ifppl_classifier_weights, ifppl_classnames = build_simple_classifier(clip_model, ifppl_texts, lambda c: f'There {c} in this photo.', device)

	model_dict['clip_model'] = clip_model
	model_dict['clip_preprocess'] = clip_preprocess
	model_dict['openimage_classifier_weights'] = openimage_classifier_weights
	model_dict['openimage_classnames'] = openimage_classnames
	model_dict['tencentml_classifier_weights'] = tencentml_classifier_weights
	model_dict['tencentml_classnames'] = tencentml_classnames
	model_dict['place365_classifier_weights'] = place365_classifier_weights
	model_dict['place365_classnames'] = place365_classnames
	model_dict['imgtype_classifier_weights'] = imgtype_classifier_weights
	model_dict['imgtype_classnames'] = imgtype_classnames
	model_dict['ppl_classifier_weights'] = ppl_classifier_weights
	model_dict['ppl_classnames'] = ppl_classnames
	model_dict['ifppl_classifier_weights'] = ifppl_classifier_weights
	model_dict['ifppl_classnames'] = ifppl_classnames
	model_dict['device'] = device

	return model_dict


	def drop_gpu(tensor):
	if torch.cuda.is_available():
	return tensor.cpu().numpy()
	else:
	return tensor.numpy()


	def zeroshot_classifier(image):
	image_input = model_dict['clip_preprocess'](image).unsqueeze(0).to(model_dict['device'])
	with torch.no_grad():
	image_features = model_dict['clip_model'].encode_image(image_input)
	image_features /= image_features.norm(dim=-1, keepdim=True)

	sim = (100.0 * image_features @ model_dict['openimage_classifier_weights'].T).softmax(dim=-1)
	openimage_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(10)]
	openimage_classes = [model_dict['openimage_classnames'][idx] for idx in indices]

	sim = (100.0 * image_features @ model_dict['tencentml_classifier_weights'].T).softmax(dim=-1)
	tencentml_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(10)]
	tencentml_classes = [model_dict['tencentml_classnames'][idx] for idx in indices]

	sim = (100.0 * image_features @ model_dict['place365_classifier_weights'].T).softmax(dim=-1)
	place365_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(10)]
	place365_classes = [model_dict['place365_classnames'][idx] for idx in indices]

	sim = (100.0 * image_features @ model_dict['imgtype_classifier_weights'].T).softmax(dim=-1)
	imgtype_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(model_dict['imgtype_classnames']))]
	imgtype_classes = [model_dict['imgtype_classnames'][idx] for idx in indices]

	sim = (100.0 * image_features @ model_dict['ppl_classifier_weights'].T).softmax(dim=-1)
	ppl_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(model_dict['ppl_classnames']))]
	ppl_classes = [model_dict['ppl_classnames'][idx] for idx in indices]

	sim = (100.0 * image_features @ model_dict['ifppl_classifier_weights'].T).softmax(dim=-1)
	ifppl_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(model_dict['ifppl_classnames']))]
	ifppl_classes = [model_dict['ifppl_classnames'][idx] for idx in indices]

	return image_features, openimage_scores, openimage_classes, tencentml_scores, tencentml_classes,\
	place365_scores, place365_classes, imgtype_scores, imgtype_classes,\
	ppl_scores, ppl_classes, ifppl_scores, ifppl_classes


	def generate_prompt(openimage_classes, tencentml_classes, place365_classes, imgtype_classes, ppl_classes, ifppl_classes):
	img_type = imgtype_classes[0]
	ppl_result = ppl_classes[0]
	if ppl_result == 'people':
	ppl_result = ifppl_classes[0]
	else:
	ppl_result = 'are %s' % ppl_result

	sorted_places = place365_classes

	object_list = ''
	for cls in tencentml_classes:
	object_list += f'{cls}, '
	for cls in openimage_classes[:2]:
	object_list += f'{cls}, '
	object_list = object_list[:-2]

	prompt_caption = f'''I am an intelligent image captioning bot.
	This image is a {img_type}. There {ppl_result}.
	I think this photo was taken at a {sorted_places[0]}, {sorted_places[1]}, or {sorted_places[2]}.
	I think there might be a {object_list} in this {img_type}.
	A creative short caption I can generate to describe this image is:'''

	#prompt_search = f'''Let's list keywords that include the following description.
	#This image is a {img_type}. There {ppl_result}.
	#I think this photo was taken at a {sorted_places[0]}, {sorted_places[1]}, or {sorted_places[2]}.
	#I think there might be a {object_list} in this {img_type}.
	#Relevant keywords which we can list and are seperated with comma are:'''

	return prompt_caption


	def generate_captions(prompt, num_captions=3):
	headers = {"Authorization": f"Bearer {HF_TOKEN}"}

	max_length = 16
	seed = 42
	sample_or_greedy = 'Greedy'
	input_sentence = prompt
	if sample_or_greedy == "Sample":
	parameters = {
	"max_new_tokens": max_length,
	"top_p": 0.7,
	"do_sample": True,
	"seed": seed,
	"early_stopping": False,
	"length_penalty": 0.0,
	"eos_token_id": None,
	}
	else:
	parameters = {
	"max_new_tokens": max_length,
	"do_sample": False,
	"seed": seed,
	"early_stopping": False,
	"length_penalty": 0.0,
	"eos_token_id": None,
	}

	payload = {"inputs": input_sentence, "parameters": parameters,"options" : {"use_cache": False}}

	bloom_results = []
	for _ in range(num_captions):
	response = requests.post(API_URL, headers=headers, json=payload)
	output = response.json()
	generated_text = output[0]['generated_text'].replace(prompt, '').split('.')[0] + '.'
	bloom_results.append(generated_text)
	return bloom_results


	def sorting_texts(image_features, captions):
	with torch.no_grad():
	text_inputs = clip.tokenize(captions).to(model_dict['device'])
	text_features = model_dict['clip_model'].encode_text(text_inputs)
	text_features /= text_features.norm(dim=-1, keepdim=True)

	sim = (100.0 * image_features @ text_features.T).softmax(dim=-1)
	scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(captions))]
	sorted_captions = [captions[idx] for idx in indices]

	return scores, sorted_captions


	def postprocess_results(scores, classes):
	scores = [float('%.4f' % float(val)) for val in scores]
	outputs = []
	for score, cls in zip(scores, classes):
	outputs.append({'score': score, 'output': cls})
	return outputs


	def image_captioning(image):
	start_time = time.time()
	image_features, openimage_scores, openimage_classes, tencentml_scores, tencentml_classes, place365_scores, place365_classes, imgtype_scores, imgtype_classes, ppl_scores, ppl_classes, ifppl_scores, ifppl_classes = zeroshot_classifier(image)
	end_zeroshot = time.time()
	prompt_caption = generate_prompt(openimage_classes, tencentml_classes, place365_classes, imgtype_classes, ppl_classes, ifppl_classes)
	generated_captions = generate_captions(prompt_caption, num_captions=1)
	end_bloom = time.time()
	caption_scores, sorted_captions = sorting_texts(image_features, generated_captions)

	output_dict = {}
	output_dict['inference_time'] = {'CLIP inference': end_zeroshot - start_time,
	'BLOOM request': end_bloom - end_zeroshot}

	output_dict['generated_captions'] = postprocess_results(caption_scores, sorted_captions)
	output_dict['reasoning'] = {'openimage_results': postprocess_results(openimage_scores, openimage_classes),
	'tencentml_results': postprocess_results(tencentml_scores, tencentml_classes),
	'place365_results': postprocess_results(place365_scores, place365_classes),
	'imgtype_results': postprocess_results(imgtype_scores, imgtype_classes),
	'ppl_results': postprocess_results(ppl_scores, ppl_classes),
	'ifppl_results': postprocess_results(ifppl_scores, ifppl_classes)}
	return output_dict


	if __name__ == '__main__':
	print('\tinit models')

	global model_dict

	model_dict = load_models()

	# define gradio demo
	inputs = [gr.inputs.Image(type="pil", label="Image")
	]

	outputs = gr.outputs.JSON()

	title = "Socratic models for image captioning with BLOOM"

	description = """
	## Details
	Without any fine-tuning, we can do image captioning using Visual-Language models (e.g., CLIP, SLIP, ...) and Large language models (e.g., GPT, BLOOM, ...).
	In this demo, I choose BLOOM as the language model and CLIP ViT-L/14 as the visual-language model.
	The order of generating image caption is as follow:
	1. Classify whether there are people, where the location is, and what objects are in the input image using the visual-language model.
	2. Then, build a prompt using classified results.
	3. Request BLOOM API with the prompt.

	This demo is slightly different with the original method proposed in the socratic model paper.
	I used not only tencent ml class names, but also OpenImage class names and I adopt BLOOM for the large language model

	If you want the demo using GPT3 from OpenAI, check https://github.com/geonm/socratic-models-demo.

	Demo is running on CPU.
	"""

	article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.00598'>Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language</a></p>"
	examples = ['k21-1.jpg']

	gr.Interface(image_captioning,
	inputs,
	outputs,
	title=title,
	description=description,
	article=article,
	examples=examples,
	#examples_per_page=50,
	).launch()