# Quickstart ```python import torch from PIL import Image from transformers import AutoModelForCausalLM, LlamaTokenizer tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5') model = AutoModelForCausalLM.from_pretrained( 'THUDM/cogvlm-grounding-base-hf', torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True ).to('cuda').eval() query = 'Can you provide a description of the image and include the coordinates [[x0,y0,x1,y1]] for each mentioned object?' image = Image.open(requests.get('https://github.com/THUDM/CogVLM/blob/main/examples/4.jpg?raw=true', stream=True).raw).convert('RGB') inputs = model.build_conversation_input_ids(tokenizer, query=query, images=[image]) inputs = { 'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'), 'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'), 'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'), 'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]], } gen_kwargs = {"max_length": 2048, "do_sample": False} with torch.no_grad(): outputs = model.generate(**inputs, **gen_kwargs) outputs = outputs[:, inputs['input_ids'].shape[1]:] print(tokenizer.decode(outputs[0])) ```