import torch from transformers import IdeficsForVisionText2Text, AutoProcessor device = "cuda" if torch.cuda.is_available() else "cpu" checkpoint = "HuggingFaceM4/idefics-9b-instruct" model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device) processor = AutoProcessor.from_pretrained(checkpoint) # We feed to the model an arbitrary sequence of text strings and images. Images can be either URLs or PIL Images. prompts = [ [ "User: What is in this image?", "https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG", "", "\nAssistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground.", "\nUser:", "https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052", "And who is that?", "\nAssistant:", ], ] # --batched mode inputs = processor(prompts, add_end_of_utterance_token=False, return_tensors="pt").to(device) # --single sample mode # inputs = processor(prompts[0], return_tensors="pt").to(device) # Generation args exit_condition = processor.tokenizer("", add_special_tokens=False).input_ids bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) for i, t in enumerate(generated_text): print(f"{i}:\n{t}\n")