from PIL import Image | |
from transformers import AutoTokenizer | |
from pydantic import BaseModel | |
from enum import Enum | |
from moonline import Moonline | |
def main(): | |
class Mood(Enum): | |
sad = "sad" | |
happy = "happy" | |
angry = "angry" | |
neutral = "neutral" | |
class ExampleModel(BaseModel): | |
description: str | |
mood: Mood | |
prompt = f""" | |
Your job is to describe the image. | |
Please answer in json with the following format: {ExampleModel.__annotations__} | |
""" | |
image_path = "example.png" | |
prompt = prompt | |
model_id = "vikhyatk/moondream2" | |
revision = "2024-04-02" | |
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) | |
moonline = Moonline.from_pretrained( | |
model_id, | |
revision=revision, | |
).to() | |
moonline.eval() | |
image = Image.open(image_path) | |
image_embeds = moonline.encode_image(image) | |
fsm = moonline.generate_fsm(ExampleModel, tokenizer) | |
answer = moonline.answer_question(image_embeds, prompt, tokenizer, fsm) | |
print(f"answer: {answer}") | |
if __name__ == "__main__": | |
main() |