| from PIL import Image | |
| from transformers import AutoTokenizer | |
| from pydantic import BaseModel | |
| from enum import Enum | |
| from moonline import Moonline | |
| def main(): | |
| class Mood(Enum): | |
| sad = "sad" | |
| happy = "happy" | |
| angry = "angry" | |
| neutral = "neutral" | |
| class ExampleModel(BaseModel): | |
| description: str | |
| mood: Mood | |
| prompt = f""" | |
| Your job is to describe the image. | |
| Please answer in json with the following format: {ExampleModel.__annotations__} | |
| """ | |
| image_path = "example.png" | |
| prompt = prompt | |
| model_id = "vikhyatk/moondream2" | |
| revision = "2024-04-02" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) | |
| moonline = Moonline.from_pretrained( | |
| model_id, | |
| revision=revision, | |
| ).to() | |
| moonline.eval() | |
| image = Image.open(image_path) | |
| image_embeds = moonline.encode_image(image) | |
| fsm = moonline.generate_fsm(ExampleModel, tokenizer) | |
| answer = moonline.answer_question(image_embeds, prompt, tokenizer, fsm) | |
| print(f"answer: {answer}") | |
| if __name__ == "__main__": | |
| main() |