# Load model directly
from transformers import AutoModel, AutoTokenizer
import torch
from PIL import Image

model = AutoModel.from_pretrained(
    "openbmb/MiniCPM-o-2_6",
    trust_remote_code=True,
    attn_implementation='flash_attention_2',
    torch_dtype=torch.bfloat16,
    # device_map="auto"
)
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', use_fast=True, trust_remote_code=True)

img1 = Image.open('/home/ieemoo0337/projects/datasets/constrast_pair/8850813311020/8850813311020.jpg')
img2 = Image.open('/home/ieemoo0337/projects/datasets/constrast_pair/8850511321499/8850511321499.jpg')

question = '描述第一张图像的1。'
msgs = [{'role': 'user', 'content': [img1, img2, question]}]
answer = model.chat(
    msgs=msgs,
    tokenizer=tokenizer
)
print(answer)