from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

from ..hf import detect_device

MODEL_ID = "vikhyatk/moondream2"
DEVICE, DTYPE = detect_device()

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
moondream = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
    torch_dtype=DTYPE,
    device_map={"": DEVICE},
)
moondream.eval()

# Yes, the benchmark test set is stored in the 'train' split...
dataset = load_dataset("BaiqiL/NaturalBench", split="train")

acc = []
q_acc = []
i_acc = []
g_acc = []

for row in tqdm(dataset):
    if row["Question_Type"] == "yes_no":
        suffix = " Answer yes or no."
    else:
        suffix = ""

    answers = moondream.batch_answer(
        images=[row["Image_0"], row["Image_1"], row["Image_0"], row["Image_1"]],
        prompts=[
            row["Question_0"] + suffix,
            row["Question_0"] + suffix,
            row["Question_1"] + suffix,
            row["Question_1"] + suffix,
        ],
        tokenizer=tokenizer,
    )

    expected = [
        row["Image_0_Question_0"],
        row["Image_1_Question_0"],
        row["Image_0_Question_1"],
        row["Image_1_Question_1"],
    ]

    acc.append(answers[0] == expected[0])
    acc.append(answers[1] == expected[1])
    acc.append(answers[2] == expected[2])
    acc.append(answers[3] == expected[3])

    i_acc.append(answers[0] == expected[0] and answers[2] == expected[2])
    i_acc.append(answers[1] == expected[1] and answers[3] == expected[3])

    q_acc.append(answers[0] == expected[0] and answers[1] == expected[1])
    q_acc.append(answers[2] == expected[2] and answers[3] == expected[3])

    g_acc.append(
        answers[0] == expected[0]
        and answers[1] == expected[1]
        and answers[2] == expected[2]
        and answers[3] == expected[3]
    )


print("Overall Accuracy:", sum(acc) / len(acc))
print("Image Accuracy:", sum(i_acc) / len(i_acc))
print("Question Accuracy:", sum(q_acc) / len(q_acc))
print("Group Accuracy:", sum(g_acc) / len(g_acc))