"""
CLIPScore
===============================

The CLIPScore is a model-based image captioning metric that correlates well with human judgments.

The benefit of CLIPScore is that it does not require reference captions for evaluation.
"""

# %%
# Here's a hypothetical Python example demonstrating the usage of the CLIPScore metric to evaluate image captions:
import matplotlib.animation as animation
import matplotlib.pyplot as plt
import numpy as np
import torch
from matplotlib.table import Table
from skimage.data import astronaut, cat, coffee
from torchmetrics.multimodal import CLIPScore

# %%
# Get sample images

images = {
    "astronaut": astronaut(),
    "cat": cat(),
    "coffee": coffee(),
}

# %%
# Define a hypothetical captions for the images

captions = [
    "A photo of an astronaut.",
    "A photo of a cat.",
    "A photo of a cup of coffee.",
]

# %%
# Define the models for CLIPScore

models = [
    "openai/clip-vit-base-patch16",
    # "openai/clip-vit-base-patch32",
    # "openai/clip-vit-large-patch14-336",
    "openai/clip-vit-large-patch14",
]

# %%
# Collect scores for each image-caption pair

score_results = []
for model in models:
    clip_score = CLIPScore(model_name_or_path=model)
    for key, img in images.items():
        img_tensor = torch.tensor(np.array(img))
        caption_scores = {caption: clip_score(img_tensor, caption) for caption in captions}
        score_results.append({"scores": caption_scores, "image": key, "model": model})

# %%
# Create an animation to display the scores

fig, (ax_img, ax_table) = plt.subplots(1, 2, figsize=(10, 5))


def update(num: int) -> tuple:
    """Update the image and table with the scores for the given model."""
    results = score_results[num]
    scores, image, model = results["scores"], results["image"], results["model"]

    fig.suptitle(f"Model: {model.split('/')[-1]}", fontsize=16, fontweight="bold")

    # Update image
    ax_img.imshow(images[image])
    ax_img.axis("off")

    # Update table
    table = Table(ax_table, bbox=[0, 0, 1, 1])
    header1 = table.add_cell(0, 0, text="Caption", width=3, height=1)
    header2 = table.add_cell(0, 1, text="Score", width=1, height=1)
    header1.get_text().set_weight("bold")
    header2.get_text().set_weight("bold")
    for i, (caption, score) in enumerate(scores.items()):
        table.add_cell(i + 1, 0, text=caption, width=3, height=1)
        table.add_cell(i + 1, 1, text=f"{score:.2f}", width=1, height=1)
    ax_table.add_table(table)
    ax_table.axis("off")
    return ax_img, ax_table


ani = animation.FuncAnimation(fig, update, frames=len(score_results), interval=3000)