CLIPScore

The CLIPScore is a model-based image captioning metric that correlates well with human judgments.

The benefit of CLIPScore is that it does not require reference captions for evaluation.

Here’s a hypothetical Python example demonstrating the usage of the CLIPScore metric to evaluate image captions:

12 import warnings
13
14 import matplotlib.animation as animation
15 import matplotlib.pyplot as plt
16 import numpy as np
17 import torch
18 from matplotlib.table import Table
19 from skimage.data import astronaut, cat, coffee
20
21 from torchmetrics.multimodal import CLIPScore

Get sample images

26 images = {
27     "astronaut": astronaut(),
28     "cat": cat(),
29     "coffee": coffee(),
30 }

Define a hypothetical captions for the images

35 captions = [
36     "A photo of an astronaut.",
37     "A photo of a cat.",
38     "A photo of a cup of coffee.",
39 ]

Define the models for CLIPScore

44 models = [
45     "openai/clip-vit-base-patch16",
46     # "zer0int/LongCLIP-L-Diffusers",
47 ]

Collect scores for each image-caption pair

52 score_results = []
53
54
55 def process_model(model):
56     """Process a CLIP model by evaluating image-caption pairs and recording scores.
57
58     Args:
59         model: The name or path of the CLIP model to use for evaluation
60
61     This function handles exceptions if the model fails to load or process,
62     allowing the program to continue with other models.
63
64     """
65     try:
66         clip_score = CLIPScore(model_name_or_path=model)
67         for key, img in images.items():
68             img_tensor = torch.tensor(np.array(img))
69             caption_scores = {caption: clip_score(img_tensor, caption) for caption in captions}
70             score_results.append({"scores": caption_scores, "image": key, "model": model})
71     except Exception as e:
72         warnings.warn(f"Error loading model {model} - skipping this test. Error details: {e}", stacklevel=2)
73
74
75 for model in models:
76     process_model(model)

Create an animation to display the scores

 81 fig, (ax_img, ax_table) = plt.subplots(1, 2, figsize=(10, 5))
 82
 83
 84 def update(num: int) -> tuple:
 85     """Update the image and table with the scores for the given model."""
 86     results = score_results[num]
 87     scores, image, model = results["scores"], results["image"], results["model"]
 88
 89     fig.suptitle(f"Model: {model.split('/')[-1]}", fontsize=16, fontweight="bold")
 90
 91     # Update image
 92     ax_img.imshow(images[image])
 93     ax_img.axis("off")
 94
 95     # Update table
 96     table = Table(ax_table, bbox=[0, 0, 1, 1])
 97     header1 = table.add_cell(0, 0, text="Caption", width=3, height=1)
 98     header2 = table.add_cell(0, 1, text="Score", width=1, height=1)
 99     header1.get_text().set_weight("bold")
100     header2.get_text().set_weight("bold")
101     for i, (caption, score) in enumerate(scores.items()):
102         table.add_cell(i + 1, 0, text=caption, width=3, height=1)
103         table.add_cell(i + 1, 1, text=f"{score:.2f}", width=1, height=1)
104     ax_table.add_table(table)
105     ax_table.axis("off")
106     return ax_img, ax_table
107
108
109 ani = animation.FuncAnimation(fig, update, frames=len(score_results), interval=3000)

Total running time of the script: (0 minutes 9.709 seconds)

Gallery generated by Sphinx-Gallery