CLIPScore

The CLIPScore is a model-based image captioning metric that correlates well with human judgments.

The benefit of CLIPScore is that it does not require reference captions for evaluation.

Here’s a hypothetical Python example demonstrating the usage of the CLIPScore metric to evaluate image captions:

12 import matplotlib.animation as animation
13 import matplotlib.pyplot as plt
14 import numpy as np
15 import torch
16 from matplotlib.table import Table
17 from skimage.data import astronaut, cat, coffee
18 from torchmetrics.multimodal import CLIPScore

Get sample images

22 images = {
23     "astronaut": astronaut(),
24     "cat": cat(),
25     "coffee": coffee(),
26 }

Define a hypothetical captions for the images

30 captions = [
31     "A photo of an astronaut.",
32     "A photo of a cat.",
33     "A photo of a cup of coffee.",
34 ]

Define the models for CLIPScore

38 models = [
39     "openai/clip-vit-base-patch16",
40     # "openai/clip-vit-base-patch32",
41     # "openai/clip-vit-large-patch14-336",
42     "openai/clip-vit-large-patch14",
43 ]

Collect scores for each image-caption pair

47 score_results = []
48 for model in models:
49     clip_score = CLIPScore(model_name_or_path=model)
50     for key, img in images.items():
51         img_tensor = torch.tensor(np.array(img))
52         caption_scores = {caption: clip_score(img_tensor, caption) for caption in captions}
53         score_results.append({"scores": caption_scores, "image": key, "model": model})
/opt/hostedtoolcache/Python/3.9.19/x64/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884
  warnings.warn(

Create an animation to display the scores

57 fig, (ax_img, ax_table) = plt.subplots(1, 2, figsize=(10, 5))
58
59
60 def update(num: int) -> tuple:
61     """Update the image and table with the scores for the given model."""
62     results = score_results[num]
63     scores, image, model = results["scores"], results["image"], results["model"]
64
65     fig.suptitle(f"Model: {model.split('/')[-1]}", fontsize=16, fontweight="bold")
66
67     # Update image
68     ax_img.imshow(images[image])
69     ax_img.axis("off")
70
71     # Update table
72     table = Table(ax_table, bbox=[0, 0, 1, 1])
73     header1 = table.add_cell(0, 0, text="Caption", width=3, height=1)
74     header2 = table.add_cell(0, 1, text="Score", width=1, height=1)
75     header1.get_text().set_weight("bold")
76     header2.get_text().set_weight("bold")
77     for i, (caption, score) in enumerate(scores.items()):
78         table.add_cell(i + 1, 0, text=caption, width=3, height=1)
79         table.add_cell(i + 1, 1, text=f"{score:.2f}", width=1, height=1)
80     ax_table.add_table(table)
81     ax_table.axis("off")
82     return ax_img, ax_table
83
84
85 ani = animation.FuncAnimation(fig, update, frames=len(score_results), interval=3000)

Total running time of the script: (0 minutes 44.380 seconds)

Gallery generated by Sphinx-Gallery