CLIPScore

The CLIPScore is a model-based image captioning metric that correlates well with human judgments.

The benefit of CLIPScore is that it does not require reference captions for evaluation.

Here’s a hypothetical Python example demonstrating the usage of the CLIPScore metric to evaluate image captions:

 import matplotlib.animation as animation
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
 from matplotlib.table import Table
 from skimage.data import astronaut, cat, coffee

 from torchmetrics.multimodal import CLIPScore

Get sample images

 images = {
     "astronaut": astronaut(),
     "cat": cat(),
     "coffee": coffee(),
 }

Define a hypothetical captions for the images

 captions = [
     "A photo of an astronaut.",
     "A photo of a cat.",
     "A photo of a cup of coffee.",
 ]

Define the models for CLIPScore

 models = [
     "openai/clip-vit-base-patch16",
     # "zer0int/LongCLIP-L-Diffusers",
 ]

Collect scores for each image-caption pair

 score_results = []
 for model in models:
     clip_score = CLIPScore(model_name_or_path=model)
     for key, img in images.items():
         img_tensor = torch.tensor(np.array(img))
         caption_scores = {caption: clip_score(img_tensor, caption) for caption in captions}
         score_results.append({"scores": caption_scores, "image": key, "model": model})

Create an animation to display the scores

 fig, (ax_img, ax_table) = plt.subplots(1, 2, figsize=(10, 5))


 def update(num: int) -> tuple:
     """Update the image and table with the scores for the given model."""
     results = score_results[num]
     scores, image, model = results["scores"], results["image"], results["model"]

     fig.suptitle(f"Model: {model.split('/')[-1]}", fontsize=16, fontweight="bold")

     # Update image
     ax_img.imshow(images[image])
     ax_img.axis("off")

     # Update table
     table = Table(ax_table, bbox=[0, 0, 1, 1])
     header1 = table.add_cell(0, 0, text="Caption", width=3, height=1)
     header2 = table.add_cell(0, 1, text="Score", width=1, height=1)
     header1.get_text().set_weight("bold")
     header2.get_text().set_weight("bold")
     for i, (caption, score) in enumerate(scores.items()):
         table.add_cell(i + 1, 0, text=caption, width=3, height=1)
         table.add_cell(i + 1, 1, text=f"{score:.2f}", width=1, height=1)
     ax_table.add_table(table)
     ax_table.axis("off")
     return ax_img, ax_table


 ani = animation.FuncAnimation(fig, update, frames=len(score_results), interval=3000)

Total running time of the script: (0 minutes 11.708 seconds)

Gallery generated by Sphinx-Gallery