{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# CLIPScore\n\nThe CLIPScore is a model-based image captioning metric that correlates well with human judgments.\n\nThe benefit of CLIPScore is that it does not require reference captions for evaluation.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Here's a hypothetical Python example demonstrating the usage of the CLIPScore metric to evaluate image captions:\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import matplotlib.animation as animation\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport torch\nfrom matplotlib.table import Table\nfrom skimage.data import astronaut, cat, coffee\nfrom torchmetrics.multimodal import CLIPScore"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Get sample images\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "images = {\n    \"astronaut\": astronaut(),\n    \"cat\": cat(),\n    \"coffee\": coffee(),\n}"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Define a hypothetical captions for the images\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "captions = [\n    \"A photo of an astronaut.\",\n    \"A photo of a cat.\",\n    \"A photo of a cup of coffee.\",\n]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Define the models for CLIPScore\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "models = [\n    \"openai/clip-vit-base-patch16\",\n    # \"openai/clip-vit-base-patch32\",\n    # \"openai/clip-vit-large-patch14-336\",\n    \"openai/clip-vit-large-patch14\",\n]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Collect scores for each image-caption pair\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "score_results = []\nfor model in models:\n    clip_score = CLIPScore(model_name_or_path=model)\n    for key, img in images.items():\n        img_tensor = torch.tensor(np.array(img))\n        caption_scores = {caption: clip_score(img_tensor, caption) for caption in captions}\n        score_results.append({\"scores\": caption_scores, \"image\": key, \"model\": model})"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Create an animation to display the scores\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "fig, (ax_img, ax_table) = plt.subplots(1, 2, figsize=(10, 5))\n\n\ndef update(num: int) -> tuple:\n    \"\"\"Update the image and table with the scores for the given model.\"\"\"\n    results = score_results[num]\n    scores, image, model = results[\"scores\"], results[\"image\"], results[\"model\"]\n\n    fig.suptitle(f\"Model: {model.split('/')[-1]}\", fontsize=16, fontweight=\"bold\")\n\n    # Update image\n    ax_img.imshow(images[image])\n    ax_img.axis(\"off\")\n\n    # Update table\n    table = Table(ax_table, bbox=[0, 0, 1, 1])\n    header1 = table.add_cell(0, 0, text=\"Caption\", width=3, height=1)\n    header2 = table.add_cell(0, 1, text=\"Score\", width=1, height=1)\n    header1.get_text().set_weight(\"bold\")\n    header2.get_text().set_weight(\"bold\")\n    for i, (caption, score) in enumerate(scores.items()):\n        table.add_cell(i + 1, 0, text=caption, width=3, height=1)\n        table.add_cell(i + 1, 1, text=f\"{score:.2f}\", width=1, height=1)\n    ax_table.add_table(table)\n    ax_table.axis(\"off\")\n    return ax_img, ax_table\n\n\nani = animation.FuncAnimation(fig, update, frames=len(score_results), interval=3000)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.20"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}