{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n# CLIPScore\n\nThe CLIPScore is a model-based image captioning metric that correlates well with human judgments.\n\nThe benefit of CLIPScore is that it does not require reference captions for evaluation.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here's a hypothetical Python example demonstrating the usage of the CLIPScore metric to evaluate image captions:\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import matplotlib.animation as animation\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport torch\nfrom matplotlib.table import Table\nfrom skimage.data import astronaut, cat, coffee\nfrom torchmetrics.multimodal import CLIPScore" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get sample images\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "images = {\n \"astronaut\": astronaut(),\n \"cat\": cat(),\n \"coffee\": coffee(),\n}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define a hypothetical captions for the images\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "captions = [\n \"A photo of an astronaut.\",\n \"A photo of a cat.\",\n \"A photo of a cup of coffee.\",\n]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define the models for CLIPScore\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "models = [\n \"openai/clip-vit-base-patch16\",\n # \"openai/clip-vit-base-patch32\",\n # \"openai/clip-vit-large-patch14-336\",\n \"openai/clip-vit-large-patch14\",\n]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Collect scores for each image-caption pair\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "score_results = []\nfor model in models:\n clip_score = CLIPScore(model_name_or_path=model)\n for key, img in images.items():\n img_tensor = torch.tensor(np.array(img))\n caption_scores = {caption: clip_score(img_tensor, caption) for caption in captions}\n score_results.append({\"scores\": caption_scores, \"image\": key, \"model\": model})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create an animation to display the scores\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig, (ax_img, ax_table) = plt.subplots(1, 2, figsize=(10, 5))\n\n\ndef update(num: int) -> tuple:\n \"\"\"Update the image and table with the scores for the given model.\"\"\"\n results = score_results[num]\n scores, image, model = results[\"scores\"], results[\"image\"], results[\"model\"]\n\n fig.suptitle(f\"Model: {model.split('/')[-1]}\", fontsize=16, fontweight=\"bold\")\n\n # Update image\n ax_img.imshow(images[image])\n ax_img.axis(\"off\")\n\n # Update table\n table = Table(ax_table, bbox=[0, 0, 1, 1])\n header1 = table.add_cell(0, 0, text=\"Caption\", width=3, height=1)\n header2 = table.add_cell(0, 1, text=\"Score\", width=1, height=1)\n header1.get_text().set_weight(\"bold\")\n header2.get_text().set_weight(\"bold\")\n for i, (caption, score) in enumerate(scores.items()):\n table.add_cell(i + 1, 0, text=caption, width=3, height=1)\n table.add_cell(i + 1, 1, text=f\"{score:.2f}\", width=1, height=1)\n ax_table.add_table(table)\n ax_table.axis(\"off\")\n return ax_img, ax_table\n\n\nani = animation.FuncAnimation(fig, update, frames=len(score_results), interval=3000)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.19" } }, "nbformat": 4, "nbformat_minor": 0 }