"""ROUGE =============================== The ROUGE (Recall-Oriented Understudy for Gisting Evaluation) metric used to evaluate the quality of generated text compared to a reference text. It does so by computing the overlap between two texts, for which a subsequent precision and recall value can be computed. The ROUGE score is often used in the context of generative tasks such as text summarization and machine translation. A major difference with Perplexity comes from the fact that ROUGE evaluates actual text, whereas Perplexity evaluates logits. """ # %% # Here's a hypothetical Python example demonstrating the usage of unigram ROUGE F-score to evaluate a generative language model: from torchmetrics.text import ROUGEScore from transformers import AutoTokenizer, pipeline pipe = pipeline("text-generation", model="openai-community/gpt2") tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") # %% # Define the prompt and target texts prompt = "The quick brown fox" target_text = "The quick brown fox jumps over the lazy dog." # %% # Generate a sample text using the GPT-2 model sample_text = pipe(prompt, max_length=20, do_sample=True, temperature=0.1, pad_token_id=tokenizer.eos_token_id)[0][ "generated_text" ] print(sample_text) # %% # Calculate the ROUGE of the generated text rouge = ROUGEScore() rouge(preds=[sample_text], target=[target_text]) # %% # By default, the ROUGE score is calculated using a whitespace tokenizer. You can also calculate the ROUGE for the tokens directly: token_rouge = ROUGEScore(tokenizer=lambda text: tokenizer.tokenize(text)) token_rouge(preds=[sample_text], target=[target_text]) # %% # Since ROUGE is a text-based metric, it can be used to benchmark decoding strategies. For example, you can compare temperature settings: import matplotlib.pyplot as plt # noqa: E402 temperatures = [x * 0.1 for x in range(1, 10)] # Generate temperature values from 0 to 1 with a step of 0.1 n_samples = 100 # Note that a real benchmark typically requires more data average_scores = [] for temperature in temperatures: sample_text = pipe( prompt, max_length=20, do_sample=True, temperature=temperature, pad_token_id=tokenizer.eos_token_id )[0]["generated_text"] scores = [rouge(preds=[sample_text], target=[target_text])["rouge1_fmeasure"] for _ in range(n_samples)] average_scores.append(sum(scores) / n_samples) # Plot the average ROUGE score for each temperature plt.plot(temperatures, average_scores) plt.xlabel("Generation temperature") plt.ylabel("Average unigram ROUGE F-Score") plt.title("ROUGE for varying temperature settings") plt.show()