mauve / mauve.py
evaluate-bot's picture
Update Space (evaluate main: 18932858)
55bcb26
# coding=utf-8
# Copyright 2020 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" MAUVE metric from https://github.com/krishnap25/mauve. """
import datasets
import faiss # Here to have a nice missing dependency error message early on
import numpy # Here to have a nice missing dependency error message early on
import requests # Here to have a nice missing dependency error message early on
import sklearn # Here to have a nice missing dependency error message early on
import tqdm # Here to have a nice missing dependency error message early on
from mauve import compute_mauve # From: mauve-text
import evaluate
_CITATION = """\
@inproceedings{pillutla-etal:mauve:neurips2021,
title={{MAUVE: Measuring the Gap Between Neural Text and Human Text using Divergence Frontiers}},
author={Pillutla, Krishna and Swayamdipta, Swabha and Zellers, Rowan and Thickstun, John and Welleck, Sean and Choi, Yejin and Harchaoui, Zaid},
booktitle = {NeurIPS},
year = {2021}
}
@article{pillutla-etal:mauve:arxiv2022,
title={{MAUVE Scores for Generative Models: Theory and Practice}},
author={Pillutla, Krishna and Liu, Lang and Thickstun, John and Welleck, Sean and Swayamdipta, Swabha and Zellers, Rowan and Oh, Sewoong and Choi, Yejin and Harchaoui, Zaid},
journal={arXiv Preprint},
year={2022}
}
"""
_DESCRIPTION = """\
MAUVE is a measure of the statistical gap between two text distributions, e.g., how far the text written by a model is the distribution of human text, using samples from both distributions.
MAUVE is obtained by computing Kullback–Leibler (KL) divergences between the two distributions in a quantized embedding space of a large language model.
It can quantify differences in the quality of generated text based on the size of the model, the decoding algorithm, and the length of the generated text.
MAUVE was found to correlate the strongest with human evaluations over baseline metrics for open-ended text generation.
This metrics is a wrapper around the official implementation of MAUVE:
https://github.com/krishnap25/mauve
"""
_KWARGS_DESCRIPTION = """
Calculates MAUVE scores between two lists of generated text and reference text.
Args:
predictions: list of generated text to score. Each predictions
should be a string with tokens separated by spaces.
references: list of reference for each prediction. Each
reference should be a string with tokens separated by spaces.
Optional Args:
num_buckets: the size of the histogram to quantize P and Q. Options: 'auto' (default) or an integer
pca_max_data: the number data points to use for PCA dimensionality reduction prior to clustering. If -1, use all the data. Default -1
kmeans_explained_var: amount of variance of the data to keep in dimensionality reduction by PCA. Default 0.9
kmeans_num_redo: number of times to redo k-means clustering (the best objective is kept). Default 5
kmeans_max_iter: maximum number of k-means iterations. Default 500
featurize_model_name: name of the model from which features are obtained. Default 'gpt2-large' Use one of ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'].
device_id: Device for featurization. Supply a GPU id (e.g. 0 or 3) to use GPU. If no GPU with this id is found, use CPU
max_text_length: maximum number of tokens to consider. Default 1024
divergence_curve_discretization_size: Number of points to consider on the divergence curve. Default 25
mauve_scaling_factor: "c" from the paper. Default 5.
verbose: If True (default), print running time updates
seed: random seed to initialize k-means cluster assignments.
Returns:
mauve: MAUVE score, a number between 0 and 1. Larger values indicate that P and Q are closer,
frontier_integral: Frontier Integral, a number between 0 and 1. Smaller values indicate that P and Q are closer,
divergence_curve: a numpy.ndarray of shape (m, 2); plot it with matplotlib to view the divergence curve,
p_hist: a discrete distribution, which is a quantized version of the text distribution p_text,
q_hist: same as above, but with q_text.
Examples:
>>> # faiss segfaults in doctest for some reason, so the .compute call is not tested with doctest
>>> import evaluate
>>> mauve = evaluate.load('mauve')
>>> predictions = ["hello there", "general kenobi"]
>>> references = ["hello there", "general kenobi"]
>>> out = mauve.compute(predictions=predictions, references=references) # doctest: +SKIP
>>> print(out.mauve) # doctest: +SKIP
1.0
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Mauve(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
homepage="https://github.com/krishnap25/mauve",
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
codebase_urls=["https://github.com/krishnap25/mauve"],
reference_urls=[
"https://arxiv.org/abs/2102.01454",
"https://github.com/krishnap25/mauve",
],
)
def _compute(
self,
predictions,
references,
p_features=None,
q_features=None,
p_tokens=None,
q_tokens=None,
num_buckets="auto",
pca_max_data=-1,
kmeans_explained_var=0.9,
kmeans_num_redo=5,
kmeans_max_iter=500,
featurize_model_name="gpt2-large",
device_id=-1,
max_text_length=1024,
divergence_curve_discretization_size=25,
mauve_scaling_factor=5,
verbose=True,
seed=25,
):
out = compute_mauve(
p_text=predictions,
q_text=references,
p_features=p_features,
q_features=q_features,
p_tokens=p_tokens,
q_tokens=q_tokens,
num_buckets=num_buckets,
pca_max_data=pca_max_data,
kmeans_explained_var=kmeans_explained_var,
kmeans_num_redo=kmeans_num_redo,
kmeans_max_iter=kmeans_max_iter,
featurize_model_name=featurize_model_name,
device_id=device_id,
max_text_length=max_text_length,
divergence_curve_discretization_size=divergence_curve_discretization_size,
mauve_scaling_factor=mauve_scaling_factor,
verbose=verbose,
seed=seed,
)
return out