|
import os |
|
|
|
import datasets |
|
import pandas as pd |
|
import sacrebleu as scb |
|
from packaging import version |
|
from sacrebleu import CHRF |
|
import string |
|
|
|
import evaluate |
|
|
|
_CITATION = """\ |
|
@ARTICLE{2023arXiv230513252W, |
|
author = {{Weller}, Orion and {Marone}, Marc and {Weir}, Nathaniel and {Lawrie}, Dawn and {Khashabi}, Daniel and {Van Durme}, Benjamin}, |
|
title = "{``According to ...'' Prompting Language Models Improves Quoting from Pre-Training Data}", |
|
journal = {arXiv e-prints}, |
|
keywords = {Computer Science - Computation and Language, Computer Science - Artificial Intelligence}, |
|
year = 2023, |
|
month = may, |
|
eid = {arXiv:2305.13252}, |
|
pages = {arXiv:2305.13252}, |
|
doi = {10.48550/arXiv.2305.13252}, |
|
archivePrefix = {arXiv}, |
|
eprint = {2305.13252}, |
|
primaryClass = {cs.CL}, |
|
adsurl = {https://ui.adsabs.harvard.edu/abs/2023arXiv230513252W}, |
|
adsnote = {Provided by the SAO/NASA Astrophysics Data System} |
|
} |
|
""" |
|
|
|
_DESCRIPTION = """\ |
|
In order to understand whether models are able |
|
to ground to their pre-training data, we first need |
|
to have a way of measuring this phenomena. We |
|
adopt a narrow definition of grounding (quoting |
|
from source material) while acknowledging that |
|
grounding is a broad term. |
|
To enable fast and efficient measurement of |
|
quoting from pre-training data for many language |
|
model generations across large corpora, we build |
|
off of a D ATA P ORTRAIT (Marone and Van Durme, |
|
2023), which allows for fast membership queries |
|
for each n-gram in the output. This approach en- |
|
ables us to perform a one-time indexing of a large |
|
corpus (e.g. Wikipedia) and at inference time sim- |
|
ply compute a constant time lookup operation (in |
|
milliseconds) for each n-gram in the generation. |
|
We build a D ATA P ORTRAIT on the version of |
|
Wikipedia included in the Pile, 2 as it allows for |
|
us to exactly test the pre-training data included |
|
in many models like GPT-J and is similar to the |
|
training data used in T5. However, we note that for |
|
some models evaluated in this paper (e.g. OpenAI |
|
models) there is no public information about the |
|
Wikipedia version in the models. |
|
We use character based n-grams as opposed to a |
|
token-based n-gram as different models have differ- |
|
ent tokenization schemes; furthermore, character- |
|
based n-gram metrics have widespread usage in |
|
fields such as machine translation with metrics like |
|
chrF and chrF++ (PopoviΔ, 2015, 2017). We use |
|
25 character grams for the sketch, approximately 5- |
|
gram words, as we found it empirically gave mean- |
|
ingful results (not too small of an n-gram and not |
|
too large). The D ATA P ORTRAIT checks for exact |
|
matches and is sensitive to orthographic variation |
|
(e.g. case, whitespace). Therefore we view this as |
|
a lower-bound on actual quoting performance. |
|
We define our new metric QUIP-Score as the |
|
character n-gram precision of the generated out- |
|
put compared to the pre-training corpus. More |
|
formally, for generation Y and text corpus C: |
|
P |
|
gram n βY 1 C (gram n ) |
|
QUIP(Y ; C) = |
|
, |
|
|gram n β Y | |
|
where 1(.) is an indicator function: 1 if gram n β C |
|
else 0. Thus, a score of 0.5 would indicate that |
|
50% of the generated text n-grams are found in |
|
the pre-training corpus. We macro-average this |
|
quantity over a set of generations to obtain a single |
|
performance number for a given test dataset. 3 |
|
""" |
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Produces QUIP scores for checking grounding from references |
|
Args: |
|
predictions (list of str): The predicted sentences. |
|
references (list of list of str): The references. There should be one reference sub-list for each prediction sentence. |
|
Returns: |
|
'score' (float): The QUIP score, |
|
Examples: |
|
Example 1--a simple example of calculating chrF: |
|
predictions = ["The current goodwill balance is $25,173 million as of December 31, 2022."] |
|
references = [[ |
|
"Table 7.3: Goodwill (in millions) Consumer Banking and Lending Commercial Banking Corporate and Investment Banking Wealth and Investment Management Corporate Consolidated Company December 31, 2020 $ 16,418 3,018 5,375 1,276 305 26,392 Foreign currency translation β β β β β β Transfers of goodwill β (80) β (932) 1,012 β Divestitures β β β β (1,212) (1,212) December 31, 2021 $ 16,418 2,938 5,375 344 105 25,180 Foreign currency translation β (7) β β β (7) December 31, 2022 $ 16,418 2,931 5,375 344 105 25,173 Table 7.4 presents the components of other assets."]] |
|
results = quip.compute(predictions=predictions, references=references, return_match_fraction_by_pred_length=True) |
|
print(results) |
|
assert results == 0.5 |
|
""" |
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class Quip(evaluate.Metric): |
|
def __init__(self, **kwargs): |
|
|
|
self.set_common = None |
|
if False: |
|
common_words_file = "data/NGSL_1.2_stats.csv.zip" |
|
if os.path.isfile(common_words_file): |
|
df = pd.read_csv(common_words_file) |
|
self.set_common = set(df['Lemma'].values.tolist()) |
|
else: |
|
|
|
common_words_file = "data/count_1w.txt.zip" |
|
if os.path.isfile(common_words_file): |
|
df = pd.read_csv(common_words_file, names=["word", "freq"], header=None, sep='\t') |
|
df = df.head(1000) |
|
self.set_common = set(df['word'].values.tolist()) |
|
for k in list(string.ascii_lowercase): |
|
keep = {'i', 'I', 'A', 'a'} |
|
if k in self.set_common: |
|
if k in keep: |
|
continue |
|
self.set_common.remove(k) |
|
|
|
super().__init__(**kwargs) |
|
|
|
def _info(self): |
|
if version.parse(scb.__version__) < version.parse("1.4.12"): |
|
raise ImportWarning( |
|
"To use `quip`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n" |
|
'You can install it with `pip install "sacrebleu>=1.4.12"`.' |
|
) |
|
return evaluate.MetricInfo( |
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
homepage="https://github.com/h2oai/h2ogpt", |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
features=[ |
|
datasets.Features( |
|
{ |
|
"predictions": datasets.Value("string", id="sequence"), |
|
"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), |
|
} |
|
), |
|
datasets.Features( |
|
{ |
|
"predictions": datasets.Value("string", id="sequence"), |
|
"references": datasets.Value("string", id="sequence"), |
|
} |
|
), |
|
], |
|
codebase_urls=["https://github.com/h2oai/h2ogpt"], |
|
reference_urls=[ |
|
"https://github.com/h2oai/h2ogpt", |
|
], |
|
) |
|
|
|
def _compute( |
|
self, |
|
predictions=None, |
|
references=None, |
|
reduced=True, |
|
min_len=2, |
|
max_len=5, |
|
return_match_count=False, |
|
return_match_fraction_by_pred_length=False, |
|
**kwargs, |
|
): |
|
|
|
if isinstance(references[0], str): |
|
references = [[ref] for ref in references] |
|
references_per_prediction = len(references[0]) |
|
if any(len(refs) != references_per_prediction for refs in references): |
|
raise ValueError( |
|
"Quip requires the same number of references for each prediction" |
|
) |
|
|
|
|
|
if reduced: |
|
punc = """"!"#$%&()*+,-./:;<=>?@[\\]^_{|}~""" |
|
|
|
for predi, pred in enumerate(predictions): |
|
pred = pred.translate(str.maketrans(punc, ' ' * len(punc))).strip() |
|
predictions[predi] = ' '.join([x for x in pred.split() if x not in self.set_common]) |
|
|
|
for refi, refl in enumerate(references): |
|
for refj, ref in enumerate(refl): |
|
ref = ref.translate(str.maketrans(punc, ' ' * len(punc))).strip() |
|
references[refi][refj] = ' '.join([x for x in ref.split() if x not in self.set_common]) |
|
|
|
from nltk.util import everygrams |
|
from utils import flatten_list |
|
pred_ngrams = set( |
|
flatten_list([list(everygrams(x.split(), min_len=min_len, max_len=max_len)) for x in predictions])) |
|
ref_ngrams = set(flatten_list( |
|
[[list(everygrams(y.split(), min_len=min_len, max_len=max_len)) for y in z] for z in references])) |
|
residual = pred_ngrams.difference(ref_ngrams) |
|
if return_match_count: |
|
return len(pred_ngrams) - len(residual) |
|
else: |
|
if not return_match_fraction_by_pred_length: |
|
|
|
|
|
return 1.0 - len(residual) / len(pred_ngrams) |
|
else: |
|
|
|
nmatches = len(pred_ngrams) - len(residual) |
|
return min(1.0, nmatches / len(predictions[0].split())) |
|
|
|
def get_reduced_size(self, reduced_query, verbose=True): |
|
reduced_query_words = reduced_query.split(' ') |
|
set_common = set(self.df['Lemma'].values.tolist()) |
|
num_common = len([x.lower() in set_common for x in reduced_query_words]) |
|
frac_common = num_common / len(reduced_query) if reduced_query else 0 |
|
|
|
if verbose: |
|
print("frac_common: %s" % frac_common, flush=True) |
|
|