metadata
license: mit
datasets:
- ZachW/GPT-BookSum
language:
- en
metrics:
- accuracy
base_model:
- FacebookAI/roberta-base
pipeline_tag: zero-shot-classification
tags:
- pacing
- concreteness
- text-evalutaion
Pacing-Judge
Overview
This is the concreteness evaluator developed in the paper Improving Pacing in Long-Form Story Planning (EMNLP 2023).
Quick Start
A simple usage: Input a pair of texts (text_ex_1, text_ex_2) with <sep> as the separator to the model. The output is whether the first or the second is more concrete.
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_name = "ZachW/pacing-judge"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
text_ex_1 = "The Duke then focused on securing his power and looking to future threats. The Duke eventually turned his attention to acquiring Tuscany but struggled."
text_ex_2 = "Lord Bacon mentioned his book \"The History of Henry VII,\" in the conversation noting that King Charles had conquered Naples without resistance, implying that the conquest was like a dream."
inputs = tokenizer(text_ex_1 + " <sep> " + text_ex_2, return_tensors="pt")
outputs = model(**inputs)
output = int(F.softmax(outputs.logits, dim=1)[:, 0].squeeze(-1).detach().cpu().numpy() > 0.5)
print(f"Output Binary = {output}")
if output:
print("The second text is more concrete.")
else:
print("The first text is more concrete.")
Usage
We have designed this Ranker, which enables fair pairwise comparison (independent of sequence order) and ranking among candidates. We recommend using our model via the Ranker.
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer
class Ranker:
def __init__(self):
print(f"*** Loading Model from Huggingface ***")
model_name = "ZachW/pacing-judge"
self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
def compare(self, t1, t2):
text_pair = [t1 + ' <sep> ' + t2, t2 + ' <sep> ' + t1]
pair_dataset = self.tokenizer(text_pair, padding=True, truncation=True, return_tensors="pt")
score = self.run_model(pair_dataset)
if score < 0.5:
return 0 # first is more concrete
else:
return 1 # second is more concrete
def compare_logits(self, t1, t2):
text_pair = [t1 + ' <sep> ' + t2, t2 + ' <sep> ' + t1]
pair_dataset = self.tokenizer(text_pair, padding=True, truncation=True, return_tensors="pt")
score = self.run_model(pair_dataset)
return score
def run_model(self, dataset):
outputs = self.model(**dataset)
scores = F.softmax(outputs.logits, dim=1)[:, 0].squeeze(-1).detach().cpu().numpy()
aver_score = (scores[0] + (1 - scores[1]))/2
return aver_score
def rank(self, texts_list): # input a list of texts
def quicksort(arr):
if len(arr) <= 1:
return arr
else:
pivot = arr[0]
less = []
greater = []
for t in arr[1:]:
cmp = self.compare(pivot, t)
if cmp == 0:
less.append(t)
elif cmp == 1:
greater.append(t)
return quicksort(greater) + [pivot] + quicksort(less)
return quicksort(texts_list)
# most concrete -> lest concrete
def rank_idx(self, texts_list): # input a list of texts
def quicksort(arr):
if len(arr) <= 1:
return arr
else:
pivot = arr[0]
less = []
greater = []
for t in arr[1:]:
cmp = self.compare(texts_list[pivot], texts_list[t])
if cmp == 0:
less.append(t)
elif cmp == 1:
greater.append(t)
return quicksort(greater) + [pivot] + quicksort(less)
return quicksort(list(range(len(texts_list))))
def rank_idx_conpletely(self, texts_list):
n = len(texts_list)
texts_idx = list(range(n))
scores = [[0] * n for _ in range(n)]
self_score = [0] * n
for i in texts_idx:
scores[i][i] = self.compare_logits(texts_list[i], texts_list[i])
self_score[i] = scores[i][i]
for j in texts_idx:
if j < i:
scores[i][j] = 1 - scores[j][i]
continue
if j == i:
continue
scores[i][j] = self.compare_logits(texts_list[i], texts_list[j])
# average score is, smaller is more concrete
average_score = [ sum(s)/len(s) for s in scores]
output_score = [ a + 0.5 - s for a, s in zip(average_score, self_score)]
sorted_indices = sorted(range(len(output_score)), key=lambda x: output_score[x])
return sorted_indices
def rank_idx_conpletely_wlogits(self, texts_list, logger=None):
n = len(texts_list)
texts_idx = list(range(n))
scores = [[0] * n for _ in range(n)]
self_score = [0] * n
for i in texts_idx:
scores[i][i] = self.compare_logits(texts_list[i], texts_list[i])
self_score[i] = scores[i][i]
for j in texts_idx:
if j < i:
scores[i][j] = 1 - scores[j][i]
continue
if j == i:
continue
scores[i][j] = self.compare_logits(texts_list[i], texts_list[j])
# average score is, smaller is more concrete
average_score = [ sum(s)/len(s) for s in scores]
output_score = [ a + 0.5 - s for a, s in zip(average_score, self_score)]
sorted_indices = sorted(range(len(output_score)), key=lambda x: output_score[x])
return sorted_indices, output_score
def compare_w_neighbors(self, t, cand):
score = 0.0
for c in cand:
score += self.compare_logits(t, c)
score /= len(cand)
return score
text_ex_1 = "The Duke then focused on securing his power and looking to future threats. The Duke eventually turned his attention to acquiring Tuscany but struggled."
text_ex_2 = "Lord Bacon mentioned his book \"The History of Henry VII,\" in the conversation noting that King Charles had conquered Naples without resistance, implying that the conquest was like a dream."
ranker = Ranker()
output = ranker.compare(text_ex_1, text_ex_2) # it is equvilant to (text_ex_2, text_ex_1)
print(f"Output Binary = {output}")
if output:
print("The second text is more concrete.")
else:
print("The first text is more concrete.")
output_logits = ranker.compare_logits(text_ex_1, text_ex_2)
print(f"Output Logits = {output_logits:.4f}")
For more details on the evaluator usage (e.g., pacing planning and control in generation) and training process, please refer to our paper!