sango07 commited on
Commit
a3e3740
·
verified ·
1 Parent(s): 42b4378

Create evaluation_module.py

Browse files
Files changed (1) hide show
  1. evaluation_module.py +77 -0
evaluation_module.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from sacrebleu import corpus_bleu
3
+ from rouge_score import rouge_scorer
4
+ from bert_score import score
5
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
6
+ import nltk
7
+ from nltk.util import ngrams
8
+
9
+ class RAGEvaluator:
10
+ def __init__(self):
11
+ self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
12
+ self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
13
+
14
+ def load_gpt2_model(self):
15
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
16
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
17
+ return model, tokenizer
18
+
19
+ def evaluate_bleu_rouge(self, candidates, references):
20
+ bleu_score = corpus_bleu(candidates, [references]).score
21
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
22
+ rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
23
+ rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
24
+ return bleu_score, rouge1
25
+
26
+ def evaluate_bert_score(self, candidates, references):
27
+ P, R, F1 = score(candidates, references, lang="en", model_type='bert-base-multilingual-cased')
28
+ return P.mean().item(), R.mean().item(), F1.mean().item()
29
+
30
+ def evaluate_perplexity(self, text):
31
+ encodings = self.gpt2_tokenizer(text, return_tensors='pt')
32
+ max_length = self.gpt2_model.config.n_positions
33
+ stride = 512
34
+ lls = []
35
+ for i in range(0, encodings.input_ids.size(1), stride):
36
+ begin_loc = max(i + stride - max_length, 0)
37
+ end_loc = min(i + stride, encodings.input_ids.size(1))
38
+ trg_len = end_loc - i
39
+ input_ids = encodings.input_ids[:, begin_loc:end_loc]
40
+ target_ids = input_ids.clone()
41
+ target_ids[:, :-trg_len] = -100
42
+ with torch.no_grad():
43
+ outputs = self.gpt2_model(input_ids, labels=target_ids)
44
+ log_likelihood = outputs[0] * trg_len
45
+ lls.append(log_likelihood)
46
+ ppl = torch.exp(torch.stack(lls).sum() / end_loc)
47
+ return ppl.item()
48
+
49
+ def evaluate_diversity(self, texts):
50
+ all_tokens = [tok for text in texts for tok in text.split()]
51
+ unique_bigrams = set(ngrams(all_tokens, 2))
52
+ diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
53
+ return diversity_score
54
+
55
+ def evaluate_racial_bias(self, text):
56
+ results = self.bias_pipeline([text], candidate_labels=["hate speech", "not hate speech"])
57
+ bias_score = results[0]['scores'][results[0]['labels'].index('hate speech')]
58
+ return bias_score
59
+
60
+ def evaluate_all(self, response, reference):
61
+ candidates = [response]
62
+ references = [reference]
63
+ bleu, rouge1 = self.evaluate_bleu_rouge(candidates, references)
64
+ bert_p, bert_r, bert_f1 = self.evaluate_bert_score(candidates, references)
65
+ perplexity = self.evaluate_perplexity(response)
66
+ diversity = self.evaluate_diversity(candidates)
67
+ racial_bias = self.evaluate_racial_bias(response)
68
+ return {
69
+ "BLEU": bleu,
70
+ "ROUGE-1": rouge1,
71
+ "BERT P": bert_p,
72
+ "BERT R": bert_r,
73
+ "BERT F1": bert_f1,
74
+ "Perplexity": perplexity,
75
+ "Diversity": diversity,
76
+ "Racial Bias": racial_bias
77
+ }