DereAbdulhameed commited on
Commit
19252de
·
verified ·
1 Parent(s): c8744aa

Upload 2 files

Browse files
Files changed (2) hide show
  1. evaluation_module.py +229 -0
  2. memory.py +147 -0
evaluation_module.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''import torch
2
+ from sacrebleu import corpus_bleu
3
+ from rouge_score import rouge_scorer
4
+ from bert_score import score
5
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
6
+ from transformers import AutoModelForSequenceClassification
7
+ import nltk
8
+ from nltk.util import ngrams
9
+ from nltk.tokenize import word_tokenize
10
+ from nltk.translate.meteor_score import meteor_score
11
+ from nltk.translate.chrf_score import sentence_chrf
12
+ from textstat import flesch_reading_ease, flesch_kincaid_grade
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+
15
+ class RAGEvaluator:
16
+ def __init__(self):
17
+ self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
18
+ self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
19
+
20
+ def load_gpt2_model(self):
21
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
22
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
23
+ return model, tokenizer
24
+
25
+ def evaluate_bleu_rouge(self, candidates, references):
26
+ bleu_score = corpus_bleu(candidates, [references]).score
27
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
28
+ rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
29
+ rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
30
+ return bleu_score, rouge1
31
+
32
+ def evaluate_bert_score(self, candidates, references):
33
+ P, R, F1 = score(candidates, references, lang="en", model_type='bert-base-multilingual-cased')
34
+ return P.mean().item(), R.mean().item(), F1.mean().item()
35
+
36
+ def evaluate_perplexity(self, text):
37
+ encodings = self.gpt2_tokenizer(text, return_tensors='pt')
38
+ max_length = self.gpt2_model.config.n_positions
39
+ stride = 512
40
+ lls = []
41
+ for i in range(0, encodings.input_ids.size(1), stride):
42
+ begin_loc = max(i + stride - max_length, 0)
43
+ end_loc = min(i + stride, encodings.input_ids.size(1))
44
+ trg_len = end_loc - i
45
+ input_ids = encodings.input_ids[:, begin_loc:end_loc]
46
+ target_ids = input_ids.clone()
47
+ target_ids[:, :-trg_len] = -100
48
+ with torch.no_grad():
49
+ outputs = self.gpt2_model(input_ids, labels=target_ids)
50
+ log_likelihood = outputs[0] * trg_len
51
+ lls.append(log_likelihood)
52
+ ppl = torch.exp(torch.stack(lls).sum() / end_loc)
53
+ return ppl.item()
54
+
55
+ def evaluate_diversity(self, texts):
56
+ all_tokens = [tok for text in texts for tok in text.split()]
57
+ unique_bigrams = set(ngrams(all_tokens, 2))
58
+ diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
59
+ return diversity_score
60
+
61
+ def evaluate_racial_bias(self, text):
62
+ results = self.bias_pipeline([text], candidate_labels=["hate speech", "not hate speech"])
63
+ bias_score = results[0]['scores'][results[0]['labels'].index('hate speech')]
64
+ return bias_score
65
+
66
+ def evaluate_meteor(self, candidates, references):
67
+ nltk.download('punkt', quiet=True)
68
+
69
+ meteor_scores = [
70
+ meteor_score([word_tokenize(ref)], word_tokenize(cand))
71
+ for ref, cand in zip(references, candidates)
72
+ ]
73
+ return sum(meteor_scores) / len(meteor_scores)
74
+
75
+ def evaluate_chrf(self, candidates, references):
76
+ chrf_scores = [sentence_chrf(ref, cand) for ref, cand in zip(references, candidates)]
77
+ return sum(chrf_scores) / len(chrf_scores)
78
+
79
+ def evaluate_readability(self, text):
80
+ flesch_ease = flesch_reading_ease(text)
81
+ flesch_grade = flesch_kincaid_grade(text)
82
+ return flesch_ease, flesch_grade
83
+
84
+ def evaluate_all(self, response, reference):
85
+ candidates = [response]
86
+ references = [reference]
87
+ bleu, rouge1 = self.evaluate_bleu_rouge(candidates, references)
88
+ bert_p, bert_r, bert_f1 = self.evaluate_bert_score(candidates, references)
89
+ perplexity = self.evaluate_perplexity(response)
90
+ diversity = self.evaluate_diversity(candidates)
91
+ racial_bias = self.evaluate_racial_bias(response)
92
+ meteor = self.evaluate_meteor(candidates, references)
93
+ chrf = self.evaluate_chrf(candidates, references)
94
+ flesch_ease, flesch_grade = self.evaluate_readability(response)
95
+ return {
96
+ "BLEU": bleu,
97
+ "ROUGE-1": rouge1,
98
+ "BERT P": bert_p,
99
+ "BERT R": bert_r,
100
+ "BERT F1": bert_f1,
101
+ "Perplexity": perplexity,
102
+ "Diversity": diversity,
103
+ "Racial Bias": racial_bias,
104
+ "METEOR": meteor,
105
+ "CHRF": chrf,
106
+ "Flesch Reading Ease": flesch_ease,
107
+ "Flesch-Kincaid Grade": flesch_grade,
108
+ }'''
109
+
110
+
111
+ import torch
112
+ from sacrebleu import corpus_bleu
113
+ from rouge_score import rouge_scorer
114
+ from bert_score import score
115
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, AutoModelForSequenceClassification, AutoTokenizer
116
+ import nltk
117
+ from nltk.util import ngrams
118
+ from nltk.tokenize import word_tokenize
119
+ from nltk.translate.meteor_score import meteor_score
120
+ from nltk.translate.chrf_score import sentence_chrf
121
+ from textstat import flesch_reading_ease, flesch_kincaid_grade
122
+ from sklearn.metrics.pairwise import cosine_similarity
123
+
124
+ class RAGEvaluator:
125
+ def __init__(self):
126
+ self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
127
+ self.bias_pipeline = self.load_bias_model()
128
+
129
+ def load_gpt2_model(self):
130
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
131
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
132
+ return model, tokenizer
133
+
134
+ def load_bias_model(self):
135
+ # Load the model for zero-shot classification
136
+ model = AutoModelForSequenceClassification.from_pretrained('Hate-speech-CNERG/dehatebert-mono-english')
137
+ tokenizer = AutoTokenizer.from_pretrained('Hate-speech-CNERG/dehatebert-mono-english')
138
+
139
+ # Define label2id mapping for entailment and contradiction
140
+ model.config.label2id = {'not hate speech': 0, 'hate speech': 1}
141
+
142
+ # Return pipeline with the proper model and tokenizer
143
+ return pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)
144
+
145
+ def evaluate_bleu_rouge(self, candidates, references):
146
+ bleu_score = corpus_bleu(candidates, [references]).score
147
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
148
+ rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
149
+ rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
150
+ return bleu_score, rouge1
151
+
152
+ def evaluate_bert_score(self, candidates, references):
153
+ P, R, F1 = score(candidates, references, lang="en", model_type='bert-base-multilingual-cased')
154
+ return P.mean().item(), R.mean().item(), F1.mean().item()
155
+
156
+ def evaluate_perplexity(self, text):
157
+ encodings = self.gpt2_tokenizer(text, return_tensors='pt')
158
+ max_length = self.gpt2_model.config.n_positions
159
+ stride = 512
160
+ lls = []
161
+ for i in range(0, encodings.input_ids.size(1), stride):
162
+ begin_loc = max(i + stride - max_length, 0)
163
+ end_loc = min(i + stride, encodings.input_ids.size(1))
164
+ trg_len = end_loc - i
165
+ input_ids = encodings.input_ids[:, begin_loc:end_loc]
166
+ target_ids = input_ids.clone()
167
+ target_ids[:, :-trg_len] = -100
168
+ with torch.no_grad():
169
+ outputs = self.gpt2_model(input_ids, labels=target_ids)
170
+ log_likelihood = outputs[0] * trg_len
171
+ lls.append(log_likelihood)
172
+ ppl = torch.exp(torch.stack(lls).sum() / end_loc)
173
+ return ppl.item()
174
+
175
+ def evaluate_diversity(self, texts):
176
+ all_tokens = [tok for text in texts for tok in text.split()]
177
+ unique_bigrams = set(ngrams(all_tokens, 2))
178
+ diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
179
+ return diversity_score
180
+
181
+ def evaluate_racial_bias(self, text):
182
+ results = self.bias_pipeline([text], candidate_labels=["hate speech", "not hate speech"])
183
+ bias_score = results[0]['scores'][results[0]['labels'].index('hate speech')]
184
+ return bias_score
185
+
186
+ def evaluate_meteor(self, candidates, references):
187
+ nltk.download('punkt', quiet=True)
188
+
189
+ meteor_scores = [
190
+ meteor_score([word_tokenize(ref)], word_tokenize(cand))
191
+ for ref, cand in zip(references, candidates)
192
+ ]
193
+ return sum(meteor_scores) / len(meteor_scores)
194
+
195
+ def evaluate_chrf(self, candidates, references):
196
+ chrf_scores = [sentence_chrf(ref, cand) for ref, cand in zip(references, candidates)]
197
+ return sum(chrf_scores) / len(chrf_scores)
198
+
199
+ def evaluate_readability(self, text):
200
+ flesch_ease = flesch_reading_ease(text)
201
+ flesch_grade = flesch_kincaid_grade(text)
202
+ return flesch_ease, flesch_grade
203
+
204
+ def evaluate_all(self, response, reference):
205
+ candidates = [response]
206
+ references = [reference]
207
+ bleu, rouge1 = self.evaluate_bleu_rouge(candidates, references)
208
+ bert_p, bert_r, bert_f1 = self.evaluate_bert_score(candidates, references)
209
+ perplexity = self.evaluate_perplexity(response)
210
+ diversity = self.evaluate_diversity(candidates)
211
+ racial_bias = self.evaluate_racial_bias(response)
212
+ meteor = self.evaluate_meteor(candidates, references)
213
+ chrf = self.evaluate_chrf(candidates, references)
214
+ flesch_ease, flesch_grade = self.evaluate_readability(response)
215
+ return {
216
+ "BLEU": bleu,
217
+ "ROUGE-1": rouge1,
218
+ "BERT P": bert_p,
219
+ "BERT R": bert_r,
220
+ "BERT F1": bert_f1,
221
+ "Perplexity": perplexity,
222
+ "Diversity": diversity,
223
+ "Racial Bias": racial_bias,
224
+ "METEOR": meteor,
225
+ "CHRF": chrf,
226
+ "Flesch Reading Ease": flesch_ease,
227
+ "Flesch-Kincaid Grade": flesch_grade,
228
+ }
229
+
memory.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import openai
3
+ from openai import OpenAI
4
+ from brain import get_index_for_documents
5
+ from langchain.chains import RetrievalQA
6
+ from langchain_community.chat_models import ChatOpenAI
7
+ from langchain_community.embeddings import OpenAIEmbeddings
8
+ from langchain_community.vectorstores import FAISS
9
+ from dotenv import load_dotenv
10
+ import os
11
+ from evaluation_module import RAGEvaluator
12
+
13
+ # Set the title for the Streamlit app
14
+ st.title("DocuChat with Evaluation")
15
+
16
+ # Set up the OpenAI client
17
+ client = OpenAI()
18
+ load_dotenv() # Load variables from .env
19
+ openai.api_key = os.getenv("OPENAI_API_KEY")
20
+
21
+ # Initialize evaluator
22
+ evaluator = RAGEvaluator()
23
+
24
+ # Function to create vector database from different file types
25
+ @st.cache_resource
26
+ def create_vectordb(files, filenames, raw_texts):
27
+ with st.spinner("Creating vector database..."):
28
+ vectordb = get_index_for_documents(
29
+ [file.getvalue() for file in files if file.type == "application/pdf"],
30
+ filenames,
31
+ [raw_text for raw_text in raw_texts.splitlines() if raw_text.strip()],
32
+ openai.api_key
33
+ )
34
+ return vectordb
35
+
36
+ # Upload files using Streamlit's file uploader
37
+ uploaded_files = st.file_uploader("Upload your documents (PDF or TXT)", type=["pdf", "txt"], accept_multiple_files=True, label_visibility="hidden")
38
+
39
+ # Text area for raw text input
40
+ raw_text = st.text_area("Or enter your raw text here:", height=150)
41
+
42
+ # If files are uploaded or raw text is provided, create the vectordb and store it in the session state
43
+ if uploaded_files or raw_text:
44
+ file_names = [file.name for file in uploaded_files] if uploaded_files else []
45
+ st.session_state["vectordb"] = create_vectordb(uploaded_files, file_names, raw_text)
46
+
47
+ # Define the template for the chatbot prompt
48
+ prompt_template = """
49
+ You are a helpful Assistant who answers to users questions based on multiple contexts given to you.
50
+
51
+ Keep your answer short and to the point.
52
+
53
+ The evidence is the context of the document extract with metadata.
54
+
55
+ Carefully focus on the metadata, especially 'filename' and 'page' whenever answering.
56
+
57
+ Make sure to add filename and page number at the end of the sentence you are citing to.
58
+
59
+ Also be able to give a summary based on the document extract given to you, but do not hallucinate.
60
+
61
+ Reply "Not applicable" if text is irrelevant.
62
+
63
+ The document content is:
64
+ {doc_extract}
65
+ """
66
+
67
+ # Get the current prompt from the session state or set a default value
68
+ prompt = st.session_state.get("prompt", [{"role": "system", "content": "none"}])
69
+
70
+ # Display previous chat messages
71
+ for message in prompt:
72
+ if message["role"] != "system":
73
+ with st.chat_message(message["role"]):
74
+ st.write(message["content"])
75
+
76
+ # Get the user's question using Streamlit's chat input
77
+ question = st.chat_input("Ask anything")
78
+
79
+ # Handle the user's question
80
+ if question:
81
+ vectordb = st.session_state.get("vectordb", None)
82
+ if not vectordb:
83
+ with st.chat_message("assistant"):
84
+ st.write("You need to provide a PDF, TXT file, or raw text.")
85
+ st.stop()
86
+
87
+ # Search the vectordb for similar content to the user's question
88
+ search_results = vectordb.similarity_search(question, k=3)
89
+ doc_extract = "\n".join([result.page_content for result in search_results])
90
+
91
+ # Update the prompt with the document extract
92
+ prompt[0] = {
93
+ "role": "system",
94
+ "content": prompt_template.format(doc_extract=doc_extract),
95
+ }
96
+
97
+ # Add the user's question to the prompt and display it
98
+ prompt.append({"role": "user", "content": question})
99
+ with st.chat_message("user"):
100
+ st.write(question)
101
+
102
+ # Display an empty assistant message while waiting for the response
103
+ with st.chat_message("assistant"):
104
+ botmsg = st.empty()
105
+
106
+ # Call ChatGPT with streaming and display the response as it comes
107
+ response = []
108
+ result = ""
109
+ for chunk in client.chat.completions.create(
110
+ model="gpt-3.5-turbo", messages=prompt, stream=True
111
+ ):
112
+ text = chunk.choices[0].delta.content
113
+ if text is not None:
114
+ response.append(text)
115
+ result = "".join(response).strip()
116
+ botmsg.write(result)
117
+
118
+ # Add the assistant's response to the prompt
119
+ prompt.append({"role": "assistant", "content": result})
120
+
121
+ # Store the updated prompt in the session state
122
+ st.session_state["prompt"] = prompt
123
+
124
+ # Evaluation Section
125
+ st.write("## Evaluation Results")
126
+ if st.button("Evaluate Response"):
127
+ if doc_extract and result:
128
+ # Perform evaluation
129
+ metrics = evaluator.evaluate_all(result, doc_extract)
130
+
131
+ # Display metrics with explanations
132
+ st.write(f"**BLEU Score**: {metrics['BLEU']:.2f}")
133
+ st.write("BLEU measures the overlap between the generated output and reference text based on n-grams. Range: 0-100. Higher scores indicate better match.")
134
+
135
+ st.write(f"**ROUGE-1 Score**: {metrics['ROUGE-1']:.2f}")
136
+ st.write("ROUGE-1 measures the overlap of unigrams between the generated output and reference text. Range: 0-1. Higher scores indicate better match.")
137
+
138
+ st.write(f"**BERT Precision**: {metrics['BERT P']:.2f}")
139
+ st.write(f"**BERT Recall**: {metrics['BERT R']:.2f}")
140
+ st.write(f"**BERT F1 Score**: {metrics['BERT F1']:.2f}")
141
+ st.write("BERTScore evaluates the semantic similarity between the generated output and reference text using BERT embeddings. Range: 0-1. Higher scores indicate better semantic similarity.")
142
+
143
+ st.write(f"**Perplexity**: {metrics['Perplexity']:.2f}")
144
+ st.write("Perplexity measures how well a language model predicts the text. Range: 1 to ∞. Lower values indicate better fluency and coherence.")
145
+
146
+ st.write(f"**Diversity**: {metrics['Diversity']:.2f}")
147
+ st.write("Diversity measures the uniqueness of bigrams in the generated output. Range: 0-1. Higher values indicate more diverse and varied output.")