Ahmad-Moiz commited on
Commit
38e1f76
1 Parent(s): bdbf35c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -472
app.py CHANGED
@@ -1,472 +0,0 @@
1
- import os
2
- import json
3
- import time
4
- from typing import List
5
- import faiss
6
- import pypdf
7
- import random
8
- import itertools
9
- import text_utils
10
- import pandas as pd
11
- import altair as alt
12
- import streamlit as st
13
- from io import StringIO
14
- from llama_index import Document
15
- from langchain.llms import Anthropic
16
- from langchain.chains import RetrievalQA
17
- from langchain.vectorstores import FAISS
18
- from llama_index import LangchainEmbedding
19
- from langchain.chat_models import ChatOpenAI
20
- from langchain.retrievers import SVMRetriever
21
- from langchain.chains import QAGenerationChain
22
- from langchain.retrievers import TFIDFRetriever
23
- from langchain.evaluation.qa import QAEvalChain
24
- from langchain.embeddings import HuggingFaceEmbeddings
25
- from langchain.embeddings.openai import OpenAIEmbeddings
26
- from gpt_index import LLMPredictor, ServiceContext, GPTFaissIndex
27
- from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
28
- from text_utils import GRADE_DOCS_PROMPT, GRADE_ANSWER_PROMPT, GRADE_DOCS_PROMPT_FAST, GRADE_ANSWER_PROMPT_FAST, GRADE_ANSWER_PROMPT_BIAS_CHECK, GRADE_ANSWER_PROMPT_OPENAI
29
-
30
- # Keep dataframe in memory to accumulate experimental results
31
- if "existing_df" not in st.session_state:
32
- summary = pd.DataFrame(columns=['chunk_chars',
33
- 'overlap',
34
- 'split',
35
- 'model',
36
- 'retriever',
37
- 'embedding',
38
- 'num_neighbors',
39
- 'Latency',
40
- 'Retrieval score',
41
- 'Answer score'])
42
- st.session_state.existing_df = summary
43
- else:
44
- summary = st.session_state.existing_df
45
-
46
-
47
- @st.cache_data
48
- def load_docs(files: List) -> str:
49
- """
50
- Load docs from files
51
- @param files: list of files to load
52
- @return: string of all docs concatenated
53
- """
54
-
55
- st.info("Reading doc ...")
56
- all_text = ""
57
- for file_path in files:
58
- file_extension = os.path.splitext(file_path.name)[1]
59
- if file_extension == ".pdf":
60
- pdf_reader = pypdf.PdfReader(file_path)
61
- file_content = ""
62
- for page in pdf_reader.pages:
63
- file_content += page.extract_text()
64
- file_content = text_utils.clean_pdf_text(file_content)
65
- all_text += file_content
66
- elif file_extension == ".txt":
67
- stringio = StringIO(file_path.getvalue().decode("utf-8"))
68
- file_content = stringio.read()
69
- all_text += file_content
70
- else:
71
- st.warning('Please provide txt or pdf.', icon="⚠")
72
- return all_text
73
-
74
- #skaks
75
- @st.cache_data
76
- def generate_eval(text: str, num_questions: int, chunk: int):
77
- """
78
- Generate eval set
79
- @param text: text to generate eval set from
80
- @param num_questions: number of questions to generate
81
- @param chunk: chunk size to draw question from in the doc
82
- @return: eval set as JSON list
83
- """
84
- st.info("Generating eval set ...")
85
- n = len(text)
86
- starting_indices = [random.randint(0, n - chunk) for _ in range(num_questions)]
87
- sub_sequences = [text[i:i + chunk] for i in starting_indices]
88
- chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0))
89
- eval_set = []
90
- for i, b in enumerate(sub_sequences):
91
- try:
92
- qa = chain.run(b)
93
- eval_set.append(qa)
94
- except:
95
- st.warning('Error generating question %s.' % str(i + 1), icon="⚠")
96
- eval_set_full = list(itertools.chain.from_iterable(eval_set))
97
- return eval_set_full
98
-
99
-
100
- @st.cache_resource
101
- def split_texts(text, chunk_size: int, overlap, split_method: str):
102
- """
103
- Split text into chunks
104
- @param text: text to split
105
- @param chunk_size:
106
- @param overlap:
107
- @param split_method:
108
- @return: list of str splits
109
- """
110
- st.info("Splitting doc ...")
111
- if split_method == "RecursiveTextSplitter":
112
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
113
- chunk_overlap=overlap)
114
- elif split_method == "CharacterTextSplitter":
115
- text_splitter = CharacterTextSplitter(separator=" ",
116
- chunk_size=chunk_size,
117
- chunk_overlap=overlap)
118
- else:
119
- st.warning("Split method not recognized. Using RecursiveCharacterTextSplitter", icon="⚠")
120
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
121
- chunk_overlap=overlap)
122
-
123
- split_text = text_splitter.split_text(text)
124
- return split_text
125
-
126
-
127
- @st.cache_resource
128
- def make_llm(model_version: str):
129
- """
130
- Make LLM from model version
131
- @param model_version: model_version
132
- @return: LLN
133
- """
134
- if (model_version == "gpt-3.5-turbo") or (model_version == "gpt-4"):
135
- chosen_model = ChatOpenAI(model_name=model_version, temperature=0)
136
- elif model_version == "anthropic":
137
- chosen_model = Anthropic(temperature=0)
138
- else:
139
- st.warning("Model version not recognized. Using gpt-3.5-turbo", icon="⚠")
140
- chosen_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
141
- return chosen_model
142
-
143
-
144
- @st.cache_resource
145
- def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
146
- """
147
- Make document retriever
148
- @param splits: list of str splits
149
- @param retriever_type: retriever type
150
- @param embedding_type: embedding type
151
- @param num_neighbors: number of neighbors for retrieval
152
- @param _llm: model
153
- @return: retriever
154
- """
155
- st.info("Making retriever ...")
156
- # Set embeddings
157
- if embedding_type == "OpenAI":
158
- embedding = OpenAIEmbeddings()
159
- elif embedding_type == "HuggingFace":
160
- embedding = HuggingFaceEmbeddings()
161
- else:
162
- st.warning("Embedding type not recognized. Using OpenAI", icon="⚠")
163
- embedding = OpenAIEmbeddings()
164
-
165
- # Select retriever
166
- if retriever_type == "similarity-search":
167
- try:
168
- vector_store = FAISS.from_texts(splits, embedding)
169
- except ValueError:
170
- st.warning("Error using OpenAI embeddings (disallowed TikToken token in the text). Using HuggingFace.",
171
- icon="⚠")
172
- vector_store = FAISS.from_texts(splits, HuggingFaceEmbeddings())
173
- retriever_obj = vector_store.as_retriever(k=num_neighbors)
174
- elif retriever_type == "SVM":
175
- retriever_obj = SVMRetriever.from_texts(splits, embedding)
176
- elif retriever_type == "TF-IDF":
177
- retriever_obj = TFIDFRetriever.from_texts(splits)
178
- elif retriever_type == "Llama-Index":
179
- documents = [Document(t, LangchainEmbedding(embedding)) for t in splits]
180
- llm_predictor = LLMPredictor(llm)
181
- context = ServiceContext.from_defaults(chunk_size_limit=512, llm_predictor=llm_predictor)
182
- d = 1536
183
- faiss_index = faiss.IndexFlatL2(d)
184
- retriever_obj = GPTFaissIndex.from_documents(documents, faiss_index=faiss_index, service_context=context)
185
- else:
186
- st.warning("Retriever type not recognized. Using SVM", icon="⚠")
187
- retriever_obj = SVMRetriever.from_texts(splits, embedding)
188
- return retriever_obj
189
-
190
-
191
- def make_chain(llm, retriever, retriever_type: str) -> RetrievalQA:
192
- """
193
- Make chain
194
- @param llm: model
195
- @param retriever: retriever
196
- @param retriever_type: retriever type
197
- @return: chain (or return retriever for Llama-Index)
198
- """
199
- st.info("Making chain ...")
200
- if retriever_type == "Llama-Index":
201
- qa = retriever
202
- else:
203
- qa = RetrievalQA.from_chain_type(llm,
204
- chain_type="stuff",
205
- retriever=retriever,
206
- input_key="question")
207
- return qa
208
-
209
-
210
- def grade_model_answer(predicted_dataset: List, predictions: List, grade_answer_prompt: str) -> List:
211
- """
212
- Grades the distilled answer based on ground truth and model predictions.
213
- @param predicted_dataset: A list of dictionaries containing ground truth questions and answers.
214
- @param predictions: A list of dictionaries containing model predictions for the questions.
215
- @param grade_answer_prompt: The prompt level for the grading. Either "Fast" or "Full".
216
- @return: A list of scores for the distilled answers.
217
- """
218
- # Grade the distilled answer
219
- st.info("Grading model answer ...")
220
- # Set the grading prompt based on the grade_answer_prompt parameter
221
- if grade_answer_prompt == "Fast":
222
- prompt = GRADE_ANSWER_PROMPT_FAST
223
- elif grade_answer_prompt == "Descriptive w/ bias check":
224
- prompt = GRADE_ANSWER_PROMPT_BIAS_CHECK
225
- elif grade_answer_prompt == "OpenAI grading prompt":
226
- prompt = GRADE_ANSWER_PROMPT_OPENAI
227
- else:
228
- prompt = GRADE_ANSWER_PROMPT
229
-
230
- # Create an evaluation chain
231
- eval_chain = QAEvalChain.from_llm(
232
- llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
233
- prompt=prompt
234
- )
235
-
236
- # Evaluate the predictions and ground truth using the evaluation chain
237
- graded_outputs = eval_chain.evaluate(
238
- predicted_dataset,
239
- predictions,
240
- question_key="question",
241
- prediction_key="result"
242
- )
243
-
244
- return graded_outputs
245
-
246
-
247
- def grade_model_retrieval(gt_dataset: List, predictions: List, grade_docs_prompt: str):
248
- """
249
- Grades the relevance of retrieved documents based on ground truth and model predictions.
250
- @param gt_dataset: list of dictionaries containing ground truth questions and answers.
251
- @param predictions: list of dictionaries containing model predictions for the questions
252
- @param grade_docs_prompt: prompt level for the grading. Either "Fast" or "Full"
253
- @return: list of scores for the retrieved documents.
254
- """
255
- # Grade the docs retrieval
256
- st.info("Grading relevance of retrieved docs ...")
257
-
258
- # Set the grading prompt based on the grade_docs_prompt parameter
259
- prompt = GRADE_DOCS_PROMPT_FAST if grade_docs_prompt == "Fast" else GRADE_DOCS_PROMPT
260
-
261
- # Create an evaluation chain
262
- eval_chain = QAEvalChain.from_llm(
263
- llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
264
- prompt=prompt
265
- )
266
-
267
- # Evaluate the predictions and ground truth using the evaluation chain
268
- graded_outputs = eval_chain.evaluate(
269
- gt_dataset,
270
- predictions,
271
- question_key="question",
272
- prediction_key="result"
273
- )
274
- return graded_outputs
275
-
276
-
277
- def run_evaluation(chain, retriever, eval_set, grade_prompt, retriever_type, num_neighbors):
278
- """
279
- Runs evaluation on a model's performance on a given evaluation dataset.
280
- @param chain: Model chain used for answering questions
281
- @param retriever: Document retriever used for retrieving relevant documents
282
- @param eval_set: List of dictionaries containing questions and corresponding ground truth answers
283
- @param grade_prompt: String prompt used for grading model's performance
284
- @param retriever_type: String specifying the type of retriever used
285
- @param num_neighbors: Number of neighbors to retrieve using the retriever
286
- @return: A tuple of four items:
287
- - answers_grade: A dictionary containing scores for the model's answers.
288
- - retrieval_grade: A dictionary containing scores for the model's document retrieval.
289
- - latencies_list: A list of latencies in seconds for each question answered.
290
- - predictions_list: A list of dictionaries containing the model's predicted answers and relevant documents for each question.
291
- """
292
- st.info("Running evaluation ...")
293
- predictions_list = []
294
- retrieved_docs = []
295
- gt_dataset = []
296
- latencies_list = []
297
-
298
- for data in eval_set:
299
-
300
- # Get answer and log latency
301
- start_time = time.time()
302
- if retriever_type != "Llama-Index":
303
- predictions_list.append(chain(data))
304
- elif retriever_type == "Llama-Index":
305
- answer = chain.query(data["question"], similarity_top_k=num_neighbors, response_mode="tree_summarize",
306
- use_async=True)
307
- predictions_list.append({"question": data["question"], "answer": data["answer"], "result": answer.response})
308
- gt_dataset.append(data)
309
- end_time = time.time()
310
- elapsed_time = end_time - start_time
311
- latencies_list.append(elapsed_time)
312
-
313
- # Retrieve docs
314
- retrieved_doc_text = ""
315
- if retriever_type == "Llama-Index":
316
- for i, doc in enumerate(answer.source_nodes):
317
- retrieved_doc_text += "Doc %s: " % str(i + 1) + doc.node.text + " "
318
-
319
- else:
320
- docs = retriever.get_relevant_documents(data["question"])
321
- for i, doc in enumerate(docs):
322
- retrieved_doc_text += "Doc %s: " % str(i + 1) + doc.page_content + " "
323
-
324
- retrieved = {"question": data["question"], "answer": data["answer"], "result": retrieved_doc_text}
325
- retrieved_docs.append(retrieved)
326
-
327
- # Grade
328
- answers_grade = grade_model_answer(gt_dataset, predictions_list, grade_prompt)
329
- retrieval_grade = grade_model_retrieval(gt_dataset, retrieved_docs, grade_prompt)
330
- return answers_grade, retrieval_grade, latencies_list, predictions_list
331
-
332
-
333
- # Auth
334
- st.sidebar.image("img/diagnostic.jpg")
335
-
336
- with st.sidebar.form("user_input"):
337
- num_eval_questions = st.select_slider("Number of eval questions",
338
- options=[1, 5, 10, 15, 20], value=5)
339
-
340
- chunk_chars = st.select_slider("Choose chunk size for splitting",
341
- options=[500, 750, 1000, 1500, 2000], value=1000)
342
-
343
- overlap = st.select_slider("Choose overlap for splitting",
344
- options=[0, 50, 100, 150, 200], value=100)
345
-
346
- split_method = st.radio("Split method",
347
- ("RecursiveTextSplitter",
348
- "CharacterTextSplitter"),
349
- index=0)
350
-
351
- model = st.radio("Choose model",
352
- ("gpt-3.5-turbo",
353
- "gpt-4",
354
- "anthropic"),
355
- index=0)
356
-
357
- retriever_type = st.radio("Choose retriever",
358
- ("TF-IDF",
359
- "SVM",
360
- "Llama-Index",
361
- "similarity-search"),
362
- index=3)
363
-
364
- num_neighbors = st.select_slider("Choose # chunks to retrieve",
365
- options=[3, 4, 5, 6, 7, 8])
366
-
367
- embeddings = st.radio("Choose embeddings",
368
- ("HuggingFace",
369
- "OpenAI"),
370
- index=1)
371
-
372
- grade_prompt = st.radio("Grading style prompt",
373
- ("Fast",
374
- "Descriptive",
375
- "Descriptive w/ bias check",
376
- "OpenAI grading prompt"),
377
- index=0)
378
-
379
- submitted = st.form_submit_button("Submit evaluation")
380
-
381
- # App
382
- st.header("Auto-evaluator")
383
- st.info(
384
- "`I am an evaluation tool for question-answering. Given documents, I will auto-generate a question-answer eval "
385
- "set and evaluate using the selected chain settings. Experiments with different configurations are logged. "
386
- "Optionally, provide your own eval set (as a JSON, see docs/karpathy-pod-eval.json for an example).`")
387
-
388
- with st.form(key='file_inputs'):
389
- uploaded_file = st.file_uploader("Please upload a file to evaluate (.txt or .pdf): ",
390
- type=['pdf', 'txt'],
391
- accept_multiple_files=True)
392
-
393
- uploaded_eval_set = st.file_uploader("[Optional] Please upload eval set (.json): ",
394
- type=['json'],
395
- accept_multiple_files=False)
396
-
397
- submitted = st.form_submit_button("Submit files")
398
-
399
- if uploaded_file:
400
-
401
- # Load docs
402
- text = load_docs(uploaded_file)
403
- # Generate num_eval_questions questions, each from context of 3k chars randomly selected
404
- if not uploaded_eval_set:
405
- eval_set = generate_eval(text, num_eval_questions, 3000)
406
- else:
407
- eval_set = json.loads(uploaded_eval_set.read())
408
- # Split text
409
- splits = split_texts(text, chunk_chars, overlap, split_method)
410
- # Make LLM
411
- llm = make_llm(model)
412
- # Make vector DB
413
- retriever = make_retriever(splits, retriever_type, embeddings, num_neighbors, llm)
414
- # Make chain
415
- qa_chain = make_chain(llm, retriever, retriever_type)
416
- # Grade model
417
- graded_answers, graded_retrieval, latency, predictions = run_evaluation(qa_chain, retriever, eval_set, grade_prompt,
418
- retriever_type, num_neighbors)
419
-
420
- # Assemble outputs
421
- d = pd.DataFrame(predictions)
422
- d['answer score'] = [g['text'] for g in graded_answers]
423
- d['docs score'] = [g['text'] for g in graded_retrieval]
424
- d['latency'] = latency
425
-
426
- # Summary statistics
427
- mean_latency = d['latency'].mean()
428
- correct_answer_count = len([text for text in d['answer score'] if "INCORRECT" not in text])
429
- correct_docs_count = len([text for text in d['docs score'] if "Context is relevant: True" in text])
430
- percentage_answer = (correct_answer_count / len(graded_answers)) * 100
431
- percentage_docs = (correct_docs_count / len(graded_retrieval)) * 100
432
-
433
- st.subheader("Run Results")
434
- st.info(
435
- "`I will grade the chain based on: 1/ the relevance of the retrived documents relative to the question and 2/ "
436
- "the summarized answer relative to the ground truth answer. You can see (and change) to prompts used for "
437
- "grading in text_utils`")
438
- st.dataframe(data=d, use_container_width=True)
439
-
440
- # Accumulate results
441
- st.subheader("Aggregate Results")
442
- st.info(
443
- "`Retrieval and answer scores are percentage of retrived documents deemed relevant by the LLM grader ("
444
- "relative to the question) and percentage of summarized answers deemed relevant (relative to ground truth "
445
- "answer), respectively. The size of point correponds to the latency (in seconds) of retrieval + answer "
446
- "summarization (larger circle = slower).`")
447
- new_row = pd.DataFrame({'chunk_chars': [chunk_chars],
448
- 'overlap': [overlap],
449
- 'split': [split_method],
450
- 'model': [model],
451
- 'retriever': [retriever_type],
452
- 'embedding': [embeddings],
453
- 'num_neighbors': [num_neighbors],
454
- 'Latency': [mean_latency],
455
- 'Retrieval score': [percentage_docs],
456
- 'Answer score': [percentage_answer]})
457
- summary = pd.concat([summary, new_row], ignore_index=True)
458
- st.dataframe(data=summary, use_container_width=True)
459
- st.session_state.existing_df = summary
460
-
461
- # Dataframe for visualization
462
- show = summary.reset_index().copy()
463
- show.columns = ['expt number', 'chunk_chars', 'overlap',
464
- 'split', 'model', 'retriever', 'embedding', 'num_neighbors', 'Latency', 'Retrieval score',
465
- 'Answer score']
466
- show['expt number'] = show['expt number'].apply(lambda x: "Expt #: " + str(x + 1))
467
- c = alt.Chart(show).mark_circle().encode(x='Retrieval score',
468
- y='Answer score',
469
- size=alt.Size('Latency'),
470
- color='expt number',
471
- tooltip=['expt number', 'Retrieval score', 'Latency', 'Answer score'])
472
- st.altair_chart(c, use_container_width=True, theme="streamlit")