Ahmad-Moiz commited on
Commit
9333846
1 Parent(s): b0a12fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -53
app.py CHANGED
@@ -13,7 +13,6 @@ import streamlit as st
13
  from io import StringIO
14
  from llama_index import Document
15
  from langchain.llms import Anthropic
16
- from langchain import HuggingFaceHub
17
  from langchain.chains import RetrievalQA
18
  from langchain.vectorstores import FAISS
19
  from llama_index import LangchainEmbedding
@@ -53,7 +52,7 @@ def load_docs(files: List) -> str:
53
  @return: string of all docs concatenated
54
  """
55
 
56
- st.info("Reading doc ...")
57
  all_text = ""
58
  for file_path in files:
59
  file_extension = os.path.splitext(file_path.name)[1]
@@ -69,7 +68,7 @@ def load_docs(files: List) -> str:
69
  file_content = stringio.read()
70
  all_text += file_content
71
  else:
72
- st.warning('Please provide txt or pdf.', icon="")
73
  return all_text
74
 
75
 
@@ -82,7 +81,7 @@ def generate_eval(text: str, num_questions: int, chunk: int):
82
  @param chunk: chunk size to draw question from in the doc
83
  @return: eval set as JSON list
84
  """
85
- st.info("Generating eval set ...")
86
  n = len(text)
87
  starting_indices = [random.randint(0, n - chunk) for _ in range(num_questions)]
88
  sub_sequences = [text[i:i + chunk] for i in starting_indices]
@@ -93,7 +92,7 @@ def generate_eval(text: str, num_questions: int, chunk: int):
93
  qa = chain.run(b)
94
  eval_set.append(qa)
95
  except:
96
- st.warning('Error generating question %s.' % str(i + 1), icon="")
97
  eval_set_full = list(itertools.chain.from_iterable(eval_set))
98
  return eval_set_full
99
 
@@ -108,7 +107,7 @@ def split_texts(text, chunk_size: int, overlap, split_method: str):
108
  @param split_method:
109
  @return: list of str splits
110
  """
111
- st.info("Splitting doc ...")
112
  if split_method == "RecursiveTextSplitter":
113
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
114
  chunk_overlap=overlap)
@@ -117,7 +116,7 @@ def split_texts(text, chunk_size: int, overlap, split_method: str):
117
  chunk_size=chunk_size,
118
  chunk_overlap=overlap)
119
  else:
120
- st.warning("Split method not recognized. Using RecursiveCharacterTextSplitter", icon="")
121
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
122
  chunk_overlap=overlap)
123
 
@@ -136,13 +135,12 @@ def make_llm(model_version: str):
136
  chosen_model = ChatOpenAI(model_name=model_version, temperature=0)
137
  elif model_version == "anthropic":
138
  chosen_model = Anthropic(temperature=0)
139
- elif model_version == "flan-t5-xl":
140
- chosen_model = HuggingFaceHub(repo_id="google/flan-t5-xl",model_kwargs={"temperature":0,"max_length":64})
141
  else:
142
- st.warning("Model version not recognized. Using gpt-3.5-turbo", icon="")
143
  chosen_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
144
  return chosen_model
145
 
 
146
  @st.cache_resource
147
  def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
148
  """
@@ -154,14 +152,14 @@ def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
154
  @param _llm: model
155
  @return: retriever
156
  """
157
- st.info("Making retriever ...")
158
  # Set embeddings
159
  if embedding_type == "OpenAI":
160
  embedding = OpenAIEmbeddings()
161
  elif embedding_type == "HuggingFace":
162
  embedding = HuggingFaceEmbeddings()
163
  else:
164
- st.warning("Embedding type not recognized. Using OpenAI", icon="")
165
  embedding = OpenAIEmbeddings()
166
 
167
  # Select retriever
@@ -169,8 +167,8 @@ def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
169
  try:
170
  vector_store = FAISS.from_texts(splits, embedding)
171
  except ValueError:
172
- st.warning("Error using OpenAI embeddings (disallowed TikToken token in the text). Using HuggingFace.",
173
- icon="")
174
  vector_store = FAISS.from_texts(splits, HuggingFaceEmbeddings())
175
  retriever_obj = vector_store.as_retriever(k=num_neighbors)
176
  elif retriever_type == "SVM":
@@ -185,7 +183,7 @@ def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
185
  faiss_index = faiss.IndexFlatL2(d)
186
  retriever_obj = GPTFaissIndex.from_documents(documents, faiss_index=faiss_index, service_context=context)
187
  else:
188
- st.warning("Retriever type not recognized. Using SVM", icon="")
189
  retriever_obj = SVMRetriever.from_texts(splits, embedding)
190
  return retriever_obj
191
 
@@ -198,7 +196,7 @@ def make_chain(llm, retriever, retriever_type: str) -> RetrievalQA:
198
  @param retriever_type: retriever type
199
  @return: chain (or return retriever for Llama-Index)
200
  """
201
- st.info("Making chain ...")
202
  if retriever_type == "Llama-Index":
203
  qa = retriever
204
  else:
@@ -218,7 +216,7 @@ def grade_model_answer(predicted_dataset: List, predictions: List, grade_answer_
218
  @return: A list of scores for the distilled answers.
219
  """
220
  # Grade the distilled answer
221
- st.info("Grading model answer ...")
222
  # Set the grading prompt based on the grade_answer_prompt parameter
223
  if grade_answer_prompt == "Fast":
224
  prompt = GRADE_ANSWER_PROMPT_FAST
@@ -255,7 +253,7 @@ def grade_model_retrieval(gt_dataset: List, predictions: List, grade_docs_prompt
255
  @return: list of scores for the retrieved documents.
256
  """
257
  # Grade the docs retrieval
258
- st.info("Grading relevance of retrieved docs ...")
259
 
260
  # Set the grading prompt based on the grade_docs_prompt parameter
261
  prompt = GRADE_DOCS_PROMPT_FAST if grade_docs_prompt == "Fast" else GRADE_DOCS_PROMPT
@@ -291,7 +289,7 @@ def run_evaluation(chain, retriever, eval_set, grade_prompt, retriever_type, num
291
  - latencies_list: A list of latencies in seconds for each question answered.
292
  - predictions_list: A list of dictionaries containing the model's predicted answers and relevant documents for each question.
293
  """
294
- st.info("Running evaluation ...")
295
  predictions_list = []
296
  retrieved_docs = []
297
  gt_dataset = []
@@ -335,50 +333,43 @@ def run_evaluation(chain, retriever, eval_set, grade_prompt, retriever_type, num
335
  # Auth
336
  st.sidebar.image("img/diagnostic.jpg")
337
 
338
- oai_api_key = st.sidebar.text_input("OpenAI API Key:", type="password")
339
- ant_api_key = st.sidebar.text_input("(Optional) Anthropic API Key:", type="password")
340
- hf_api_key = st.sidebar.text_input("(Optional) HuggingFace API Token:", type="password")
341
-
342
  with st.sidebar.form("user_input"):
343
-
344
- num_eval_questions = st.select_slider("Number of eval questions",
345
  options=[1, 5, 10, 15, 20], value=5)
346
 
347
- chunk_chars = st.select_slider("Choose chunk size for splitting",
348
  options=[500, 750, 1000, 1500, 2000], value=1000)
349
 
350
- overlap = st.select_slider("Choose overlap for splitting",
351
  options=[0, 50, 100, 150, 200], value=100)
352
 
353
- split_method = st.radio("Split method",
354
  ("RecursiveTextSplitter",
355
  "CharacterTextSplitter"),
356
  index=0)
357
 
358
- model = st.radio("Choose model",
359
  ("gpt-3.5-turbo",
360
  "gpt-4",
361
  "anthropic"),
362
- # Error raised by inference API: Model google/flan-t5-xl time out
363
- #"flan-t5-xl"),
364
  index=0)
365
 
366
- retriever_type = st.radio("Choose retriever",
367
  ("TF-IDF",
368
  "SVM",
369
  "Llama-Index",
370
  "similarity-search"),
371
  index=3)
372
 
373
- num_neighbors = st.select_slider("Choose # chunks to retrieve",
374
  options=[3, 4, 5, 6, 7, 8])
375
 
376
- embeddings = st.radio("Choose embeddings",
377
  ("HuggingFace",
378
  "OpenAI"),
379
  index=1)
380
 
381
- grade_prompt = st.radio("Grading style prompt",
382
  ("Fast",
383
  "Descriptive",
384
  "Descriptive w/ bias check",
@@ -387,31 +378,25 @@ with st.sidebar.form("user_input"):
387
 
388
  submitted = st.form_submit_button("Submit evaluation")
389
 
390
- st.sidebar.write("By: [Sentient](https://twitter.com/sentient)")
391
-
392
  # App
393
- st.header("Auto-evaluator")
394
  st.info(
395
- "`I am an evaluation tool for question-answering built on LangChain. Given documents, I will auto-generate a question-answer eval "
396
  "set and evaluate using the selected chain settings. Experiments with different configurations are logged. "
397
- "Optionally, provide your own eval set (as a JSON, see docs/karpathy-pod-eval.json for an example). If you don't have acess to GPT-4 or Anthropic, you can use our free hosted app here: https://autoevaluator.langchain.com/`")
398
 
399
  with st.form(key='file_inputs'):
400
- uploaded_file = st.file_uploader("Please upload a file to evaluate (.txt or .pdf): ",
401
  type=['pdf', 'txt'],
402
  accept_multiple_files=True)
403
 
404
- uploaded_eval_set = st.file_uploader("[Optional] Please upload eval set (.json): ",
405
  type=['json'],
406
  accept_multiple_files=False)
407
 
408
  submitted = st.form_submit_button("Submit files")
409
 
410
- if uploaded_file and oai_api_key:
411
-
412
- os.environ["OPENAI_API_KEY"] = oai_api_key
413
- os.environ["ANTHROPIC_API_KEY"] = ant_api_key
414
- os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_api_key
415
 
416
  # Load docs
417
  text = load_docs(uploaded_file)
@@ -445,7 +430,7 @@ if uploaded_file and oai_api_key:
445
  percentage_answer = (correct_answer_count / len(graded_answers)) * 100
446
  percentage_docs = (correct_docs_count / len(graded_retrieval)) * 100
447
 
448
- st.subheader("Run Results")
449
  st.info(
450
  "`I will grade the chain based on: 1/ the relevance of the retrived documents relative to the question and 2/ "
451
  "the summarized answer relative to the ground truth answer. You can see (and change) to prompts used for "
@@ -453,7 +438,7 @@ if uploaded_file and oai_api_key:
453
  st.dataframe(data=d, use_container_width=True)
454
 
455
  # Accumulate results
456
- st.subheader("Aggregate Results")
457
  st.info(
458
  "`Retrieval and answer scores are percentage of retrived documents deemed relevant by the LLM grader ("
459
  "relative to the question) and percentage of summarized answers deemed relevant (relative to ground truth "
@@ -485,7 +470,3 @@ if uploaded_file and oai_api_key:
485
  color='expt number',
486
  tooltip=['expt number', 'Retrieval score', 'Latency', 'Answer score'])
487
  st.altair_chart(c, use_container_width=True, theme="streamlit")
488
-
489
- else:
490
-
491
- st.warning("Please input file and API key(s)!")
 
13
  from io import StringIO
14
  from llama_index import Document
15
  from langchain.llms import Anthropic
 
16
  from langchain.chains import RetrievalQA
17
  from langchain.vectorstores import FAISS
18
  from llama_index import LangchainEmbedding
 
52
  @return: string of all docs concatenated
53
  """
54
 
55
+ st.info("`Reading doc ...`")
56
  all_text = ""
57
  for file_path in files:
58
  file_extension = os.path.splitext(file_path.name)[1]
 
68
  file_content = stringio.read()
69
  all_text += file_content
70
  else:
71
+ st.warning('Please provide txt or pdf.', icon="⚠️")
72
  return all_text
73
 
74
 
 
81
  @param chunk: chunk size to draw question from in the doc
82
  @return: eval set as JSON list
83
  """
84
+ st.info("`Generating eval set ...`")
85
  n = len(text)
86
  starting_indices = [random.randint(0, n - chunk) for _ in range(num_questions)]
87
  sub_sequences = [text[i:i + chunk] for i in starting_indices]
 
92
  qa = chain.run(b)
93
  eval_set.append(qa)
94
  except:
95
+ st.warning('Error generating question %s.' % str(i + 1), icon="⚠️")
96
  eval_set_full = list(itertools.chain.from_iterable(eval_set))
97
  return eval_set_full
98
 
 
107
  @param split_method:
108
  @return: list of str splits
109
  """
110
+ st.info("`Splitting doc ...`")
111
  if split_method == "RecursiveTextSplitter":
112
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
113
  chunk_overlap=overlap)
 
116
  chunk_size=chunk_size,
117
  chunk_overlap=overlap)
118
  else:
119
+ st.warning("`Split method not recognized. Using RecursiveCharacterTextSplitter`", icon="⚠️")
120
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
121
  chunk_overlap=overlap)
122
 
 
135
  chosen_model = ChatOpenAI(model_name=model_version, temperature=0)
136
  elif model_version == "anthropic":
137
  chosen_model = Anthropic(temperature=0)
 
 
138
  else:
139
+ st.warning("`Model version not recognized. Using gpt-3.5-turbo`", icon="⚠️")
140
  chosen_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
141
  return chosen_model
142
 
143
+
144
  @st.cache_resource
145
  def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
146
  """
 
152
  @param _llm: model
153
  @return: retriever
154
  """
155
+ st.info("`Making retriever ...`")
156
  # Set embeddings
157
  if embedding_type == "OpenAI":
158
  embedding = OpenAIEmbeddings()
159
  elif embedding_type == "HuggingFace":
160
  embedding = HuggingFaceEmbeddings()
161
  else:
162
+ st.warning("`Embedding type not recognized. Using OpenAI`", icon="⚠️")
163
  embedding = OpenAIEmbeddings()
164
 
165
  # Select retriever
 
167
  try:
168
  vector_store = FAISS.from_texts(splits, embedding)
169
  except ValueError:
170
+ st.warning("`Error using OpenAI embeddings (disallowed TikToken token in the text). Using HuggingFace.`",
171
+ icon="⚠️")
172
  vector_store = FAISS.from_texts(splits, HuggingFaceEmbeddings())
173
  retriever_obj = vector_store.as_retriever(k=num_neighbors)
174
  elif retriever_type == "SVM":
 
183
  faiss_index = faiss.IndexFlatL2(d)
184
  retriever_obj = GPTFaissIndex.from_documents(documents, faiss_index=faiss_index, service_context=context)
185
  else:
186
+ st.warning("`Retriever type not recognized. Using SVM`", icon="⚠️")
187
  retriever_obj = SVMRetriever.from_texts(splits, embedding)
188
  return retriever_obj
189
 
 
196
  @param retriever_type: retriever type
197
  @return: chain (or return retriever for Llama-Index)
198
  """
199
+ st.info("`Making chain ...`")
200
  if retriever_type == "Llama-Index":
201
  qa = retriever
202
  else:
 
216
  @return: A list of scores for the distilled answers.
217
  """
218
  # Grade the distilled answer
219
+ st.info("`Grading model answer ...`")
220
  # Set the grading prompt based on the grade_answer_prompt parameter
221
  if grade_answer_prompt == "Fast":
222
  prompt = GRADE_ANSWER_PROMPT_FAST
 
253
  @return: list of scores for the retrieved documents.
254
  """
255
  # Grade the docs retrieval
256
+ st.info("`Grading relevance of retrieved docs ...`")
257
 
258
  # Set the grading prompt based on the grade_docs_prompt parameter
259
  prompt = GRADE_DOCS_PROMPT_FAST if grade_docs_prompt == "Fast" else GRADE_DOCS_PROMPT
 
289
  - latencies_list: A list of latencies in seconds for each question answered.
290
  - predictions_list: A list of dictionaries containing the model's predicted answers and relevant documents for each question.
291
  """
292
+ st.info("`Running evaluation ...`")
293
  predictions_list = []
294
  retrieved_docs = []
295
  gt_dataset = []
 
333
  # Auth
334
  st.sidebar.image("img/diagnostic.jpg")
335
 
 
 
 
 
336
  with st.sidebar.form("user_input"):
337
+ num_eval_questions = st.select_slider("`Number of eval questions`",
 
338
  options=[1, 5, 10, 15, 20], value=5)
339
 
340
+ chunk_chars = st.select_slider("`Choose chunk size for splitting`",
341
  options=[500, 750, 1000, 1500, 2000], value=1000)
342
 
343
+ overlap = st.select_slider("`Choose overlap for splitting`",
344
  options=[0, 50, 100, 150, 200], value=100)
345
 
346
+ split_method = st.radio("`Split method`",
347
  ("RecursiveTextSplitter",
348
  "CharacterTextSplitter"),
349
  index=0)
350
 
351
+ model = st.radio("`Choose model`",
352
  ("gpt-3.5-turbo",
353
  "gpt-4",
354
  "anthropic"),
 
 
355
  index=0)
356
 
357
+ retriever_type = st.radio("`Choose retriever`",
358
  ("TF-IDF",
359
  "SVM",
360
  "Llama-Index",
361
  "similarity-search"),
362
  index=3)
363
 
364
+ num_neighbors = st.select_slider("`Choose # chunks to retrieve`",
365
  options=[3, 4, 5, 6, 7, 8])
366
 
367
+ embeddings = st.radio("`Choose embeddings`",
368
  ("HuggingFace",
369
  "OpenAI"),
370
  index=1)
371
 
372
+ grade_prompt = st.radio("`Grading style prompt`",
373
  ("Fast",
374
  "Descriptive",
375
  "Descriptive w/ bias check",
 
378
 
379
  submitted = st.form_submit_button("Submit evaluation")
380
 
 
 
381
  # App
382
+ st.header("`Auto-evaluator`")
383
  st.info(
384
+ "`I am an evaluation tool for question-answering. Given documents, I will auto-generate a question-answer eval "
385
  "set and evaluate using the selected chain settings. Experiments with different configurations are logged. "
386
+ "Optionally, provide your own eval set (as a JSON, see docs/karpathy-pod-eval.json for an example).`")
387
 
388
  with st.form(key='file_inputs'):
389
+ uploaded_file = st.file_uploader("`Please upload a file to evaluate (.txt or .pdf):` ",
390
  type=['pdf', 'txt'],
391
  accept_multiple_files=True)
392
 
393
+ uploaded_eval_set = st.file_uploader("`[Optional] Please upload eval set (.json):` ",
394
  type=['json'],
395
  accept_multiple_files=False)
396
 
397
  submitted = st.form_submit_button("Submit files")
398
 
399
+ if uploaded_file:
 
 
 
 
400
 
401
  # Load docs
402
  text = load_docs(uploaded_file)
 
430
  percentage_answer = (correct_answer_count / len(graded_answers)) * 100
431
  percentage_docs = (correct_docs_count / len(graded_retrieval)) * 100
432
 
433
+ st.subheader("`Run Results`")
434
  st.info(
435
  "`I will grade the chain based on: 1/ the relevance of the retrived documents relative to the question and 2/ "
436
  "the summarized answer relative to the ground truth answer. You can see (and change) to prompts used for "
 
438
  st.dataframe(data=d, use_container_width=True)
439
 
440
  # Accumulate results
441
+ st.subheader("`Aggregate Results`")
442
  st.info(
443
  "`Retrieval and answer scores are percentage of retrived documents deemed relevant by the LLM grader ("
444
  "relative to the question) and percentage of summarized answers deemed relevant (relative to ground truth "
 
470
  color='expt number',
471
  tooltip=['expt number', 'Retrieval score', 'Latency', 'Answer score'])
472
  st.altair_chart(c, use_container_width=True, theme="streamlit")