Spaces:

Ahmad-Moiz
/

Auto_evaluator_real

Sleeping

App Files Files Community

Ahmad-Moiz commited on Dec 2, 2023

Commit

9333846

•

1 Parent(s): b0a12fe

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -53

app.py CHANGED Viewed

@@ -13,7 +13,6 @@ import streamlit as st
 from io import StringIO
 from llama_index import Document
 from langchain.llms import Anthropic
-from langchain import HuggingFaceHub
 from langchain.chains import RetrievalQA
 from langchain.vectorstores import FAISS
 from llama_index import LangchainEmbedding
@@ -53,7 +52,7 @@ def load_docs(files: List) -> str:
     @return: string of all docs concatenated
     """
-    st.info("Reading doc ...")
     all_text = ""
     for file_path in files:
         file_extension = os.path.splitext(file_path.name)[1]
@@ -69,7 +68,7 @@ def load_docs(files: List) -> str:
             file_content = stringio.read()
             all_text += file_content
         else:
-            st.warning('Please provide txt or pdf.', icon="⚠")
     return all_text
@@ -82,7 +81,7 @@ def generate_eval(text: str, num_questions: int, chunk: int):
     @param chunk: chunk size to draw question from in the doc
     @return: eval set as JSON list
     """
-    st.info("Generating eval set ...")
     n = len(text)
     starting_indices = [random.randint(0, n - chunk) for _ in range(num_questions)]
     sub_sequences = [text[i:i + chunk] for i in starting_indices]
@@ -93,7 +92,7 @@ def generate_eval(text: str, num_questions: int, chunk: int):
             qa = chain.run(b)
             eval_set.append(qa)
         except:
-            st.warning('Error generating question %s.' % str(i + 1), icon="⚠")
     eval_set_full = list(itertools.chain.from_iterable(eval_set))
     return eval_set_full
@@ -108,7 +107,7 @@ def split_texts(text, chunk_size: int, overlap, split_method: str):
     @param split_method:
     @return: list of str splits
     """
-    st.info("Splitting doc ...")
     if split_method == "RecursiveTextSplitter":
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                        chunk_overlap=overlap)
@@ -117,7 +116,7 @@ def split_texts(text, chunk_size: int, overlap, split_method: str):
                                               chunk_size=chunk_size,
                                               chunk_overlap=overlap)
     else:
-        st.warning("Split method not recognized. Using RecursiveCharacterTextSplitter", icon="⚠")
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                        chunk_overlap=overlap)
@@ -136,13 +135,12 @@ def make_llm(model_version: str):
         chosen_model = ChatOpenAI(model_name=model_version, temperature=0)
     elif model_version == "anthropic":
         chosen_model = Anthropic(temperature=0)
-    elif model_version == "flan-t5-xl":
-        chosen_model = HuggingFaceHub(repo_id="google/flan-t5-xl",model_kwargs={"temperature":0,"max_length":64})
     else:
-        st.warning("Model version not recognized. Using gpt-3.5-turbo", icon="⚠")
         chosen_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
     return chosen_model
 @st.cache_resource
 def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
     """
@@ -154,14 +152,14 @@ def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
     @param _llm: model
     @return: retriever
     """
-    st.info("Making retriever ...")
     # Set embeddings
     if embedding_type == "OpenAI":
         embedding = OpenAIEmbeddings()
     elif embedding_type == "HuggingFace":
         embedding = HuggingFaceEmbeddings()
     else:
-        st.warning("Embedding type not recognized. Using OpenAI", icon="⚠")
         embedding = OpenAIEmbeddings()
     # Select retriever
@@ -169,8 +167,8 @@ def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
         try:
             vector_store = FAISS.from_texts(splits, embedding)
         except ValueError:
-            st.warning("Error using OpenAI embeddings (disallowed TikToken token in the text). Using HuggingFace.",
-                       icon="⚠")
             vector_store = FAISS.from_texts(splits, HuggingFaceEmbeddings())
         retriever_obj = vector_store.as_retriever(k=num_neighbors)
     elif retriever_type == "SVM":
@@ -185,7 +183,7 @@ def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
         faiss_index = faiss.IndexFlatL2(d)
         retriever_obj = GPTFaissIndex.from_documents(documents, faiss_index=faiss_index, service_context=context)
     else:
-        st.warning("Retriever type not recognized. Using SVM", icon="⚠")
         retriever_obj = SVMRetriever.from_texts(splits, embedding)
     return retriever_obj
@@ -198,7 +196,7 @@ def make_chain(llm, retriever, retriever_type: str) -> RetrievalQA:
     @param retriever_type: retriever type
     @return: chain (or return retriever for Llama-Index)
     """
-    st.info("Making chain ...")
     if retriever_type == "Llama-Index":
         qa = retriever
     else:
@@ -218,7 +216,7 @@ def grade_model_answer(predicted_dataset: List, predictions: List, grade_answer_
     @return: A list of scores for the distilled answers.
     """
     # Grade the distilled answer
-    st.info("Grading model answer ...")
     # Set the grading prompt based on the grade_answer_prompt parameter
     if grade_answer_prompt == "Fast":
         prompt = GRADE_ANSWER_PROMPT_FAST
@@ -255,7 +253,7 @@ def grade_model_retrieval(gt_dataset: List, predictions: List, grade_docs_prompt
     @return: list of scores for the retrieved documents.
     """
     # Grade the docs retrieval
-    st.info("Grading relevance of retrieved docs ...")
     # Set the grading prompt based on the grade_docs_prompt parameter
     prompt = GRADE_DOCS_PROMPT_FAST if grade_docs_prompt == "Fast" else GRADE_DOCS_PROMPT
@@ -291,7 +289,7 @@ def run_evaluation(chain, retriever, eval_set, grade_prompt, retriever_type, num
     - latencies_list: A list of latencies in seconds for each question answered.
     - predictions_list: A list of dictionaries containing the model's predicted answers and relevant documents for each question.
     """
-    st.info("Running evaluation ...")
     predictions_list = []
     retrieved_docs = []
     gt_dataset = []
@@ -335,50 +333,43 @@ def run_evaluation(chain, retriever, eval_set, grade_prompt, retriever_type, num
 # Auth
 st.sidebar.image("img/diagnostic.jpg")
-oai_api_key = st.sidebar.text_input("OpenAI API Key:", type="password")
-ant_api_key = st.sidebar.text_input("(Optional) Anthropic API Key:", type="password")
-hf_api_key = st.sidebar.text_input("(Optional) HuggingFace API Token:", type="password")
 with st.sidebar.form("user_input"):
-    num_eval_questions = st.select_slider("Number of eval questions",
                                           options=[1, 5, 10, 15, 20], value=5)
-    chunk_chars = st.select_slider("Choose chunk size for splitting",
                                    options=[500, 750, 1000, 1500, 2000], value=1000)
-    overlap = st.select_slider("Choose overlap for splitting",
                                options=[0, 50, 100, 150, 200], value=100)
-    split_method = st.radio("Split method",
                             ("RecursiveTextSplitter",
                              "CharacterTextSplitter"),
                             index=0)
-    model = st.radio("Choose model",
                      ("gpt-3.5-turbo",
                       "gpt-4",
                       "anthropic"),
-                      # Error raised by inference API: Model google/flan-t5-xl time out
-                      #"flan-t5-xl"),
                      index=0)
-    retriever_type = st.radio("Choose retriever",
                               ("TF-IDF",
                                "SVM",
                                "Llama-Index",
                                "similarity-search"),
                               index=3)
-    num_neighbors = st.select_slider("Choose # chunks to retrieve",
                                      options=[3, 4, 5, 6, 7, 8])
-    embeddings = st.radio("Choose embeddings",
                           ("HuggingFace",
                            "OpenAI"),
                           index=1)
-    grade_prompt = st.radio("Grading style prompt",
                             ("Fast",
                              "Descriptive",
                              "Descriptive w/ bias check",
@@ -387,31 +378,25 @@ with st.sidebar.form("user_input"):
     submitted = st.form_submit_button("Submit evaluation")
-st.sidebar.write("By: [Sentient](https://twitter.com/sentient)")
 # App
-st.header("Auto-evaluator")
 st.info(
-    "`I am an evaluation tool for question-answering built on LangChain. Given documents, I will auto-generate a question-answer eval "
     "set and evaluate using the selected chain settings. Experiments with different configurations are logged. "
-    "Optionally, provide your own eval set (as a JSON, see docs/karpathy-pod-eval.json for an example). If you don't have acess to GPT-4 or Anthropic, you can use our free hosted app here: https://autoevaluator.langchain.com/`")
 with st.form(key='file_inputs'):
-    uploaded_file = st.file_uploader("Please upload a file to evaluate (.txt or .pdf): ",
                                      type=['pdf', 'txt'],
                                      accept_multiple_files=True)
-    uploaded_eval_set = st.file_uploader("[Optional] Please upload eval set (.json): ",
                                          type=['json'],
                                          accept_multiple_files=False)
     submitted = st.form_submit_button("Submit files")
-if uploaded_file and oai_api_key:
-    os.environ["OPENAI_API_KEY"] = oai_api_key
-    os.environ["ANTHROPIC_API_KEY"] = ant_api_key
-    os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_api_key
     # Load docs
     text = load_docs(uploaded_file)
@@ -445,7 +430,7 @@ if uploaded_file and oai_api_key:
     percentage_answer = (correct_answer_count / len(graded_answers)) * 100
     percentage_docs = (correct_docs_count / len(graded_retrieval)) * 100
-    st.subheader("Run Results")
     st.info(
         "`I will grade the chain based on: 1/ the relevance of the retrived documents relative to the question and 2/ "
         "the summarized answer relative to the ground truth answer. You can see (and change) to prompts used for "
@@ -453,7 +438,7 @@ if uploaded_file and oai_api_key:
     st.dataframe(data=d, use_container_width=True)
     # Accumulate results
-    st.subheader("Aggregate Results")
     st.info(
         "`Retrieval and answer scores are percentage of retrived documents deemed relevant by the LLM grader ("
         "relative to the question) and percentage of summarized answers deemed relevant (relative to ground truth "
@@ -485,7 +470,3 @@ if uploaded_file and oai_api_key:
                                              color='expt number',
                                              tooltip=['expt number', 'Retrieval score', 'Latency', 'Answer score'])
     st.altair_chart(c, use_container_width=True, theme="streamlit")
-else:
-    st.warning("Please input file and API key(s)!")

 from io import StringIO
 from llama_index import Document
 from langchain.llms import Anthropic
 from langchain.chains import RetrievalQA
 from langchain.vectorstores import FAISS
 from llama_index import LangchainEmbedding
     @return: string of all docs concatenated
     """
+    st.info("`Reading doc ...`")
     all_text = ""
     for file_path in files:
         file_extension = os.path.splitext(file_path.name)[1]
             file_content = stringio.read()
             all_text += file_content
         else:
+            st.warning('Please provide txt or pdf.', icon="⚠️")
     return all_text
     @param chunk: chunk size to draw question from in the doc
     @return: eval set as JSON list
     """
+    st.info("`Generating eval set ...`")
     n = len(text)
     starting_indices = [random.randint(0, n - chunk) for _ in range(num_questions)]
     sub_sequences = [text[i:i + chunk] for i in starting_indices]
             qa = chain.run(b)
             eval_set.append(qa)
         except:
+            st.warning('Error generating question %s.' % str(i + 1), icon="⚠️")
     eval_set_full = list(itertools.chain.from_iterable(eval_set))
     return eval_set_full
     @param split_method:
     @return: list of str splits
     """
+    st.info("`Splitting doc ...`")
     if split_method == "RecursiveTextSplitter":
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                        chunk_overlap=overlap)
                                               chunk_size=chunk_size,
                                               chunk_overlap=overlap)
     else:
+        st.warning("`Split method not recognized. Using RecursiveCharacterTextSplitter`", icon="⚠️")
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                        chunk_overlap=overlap)
         chosen_model = ChatOpenAI(model_name=model_version, temperature=0)
     elif model_version == "anthropic":
         chosen_model = Anthropic(temperature=0)
     else:
+        st.warning("`Model version not recognized. Using gpt-3.5-turbo`", icon="⚠️")
         chosen_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
     return chosen_model
 @st.cache_resource
 def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
     """
     @param _llm: model
     @return: retriever
     """
+    st.info("`Making retriever ...`")
     # Set embeddings
     if embedding_type == "OpenAI":
         embedding = OpenAIEmbeddings()
     elif embedding_type == "HuggingFace":
         embedding = HuggingFaceEmbeddings()
     else:
+        st.warning("`Embedding type not recognized. Using OpenAI`", icon="⚠️")
         embedding = OpenAIEmbeddings()
     # Select retriever
         try:
             vector_store = FAISS.from_texts(splits, embedding)
         except ValueError:
+            st.warning("`Error using OpenAI embeddings (disallowed TikToken token in the text). Using HuggingFace.`",
+                       icon="⚠️")
             vector_store = FAISS.from_texts(splits, HuggingFaceEmbeddings())
         retriever_obj = vector_store.as_retriever(k=num_neighbors)
     elif retriever_type == "SVM":
         faiss_index = faiss.IndexFlatL2(d)
         retriever_obj = GPTFaissIndex.from_documents(documents, faiss_index=faiss_index, service_context=context)
     else:
+        st.warning("`Retriever type not recognized. Using SVM`", icon="⚠️")
         retriever_obj = SVMRetriever.from_texts(splits, embedding)
     return retriever_obj
     @param retriever_type: retriever type
     @return: chain (or return retriever for Llama-Index)
     """
+    st.info("`Making chain ...`")
     if retriever_type == "Llama-Index":
         qa = retriever
     else:
     @return: A list of scores for the distilled answers.
     """
     # Grade the distilled answer
+    st.info("`Grading model answer ...`")
     # Set the grading prompt based on the grade_answer_prompt parameter
     if grade_answer_prompt == "Fast":
         prompt = GRADE_ANSWER_PROMPT_FAST
     @return: list of scores for the retrieved documents.
     """
     # Grade the docs retrieval
+    st.info("`Grading relevance of retrieved docs ...`")
     # Set the grading prompt based on the grade_docs_prompt parameter
     prompt = GRADE_DOCS_PROMPT_FAST if grade_docs_prompt == "Fast" else GRADE_DOCS_PROMPT
     - latencies_list: A list of latencies in seconds for each question answered.
     - predictions_list: A list of dictionaries containing the model's predicted answers and relevant documents for each question.
     """
+    st.info("`Running evaluation ...`")
     predictions_list = []
     retrieved_docs = []
     gt_dataset = []
 # Auth
 st.sidebar.image("img/diagnostic.jpg")
 with st.sidebar.form("user_input"):
+    num_eval_questions = st.select_slider("`Number of eval questions`",
                                           options=[1, 5, 10, 15, 20], value=5)
+    chunk_chars = st.select_slider("`Choose chunk size for splitting`",
                                    options=[500, 750, 1000, 1500, 2000], value=1000)
+    overlap = st.select_slider("`Choose overlap for splitting`",
                                options=[0, 50, 100, 150, 200], value=100)
+    split_method = st.radio("`Split method`",
                             ("RecursiveTextSplitter",
                              "CharacterTextSplitter"),
                             index=0)
+    model = st.radio("`Choose model`",
                      ("gpt-3.5-turbo",
                       "gpt-4",
                       "anthropic"),
                      index=0)
+    retriever_type = st.radio("`Choose retriever`",
                               ("TF-IDF",
                                "SVM",
                                "Llama-Index",
                                "similarity-search"),
                               index=3)
+    num_neighbors = st.select_slider("`Choose # chunks to retrieve`",
                                      options=[3, 4, 5, 6, 7, 8])
+    embeddings = st.radio("`Choose embeddings`",
                           ("HuggingFace",
                            "OpenAI"),
                           index=1)
+    grade_prompt = st.radio("`Grading style prompt`",
                             ("Fast",
                              "Descriptive",
                              "Descriptive w/ bias check",
     submitted = st.form_submit_button("Submit evaluation")
 # App
+st.header("`Auto-evaluator`")
 st.info(
+    "`I am an evaluation tool for question-answering. Given documents, I will auto-generate a question-answer eval "
     "set and evaluate using the selected chain settings. Experiments with different configurations are logged. "
+    "Optionally, provide your own eval set (as a JSON, see docs/karpathy-pod-eval.json for an example).`")
 with st.form(key='file_inputs'):
+    uploaded_file = st.file_uploader("`Please upload a file to evaluate (.txt or .pdf):` ",
                                      type=['pdf', 'txt'],
                                      accept_multiple_files=True)
+    uploaded_eval_set = st.file_uploader("`[Optional] Please upload eval set (.json):` ",
                                          type=['json'],
                                          accept_multiple_files=False)
     submitted = st.form_submit_button("Submit files")
+if uploaded_file:
     # Load docs
     text = load_docs(uploaded_file)
     percentage_answer = (correct_answer_count / len(graded_answers)) * 100
     percentage_docs = (correct_docs_count / len(graded_retrieval)) * 100
+    st.subheader("`Run Results`")
     st.info(
         "`I will grade the chain based on: 1/ the relevance of the retrived documents relative to the question and 2/ "
         "the summarized answer relative to the ground truth answer. You can see (and change) to prompts used for "
     st.dataframe(data=d, use_container_width=True)
     # Accumulate results
+    st.subheader("`Aggregate Results`")
     st.info(
         "`Retrieval and answer scores are percentage of retrived documents deemed relevant by the LLM grader ("
         "relative to the question) and percentage of summarized answers deemed relevant (relative to ground truth "
                                              color='expt number',
                                              tooltip=['expt number', 'Retrieval score', 'Latency', 'Answer score'])
     st.altair_chart(c, use_container_width=True, theme="streamlit")