Spaces:

xangma
/

chat-pykg

Runtime error

App Files Files Community

xangma commited on Apr 9, 2023

Commit

c62e5cd

•

0 Parent(s):

init

Browse files

Files changed (7) hide show

.gitignore +1 -0
README.md +13 -0
app.py +206 -0
chain.py +74 -0
index.html +19 -0
requirements.txt +8 -0
style.css +28 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ pycbc/*

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Chat PyCBC
+emoji: 🦀
+colorFrom: yellow
+colorTo: indigo
+sdk: gradio
+sdk_version: 3.16.2
+app_file: app.py
+pinned: false
+license: gpl-3.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import datetime
+import os
+import gradio as gr
+from abc import ABC
+from typing import List, Optional, Any
+import chromadb
+import langchain
+# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import Chroma
+from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, PythonCodeTextSplitter
+from langchain.document_loaders import TextLoader
+from langchain.docstore.document import Document
+from langchain.embeddings.base import Embeddings
+from langchain.vectorstores import Chroma
+from chain import get_new_chain1
+class CachedChroma(Chroma, ABC):
+    """
+    Wrapper around Chroma to make caching embeddings easier.
+    It automatically uses a cached version of a specified collection, if available.
+        Example:
+            .. code-block:: python
+                    from langchain.vectorstores import Chroma
+                    from langchain.embeddings.openai import OpenAIEmbeddings
+                    embeddings = OpenAIEmbeddings()
+                    vectorstore = CachedChroma.from_documents_with_cache(
+                        ".persisted_data", texts, embeddings, collection_name="fun_experiement"
+                    )
+        """
+    @classmethod
+    def from_documents_with_cache(
+            cls,
+            persist_directory: str,
+            documents: List[Document],
+            embedding: Optional[Embeddings] = None,
+            ids: Optional[List[str]] = None,
+            collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
+            client_settings: Optional[chromadb.config.Settings] = None,
+            **kwargs: Any,
+    ) -> Chroma:
+        settings = chromadb.config.Settings(
+            chroma_db_impl="duckdb+parquet",
+            persist_directory=persist_directory
+        )
+        client = chromadb.Client(settings)
+        collection_names = [c.name for c in client.list_collections()]
+        if collection_name in collection_names:
+            return Chroma(
+                collection_name=collection_name,
+                embedding_function=embedding,
+                persist_directory=persist_directory,
+                client_settings=client_settings,
+            )
+        return Chroma.from_documents(
+            documents=documents,
+            embedding=embedding,
+            ids=ids,
+            collection_name=collection_name,
+            persist_directory=persist_directory,
+            client_settings=client_settings,
+            **kwargs
+        )
+def get_docs():
+    local_repo_path_1 = "pycbc/"
+    loaders = []
+    docs = []
+    for root, dirs, files in os.walk(local_repo_path_1):
+        for file in files:
+            file_path = os.path.join(root, file)
+            rel_file_path = os.path.relpath(file_path, local_repo_path_1)
+            # Filter by file extension
+            if any(rel_file_path.endswith(ext) for ext in [".py", ".sh"]):
+                # Filter by directory
+                if any(rel_file_path.startswith(d) for d in ["pycbc/", "examples/"]):
+                    docs.append(rel_file_path)
+            if any(rel_file_path.startswith(d) for d in ["bin/"]):
+                docs.append(rel_file_path)
+    loaders.extend([TextLoader(os.path.join(local_repo_path_1, doc)).load() for doc in docs])
+    py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+    documents = []
+    for load in loaders:
+        try:
+            if load[0].metadata['source'][-3:] == ".py" == "" or "pycbc/bin/" in load[0].metadata['source']:
+                documents.extend(py_splitter.split_documents(load))
+        except Exception as e:
+            documents.extend(text_splitter.split_documents(load))
+    return documents
+def set_chain_up(openai_api_key, model_selector, k_textbox, agent):
+    # set defaults
+    if not model_selector:
+        model_selector = "gpt-3.5-turbo"
+    if not k_textbox:
+        k_textbox = 10
+    else:
+        k_textbox = int(k_textbox)
+    documents = get_docs()
+    embeddings = OpenAIEmbeddings()
+    vectorstore = CachedChroma.from_documents_with_cache(".persisted_data", documents, embedding=embeddings)
+    if model_selector in ["gpt-3.5-turbo", "gpt-4"]:
+        if openai_api_key:
+            os.environ["OPENAI_API_KEY"] = openai_api_key
+            qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox)
+            os.environ["OPENAI_API_KEY"] = ""
+            return qa_chain
+    else:
+        qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox)
+        return qa_chain
+def chat(inp, history, agent):
+    history = history or []
+    if agent is None:
+        history.append((inp, "Please paste your OpenAI key to use"))
+        return history, history
+    print("\n==== date/time: " + str(datetime.datetime.now()) + " ====")
+    print("inp: " + inp)
+    history = history or []
+    output = agent({"question": inp, "chat_history": history})
+    answer = output["answer"]
+    history.append((inp, answer))
+    print(history)
+    return history, history
+block = gr.Blocks(css=".gradio-container {background-color: lightgray}")
+with block:
+    with gr.Row():
+        gr.Markdown("<h3><center>Repo Code Assistant</center></h3>")
+        openai_api_key_textbox = gr.Textbox(
+            placeholder="Paste your OpenAI API key (sk-...)",
+            show_label=False,
+            lines=1,
+            type="password",
+        )
+        model_selector = gr.Dropdown(["gpt-3.5-turbo", "gpt-4", "other"], label="Model", show_label=True)
+        k_textbox = gr.Textbox(
+            placeholder="k: Number of search results to consider",
+            label="Search Results k:",
+            show_label=True,
+            lines=1,
+        )
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        message = gr.Textbox(
+            label="What's your question?",
+            placeholder="What is PyCBC?",
+            lines=1,
+        )
+        submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
+    gr.Examples(
+        examples=[
+            "What is PyCBC?",
+            "Where is the matched filtering done in the pycbc_live script?"
+        ],
+        inputs=message,
+    )
+    gr.HTML(
+        """
+    This simple application is an implementation of ChatGPT but over an external dataset (in this case, the pycbc source code).
+    The source code is split/broken down into many document objects using langchain's pythoncodetextsplitter, which apparently tries to keep whole functions etc. together. This means that each file in the source code is split into many smaller documents, and the k value is the number of documents to consider when searching for the most similar documents to the question. With gpt-3.5-turbo, k=10 seems to work well, but with gpt-4, k=20 seems to work better.
+    The model's memory is set to 5 messages, but I haven't tested with gpt-3.5-turbo yet to see if it works well. It seems to work well with gpt-4."""
+    )
+    gr.HTML(
+        "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
+    )
+    state = gr.State()
+    agent_state = gr.State()
+    submit.click(chat, inputs=[message, state, agent_state], outputs=[chatbot, state])
+    message.submit(chat, inputs=[message, state, agent_state], outputs=[chatbot, state])
+    # I need to also parse this code in the docstore so I can ask it to fix silly things like this below:
+    openai_api_key_textbox.change(
+        set_chain_up,
+        inputs=[openai_api_key_textbox, model_selector, k_textbox, agent_state],
+        outputs=[agent_state],
+    )
+    model_selector.change(
+        set_chain_up,
+        inputs=[openai_api_key_textbox, model_selector, k_textbox, agent_state],
+        outputs=[agent_state],
+    )
+    k_textbox.change(
+        set_chain_up,
+        inputs=[openai_api_key_textbox, model_selector, k_textbox, agent_state],
+        outputs=[agent_state],
+    )
+block.launch(debug=True)

chain.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import json
+import os
+import pathlib
+from typing import Dict, List, Tuple
+from langchain.chains.base import Chain
+import os
+import langchain
+# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
+from langchain import HuggingFaceHub
+from langchain.chains.question_answering import load_qa_chain
+from langchain.chat_models import ChatOpenAI
+from langchain.chains import ConversationalRetrievalChain
+from langchain.memory import ConversationBufferWindowMemory
+from langchain.chains.llm import LLMChain
+from langchain.callbacks.base import CallbackManager
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT
+from abc import ABC
+from typing import List, Optional, Any
+import chromadb
+from langchain.vectorstores import Chroma
+def get_new_chain1(vectorstore, model_selector, k_textbox) -> Chain:
+    max_tokens_dict = {'gpt-4': 2000, 'gpt-3.5-turbo': 1000}
+    # These templates aren't used for the moment.
+    _eg_template = """## Example:
+    Chat History:
+    {chat_history}
+    Follow Up Input: {question}
+    Standalone question: {answer}"""
+    _prefix = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question. You should assume that the question is related to PyCBC."""
+    _suffix = """## Example:
+    Chat History:
+    {chat_history}
+    Follow Up Input: {question}
+    Standalone question:"""
+    template = """You are an AI assistant for the open source library PyCBC. The documentation is located at https://pycbc.readthedocs.io.
+You are given the following extracted parts of a long document and a question. Provide a conversational answer with a hyperlink to the documentation.
+You should only use hyperlinks that are explicitly listed as a source in the context. Do NOT make up a hyperlink that is not listed.
+If the question includes a request for code, provide a code block directly from the documentation.
+If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
+If the question is not about PyCBC, politely inform them that you are tuned to only answer questions about PyCBC.
+Question: {question}
+=========
+{context}
+=========
+Answer in Markdown:"""
+    # Construct a ChatVectorDBChain with a streaming llm for combine docs
+    # and a separate, non-streaming llm for question generation
+    if model_selector in ['gpt-4', 'gpt-3.5-turbo']:
+        llm = ChatOpenAI(client = None, temperature=0.7, model_name=model_selector)
+        doc_chain_llm = ChatOpenAI(client = None, streaming=True, callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]), verbose=True, temperature=0.7, model_name=model_selector, max_tokens=1000)
+    if model_selector == 'other':
+        llm = HuggingFaceHub(repo_id="chavinlo/gpt4-x-alpaca")#, model_kwargs={"temperature":0, "max_length":64})
+        doc_chain_llm = HuggingFaceHub(repo_id="chavinlo/gpt4-x-alpaca")
+    question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)
+    doc_chain = load_qa_chain(doc_chain_llm, chain_type="stuff", prompt=QA_PROMPT)
+    # memory = ConversationKGMemory(llm=llm, input_key="question", output_key="answer")
+    memory = ConversationBufferWindowMemory(input_key="question", output_key="answer", k=5)
+    retriever = vectorstore.as_retriever()
+    retriever.search_kwargs = {"k": k_textbox}
+    qa = ConversationalRetrievalChain(
+        retriever=retriever, memory=memory, combine_docs_chain=doc_chain, question_generator=question_generator)
+    return qa

index.html ADDED Viewed

	@@ -0,0 +1,19 @@

+<!DOCTYPE html>
+<html>
+	<head>
+		<meta charset="utf-8" />
+		<meta name="viewport" content="width=device-width" />
+		<title>My static Space</title>
+		<link rel="stylesheet" href="style.css" />
+	</head>
+	<body>
+		<div class="card">
+			<h1>Welcome to your static Space!</h1>
+			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
+			<p>
+				Also don't forget to check the
+				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
+			</p>
+		</div>
+	</body>
+</html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+langchain
+openai
+black
+isort
+Flask
+transformers
+gradio
+chromadb

style.css ADDED Viewed

	@@ -0,0 +1,28 @@

+body {
+	padding: 2rem;
+	font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
+}
+h1 {
+	font-size: 16px;
+	margin-top: 0;
+}
+p {
+	color: rgb(107, 114, 128);
+	font-size: 15px;
+	margin-bottom: 10px;
+	margin-top: 5px;
+}
+.card {
+	max-width: 620px;
+	margin: 0 auto;
+	padding: 16px;
+	border: 1px solid lightgray;
+	border-radius: 16px;
+}
+.card p:last-child {
+	margin-bottom: 0;
+}