xangma commited on
Commit
c62e5cd
•
0 Parent(s):
Files changed (7) hide show
  1. .gitignore +1 -0
  2. README.md +13 -0
  3. app.py +206 -0
  4. chain.py +74 -0
  5. index.html +19 -0
  6. requirements.txt +8 -0
  7. style.css +28 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ pycbc/*
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Chat PyCBC
3
+ emoji: 🦀
4
+ colorFrom: yellow
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 3.16.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: gpl-3.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import os
3
+ import gradio as gr
4
+ from abc import ABC
5
+ from typing import List, Optional, Any
6
+ import chromadb
7
+ import langchain
8
+ # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
9
+ # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
10
+ from langchain.embeddings.openai import OpenAIEmbeddings
11
+ from langchain.vectorstores import Chroma
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, PythonCodeTextSplitter
13
+ from langchain.document_loaders import TextLoader
14
+ from langchain.docstore.document import Document
15
+ from langchain.embeddings.base import Embeddings
16
+ from langchain.vectorstores import Chroma
17
+
18
+ from chain import get_new_chain1
19
+
20
+ class CachedChroma(Chroma, ABC):
21
+ """
22
+ Wrapper around Chroma to make caching embeddings easier.
23
+
24
+ It automatically uses a cached version of a specified collection, if available.
25
+ Example:
26
+ .. code-block:: python
27
+ from langchain.vectorstores import Chroma
28
+ from langchain.embeddings.openai import OpenAIEmbeddings
29
+ embeddings = OpenAIEmbeddings()
30
+ vectorstore = CachedChroma.from_documents_with_cache(
31
+ ".persisted_data", texts, embeddings, collection_name="fun_experiement"
32
+ )
33
+ """
34
+
35
+ @classmethod
36
+ def from_documents_with_cache(
37
+ cls,
38
+ persist_directory: str,
39
+ documents: List[Document],
40
+ embedding: Optional[Embeddings] = None,
41
+ ids: Optional[List[str]] = None,
42
+ collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
43
+ client_settings: Optional[chromadb.config.Settings] = None,
44
+ **kwargs: Any,
45
+ ) -> Chroma:
46
+ settings = chromadb.config.Settings(
47
+ chroma_db_impl="duckdb+parquet",
48
+ persist_directory=persist_directory
49
+ )
50
+ client = chromadb.Client(settings)
51
+ collection_names = [c.name for c in client.list_collections()]
52
+
53
+ if collection_name in collection_names:
54
+ return Chroma(
55
+ collection_name=collection_name,
56
+ embedding_function=embedding,
57
+ persist_directory=persist_directory,
58
+ client_settings=client_settings,
59
+ )
60
+
61
+ return Chroma.from_documents(
62
+ documents=documents,
63
+ embedding=embedding,
64
+ ids=ids,
65
+ collection_name=collection_name,
66
+ persist_directory=persist_directory,
67
+ client_settings=client_settings,
68
+ **kwargs
69
+ )
70
+
71
+ def get_docs():
72
+ local_repo_path_1 = "pycbc/"
73
+ loaders = []
74
+ docs = []
75
+ for root, dirs, files in os.walk(local_repo_path_1):
76
+ for file in files:
77
+ file_path = os.path.join(root, file)
78
+ rel_file_path = os.path.relpath(file_path, local_repo_path_1)
79
+ # Filter by file extension
80
+ if any(rel_file_path.endswith(ext) for ext in [".py", ".sh"]):
81
+ # Filter by directory
82
+ if any(rel_file_path.startswith(d) for d in ["pycbc/", "examples/"]):
83
+ docs.append(rel_file_path)
84
+ if any(rel_file_path.startswith(d) for d in ["bin/"]):
85
+ docs.append(rel_file_path)
86
+ loaders.extend([TextLoader(os.path.join(local_repo_path_1, doc)).load() for doc in docs])
87
+ py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
88
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
89
+ documents = []
90
+ for load in loaders:
91
+ try:
92
+ if load[0].metadata['source'][-3:] == ".py" == "" or "pycbc/bin/" in load[0].metadata['source']:
93
+ documents.extend(py_splitter.split_documents(load))
94
+ except Exception as e:
95
+ documents.extend(text_splitter.split_documents(load))
96
+ return documents
97
+
98
+ def set_chain_up(openai_api_key, model_selector, k_textbox, agent):
99
+ # set defaults
100
+ if not model_selector:
101
+ model_selector = "gpt-3.5-turbo"
102
+ if not k_textbox:
103
+ k_textbox = 10
104
+ else:
105
+ k_textbox = int(k_textbox)
106
+
107
+ documents = get_docs()
108
+ embeddings = OpenAIEmbeddings()
109
+ vectorstore = CachedChroma.from_documents_with_cache(".persisted_data", documents, embedding=embeddings)
110
+
111
+ if model_selector in ["gpt-3.5-turbo", "gpt-4"]:
112
+ if openai_api_key:
113
+ os.environ["OPENAI_API_KEY"] = openai_api_key
114
+ qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox)
115
+ os.environ["OPENAI_API_KEY"] = ""
116
+ return qa_chain
117
+ else:
118
+ qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox)
119
+ return qa_chain
120
+
121
+ def chat(inp, history, agent):
122
+ history = history or []
123
+ if agent is None:
124
+ history.append((inp, "Please paste your OpenAI key to use"))
125
+ return history, history
126
+ print("\n==== date/time: " + str(datetime.datetime.now()) + " ====")
127
+ print("inp: " + inp)
128
+ history = history or []
129
+ output = agent({"question": inp, "chat_history": history})
130
+ answer = output["answer"]
131
+ history.append((inp, answer))
132
+ print(history)
133
+ return history, history
134
+
135
+ block = gr.Blocks(css=".gradio-container {background-color: lightgray}")
136
+
137
+ with block:
138
+ with gr.Row():
139
+ gr.Markdown("<h3><center>Repo Code Assistant</center></h3>")
140
+
141
+ openai_api_key_textbox = gr.Textbox(
142
+ placeholder="Paste your OpenAI API key (sk-...)",
143
+ show_label=False,
144
+ lines=1,
145
+ type="password",
146
+ )
147
+ model_selector = gr.Dropdown(["gpt-3.5-turbo", "gpt-4", "other"], label="Model", show_label=True)
148
+ k_textbox = gr.Textbox(
149
+ placeholder="k: Number of search results to consider",
150
+ label="Search Results k:",
151
+ show_label=True,
152
+ lines=1,
153
+ )
154
+ chatbot = gr.Chatbot()
155
+
156
+ with gr.Row():
157
+ message = gr.Textbox(
158
+ label="What's your question?",
159
+ placeholder="What is PyCBC?",
160
+ lines=1,
161
+ )
162
+ submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
163
+
164
+ gr.Examples(
165
+ examples=[
166
+ "What is PyCBC?",
167
+ "Where is the matched filtering done in the pycbc_live script?"
168
+ ],
169
+ inputs=message,
170
+ )
171
+
172
+ gr.HTML(
173
+ """
174
+ This simple application is an implementation of ChatGPT but over an external dataset (in this case, the pycbc source code).
175
+ The source code is split/broken down into many document objects using langchain's pythoncodetextsplitter, which apparently tries to keep whole functions etc. together. This means that each file in the source code is split into many smaller documents, and the k value is the number of documents to consider when searching for the most similar documents to the question. With gpt-3.5-turbo, k=10 seems to work well, but with gpt-4, k=20 seems to work better.
176
+ The model's memory is set to 5 messages, but I haven't tested with gpt-3.5-turbo yet to see if it works well. It seems to work well with gpt-4."""
177
+ )
178
+
179
+ gr.HTML(
180
+ "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
181
+ )
182
+
183
+ state = gr.State()
184
+ agent_state = gr.State()
185
+
186
+ submit.click(chat, inputs=[message, state, agent_state], outputs=[chatbot, state])
187
+ message.submit(chat, inputs=[message, state, agent_state], outputs=[chatbot, state])
188
+
189
+ # I need to also parse this code in the docstore so I can ask it to fix silly things like this below:
190
+ openai_api_key_textbox.change(
191
+ set_chain_up,
192
+ inputs=[openai_api_key_textbox, model_selector, k_textbox, agent_state],
193
+ outputs=[agent_state],
194
+ )
195
+ model_selector.change(
196
+ set_chain_up,
197
+ inputs=[openai_api_key_textbox, model_selector, k_textbox, agent_state],
198
+ outputs=[agent_state],
199
+ )
200
+ k_textbox.change(
201
+ set_chain_up,
202
+ inputs=[openai_api_key_textbox, model_selector, k_textbox, agent_state],
203
+ outputs=[agent_state],
204
+ )
205
+
206
+ block.launch(debug=True)
chain.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pathlib
4
+ from typing import Dict, List, Tuple
5
+ from langchain.chains.base import Chain
6
+ import os
7
+ import langchain
8
+ # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
9
+ # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
10
+ from langchain import HuggingFaceHub
11
+ from langchain.chains.question_answering import load_qa_chain
12
+ from langchain.chat_models import ChatOpenAI
13
+ from langchain.chains import ConversationalRetrievalChain
14
+ from langchain.memory import ConversationBufferWindowMemory
15
+ from langchain.chains.llm import LLMChain
16
+ from langchain.callbacks.base import CallbackManager
17
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
18
+ from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT
19
+
20
+ from abc import ABC
21
+ from typing import List, Optional, Any
22
+
23
+ import chromadb
24
+ from langchain.vectorstores import Chroma
25
+
26
+ def get_new_chain1(vectorstore, model_selector, k_textbox) -> Chain:
27
+ max_tokens_dict = {'gpt-4': 2000, 'gpt-3.5-turbo': 1000}
28
+
29
+ # These templates aren't used for the moment.
30
+ _eg_template = """## Example:
31
+
32
+ Chat History:
33
+ {chat_history}
34
+ Follow Up Input: {question}
35
+ Standalone question: {answer}"""
36
+ _prefix = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question. You should assume that the question is related to PyCBC."""
37
+ _suffix = """## Example:
38
+
39
+ Chat History:
40
+ {chat_history}
41
+ Follow Up Input: {question}
42
+ Standalone question:"""
43
+
44
+ template = """You are an AI assistant for the open source library PyCBC. The documentation is located at https://pycbc.readthedocs.io.
45
+ You are given the following extracted parts of a long document and a question. Provide a conversational answer with a hyperlink to the documentation.
46
+ You should only use hyperlinks that are explicitly listed as a source in the context. Do NOT make up a hyperlink that is not listed.
47
+ If the question includes a request for code, provide a code block directly from the documentation.
48
+ If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
49
+ If the question is not about PyCBC, politely inform them that you are tuned to only answer questions about PyCBC.
50
+ Question: {question}
51
+ =========
52
+ {context}
53
+ =========
54
+ Answer in Markdown:"""
55
+
56
+ # Construct a ChatVectorDBChain with a streaming llm for combine docs
57
+ # and a separate, non-streaming llm for question generation
58
+ if model_selector in ['gpt-4', 'gpt-3.5-turbo']:
59
+ llm = ChatOpenAI(client = None, temperature=0.7, model_name=model_selector)
60
+ doc_chain_llm = ChatOpenAI(client = None, streaming=True, callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]), verbose=True, temperature=0.7, model_name=model_selector, max_tokens=1000)
61
+ if model_selector == 'other':
62
+ llm = HuggingFaceHub(repo_id="chavinlo/gpt4-x-alpaca")#, model_kwargs={"temperature":0, "max_length":64})
63
+ doc_chain_llm = HuggingFaceHub(repo_id="chavinlo/gpt4-x-alpaca")
64
+ question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)
65
+ doc_chain = load_qa_chain(doc_chain_llm, chain_type="stuff", prompt=QA_PROMPT)
66
+
67
+ # memory = ConversationKGMemory(llm=llm, input_key="question", output_key="answer")
68
+ memory = ConversationBufferWindowMemory(input_key="question", output_key="answer", k=5)
69
+ retriever = vectorstore.as_retriever()
70
+ retriever.search_kwargs = {"k": k_textbox}
71
+ qa = ConversationalRetrievalChain(
72
+ retriever=retriever, memory=memory, combine_docs_chain=doc_chain, question_generator=question_generator)
73
+
74
+ return qa
index.html ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width" />
6
+ <title>My static Space</title>
7
+ <link rel="stylesheet" href="style.css" />
8
+ </head>
9
+ <body>
10
+ <div class="card">
11
+ <h1>Welcome to your static Space!</h1>
12
+ <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
+ <p>
14
+ Also don't forget to check the
15
+ <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
+ </p>
17
+ </div>
18
+ </body>
19
+ </html>
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ black
4
+ isort
5
+ Flask
6
+ transformers
7
+ gradio
8
+ chromadb
style.css ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ padding: 2rem;
3
+ font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
4
+ }
5
+
6
+ h1 {
7
+ font-size: 16px;
8
+ margin-top: 0;
9
+ }
10
+
11
+ p {
12
+ color: rgb(107, 114, 128);
13
+ font-size: 15px;
14
+ margin-bottom: 10px;
15
+ margin-top: 5px;
16
+ }
17
+
18
+ .card {
19
+ max-width: 620px;
20
+ margin: 0 auto;
21
+ padding: 16px;
22
+ border: 1px solid lightgray;
23
+ border-radius: 16px;
24
+ }
25
+
26
+ .card p:last-child {
27
+ margin-bottom: 0;
28
+ }