jamescalam commited on
Commit
cb9e56f
1 Parent(s): 5855c82

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +275 -0
app.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pinecone
3
+ import openai
4
+ from openai.embeddings_utils import get_embedding
5
+ import json
6
+
7
+ OPENAI_KEY = st.secrets["OPENAI_KEY"]
8
+ PINECONE_KEY = st.secrets["PINECONE_KEY"]
9
+ INDEX = 'openai-ml-qa'
10
+ instructions = {
11
+ "conservative q&a": "Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext:\n{0}\n\n---\n\nQuestion: {1}\nAnswer:",
12
+ "paragraph about a question":"Write a paragraph, addressing the question, and use the text below to obtain relevant information\"\n\nContext:\n{0}\n\n---\n\nQuestion: {1}\nParagraph long Answer:",
13
+ "bullet points": "Write a bullet point list of possible answers, addressing the question, and use the text below to obtain relevant information\"\n\nContext:\n{0}\n\n---\n\nQuestion: {1}\nBullet point Answer:",
14
+ "summarize problems given a topic": "Write a summary of the problems addressed by the questions below\"\n\n{0}\n\n---\n\n",
15
+ "extract key libraries and tools": "Write a list of libraries and tools present in the context below\"\n\nContext:\n{0}\n\n---\n\n",
16
+ "simple instructions": "{1} given the common questions and answers below \n\n{0}\n\n---\n\n",
17
+ "summarize": "Write an elaborate, paragraph long summary about \"{1}\" given the questions and answers from a public forum on this topic\n\n{0}\n\n---\n\nSummary:",
18
+ }
19
+
20
+ @st.experimental_singleton(show_spinner=False)
21
+ def init_openai():
22
+ # initialize connection to OpenAI
23
+ openai.api_key = OPENAI_KEY
24
+
25
+ @st.experimental_singleton(show_spinner=False)
26
+ def init_key_value():
27
+ with open('./beyond_search_openai/src/beyond_search/mapping.json', 'r') as fp:
28
+ mappings = json.load(fp)
29
+ return mappings
30
+
31
+ @st.experimental_singleton(show_spinner=False)
32
+ def init_pinecone(index_name):
33
+ # initialize connection to Pinecone vector DB (app.pinecone.io for API key)
34
+ pinecone.init(
35
+ api_key=PINECONE_KEY,
36
+ environment='us-west1-gcp'
37
+ )
38
+ index = pinecone.Index(index_name)
39
+ stats = index.describe_index_stats()
40
+ dims = stats['dimension']
41
+ count = stats['namespaces']['']['vector_count']
42
+ return index, dims, count
43
+
44
+ def create_context(question, index, lib_meta, max_len=3750, size="curie", top_k=5):
45
+ """
46
+ Find most relevant context for a question via Pinecone search
47
+ """
48
+ q_embed = get_embedding(question, engine=f'text-embedding-ada-002')
49
+ res = index.query(
50
+ q_embed, top_k=top_k,
51
+ include_metadata=True, filter={
52
+ 'docs': {'$in': lib_meta}
53
+ })
54
+
55
+
56
+ cur_len = 0
57
+ contexts = []
58
+ sources = []
59
+
60
+ for row in res['matches']:
61
+ meta = row['metadata']
62
+ text = (
63
+ f"Thread title: {meta['thread']}\n\n"+
64
+ f"Question asked: {meta['question']}\n\n"+
65
+ f"Given answer: {meta['context']}"
66
+ )
67
+ cur_len += len(text)
68
+ if cur_len < max_len:
69
+ contexts.append(text)
70
+ sources.append(row['metadata'])
71
+ else:
72
+ cur_len -= len(text) + 4
73
+ if max_len - cur_len < 200:
74
+ break
75
+ return "\n\n###\n\n".join(contexts), sources
76
+
77
+ def answer_question(
78
+ index,
79
+ mappings,
80
+ fine_tuned_qa_model="text-davinci-002",
81
+ question="Am I allowed to publish model outputs to Twitter, without a human review?",
82
+ instruction="Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext:\n{0}\n\n---\n\nQuestion: {1}\nAnswer:",
83
+ max_len=3550,
84
+ size="curie",
85
+ top_k=5,
86
+ debug=False,
87
+ max_tokens=400,
88
+ stop_sequence=None,
89
+ domains=["huggingface", "tensorflow", "streamlit", "pytorch"],
90
+ ):
91
+ """
92
+ Answer a question based on the most similar context from the dataframe texts
93
+ """
94
+ context, sources = create_context(
95
+ question,
96
+ index,
97
+ lib_meta=domains,
98
+ max_len=max_len,
99
+ size=size,
100
+ top_k=top_k
101
+ )
102
+ if debug:
103
+ print("Context:\n" + context)
104
+ print("\n\n")
105
+ try:
106
+ # fine-tuned models requires model parameter, whereas other models require engine parameter
107
+ model_param = (
108
+ {"model": fine_tuned_qa_model}
109
+ if ":" in fine_tuned_qa_model
110
+ and fine_tuned_qa_model.split(":")[1].startswith("ft")
111
+ else {"engine": fine_tuned_qa_model}
112
+ )
113
+ #print(instruction.format(context, question))
114
+ response = openai.Completion.create(
115
+ prompt=instruction.format(context, question),
116
+ temperature=0,
117
+ max_tokens=max_tokens,
118
+ top_p=1,
119
+ frequency_penalty=0,
120
+ presence_penalty=0,
121
+ stop=stop_sequence,
122
+ **model_param,
123
+ )
124
+ return response["choices"][0]["text"].strip(), sources
125
+ except Exception as e:
126
+ print(e)
127
+ return ""
128
+
129
+ def search(index, text_map, query, style, top_k, lib_filters):
130
+ if query != "":
131
+ with st.spinner("Retrieving, please wait..."):
132
+ answer, sources = answer_question(
133
+ index, text_map,
134
+ question=query,
135
+ instruction=instructions[style],
136
+ top_k=top_k
137
+ )
138
+ # lowercase relevant lib filters
139
+ lib_meta = [lib.lower() for lib in lib_filters.keys() if lib_filters[lib]]
140
+ lower_libs = [lib.lower() for lib in libraries]
141
+ # display the answer
142
+ st.write(answer)
143
+ with st.expander("Sources"):
144
+ for source in sources:
145
+ st.write(f"""
146
+ {source['docs']} > {source['category']} > [{source['thread']}]({source['href']})
147
+ """)
148
+
149
+ st.markdown("""
150
+ <link
151
+ rel="stylesheet"
152
+ href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&display=swap"
153
+ />
154
+ """, unsafe_allow_html=True)
155
+
156
+ #model_name = 'mpnet-discourse'
157
+
158
+ libraries = [
159
+ "Streamlit",
160
+ "HuggingFace",
161
+ "PyTorch",
162
+ "TensorFlow"
163
+ ]
164
+
165
+ with st.spinner("Connecting to OpenAI..."):
166
+ retriever = init_openai()
167
+
168
+ with st.spinner("Connecting to Pinecone..."):
169
+ index, dims, count = init_pinecone(INDEX)
170
+ text_map = init_key_value()
171
+
172
+ st.write("# ML Q&A")
173
+ search = st.container()
174
+ query = search.text_input('Ask a framework-specific question!', "")
175
+
176
+ with search.expander("Search Options"):
177
+ style = st.radio(label='Style', options=[
178
+ 'Paragraph about a question', 'Conservative Q&A',
179
+ 'Bullet points', 'Summarize problems given a topic',
180
+ 'Extract key libraries and tools', 'Simple instructions',
181
+ 'Summarize'
182
+ ])
183
+ # add section for filters
184
+ st.write("""
185
+ #### Metadata Filters
186
+
187
+ **Libraries**
188
+ """)
189
+ # create two cols
190
+ cols = st.columns(2)
191
+ # add filtering based on library
192
+ lib_filters = {}
193
+ for lib in libraries:
194
+ i = len(lib_filters.keys()) % 2
195
+ with cols[i]:
196
+ lib_filters[lib] = st.checkbox(lib, value=True)
197
+ st.write("---")
198
+ top_k = st.slider(
199
+ "top_k",
200
+ min_value=1,
201
+ max_value=20,
202
+ value=5
203
+ )
204
+
205
+ st.sidebar.write(f"""
206
+ ### Info
207
+
208
+ **Pinecone index name**: {INDEX}
209
+
210
+ **Pinecone index size**: {count}
211
+
212
+ **OpenAI embedding model**: *text-search-curie-query-001*
213
+
214
+ **Vector dimensionality**: {dims}
215
+
216
+ **OpenAI generation model**: *text-davinci-002*
217
+
218
+ ---
219
+
220
+ ### How it Works
221
+
222
+ The Q&A tool takes discussions and docs from some of the best Python ML
223
+ libraries and collates their content into a natural language search and Q&A tool.
224
+
225
+ Ask questions like **"How do I use the gradient tape in tensorflow?"** or **"What is the difference
226
+ between Tensorflow and PyTorch?"**, choose a answer style, and return relevant results!
227
+
228
+ The app is powered using OpenAI's embedding service with Pinecone's vector database. The whole process consists
229
+ of *three* steps:
230
+
231
+ **1**. Questions are fed into OpenAI's embeddings service to generate a {dims}-dimensional query vector.
232
+
233
+ **2**. We use Pinecone to identify similar context vectors (previously encoded from Q&A pages).
234
+
235
+ **3**. Relevant pages are passed in a new question to OpenAI's generative model, returning our answer.
236
+
237
+ **How do I make something like this?**
238
+
239
+ It's easy! Check out the [source code](https://github.com/pinecone-io/examples/tree/master/integrations/openai/beyond_search_webinar) and learn how to [integrate OpenAI and Pinecone in the docs](https://www.pinecone.io/docs/integrations/openai/)!
240
+
241
+ ---
242
+
243
+ ### Usage
244
+
245
+ If you'd like to restrict your search to a specific library (such as PyTorch or
246
+ Streamlit) you can with the *Advanced Options* dropdown. The source of information
247
+ can be switched between official docs and forum discussions too!
248
+
249
+ If you'd like OpenAI to consider more or less pages, try changing the `top_k` slider.
250
+
251
+ Want to see the original sources that GPT-3 is using to generate the answer? No problem, just click on the **Sources** box.
252
+ """)
253
+
254
+ #if style.lower() == 'conservative q&a':
255
+ # search.info("*Access search options above.*")
256
+
257
+ if search.button("Go!") or query != "":
258
+ with st.spinner("Retrieving, please wait..."):
259
+ # lowercase relevant lib filters
260
+ lib_meta = [lib.lower() for lib in lib_filters.keys() if lib_filters[lib]]
261
+ # ask the question
262
+ answer, sources = answer_question(
263
+ index, text_map,
264
+ question=query,
265
+ instruction=instructions[style.lower()],
266
+ top_k=top_k,
267
+ domains=lib_meta
268
+ )
269
+ # display the answer
270
+ st.write(answer)
271
+ with st.expander("Sources"):
272
+ for source in sources:
273
+ st.write(f"""
274
+ {source['docs']} > {source['category']} > [{source['thread']}]({source['href']})
275
+ """)