POC for passive monitoring
Browse files
README.md
CHANGED
@@ -5,7 +5,7 @@ colorFrom: red
|
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.36.1
|
8 |
-
app_file:
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.36.1
|
8 |
+
app_file: app_v1.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
app.py
CHANGED
@@ -1,72 +1,306 @@
|
|
|
|
1 |
import logging
|
2 |
import os
|
3 |
-
import
|
4 |
-
from
|
5 |
-
|
6 |
from llama_index.llms import OpenAI
|
7 |
-
from
|
|
|
|
|
|
|
|
|
8 |
|
9 |
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=os.environ.get("LOGLEVEL", "INFO"))
|
10 |
import gradio as gr
|
11 |
-
from llama_index import
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
|
15 |
-
chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo")
|
16 |
service_context = ServiceContext.from_defaults(llm=chatgpt, chunk_size=1024)
|
17 |
|
18 |
|
19 |
-
def
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
else:
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
|
72 |
-
|
|
|
1 |
+
import json
|
2 |
import logging
|
3 |
import os
|
4 |
+
import re
|
5 |
+
from functools import lru_cache
|
6 |
+
|
7 |
from llama_index.llms import OpenAI
|
8 |
+
from whoosh.query import Or, Term
|
9 |
+
|
10 |
+
import csv
|
11 |
+
|
12 |
+
THREAD_ID = "thread_id"
|
13 |
|
14 |
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=os.environ.get("LOGLEVEL", "INFO"))
|
15 |
import gradio as gr
|
16 |
+
from llama_index import ServiceContext, \
|
17 |
+
Document, GPTListIndex, VectorStoreIndex
|
18 |
+
from whoosh import fields, index
|
19 |
+
from whoosh.qparser import QueryParser
|
20 |
+
|
21 |
+
thread_index = {}
|
22 |
+
comment_index = {}
|
23 |
+
llama_cache = {}
|
24 |
|
25 |
+
chatgpt = OpenAI(temperature=0, model="gpt-4")
|
|
|
26 |
service_context = ServiceContext.from_defaults(llm=chatgpt, chunk_size=1024)
|
27 |
|
28 |
|
29 |
+
def passive_topics(index_name, query, topic, summary_type):
|
30 |
+
resp = search_keyword_matches(index_name, query)
|
31 |
+
if resp is not None:
|
32 |
+
print(f"Found {len(resp)} matches for {query}")
|
33 |
+
docs_list = [Document(text=content) for content in resp]
|
34 |
+
last_llama_index = GPTListIndex.from_documents(documents=docs_list, service_context=service_context)
|
35 |
+
llama_cache[index_name] = last_llama_index
|
36 |
+
resp = last_llama_index.as_query_engine().query(
|
37 |
+
"What are the key negative topics from the discussion? Limit each topic to 30 characters")
|
38 |
+
dynamic_topics = resp.response.split('\n')
|
39 |
+
return dynamic_topics
|
40 |
+
|
41 |
+
return []
|
42 |
+
|
43 |
+
|
44 |
+
def load_data(data_sets):
|
45 |
+
for data_set in data_sets:
|
46 |
+
create_thread_index(data_set)
|
47 |
+
create_comment_index(data_set)
|
48 |
+
|
49 |
+
|
50 |
+
def create_thread_index(data_set):
|
51 |
+
# Define a schema for the index
|
52 |
+
schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT(stored=True))
|
53 |
+
index_path = f"./text_index/{data_set}"
|
54 |
+
# Create the index directory if it doesn't exist
|
55 |
+
if not os.path.exists(index_path):
|
56 |
+
os.mkdir(index_path)
|
57 |
+
build_index = True
|
58 |
+
else:
|
59 |
+
build_index = False
|
60 |
+
print("Loading from existing thread index " + data_set)
|
61 |
+
if build_index:
|
62 |
+
print("Building thread index for " + data_set)
|
63 |
+
# Create an index under "indexdir"
|
64 |
+
write_ix = index.create_in(index_path, schema)
|
65 |
+
|
66 |
+
# Create a writer object to add documents to the index
|
67 |
+
writer = write_ix.writer()
|
68 |
+
|
69 |
+
# Read the CSV file and add documents to the index
|
70 |
+
with open(f'csv/{data_set}.csv', 'r') as csvfile:
|
71 |
+
reader = csv.DictReader(csvfile)
|
72 |
+
for row in reader:
|
73 |
+
writer.add_document(id=row['thread_ts'], content=row['messages_json'])
|
74 |
+
# Commit the writer and close it
|
75 |
+
writer.commit()
|
76 |
+
write_ix.close()
|
77 |
+
# Open the index
|
78 |
+
read_ix = index.open_dir(index_path)
|
79 |
+
thread_index[data_set] = read_ix
|
80 |
+
|
81 |
+
|
82 |
+
def create_comment_index(data_set):
|
83 |
+
# Define a schema for the index
|
84 |
+
schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT(stored=True))
|
85 |
+
index_path = f"./text_index/{data_set}_comments"
|
86 |
+
# Create the index directory if it doesn't exist
|
87 |
+
if not os.path.exists(index_path):
|
88 |
+
os.mkdir(index_path)
|
89 |
+
build_index = True
|
90 |
else:
|
91 |
+
build_index = False
|
92 |
+
print("Loading from existing comments index " + data_set)
|
93 |
+
if build_index:
|
94 |
+
print("Building comments index for " + data_set)
|
95 |
+
# Create an index under "indexdir"
|
96 |
+
write_ix = index.create_in(index_path, schema)
|
97 |
+
|
98 |
+
# Create a writer object to add documents to the index
|
99 |
+
writer = write_ix.writer()
|
100 |
+
|
101 |
+
# Read the CSV file and add documents to the index
|
102 |
+
count= 0
|
103 |
+
with open(f'csv/{data_set}.csv', 'r') as csvfile:
|
104 |
+
reader = csv.DictReader(csvfile)
|
105 |
+
for row in reader:
|
106 |
+
comments = json.loads(row['messages_json'])
|
107 |
+
for comment in comments:
|
108 |
+
writer.add_document(id=row['thread_ts'], content=comment["content"])
|
109 |
+
count += 1
|
110 |
+
# Commit the writer and close it
|
111 |
+
writer.commit()
|
112 |
+
write_ix.close()
|
113 |
+
# Open the index
|
114 |
+
read_ix = index.open_dir(index_path)
|
115 |
+
comment_index[data_set] = read_ix
|
116 |
+
|
117 |
+
|
118 |
+
def search_keyword_matches(ix, input):
|
119 |
+
# Create a query parser
|
120 |
+
query_parser = QueryParser("content", ix.schema)
|
121 |
+
query = query_parser.parse(input)
|
122 |
+
return execute_text_search(ix, query)
|
123 |
+
|
124 |
+
|
125 |
+
def search_thread_id_matches(ix, thread_id_list):
|
126 |
+
# Create a query parser
|
127 |
+
query = Or([Term('id', id_) for id_ in thread_id_list])
|
128 |
+
return execute_text_search(ix, query)
|
129 |
+
|
130 |
+
|
131 |
+
def execute_text_search(ix, q):
|
132 |
+
# Search the index
|
133 |
+
with ix.searcher() as searcher:
|
134 |
+
results = searcher.search(q, limit=20)
|
135 |
+
if len(results) > 0:
|
136 |
+
matches = []
|
137 |
+
for result in results:
|
138 |
+
matches.append([result['id'], result['content']])
|
139 |
+
return matches
|
140 |
+
else:
|
141 |
+
return None
|
142 |
+
|
143 |
+
|
144 |
+
def gen_insights(index_name, topic, summary_type):
|
145 |
+
if topic is not None and len(topic) > 0:
|
146 |
+
resp = generate_insights(index_name, topic, summary_type)
|
147 |
+
return resp.response
|
148 |
+
|
149 |
+
|
150 |
+
@lru_cache(maxsize=50)
|
151 |
+
def generate_insights(index_name, topic, summary_type):
|
152 |
+
if llama_cache[index_name] is None:
|
153 |
+
return None
|
154 |
+
query = f"What is the executive summary for the topic \"{topic}\"? Highlight negative aspects in 100 words"
|
155 |
+
if summary_type == "Actions":
|
156 |
+
query = f"What are the recommended action items for the topic \"{topic}\"? Limit response to 100 words using bullet points"
|
157 |
+
elif summary_type == "Followup":
|
158 |
+
query = f"What are the recommended questions to ask team for more clarity and latest status for the topic \"{topic}\"?"
|
159 |
+
return llama_cache[index_name].as_query_engine().query(query)
|
160 |
+
|
161 |
+
|
162 |
+
def generate_comment_insights(index_name, topic, summary_type):
|
163 |
+
if summary_type == "Show Comments":
|
164 |
+
return show_docs(index_name, topic)
|
165 |
+
if summary_type == "Show Threads":
|
166 |
+
return show_threads(index_name, topic)
|
167 |
+
if summary_type == "Show Summary":
|
168 |
+
return show_thread_summaries(index_name, topic)
|
169 |
+
|
170 |
+
return "Not yet implemented"
|
171 |
+
|
172 |
+
|
173 |
+
def retrieve_llama_nodes(index_name, topic):
|
174 |
+
llama = llama_cache[index_name]
|
175 |
+
if llama is None:
|
176 |
+
return None
|
177 |
+
retriever = llama.as_retriever()
|
178 |
+
return retriever.retrieve(topic)
|
179 |
+
|
180 |
+
|
181 |
+
def show_docs(index_name, topic):
|
182 |
+
nodes = retrieve_llama_nodes(index_name, topic)
|
183 |
+
if nodes is None:
|
184 |
+
return "No matching documents found for the topic " + topic
|
185 |
+
text_list = [node_with_score.node.text for node_with_score in nodes]
|
186 |
+
return f"Total Matched Comments {len(text_list)}\n" + "\n\n==============\n".join(text_list)
|
187 |
+
|
188 |
+
|
189 |
+
def find_matching_threads(index_name, topic):
|
190 |
+
nodes = retrieve_llama_nodes(index_name, topic)
|
191 |
+
if nodes is None:
|
192 |
+
return None
|
193 |
+
thread_ids_list = [node_with_score.node.metadata[THREAD_ID] for node_with_score in nodes]
|
194 |
+
matches = search_thread_id_matches(thread_index[index_name], thread_ids_list)
|
195 |
+
threads = []
|
196 |
+
for thread in matches:
|
197 |
+
comments = json.loads(thread[1])
|
198 |
+
thread_content = []
|
199 |
+
for comment in comments:
|
200 |
+
thread_content.append(comment["content"])
|
201 |
+
threads.append("\n ->->-> \n ".join(thread_content))
|
202 |
+
return threads
|
203 |
+
|
204 |
+
|
205 |
+
def show_threads(index_name, topic):
|
206 |
+
threads = find_matching_threads(index_name, topic)
|
207 |
+
if threads is None:
|
208 |
+
return "No matching documents found for the topic " + topic
|
209 |
+
return f"Total Threads {len(threads)}\n" + "\n\n==============\n".join(threads)
|
210 |
+
|
211 |
+
|
212 |
+
@lru_cache(maxsize=50)
|
213 |
+
def show_thread_summaries(index_name, topic):
|
214 |
+
threads = find_matching_threads(index_name, topic)
|
215 |
+
if threads is None:
|
216 |
+
return "No matching documents found for the topic " + topic
|
217 |
+
docs_list = []
|
218 |
+
for thread in threads:
|
219 |
+
docs_list.append(Document(text=thread))
|
220 |
+
llama_idx = VectorStoreIndex.from_documents(documents=docs_list, service_context=service_context)
|
221 |
+
query = f"What is the executive summary for the topic \"{topic}\"? Limit response to 100 words"
|
222 |
+
resp = llama_idx.as_query_engine().query(query)
|
223 |
+
return resp.response
|
224 |
+
|
225 |
+
|
226 |
+
def remove_leading_numbers(text):
|
227 |
+
# Use re.sub to replace any pattern of "<number>." at the beginning of a line.
|
228 |
+
return re.sub(r'^\d+[.)]\s*', '', text, flags=re.M)
|
229 |
+
|
230 |
+
def find_topics_with_llama(index_name, query, matches):
|
231 |
+
print(f"Found {len(matches)} matches for {query}")
|
232 |
+
docs_list = []
|
233 |
+
for match in matches:
|
234 |
+
metadata = {THREAD_ID: match[0]}
|
235 |
+
docs_list.append(Document(text=match[1], metadata=metadata))
|
236 |
+
last_llama_index = VectorStoreIndex.from_documents(documents=docs_list, service_context=service_context)
|
237 |
+
llama_cache[index_name] = last_llama_index
|
238 |
+
resp = last_llama_index.as_query_engine().query(
|
239 |
+
"What are the key negative topics from the discussion? Limit each topic to 30 characters")
|
240 |
+
# return resp.response.split('\n')
|
241 |
+
# return ["foo", "bar"]
|
242 |
+
result_topics = resp.response.split('\n')
|
243 |
+
clean_topics = [remove_leading_numbers(topic) for topic in result_topics]
|
244 |
+
return clean_topics
|
245 |
+
|
246 |
+
|
247 |
+
def find_topics_by_thread(index_name, query, topic, summary_type):
|
248 |
+
resp = search_keyword_matches(thread_index[index_name], query)
|
249 |
+
if resp is not None:
|
250 |
+
result_topics = find_topics_with_llama(index_name, query, resp)
|
251 |
+
return gr.Dropdown.update(choices=result_topics, value=result_topics[0])
|
252 |
+
|
253 |
+
return "No matches found" if resp is None else resp
|
254 |
+
|
255 |
+
|
256 |
+
def find_topics_by_comments(index_name, query, topic, summary_type):
|
257 |
+
resp = search_keyword_matches(comment_index[index_name], query)
|
258 |
+
if resp is not None:
|
259 |
+
result_topics = find_topics_with_llama(index_name, query, resp)
|
260 |
+
return gr.Dropdown.update(choices=result_topics, value=result_topics[0])
|
261 |
+
|
262 |
+
return "No matches found" if resp is None else resp
|
263 |
+
|
264 |
+
|
265 |
+
def main_demo():
|
266 |
+
demo = gr.Blocks()
|
267 |
+
|
268 |
+
with demo:
|
269 |
+
data_sets = ["platform-engg_messages", "apps-ui_messages", "ux-reviews_messages"]
|
270 |
+
load_data(data_sets)
|
271 |
+
with gr.Tab("Thread"):
|
272 |
+
data_sets_dd = gr.Dropdown(data_sets,
|
273 |
+
type="value", value=data_sets[0], label="Select Slack Channel")
|
274 |
+
keyword_txt = gr.Textbox(lines=2, label="Enter keywords to search", placeholder='CISO, auth0')
|
275 |
+
find_topics_button = gr.Button("Find Topics")
|
276 |
+
topics_dd = gr.Dropdown([],
|
277 |
+
type="value", label="Select Topic with Negative Sentiment", allow_custom_value=True)
|
278 |
+
|
279 |
+
show_details = gr.Radio(["Summary", "Actions", "Followup"], label="Show Details")
|
280 |
+
|
281 |
+
find_topics_button.click(find_topics_by_thread,
|
282 |
+
inputs=[data_sets_dd, keyword_txt, find_topics_button, topics_dd],
|
283 |
+
outputs=topics_dd)
|
284 |
+
show_details.change(gen_insights, inputs=[data_sets_dd, topics_dd, show_details],
|
285 |
+
outputs=gr.Textbox(lines=11, label="Response"))
|
286 |
+
|
287 |
+
with gr.Tab("Comment"):
|
288 |
+
data_sets_dd = gr.Dropdown(data_sets,
|
289 |
+
type="value", value=data_sets[0], label="Select Slack Channel")
|
290 |
+
keyword_txt = gr.Textbox(lines=2, label="Enter keywords to search", placeholder='CISO, auth0')
|
291 |
+
find_topics_button = gr.Button("Find Topics")
|
292 |
+
topics_dd = gr.Dropdown([],
|
293 |
+
type="value", label="Select Topic with Negative Sentiment", allow_custom_value=True)
|
294 |
+
|
295 |
+
show_details = gr.Radio(["Show Comments", "Show Threads", "Show Summary"], label="Show Details")
|
296 |
+
|
297 |
+
find_topics_button.click(find_topics_by_comments,
|
298 |
+
inputs=[data_sets_dd, keyword_txt, find_topics_button, topics_dd],
|
299 |
+
outputs=topics_dd)
|
300 |
+
show_details.change(generate_comment_insights, inputs=[data_sets_dd, topics_dd, show_details],
|
301 |
+
outputs=gr.Textbox(lines=11, label="Response"))
|
302 |
+
|
303 |
+
demo.launch()
|
304 |
|
305 |
|
306 |
+
main_demo()
|
app_passive.py
DELETED
@@ -1,306 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import logging
|
3 |
-
import os
|
4 |
-
import re
|
5 |
-
from functools import lru_cache
|
6 |
-
|
7 |
-
from llama_index.llms import OpenAI
|
8 |
-
from whoosh.query import Or, Term
|
9 |
-
|
10 |
-
import csv
|
11 |
-
|
12 |
-
THREAD_ID = "thread_id"
|
13 |
-
|
14 |
-
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=os.environ.get("LOGLEVEL", "INFO"))
|
15 |
-
import gradio as gr
|
16 |
-
from llama_index import ServiceContext, \
|
17 |
-
Document, GPTListIndex, VectorStoreIndex
|
18 |
-
from whoosh import fields, index
|
19 |
-
from whoosh.qparser import QueryParser
|
20 |
-
|
21 |
-
thread_index = {}
|
22 |
-
comment_index = {}
|
23 |
-
llama_cache = {}
|
24 |
-
|
25 |
-
chatgpt = OpenAI(temperature=0, model="gpt-4")
|
26 |
-
service_context = ServiceContext.from_defaults(llm=chatgpt, chunk_size=1024)
|
27 |
-
|
28 |
-
|
29 |
-
def passive_topics(index_name, query, topic, summary_type):
|
30 |
-
resp = search_keyword_matches(index_name, query)
|
31 |
-
if resp is not None:
|
32 |
-
print(f"Found {len(resp)} matches for {query}")
|
33 |
-
docs_list = [Document(text=content) for content in resp]
|
34 |
-
last_llama_index = GPTListIndex.from_documents(documents=docs_list, service_context=service_context)
|
35 |
-
llama_cache[index_name] = last_llama_index
|
36 |
-
resp = last_llama_index.as_query_engine().query(
|
37 |
-
"What are the key negative topics from the discussion? Limit each topic to 30 characters")
|
38 |
-
dynamic_topics = resp.response.split('\n')
|
39 |
-
return dynamic_topics
|
40 |
-
|
41 |
-
return []
|
42 |
-
|
43 |
-
|
44 |
-
def load_data(data_sets):
|
45 |
-
for data_set in data_sets:
|
46 |
-
create_thread_index(data_set)
|
47 |
-
create_comment_index(data_set)
|
48 |
-
|
49 |
-
|
50 |
-
def create_thread_index(data_set):
|
51 |
-
# Define a schema for the index
|
52 |
-
schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT(stored=True))
|
53 |
-
index_path = f"./text_index/{data_set}"
|
54 |
-
# Create the index directory if it doesn't exist
|
55 |
-
if not os.path.exists(index_path):
|
56 |
-
os.mkdir(index_path)
|
57 |
-
build_index = True
|
58 |
-
else:
|
59 |
-
build_index = False
|
60 |
-
print("Loading from existing thread index " + data_set)
|
61 |
-
if build_index:
|
62 |
-
print("Building thread index for " + data_set)
|
63 |
-
# Create an index under "indexdir"
|
64 |
-
write_ix = index.create_in(index_path, schema)
|
65 |
-
|
66 |
-
# Create a writer object to add documents to the index
|
67 |
-
writer = write_ix.writer()
|
68 |
-
|
69 |
-
# Read the CSV file and add documents to the index
|
70 |
-
with open(f'csv/{data_set}.csv', 'r') as csvfile:
|
71 |
-
reader = csv.DictReader(csvfile)
|
72 |
-
for row in reader:
|
73 |
-
writer.add_document(id=row['thread_ts'], content=row['messages_json'])
|
74 |
-
# Commit the writer and close it
|
75 |
-
writer.commit()
|
76 |
-
write_ix.close()
|
77 |
-
# Open the index
|
78 |
-
read_ix = index.open_dir(index_path)
|
79 |
-
thread_index[data_set] = read_ix
|
80 |
-
|
81 |
-
|
82 |
-
def create_comment_index(data_set):
|
83 |
-
# Define a schema for the index
|
84 |
-
schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT(stored=True))
|
85 |
-
index_path = f"./text_index/{data_set}_comments"
|
86 |
-
# Create the index directory if it doesn't exist
|
87 |
-
if not os.path.exists(index_path):
|
88 |
-
os.mkdir(index_path)
|
89 |
-
build_index = True
|
90 |
-
else:
|
91 |
-
build_index = False
|
92 |
-
print("Loading from existing comments index " + data_set)
|
93 |
-
if build_index:
|
94 |
-
print("Building comments index for " + data_set)
|
95 |
-
# Create an index under "indexdir"
|
96 |
-
write_ix = index.create_in(index_path, schema)
|
97 |
-
|
98 |
-
# Create a writer object to add documents to the index
|
99 |
-
writer = write_ix.writer()
|
100 |
-
|
101 |
-
# Read the CSV file and add documents to the index
|
102 |
-
count= 0
|
103 |
-
with open(f'csv/{data_set}.csv', 'r') as csvfile:
|
104 |
-
reader = csv.DictReader(csvfile)
|
105 |
-
for row in reader:
|
106 |
-
comments = json.loads(row['messages_json'])
|
107 |
-
for comment in comments:
|
108 |
-
writer.add_document(id=row['thread_ts'], content=comment["content"])
|
109 |
-
count += 1
|
110 |
-
# Commit the writer and close it
|
111 |
-
writer.commit()
|
112 |
-
write_ix.close()
|
113 |
-
# Open the index
|
114 |
-
read_ix = index.open_dir(index_path)
|
115 |
-
comment_index[data_set] = read_ix
|
116 |
-
|
117 |
-
|
118 |
-
def search_keyword_matches(ix, input):
|
119 |
-
# Create a query parser
|
120 |
-
query_parser = QueryParser("content", ix.schema)
|
121 |
-
query = query_parser.parse(input)
|
122 |
-
return execute_text_search(ix, query)
|
123 |
-
|
124 |
-
|
125 |
-
def search_thread_id_matches(ix, thread_id_list):
|
126 |
-
# Create a query parser
|
127 |
-
query = Or([Term('id', id_) for id_ in thread_id_list])
|
128 |
-
return execute_text_search(ix, query)
|
129 |
-
|
130 |
-
|
131 |
-
def execute_text_search(ix, q):
|
132 |
-
# Search the index
|
133 |
-
with ix.searcher() as searcher:
|
134 |
-
results = searcher.search(q, limit=20)
|
135 |
-
if len(results) > 0:
|
136 |
-
matches = []
|
137 |
-
for result in results:
|
138 |
-
matches.append([result['id'], result['content']])
|
139 |
-
return matches
|
140 |
-
else:
|
141 |
-
return None
|
142 |
-
|
143 |
-
|
144 |
-
def gen_insights(index_name, topic, summary_type):
|
145 |
-
if topic is not None and len(topic) > 0:
|
146 |
-
resp = generate_insights(index_name, topic, summary_type)
|
147 |
-
return resp.response
|
148 |
-
|
149 |
-
|
150 |
-
@lru_cache(maxsize=50)
|
151 |
-
def generate_insights(index_name, topic, summary_type):
|
152 |
-
if llama_cache[index_name] is None:
|
153 |
-
return None
|
154 |
-
query = f"What is the executive summary for the topic \"{topic}\"? Highlight negative aspects in 100 words"
|
155 |
-
if summary_type == "Actions":
|
156 |
-
query = f"What are the recommended action items for the topic \"{topic}\"? Limit response to 100 words using bullet points"
|
157 |
-
elif summary_type == "Followup":
|
158 |
-
query = f"What are the recommended questions to ask team for more clarity and latest status for the topic \"{topic}\"?"
|
159 |
-
return llama_cache[index_name].as_query_engine().query(query)
|
160 |
-
|
161 |
-
|
162 |
-
def generate_comment_insights(index_name, topic, summary_type):
|
163 |
-
if summary_type == "Show Comments":
|
164 |
-
return show_docs(index_name, topic)
|
165 |
-
if summary_type == "Show Threads":
|
166 |
-
return show_threads(index_name, topic)
|
167 |
-
if summary_type == "Show Summary":
|
168 |
-
return show_thread_summaries(index_name, topic)
|
169 |
-
|
170 |
-
return "Not yet implemented"
|
171 |
-
|
172 |
-
|
173 |
-
def retrieve_llama_nodes(index_name, topic):
|
174 |
-
llama = llama_cache[index_name]
|
175 |
-
if llama is None:
|
176 |
-
return None
|
177 |
-
retriever = llama.as_retriever()
|
178 |
-
return retriever.retrieve(topic)
|
179 |
-
|
180 |
-
|
181 |
-
def show_docs(index_name, topic):
|
182 |
-
nodes = retrieve_llama_nodes(index_name, topic)
|
183 |
-
if nodes is None:
|
184 |
-
return "No matching documents found for the topic " + topic
|
185 |
-
text_list = [node_with_score.node.text for node_with_score in nodes]
|
186 |
-
return f"Total Matched Comments {len(text_list)}\n" + "\n\n==============\n".join(text_list)
|
187 |
-
|
188 |
-
|
189 |
-
def find_matching_threads(index_name, topic):
|
190 |
-
nodes = retrieve_llama_nodes(index_name, topic)
|
191 |
-
if nodes is None:
|
192 |
-
return None
|
193 |
-
thread_ids_list = [node_with_score.node.metadata[THREAD_ID] for node_with_score in nodes]
|
194 |
-
matches = search_thread_id_matches(thread_index[index_name], thread_ids_list)
|
195 |
-
threads = []
|
196 |
-
for thread in matches:
|
197 |
-
comments = json.loads(thread[1])
|
198 |
-
thread_content = []
|
199 |
-
for comment in comments:
|
200 |
-
thread_content.append(comment["content"])
|
201 |
-
threads.append("\n ->->-> \n ".join(thread_content))
|
202 |
-
return threads
|
203 |
-
|
204 |
-
|
205 |
-
def show_threads(index_name, topic):
|
206 |
-
threads = find_matching_threads(index_name, topic)
|
207 |
-
if threads is None:
|
208 |
-
return "No matching documents found for the topic " + topic
|
209 |
-
return f"Total Threads {len(threads)}\n" + "\n\n==============\n".join(threads)
|
210 |
-
|
211 |
-
|
212 |
-
@lru_cache(maxsize=50)
|
213 |
-
def show_thread_summaries(index_name, topic):
|
214 |
-
threads = find_matching_threads(index_name, topic)
|
215 |
-
if threads is None:
|
216 |
-
return "No matching documents found for the topic " + topic
|
217 |
-
docs_list = []
|
218 |
-
for thread in threads:
|
219 |
-
docs_list.append(Document(text=thread))
|
220 |
-
llama_idx = VectorStoreIndex.from_documents(documents=docs_list, service_context=service_context)
|
221 |
-
query = f"What is the executive summary for the topic \"{topic}\"? Limit response to 100 words"
|
222 |
-
resp = llama_idx.as_query_engine().query(query)
|
223 |
-
return resp.response
|
224 |
-
|
225 |
-
|
226 |
-
def remove_leading_numbers(text):
|
227 |
-
# Use re.sub to replace any pattern of "<number>." at the beginning of a line.
|
228 |
-
return re.sub(r'^\d+[.)]\s*', '', text, flags=re.M)
|
229 |
-
|
230 |
-
def find_topics_with_llama(index_name, query, matches):
|
231 |
-
print(f"Found {len(matches)} matches for {query}")
|
232 |
-
docs_list = []
|
233 |
-
for match in matches:
|
234 |
-
metadata = {THREAD_ID: match[0]}
|
235 |
-
docs_list.append(Document(text=match[1], metadata=metadata))
|
236 |
-
last_llama_index = VectorStoreIndex.from_documents(documents=docs_list, service_context=service_context)
|
237 |
-
llama_cache[index_name] = last_llama_index
|
238 |
-
resp = last_llama_index.as_query_engine().query(
|
239 |
-
"What are the key negative topics from the discussion? Limit each topic to 30 characters")
|
240 |
-
# return resp.response.split('\n')
|
241 |
-
# return ["foo", "bar"]
|
242 |
-
result_topics = resp.response.split('\n')
|
243 |
-
clean_topics = [remove_leading_numbers(topic) for topic in result_topics]
|
244 |
-
return clean_topics
|
245 |
-
|
246 |
-
|
247 |
-
def find_topics_by_thread(index_name, query, topic, summary_type):
|
248 |
-
resp = search_keyword_matches(thread_index[index_name], query)
|
249 |
-
if resp is not None:
|
250 |
-
result_topics = find_topics_with_llama(index_name, query, resp)
|
251 |
-
return gr.Dropdown.update(choices=result_topics, value=result_topics[0])
|
252 |
-
|
253 |
-
return "No matches found" if resp is None else resp
|
254 |
-
|
255 |
-
|
256 |
-
def find_topics_by_comments(index_name, query, topic, summary_type):
|
257 |
-
resp = search_keyword_matches(comment_index[index_name], query)
|
258 |
-
if resp is not None:
|
259 |
-
result_topics = find_topics_with_llama(index_name, query, resp)
|
260 |
-
return gr.Dropdown.update(choices=result_topics, value=result_topics[0])
|
261 |
-
|
262 |
-
return "No matches found" if resp is None else resp
|
263 |
-
|
264 |
-
|
265 |
-
def main_demo():
|
266 |
-
demo = gr.Blocks()
|
267 |
-
|
268 |
-
with demo:
|
269 |
-
data_sets = ["platform-engg_messages", "apps-ui_messages", "ux-reviews_messages"]
|
270 |
-
load_data(data_sets)
|
271 |
-
with gr.Tab("Thread"):
|
272 |
-
data_sets_dd = gr.Dropdown(data_sets,
|
273 |
-
type="value", value=data_sets[0], label="Select Slack Channel")
|
274 |
-
keyword_txt = gr.Textbox(lines=2, label="Enter keywords to search", placeholder='CISO, auth0')
|
275 |
-
find_topics_button = gr.Button("Find Topics")
|
276 |
-
topics_dd = gr.Dropdown([],
|
277 |
-
type="value", label="Select Topic with Negative Sentiment", allow_custom_value=True)
|
278 |
-
|
279 |
-
show_details = gr.Radio(["Summary", "Actions", "Followup"], label="Show Details")
|
280 |
-
|
281 |
-
find_topics_button.click(find_topics_by_thread,
|
282 |
-
inputs=[data_sets_dd, keyword_txt, find_topics_button, topics_dd],
|
283 |
-
outputs=topics_dd)
|
284 |
-
show_details.change(gen_insights, inputs=[data_sets_dd, topics_dd, show_details],
|
285 |
-
outputs=gr.Textbox(lines=11, label="Response"))
|
286 |
-
|
287 |
-
with gr.Tab("Comment"):
|
288 |
-
data_sets_dd = gr.Dropdown(data_sets,
|
289 |
-
type="value", value=data_sets[0], label="Select Slack Channel")
|
290 |
-
keyword_txt = gr.Textbox(lines=2, label="Enter keywords to search", placeholder='CISO, auth0')
|
291 |
-
find_topics_button = gr.Button("Find Topics")
|
292 |
-
topics_dd = gr.Dropdown([],
|
293 |
-
type="value", label="Select Topic with Negative Sentiment", allow_custom_value=True)
|
294 |
-
|
295 |
-
show_details = gr.Radio(["Show Comments", "Show Threads", "Show Summary"], label="Show Details")
|
296 |
-
|
297 |
-
find_topics_button.click(find_topics_by_comments,
|
298 |
-
inputs=[data_sets_dd, keyword_txt, find_topics_button, topics_dd],
|
299 |
-
outputs=topics_dd)
|
300 |
-
show_details.change(generate_comment_insights, inputs=[data_sets_dd, topics_dd, show_details],
|
301 |
-
outputs=gr.Textbox(lines=11, label="Response"))
|
302 |
-
|
303 |
-
demo.launch()
|
304 |
-
|
305 |
-
|
306 |
-
main_demo()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app_v1.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
from llama_index.indices.document_summary import DocumentSummaryIndexEmbeddingRetriever
|
5 |
+
from llama_index.indices.vector_store import VectorIndexRetriever
|
6 |
+
from llama_index.llms import OpenAI
|
7 |
+
from llama_index.query_engine import RetrieverQueryEngine
|
8 |
+
|
9 |
+
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=os.environ.get("LOGLEVEL", "INFO"))
|
10 |
+
import gradio as gr
|
11 |
+
from llama_index import VectorStoreIndex, StorageContext, download_loader, load_index_from_storage, ServiceContext, \
|
12 |
+
get_response_synthesizer
|
13 |
+
|
14 |
+
cache = {}
|
15 |
+
chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo")
|
16 |
+
service_context = ServiceContext.from_defaults(llm=chatgpt, chunk_size=1024)
|
17 |
+
|
18 |
+
|
19 |
+
def load_mapping_from_json(filepath):
|
20 |
+
if not os.path.exists(filepath):
|
21 |
+
return {}
|
22 |
+
with open(filepath, 'r') as file:
|
23 |
+
return json.load(file)
|
24 |
+
|
25 |
+
userIdMapping = load_mapping_from_json('user_id_to_name_mapping.json')
|
26 |
+
|
27 |
+
def loadData():
|
28 |
+
index_root = "./index"
|
29 |
+
directory_names = os.listdir(index_root)
|
30 |
+
for directory in directory_names:
|
31 |
+
if os.path.isdir(f"{index_root}/{directory}"):
|
32 |
+
print("Loading from existing index " + directory)
|
33 |
+
storage_context = StorageContext.from_defaults(persist_dir=f"{index_root}/{directory}")
|
34 |
+
index = load_index_from_storage(storage_context)
|
35 |
+
vector_retriever = VectorIndexRetriever(index=index, similarity_top_k=5)
|
36 |
+
response_synthesizer = get_response_synthesizer(service_context=service_context,
|
37 |
+
response_mode="tree_summarize")
|
38 |
+
query_engine = RetrieverQueryEngine(
|
39 |
+
retriever=vector_retriever,
|
40 |
+
response_synthesizer=response_synthesizer,
|
41 |
+
)
|
42 |
+
cache[directory] = query_engine
|
43 |
+
|
44 |
+
|
45 |
+
def chatbot(indexName, input_text):
|
46 |
+
"""
|
47 |
+
Chatbot function that takes in a prompt and returns a response
|
48 |
+
"""
|
49 |
+
query = "This data contains updates from multiple teams and sprints - "+ input_text + " Ignore headers in the content."
|
50 |
+
response = cache[indexName].query(query)
|
51 |
+
answer = response.response.replace('Based on the given context information', 'Based on the available information')
|
52 |
+
for userId in userIdMapping:
|
53 |
+
answer = answer.replace(userId, userIdMapping[userId])
|
54 |
+
return answer
|
55 |
+
|
56 |
+
|
57 |
+
def main():
|
58 |
+
loadData()
|
59 |
+
iface = gr.Interface(fn=chatbot, inputs=[
|
60 |
+
gr.Dropdown(cache.keys(),
|
61 |
+
type="value", value="sos", label="Select Channel"),
|
62 |
+
gr.Textbox(lines=7, label="Ask any question", placeholder='What are the key topics?')], outputs=gr.Textbox(lines=11, label="Response"),
|
63 |
+
title="NLP Demo for Slack Data")
|
64 |
+
if 'LOGIN_PASS' in os.environ:
|
65 |
+
iface.launch(auth=('axiamatic', os.environ['LOGIN_PASS']),
|
66 |
+
auth_message='For access, please check my Slack profile or contact me in Slack.',
|
67 |
+
share=False)
|
68 |
+
else:
|
69 |
+
iface.launch(share=False)
|
70 |
+
|
71 |
+
|
72 |
+
main()
|