Spaces:
Runtime error
Runtime error
madhuroopa
commited on
Commit
·
1366204
1
Parent(s):
a67e2cf
added new application files
Browse files- app.py +304 -0
- requirements.txt +29 -0
- src/aws/resonate_aws_functions.py +369 -0
- src/clustering/resonate_bert_summarizer.py +96 -0
- src/clustering/resonate_clustering.py +179 -0
- src/clustering/resonate_semantic_search.py +66 -0
- src/langchain/resonate_langchain_functions.py +148 -0
- src/pinecone/resonate_pinecone_functions.py +422 -0
- src/utils/resonate_streamlitUtils.py +80 -0
app.py
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Description: This file contains the main Streamlit application for the Resonate project.
|
2 |
+
# Run command: streamlit run app.py to run the application
|
3 |
+
|
4 |
+
|
5 |
+
import os
|
6 |
+
import pandas as pd
|
7 |
+
import streamlit as st
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
from streamlit import session_state as ss
|
10 |
+
from streamlit_chat import message
|
11 |
+
from src.clustering.resonate_bert_summarizer import summarize_runner
|
12 |
+
from src.clustering.resonate_clustering import Clustering
|
13 |
+
from src.langchain.resonate_langchain_functions import LangChain
|
14 |
+
from src.utils.resonate_streamlitUtils import (
|
15 |
+
aws_transcribe,
|
16 |
+
convert_video_to_audio,
|
17 |
+
pinecone_init_upsert,
|
18 |
+
transcript_text_editor_minutes_to_hhmmss,
|
19 |
+
)
|
20 |
+
|
21 |
+
|
22 |
+
def initialize_session_state():
|
23 |
+
# Initialize API keys in session state if not present
|
24 |
+
if "api_keys" not in ss:
|
25 |
+
ss.api_keys = {}
|
26 |
+
ss.api_keys["openai_api_key"] = os.environ.get("OPENAI_API_KEY")
|
27 |
+
ss.api_keys["pinecone_api_key"] = os.environ.get("PINECONE_API_KEY")
|
28 |
+
ss.api_keys["aws_access_key"] = os.environ.get("AWS_ACCESS_KEY")
|
29 |
+
ss.api_keys["aws_secret_access_key"] = os.environ.get("AWS_SECRET_ACCESS_KEY")
|
30 |
+
if "add_meeting" not in ss:
|
31 |
+
ss.add_meeting = False
|
32 |
+
if "Clustering_obj" not in ss:
|
33 |
+
ss.Clustering_obj = Clustering()
|
34 |
+
# Initialize - Main Screen - Transcript Editor
|
35 |
+
if "transcript_speaker_editor" not in ss:
|
36 |
+
ss.transcript_speaker_editor = False
|
37 |
+
if "transcript_text_editor" not in ss:
|
38 |
+
ss.transcript_text_editor = False
|
39 |
+
if "meeting_name" not in ss:
|
40 |
+
ss.meeting_name = ""
|
41 |
+
if "df_transcript_speaker" not in ss:
|
42 |
+
ss.df_transcript_speaker = pd.DataFrame()
|
43 |
+
if "df_transcript_text" not in ss:
|
44 |
+
ss.df_transcript_text = pd.DataFrame()
|
45 |
+
if "updated_df" not in ss:
|
46 |
+
ss.updated_transcript_df_to_embed = pd.DataFrame()
|
47 |
+
if "chat_view" not in ss:
|
48 |
+
ss.chat_view = True
|
49 |
+
if "langchain_obj" not in ss:
|
50 |
+
ss.langchain_obj = LangChain()
|
51 |
+
if "query" not in ss:
|
52 |
+
ss.query = ""
|
53 |
+
if "responses" not in ss:
|
54 |
+
ss["responses"] = ["How can I assist you?"]
|
55 |
+
if "requests" not in ss:
|
56 |
+
ss["requests"] = []
|
57 |
+
|
58 |
+
|
59 |
+
def chat_view():
|
60 |
+
st.header("Chat")
|
61 |
+
response_container = st.container()
|
62 |
+
textcontainer = st.container()
|
63 |
+
with textcontainer:
|
64 |
+
query = st.text_input(
|
65 |
+
"Chat Here",
|
66 |
+
placeholder="Message Resonate ... ",
|
67 |
+
value=ss.query,
|
68 |
+
key="query_input",
|
69 |
+
)
|
70 |
+
# Clear button
|
71 |
+
if st.button("Clear"):
|
72 |
+
ss.langchain_obj.conversation_bufw.memory.clear() # Clear conversation buffer
|
73 |
+
ss.query = ""
|
74 |
+
ss.requests = []
|
75 |
+
ss.responses = []
|
76 |
+
ss["responses"] = ["How can I assist you?"]
|
77 |
+
st.rerun()
|
78 |
+
|
79 |
+
elif query:
|
80 |
+
with st.spinner("typing..."):
|
81 |
+
uuid_list = ss.Clustering_obj.uuid_for_query(query=query)
|
82 |
+
print(f"Meeting Unique ID : {uuid_list}")
|
83 |
+
response = ss.langchain_obj.chat(
|
84 |
+
query=query, in_filter=uuid_list, complete_db_flag=False
|
85 |
+
)
|
86 |
+
response = response["response"]
|
87 |
+
ss.requests.append(query)
|
88 |
+
ss.responses.append(response)
|
89 |
+
ss.query = ""
|
90 |
+
with response_container:
|
91 |
+
if ss["responses"]:
|
92 |
+
for i in range(len(ss["responses"])):
|
93 |
+
message(ss["responses"][i], key=str(i))
|
94 |
+
if i < len(ss["requests"]):
|
95 |
+
message(
|
96 |
+
ss["requests"][i],
|
97 |
+
is_user=True,
|
98 |
+
key=str(i) + "_user",
|
99 |
+
)
|
100 |
+
|
101 |
+
|
102 |
+
def api_keys_input():
|
103 |
+
with st.form("keys_input_form"):
|
104 |
+
# Retrieve values from session state
|
105 |
+
openai_api_key = st.text_input(
|
106 |
+
"OpenAPI Key:",
|
107 |
+
type="password",
|
108 |
+
value=ss.api_keys.get(
|
109 |
+
"openai_api_key", ""
|
110 |
+
), # Use default value if key is not present
|
111 |
+
)
|
112 |
+
pinecone_api_key = st.text_input(
|
113 |
+
"Pinecone Key:",
|
114 |
+
type="password",
|
115 |
+
value=ss.api_keys.get(
|
116 |
+
"pinecone_api_key", ""
|
117 |
+
), # Use default value if key is not present
|
118 |
+
)
|
119 |
+
aws_access_key = st.text_input(
|
120 |
+
"AWS Access Key:",
|
121 |
+
type="password",
|
122 |
+
value=ss.api_keys.get(
|
123 |
+
"aws_access_key", ""
|
124 |
+
), # Use default value if key is not present
|
125 |
+
)
|
126 |
+
aws_secret_access_key = st.text_input(
|
127 |
+
"AWS Secret Access Key:",
|
128 |
+
type="password",
|
129 |
+
value=ss.api_keys.get(
|
130 |
+
"aws_secret_access_key", ""
|
131 |
+
), # Use default value if key is not present
|
132 |
+
)
|
133 |
+
# Add a button to save the keys
|
134 |
+
save_button = st.form_submit_button("Save API Keys")
|
135 |
+
if save_button:
|
136 |
+
# Update session state with provided keys
|
137 |
+
ss.api_keys["openai_api_key"] = openai_api_key
|
138 |
+
ss.api_keys["pinecone_api_key"] = pinecone_api_key
|
139 |
+
ss.api_keys["aws_access_key"] = aws_access_key
|
140 |
+
ss.api_keys["aws_secret_access_key"] = aws_secret_access_key
|
141 |
+
# Set environment variables only if the keys are not None
|
142 |
+
if openai_api_key:
|
143 |
+
os.environ["OPENAI_API_KEY"] = ss.api_keys["openai_api_key"]
|
144 |
+
if pinecone_api_key:
|
145 |
+
os.environ["PINECONE_API_KEY"] = ss.api_keys["pinecone_api_key"]
|
146 |
+
if aws_access_key:
|
147 |
+
os.environ["AWS_ACCESS_KEY"] = ss.api_keys["aws_access_key"]
|
148 |
+
if aws_secret_access_key:
|
149 |
+
os.environ["AWS_SECRET_ACCESS_KEY"] = ss.api_keys[
|
150 |
+
"aws_secret_access_key"
|
151 |
+
]
|
152 |
+
st.rerun()
|
153 |
+
|
154 |
+
|
155 |
+
def add_meeting():
|
156 |
+
with st.form("add_meeting_form"):
|
157 |
+
uploaded_file = st.file_uploader("Choose a file", type=["wav", "mp4"])
|
158 |
+
# Get user input
|
159 |
+
meeting_name = st.text_input("Enter Meeting Name:")
|
160 |
+
save_meeting_button = st.form_submit_button("Save Meeting")
|
161 |
+
if save_meeting_button:
|
162 |
+
if not meeting_name:
|
163 |
+
st.warning("Please enter Meeting Name.")
|
164 |
+
elif uploaded_file is None:
|
165 |
+
st.warning("Please upload a meeting recording.")
|
166 |
+
elif meeting_name and uploaded_file:
|
167 |
+
with st.spinner("Processing..."):
|
168 |
+
file_name = uploaded_file.name.replace(" ", "_")
|
169 |
+
if file_name.endswith(".mp4") or file_name.endswith(".mpeg4"):
|
170 |
+
print("in video")
|
171 |
+
with open("data/videoFiles/" + file_name, "wb") as f:
|
172 |
+
f.write(uploaded_file.getbuffer())
|
173 |
+
f.close()
|
174 |
+
# Convert video file to audio file
|
175 |
+
audio_path = "data/audioFiles/" + file_name[:-4] + ".wav"
|
176 |
+
convert_video_to_audio(
|
177 |
+
"data/videoFiles/" + file_name, audio_path
|
178 |
+
)
|
179 |
+
file_name = file_name[:-4] + ".wav"
|
180 |
+
elif file_name.endswith(".wav"):
|
181 |
+
print("in audio")
|
182 |
+
with open("data/audioFiles/" + file_name, "wb") as f:
|
183 |
+
f.write(uploaded_file.getbuffer())
|
184 |
+
f.close()
|
185 |
+
ss.df_transcript_speaker = aws_transcribe(file_name)
|
186 |
+
ss.meeting_name = meeting_name
|
187 |
+
ss.transcript_speaker_editor = True
|
188 |
+
|
189 |
+
|
190 |
+
def transcript_speaker_editor():
|
191 |
+
ss.add_meeting = False
|
192 |
+
with st.form("transcript_speaker_editor_form"):
|
193 |
+
st.write("Transcript Speaker Editor:")
|
194 |
+
st.dataframe(ss.df_transcript_speaker)
|
195 |
+
df = ss.df_transcript_speaker.copy(deep=True)
|
196 |
+
# Create a list of unique speaker labels
|
197 |
+
speaker_labels = df["speaker_label"].unique()
|
198 |
+
# Create a dictionary to store the updated values
|
199 |
+
updated_speaker_names = {}
|
200 |
+
# Display text input boxes for each speaker label
|
201 |
+
for speaker_label in speaker_labels:
|
202 |
+
new_name = st.text_input(
|
203 |
+
f"Edit speaker label '{speaker_label}'", speaker_label
|
204 |
+
)
|
205 |
+
updated_speaker_names[speaker_label] = new_name
|
206 |
+
# Update the DataFrame with the new speaker label names
|
207 |
+
for old_name, new_name in updated_speaker_names.items():
|
208 |
+
df["speaker_label"] = df["speaker_label"].replace(old_name, new_name)
|
209 |
+
update_speaker_button = st.form_submit_button("Update Speakers")
|
210 |
+
if update_speaker_button and df is not None:
|
211 |
+
ss.df_transcript_speaker = pd.DataFrame()
|
212 |
+
ss.df_transcript_text = df.copy(deep=True)
|
213 |
+
del df
|
214 |
+
ss.transcript_text_editor = True
|
215 |
+
ss.transcript_speaker_editor = False
|
216 |
+
st.rerun()
|
217 |
+
|
218 |
+
|
219 |
+
# Function to update the text column
|
220 |
+
def transcript_text_editor_update_text(row_index, new_text):
|
221 |
+
ss.updated_transcript_df_to_embed.at[row_index, "text"] = new_text
|
222 |
+
|
223 |
+
|
224 |
+
def transcript_text_editor():
|
225 |
+
ss.transcript_speaker_editor = False
|
226 |
+
st.write("Transcript Text Editor:")
|
227 |
+
st.write(ss.df_transcript_text)
|
228 |
+
df = ss.df_transcript_text.copy(deep=True)
|
229 |
+
ss.updated_transcript_df_to_embed = df.copy(deep=True)
|
230 |
+
# Convert start_time and end_time to HH:MM:SS format
|
231 |
+
df["start_time"] = df["start_time"].apply(transcript_text_editor_minutes_to_hhmmss)
|
232 |
+
df["end_time"] = df["end_time"].apply(transcript_text_editor_minutes_to_hhmmss)
|
233 |
+
row_index = st.number_input(
|
234 |
+
"Enter the row index:",
|
235 |
+
min_value=0,
|
236 |
+
max_value=len(df) - 1,
|
237 |
+
value=0,
|
238 |
+
step=1,
|
239 |
+
)
|
240 |
+
new_text = st.text_area("Enter the new text:", df.at[row_index, "text"])
|
241 |
+
update_text_button_inner = st.button("Update Text")
|
242 |
+
if update_text_button_inner:
|
243 |
+
transcript_text_editor_update_text(row_index, new_text)
|
244 |
+
st.success("Text updated successfully!")
|
245 |
+
# Display the updated dataframe
|
246 |
+
st.header("Updated Transcript")
|
247 |
+
st.table(ss.updated_transcript_df_to_embed)
|
248 |
+
update_text_button = st.button("Finish Transcript Editing")
|
249 |
+
if update_text_button:
|
250 |
+
with st.spinner("Uploading..."):
|
251 |
+
ss.df_transcript_text = pd.DataFrame()
|
252 |
+
meeting_summary, meeting_uuid = summarize_runner(
|
253 |
+
ss.updated_transcript_df_to_embed
|
254 |
+
)
|
255 |
+
ss.Clustering_obj.create_Cluster()
|
256 |
+
pinecone_init_upsert(
|
257 |
+
ss.updated_transcript_df_to_embed,
|
258 |
+
meeting_title=ss.meeting_name,
|
259 |
+
meeting_summary=meeting_summary,
|
260 |
+
meeting_uuid=meeting_uuid,
|
261 |
+
)
|
262 |
+
ss.meeting_name = "unnamed"
|
263 |
+
st.success("Pinecone upsert completed successfully!")
|
264 |
+
ss.transcript_text_editor = False
|
265 |
+
ss.updated_transcript_df_to_embed = pd.DataFrame()
|
266 |
+
ss.chat_view = True
|
267 |
+
st.rerun()
|
268 |
+
|
269 |
+
|
270 |
+
def init_streamlit():
|
271 |
+
load_dotenv("./config/.env")
|
272 |
+
initialize_session_state()
|
273 |
+
# Set initial state of the sidebar
|
274 |
+
if ss.api_keys["pinecone_api_key"] is not None:
|
275 |
+
st.set_page_config(
|
276 |
+
initial_sidebar_state="collapsed",
|
277 |
+
layout="wide",
|
278 |
+
)
|
279 |
+
st.title("RESONATE")
|
280 |
+
# Initializing sidebar and its components
|
281 |
+
with st.sidebar:
|
282 |
+
api_keys_input()
|
283 |
+
if st.button("Upload Meeting / Chat"):
|
284 |
+
ss.add_meeting = not ss.add_meeting
|
285 |
+
ss.chat_view = not ss.chat_view
|
286 |
+
ss.transcript_speaker_editor = False
|
287 |
+
ss.transcript_text_editor = False
|
288 |
+
if ss.add_meeting:
|
289 |
+
add_meeting()
|
290 |
+
if ss.transcript_speaker_editor:
|
291 |
+
transcript_speaker_editor()
|
292 |
+
if ss.df_transcript_text is not None and ss.transcript_text_editor:
|
293 |
+
transcript_text_editor()
|
294 |
+
if ss.chat_view:
|
295 |
+
chat_view() # Chat view
|
296 |
+
|
297 |
+
|
298 |
+
if __name__ == "__main__":
|
299 |
+
init_streamlit()
|
300 |
+
|
301 |
+
# Test questions:
|
302 |
+
# What was discussed about cyberbullying?
|
303 |
+
# What is one new feature planned for GitLab's code search?
|
304 |
+
# What is the goal of defining maintainability for the new diffs architecture?
|
requirements.txt
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core
|
2 |
+
numpy
|
3 |
+
pandas
|
4 |
+
tqdm
|
5 |
+
ipykernel
|
6 |
+
boto3
|
7 |
+
botocore
|
8 |
+
jupyter
|
9 |
+
ipywidgets
|
10 |
+
python-dotenv
|
11 |
+
scikit-learn
|
12 |
+
torch
|
13 |
+
|
14 |
+
# Project Specific
|
15 |
+
openai
|
16 |
+
langchain-community
|
17 |
+
langchain-openai
|
18 |
+
langchain
|
19 |
+
pinecone-client
|
20 |
+
streamlit
|
21 |
+
tiktoken
|
22 |
+
webvtt-py
|
23 |
+
moviepy
|
24 |
+
transformers
|
25 |
+
streamlit_chat==0.0.2.2
|
26 |
+
faiss-cpu
|
27 |
+
|
28 |
+
# LLM Eval
|
29 |
+
# trulens
|
src/aws/resonate_aws_functions.py
ADDED
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Description: AWS utility functions for Resonate. This file contains the code to parse the AWS Transcribe output.
|
2 |
+
# Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/transcribe/client/start_transcription_job.html
|
3 |
+
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import re
|
7 |
+
import time
|
8 |
+
import boto3
|
9 |
+
import dotenv
|
10 |
+
import pandas as pd
|
11 |
+
import webvtt
|
12 |
+
from datetime import datetime
|
13 |
+
from IPython.display import HTML, display
|
14 |
+
|
15 |
+
class resonate_aws_transcribe:
|
16 |
+
def create_client(
|
17 |
+
self,
|
18 |
+
aws_access_key: str,
|
19 |
+
aws_secret_access_key: str,
|
20 |
+
aws_region_name: str,
|
21 |
+
) -> tuple[boto3.client, boto3.client]:
|
22 |
+
"""
|
23 |
+
Create and return AWS Transcribe and S3 clients with the specified AWS region.
|
24 |
+
"""
|
25 |
+
session = boto3.Session(
|
26 |
+
aws_access_key_id=aws_access_key,
|
27 |
+
aws_secret_access_key=aws_secret_access_key,
|
28 |
+
region_name=aws_region_name,
|
29 |
+
)
|
30 |
+
return session.client("transcribe"), session.client("s3")
|
31 |
+
|
32 |
+
def create_s3_bucket(
|
33 |
+
self, s3: boto3.client, bucket_name: str, aws_region_name: str
|
34 |
+
) -> bool:
|
35 |
+
"""
|
36 |
+
Create an S3 bucket using the provided AWS S3 client if it doesn't exist.
|
37 |
+
"""
|
38 |
+
try:
|
39 |
+
s3.create_bucket(
|
40 |
+
Bucket=bucket_name,
|
41 |
+
CreateBucketConfiguration={"LocationConstraint": aws_region_name},
|
42 |
+
)
|
43 |
+
print(f"S3 bucket '{bucket_name}' created successfully.")
|
44 |
+
return True
|
45 |
+
except s3.exceptions.BucketAlreadyExists:
|
46 |
+
print(f"S3 bucket '{bucket_name}' already exists.")
|
47 |
+
return True
|
48 |
+
except Exception as e:
|
49 |
+
print(f"Error creating S3 bucket '{bucket_name}': {e}")
|
50 |
+
return False
|
51 |
+
|
52 |
+
def upload_to_s3(
|
53 |
+
self, s3: boto3.client, file_path: str, bucket_name: str, object_name=None
|
54 |
+
) -> str:
|
55 |
+
"""
|
56 |
+
Upload the audio file to S3 bucket using the provided AWS S3 client.
|
57 |
+
"""
|
58 |
+
if object_name is None:
|
59 |
+
object_name = file_path
|
60 |
+
|
61 |
+
try:
|
62 |
+
s3.upload_file(file_path, bucket_name, object_name)
|
63 |
+
uri = f"s3://{bucket_name}/{object_name}"
|
64 |
+
print(f"File '{file_path}' uploaded successfully to '{uri}'")
|
65 |
+
return uri
|
66 |
+
|
67 |
+
except Exception as e:
|
68 |
+
print(
|
69 |
+
f"Error uploading file '{file_path}' to '{bucket_name}/{object_name}': {e}"
|
70 |
+
)
|
71 |
+
return ""
|
72 |
+
|
73 |
+
def download_from_s3(
|
74 |
+
self,
|
75 |
+
s3: boto3.client,
|
76 |
+
object_name: str,
|
77 |
+
bucket_name: str,
|
78 |
+
local_directory: str,
|
79 |
+
) -> bool:
|
80 |
+
"""
|
81 |
+
Download the .json and .vtt files from an S3 bucket to a local directory.
|
82 |
+
"""
|
83 |
+
local_file_json = f"{local_directory}/{object_name}.json"
|
84 |
+
local_file_vtt = f"{local_directory}/{object_name}.vtt"
|
85 |
+
|
86 |
+
try:
|
87 |
+
s3.download_file(bucket_name, object_name + ".json", local_file_json)
|
88 |
+
print(f"File '{object_name}' (JSON) downloaded successfully to '{local_file_json}'")
|
89 |
+
|
90 |
+
s3.download_file(bucket_name, object_name + ".vtt", local_file_vtt)
|
91 |
+
print(f"File '{object_name}' (VTT) downloaded successfully to '{local_file_vtt}'")
|
92 |
+
return True
|
93 |
+
except Exception as e:
|
94 |
+
print(f"Error downloading file '{object_name}' from '{bucket_name}': {e}")
|
95 |
+
return False
|
96 |
+
|
97 |
+
def delete_from_s3(
|
98 |
+
self, s3: boto3.client, bucket_name: str, object_name: str
|
99 |
+
) -> bool:
|
100 |
+
"""
|
101 |
+
Delete the file from an S3 bucket using the provided AWS S3 client.
|
102 |
+
"""
|
103 |
+
try:
|
104 |
+
s3.delete_object(Bucket=bucket_name, Key=object_name)
|
105 |
+
print(f"File '{object_name}' deleted successfully from '{bucket_name}'")
|
106 |
+
return True
|
107 |
+
except Exception as e:
|
108 |
+
print(f"Error deleting file '{object_name}' from '{bucket_name}': {e}")
|
109 |
+
return False
|
110 |
+
|
111 |
+
def delete_s3_bucket(self, s3: boto3.client, bucket_name: str) -> bool:
|
112 |
+
"""
|
113 |
+
Delete a S3 bucket along with its contents using the provided AWS S3 client.
|
114 |
+
"""
|
115 |
+
try:
|
116 |
+
objects = s3.list_objects(Bucket=bucket_name).get("Contents", [])
|
117 |
+
for obj in objects:
|
118 |
+
s3.delete_object(Bucket=bucket_name, Key=obj["Key"])
|
119 |
+
print(
|
120 |
+
f"Object '{obj['Key']}' deleted successfully from '{bucket_name}'"
|
121 |
+
)
|
122 |
+
|
123 |
+
s3.delete_bucket(Bucket=bucket_name)
|
124 |
+
print(f"S3 bucket '{bucket_name}' and its contents deleted successfully.")
|
125 |
+
return True
|
126 |
+
except Exception as e:
|
127 |
+
return e
|
128 |
+
|
129 |
+
def transcribe_audio(
|
130 |
+
self,
|
131 |
+
transcribe_client: boto3.client,
|
132 |
+
uri: str,
|
133 |
+
output_bucket: str,
|
134 |
+
transcribe_job_name: str = "job",
|
135 |
+
) -> dict:
|
136 |
+
"""
|
137 |
+
Start a transcription job for audio stored in an S3 bucket using the AWS Transcribe service.
|
138 |
+
"""
|
139 |
+
print("Calling AWS Transcribe Job...")
|
140 |
+
response = transcribe_client.start_transcription_job(
|
141 |
+
TranscriptionJobName=transcribe_job_name,
|
142 |
+
LanguageCode="en-US",
|
143 |
+
MediaFormat="wav",
|
144 |
+
Settings={
|
145 |
+
"ShowSpeakerLabels": True,
|
146 |
+
"MaxSpeakerLabels": 10,
|
147 |
+
"ChannelIdentification": False,
|
148 |
+
},
|
149 |
+
Media={"MediaFileUri": uri},
|
150 |
+
Subtitles={"Formats": ["vtt"]},
|
151 |
+
OutputBucketName=output_bucket,
|
152 |
+
)
|
153 |
+
return response
|
154 |
+
|
155 |
+
def combine_files(self, file_name: str, local_directory: str) -> pd.DataFrame:
|
156 |
+
"""
|
157 |
+
Combines information from a JSON file and a WebVTT file into a CSV file.
|
158 |
+
"""
|
159 |
+
json_file_path = f"{local_directory}/{file_name}.json"
|
160 |
+
with open(json_file_path, "r") as f:
|
161 |
+
data = json.load(f)
|
162 |
+
|
163 |
+
segments = data["results"]["speaker_labels"]["segments"]
|
164 |
+
df = pd.DataFrame(segments)
|
165 |
+
df["start_time"] = df["start_time"].astype(float) / 60
|
166 |
+
df["end_time"] = df["end_time"].astype(float) / 60
|
167 |
+
df = df.rename(
|
168 |
+
columns={
|
169 |
+
"start_time": "start_time",
|
170 |
+
"end_time": "end_time",
|
171 |
+
"speaker_label": "speaker_label",
|
172 |
+
}
|
173 |
+
)
|
174 |
+
|
175 |
+
vtt_file_path = f"{local_directory}/{file_name}.vtt"
|
176 |
+
subtitles = webvtt.read(vtt_file_path)
|
177 |
+
|
178 |
+
data = [
|
179 |
+
(
|
180 |
+
subtitle.start_in_seconds / 60,
|
181 |
+
subtitle.end_in_seconds / 60,
|
182 |
+
subtitle.text.strip(),
|
183 |
+
)
|
184 |
+
for subtitle in subtitles
|
185 |
+
]
|
186 |
+
titles = pd.DataFrame(data, columns=["start_time", "end_time", "text"])
|
187 |
+
transcript = pd.merge_asof(
|
188 |
+
titles.sort_values("start_time"),
|
189 |
+
df.sort_values("start_time"),
|
190 |
+
on="start_time",
|
191 |
+
direction="backward",
|
192 |
+
)
|
193 |
+
|
194 |
+
transcript = transcript.dropna(subset=["speaker_label"])
|
195 |
+
transcript = transcript[["start_time", "end_time_x", "speaker_label", "text"]]
|
196 |
+
transcript.columns = ["start_time", "end_time", "speaker_label", "text"]
|
197 |
+
|
198 |
+
# Reset the index
|
199 |
+
transcript = transcript.reset_index(drop=True)
|
200 |
+
|
201 |
+
print("Combined transcript successfully!")
|
202 |
+
return transcript
|
203 |
+
|
204 |
+
def aws_transcribe_parser(
|
205 |
+
self, transcript_df: pd.DataFrame, output_filename: str
|
206 |
+
) -> pd.DataFrame:
|
207 |
+
"""
|
208 |
+
Parses the AWS Transcribe output by cleaning duplicate texts and merging consecutive rows with
|
209 |
+
the same speaker.
|
210 |
+
"""
|
211 |
+
prev_text = None # Initialize prev_text
|
212 |
+
transcript_df["text"] = transcript_df["text"].apply(
|
213 |
+
lambda x: re.sub(r"[\"\'\--]+", "", x)
|
214 |
+
)
|
215 |
+
|
216 |
+
for index, row in transcript_df.iterrows():
|
217 |
+
if row["text"] == prev_text and row["speaker_label"] == prev_speaker:
|
218 |
+
transcript_df.at[merge_start, "end_time"] = row["end_time"]
|
219 |
+
transcript_df.drop(index, inplace=True)
|
220 |
+
else:
|
221 |
+
merge_start = index
|
222 |
+
|
223 |
+
prev_text = row["text"]
|
224 |
+
prev_speaker = row["speaker_label"]
|
225 |
+
|
226 |
+
transcript_df["group"] = (
|
227 |
+
transcript_df["speaker_label"] != transcript_df["speaker_label"].shift()
|
228 |
+
).cumsum()
|
229 |
+
result_df = transcript_df.groupby(
|
230 |
+
["group", "speaker_label"], as_index=False
|
231 |
+
).agg({"start_time": "first", "end_time": "last", "text": " ".join})
|
232 |
+
result_df = result_df.drop(columns=["group"])
|
233 |
+
|
234 |
+
result_df.to_csv(
|
235 |
+
"./data/transcriptFiles/" + output_filename + ".csv", index=False
|
236 |
+
)
|
237 |
+
return result_df
|
238 |
+
|
239 |
+
def delete_local_temp_file(self, tempFiles: str) -> bool:
|
240 |
+
"""
|
241 |
+
Delete a local temporary file specified by the file path.
|
242 |
+
"""
|
243 |
+
if os.path.exists("./data/tempFiles/" + tempFiles + ".json"):
|
244 |
+
os.remove("./data/tempFiles/" + tempFiles + ".json")
|
245 |
+
|
246 |
+
if os.path.exists("./data/tempFiles/" + tempFiles + ".vtt"):
|
247 |
+
os.remove("./data/tempFiles/" + tempFiles + ".vtt")
|
248 |
+
|
249 |
+
def runner(
|
250 |
+
self,
|
251 |
+
file_name: str,
|
252 |
+
input_bucket: str,
|
253 |
+
output_bucket: str,
|
254 |
+
transcribe_job_name: str,
|
255 |
+
aws_access_key: str,
|
256 |
+
aws_secret_access_key: str,
|
257 |
+
aws_region_name: str,
|
258 |
+
) -> None:
|
259 |
+
"""
|
260 |
+
Run the transcription process for an audio file using AWS Transcribe.
|
261 |
+
"""
|
262 |
+
transcribe_client, s3_client = self.create_client(
|
263 |
+
aws_access_key=aws_access_key,
|
264 |
+
aws_secret_access_key=aws_secret_access_key,
|
265 |
+
aws_region_name=aws_region_name,
|
266 |
+
)
|
267 |
+
|
268 |
+
print("Transcribe_client created: ", transcribe_client)
|
269 |
+
print("s3_client created: ", s3_client)
|
270 |
+
|
271 |
+
# Create S3 buckets
|
272 |
+
print(
|
273 |
+
f"Create S3 Bucket {input_bucket} : ",
|
274 |
+
self.create_s3_bucket(s3_client, input_bucket, aws_region_name),
|
275 |
+
)
|
276 |
+
print(
|
277 |
+
f"Create S3 Bucket {output_bucket} : ",
|
278 |
+
self.create_s3_bucket(s3_client, output_bucket, aws_region_name),
|
279 |
+
)
|
280 |
+
|
281 |
+
URI = self.upload_to_s3(
|
282 |
+
s3_client, "./data/audioFiles/" + file_name, input_bucket
|
283 |
+
)
|
284 |
+
print("Upload completed now will initiate transcription job.")
|
285 |
+
self.transcribe_audio(
|
286 |
+
transcribe_client,
|
287 |
+
URI,
|
288 |
+
output_bucket,
|
289 |
+
transcribe_job_name=transcribe_job_name,
|
290 |
+
)
|
291 |
+
|
292 |
+
# Check status of transcription job
|
293 |
+
while (
|
294 |
+
transcribe_client.get_transcription_job(
|
295 |
+
TranscriptionJobName=transcribe_job_name
|
296 |
+
)["TranscriptionJob"]["TranscriptionJobStatus"]
|
297 |
+
!= "COMPLETED"
|
298 |
+
):
|
299 |
+
time.sleep(3)
|
300 |
+
|
301 |
+
# Download transcription job output
|
302 |
+
print(
|
303 |
+
"Download from S3 : ",
|
304 |
+
self.download_from_s3(
|
305 |
+
s3_client,
|
306 |
+
transcribe_job_name,
|
307 |
+
output_bucket,
|
308 |
+
local_directory="./data/tempFiles/",
|
309 |
+
),
|
310 |
+
)
|
311 |
+
|
312 |
+
print(
|
313 |
+
"Delete S3 Bucket Input Bucket : ",
|
314 |
+
self.delete_s3_bucket(s3_client, input_bucket),
|
315 |
+
)
|
316 |
+
print(
|
317 |
+
"Delete S3 Bucket Output Bucket: ",
|
318 |
+
self.delete_s3_bucket(s3_client, output_bucket),
|
319 |
+
)
|
320 |
+
|
321 |
+
try:
|
322 |
+
transcribe_client.delete_transcription_job(
|
323 |
+
TranscriptionJobName=transcribe_job_name
|
324 |
+
)
|
325 |
+
except:
|
326 |
+
print("Transcription Job does not exist.")
|
327 |
+
|
328 |
+
# Close clients
|
329 |
+
transcribe_client.close()
|
330 |
+
s3_client.close()
|
331 |
+
|
332 |
+
# combine the json and vtt results to create a transcript
|
333 |
+
df_transcript_combined = self.combine_files(
|
334 |
+
transcribe_job_name, local_directory="./data/tempFiles/"
|
335 |
+
)
|
336 |
+
df_transcript_combined_parsed = self.aws_transcribe_parser(
|
337 |
+
transcript_df=df_transcript_combined, output_filename=transcribe_job_name
|
338 |
+
)
|
339 |
+
print("Transcript parsed successfully")
|
340 |
+
|
341 |
+
self.delete_local_temp_file(tempFiles=transcribe_job_name)
|
342 |
+
return df_transcript_combined_parsed
|
343 |
+
|
344 |
+
|
345 |
+
if __name__ == "__main__":
|
346 |
+
dotenv.load_dotenv("./config/.env")
|
347 |
+
|
348 |
+
current_timestamp = str.lower(datetime.now().strftime("%Y-%b-%d-%I-%M-%p"))
|
349 |
+
|
350 |
+
aws_access_key = os.getenv("AWS_ACCESS_KEY")
|
351 |
+
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
|
352 |
+
print(aws_access_key, aws_secret_access_key)
|
353 |
+
aws_region_name = "us-east-2"
|
354 |
+
file_name = "test.wav"
|
355 |
+
input_bucket = f"resonate-input-{str(current_timestamp)}"
|
356 |
+
output_bucket = f"resonate-output-{str(current_timestamp)}"
|
357 |
+
transcribe_job_name = f"resonate-job-{str(current_timestamp)}"
|
358 |
+
|
359 |
+
rat = resonate_aws_transcribe()
|
360 |
+
df = rat.runner(
|
361 |
+
file_name=file_name,
|
362 |
+
input_bucket=input_bucket,
|
363 |
+
output_bucket=output_bucket,
|
364 |
+
transcribe_job_name=transcribe_job_name,
|
365 |
+
aws_access_key=aws_access_key,
|
366 |
+
aws_secret_access_key=aws_secret_access_key,
|
367 |
+
aws_region_name=aws_region_name,
|
368 |
+
)
|
369 |
+
print(df)
|
src/clustering/resonate_bert_summarizer.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Description: The code helps generate abstractive summary for the transcript data using BART model
|
2 |
+
# Reference: https://huggingface.co/vmarklynn/bart-large-cnn-samsum-acsi-ami-v2
|
3 |
+
# Reference : https://github.com/vmarklynn/parrot/blob/main/notebooks/summarizer.ipynb
|
4 |
+
|
5 |
+
import math
|
6 |
+
import pandas as pd
|
7 |
+
import os
|
8 |
+
import uuid
|
9 |
+
from transformers import pipeline
|
10 |
+
|
11 |
+
# Initialization of summarizer based on Bart
|
12 |
+
MODEL = 'vmarklynn/bart-large-cnn-samsum-acsi-ami-v2'
|
13 |
+
summarizer = pipeline("summarization", MODEL, truncation=True)
|
14 |
+
|
15 |
+
def format_text(text):
|
16 |
+
'''
|
17 |
+
Format the transcript data into a readable format
|
18 |
+
'''
|
19 |
+
try:
|
20 |
+
formatted_data = [
|
21 |
+
f"{row['speaker_label']}: {row['text']}" for _, row in text.iterrows()
|
22 |
+
]
|
23 |
+
formatted_text = "\n".join([f"{line}" for line in formatted_data])
|
24 |
+
return formatted_text
|
25 |
+
except Exception as e:
|
26 |
+
print(f"Error formatting text: {e}")
|
27 |
+
return ""
|
28 |
+
|
29 |
+
|
30 |
+
def summarize_text(transcript):
|
31 |
+
'''
|
32 |
+
Summarize the text using the BART model
|
33 |
+
'''
|
34 |
+
try:
|
35 |
+
text = format_text(transcript)
|
36 |
+
|
37 |
+
print("\n\nSummarizing Text...")
|
38 |
+
summary = summarizer(text)[0]["summary_text"]
|
39 |
+
response = {"transcription": format_text, "summary": summary}
|
40 |
+
return response
|
41 |
+
except Exception as e:
|
42 |
+
print(f"Error summarizing text: {e}")
|
43 |
+
return {}
|
44 |
+
|
45 |
+
|
46 |
+
def summarize_summary(summary_input):
|
47 |
+
'''
|
48 |
+
Summarize the summarized text using the BART model
|
49 |
+
'''
|
50 |
+
try:
|
51 |
+
word_count = 1024
|
52 |
+
summary = summarizer(
|
53 |
+
summary_input,
|
54 |
+
min_length=math.ceil(int(word_count) * 0.1),
|
55 |
+
max_length=math.ceil(int(word_count) * 0.25),
|
56 |
+
)[0]["summary_text"]
|
57 |
+
response = {"summary": summary}
|
58 |
+
return response
|
59 |
+
except Exception as e:
|
60 |
+
print(f"Error summarizing summary: {e}")
|
61 |
+
return {}
|
62 |
+
|
63 |
+
|
64 |
+
def append_summary_to_csv(summary_text):
|
65 |
+
try:
|
66 |
+
csv_filename = "./data/summaryFiles/abstract_summary_data.csv"
|
67 |
+
meeting_uuid = str(uuid.uuid4())
|
68 |
+
if os.path.exists(csv_filename):
|
69 |
+
df = pd.read_csv(csv_filename)
|
70 |
+
else:
|
71 |
+
df = pd.DataFrame(columns=["uuid", "text"])
|
72 |
+
new_data = pd.DataFrame({"uuid": [meeting_uuid], "text": [summary_text]})
|
73 |
+
df = pd.concat([df, new_data], ignore_index=True)
|
74 |
+
|
75 |
+
df.to_csv(csv_filename, index=False)
|
76 |
+
return meeting_uuid
|
77 |
+
except Exception as e:
|
78 |
+
print(f"Error appending summary to CSV: {e}")
|
79 |
+
return False
|
80 |
+
|
81 |
+
|
82 |
+
def summarize_runner(transcript):
|
83 |
+
try:
|
84 |
+
transcript.drop(["end_time"], axis=1, inplace=True)
|
85 |
+
summary_transcript = summarize_text(transcript)
|
86 |
+
summarized_summary = summarize_summary(summary_transcript["summary"])
|
87 |
+
final_summary = summarized_summary["summary"]
|
88 |
+
meeting_uuid = append_summary_to_csv(final_summary)
|
89 |
+
except Exception as e:
|
90 |
+
print(f"Error in summarize_runner: {e}")
|
91 |
+
return final_summary, meeting_uuid
|
92 |
+
|
93 |
+
|
94 |
+
if __name__ == "__main__":
|
95 |
+
df = pd.read_csv("./data/transcriptFiles/Social_Media_-_Ruins_your_life.csv")
|
96 |
+
summarize_runner(df)
|
src/clustering/resonate_clustering.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Description:Creates clusters based on the uploaded transcripts and returns the uuid of the documents that are similar to the query.
|
2 |
+
# Reference Code: https://github.com/chakib401/smoothing_sentence_embeddings/blob/master/utils.py
|
3 |
+
|
4 |
+
'''
|
5 |
+
Paper Citation for def normalize_adj():
|
6 |
+
Fettal, Chakib, Lazhar Labiod, and Mohamed Nadif.
|
7 |
+
"More Discriminative Sentence Embeddings via Semantic Graph Smoothing."
|
8 |
+
arXiv preprint arXiv:2402.12890 (2024).
|
9 |
+
'''
|
10 |
+
|
11 |
+
import json
|
12 |
+
import os
|
13 |
+
import joblib
|
14 |
+
import numpy as np
|
15 |
+
import pandas as pd
|
16 |
+
import scipy.sparse as sp
|
17 |
+
import src.clustering.resonate_semantic_search as SemanticSearch
|
18 |
+
from dotenv import load_dotenv
|
19 |
+
from langchain_openai import OpenAIEmbeddings
|
20 |
+
from scipy.io import loadmat, savemat
|
21 |
+
from sklearn.cluster import MeanShift, estimate_bandwidth
|
22 |
+
from sklearn.neighbors import kneighbors_graph
|
23 |
+
|
24 |
+
def normalize_adj(adj, lmbda=1):
|
25 |
+
'''
|
26 |
+
Normalize adjacency matrix of semantic graph
|
27 |
+
'''
|
28 |
+
adj = adj + lmbda * sp.eye(adj.shape[0])
|
29 |
+
rowsum = np.array(adj.sum(1))
|
30 |
+
r_inv = np.power(rowsum, -1).flatten()
|
31 |
+
r_inv[np.isinf(r_inv)] = 0.0
|
32 |
+
r_mat_inv = sp.diags(r_inv)
|
33 |
+
adj = r_mat_inv.dot(adj)
|
34 |
+
|
35 |
+
adj = adj + lmbda * sp.eye(adj.shape[0])
|
36 |
+
adj = sp.coo_matrix(adj)
|
37 |
+
row_sum = np.array(adj.sum(1))
|
38 |
+
d_inv_sqrt = np.power(row_sum, -0.5).flatten()
|
39 |
+
d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.0
|
40 |
+
d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
|
41 |
+
return d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt).tocoo()
|
42 |
+
|
43 |
+
|
44 |
+
def graph_filtering(features, degree=2, lmbda=1, nn=10, alpha=0.5, t=5, method="sgc"):
|
45 |
+
"""
|
46 |
+
This function will perform graph filtering based on four polynomial filters
|
47 |
+
We keep n=10, as per paper. And is used to calculate the graph (adjacency matrix)
|
48 |
+
between 10 vectors/features.
|
49 |
+
|
50 |
+
**That is why we have 10 pre-existing transcripts placed in pinecone (through the ont_time_script)
|
51 |
+
**If you want to change the number of transcripts, you will have to change the number of neighbors
|
52 |
+
"""
|
53 |
+
adj = kneighbors_graph(features, n_neighbors=nn, metric="cosine")
|
54 |
+
adj = (adj + adj.T) / 2
|
55 |
+
|
56 |
+
S = normalize_adj(adj, lmbda)
|
57 |
+
xx = features
|
58 |
+
yy = features.copy()
|
59 |
+
if method in ["sgc", "s2gc"]:
|
60 |
+
for _ in range(degree):
|
61 |
+
xx = S @ xx
|
62 |
+
yy += xx
|
63 |
+
if method == "sgc":
|
64 |
+
return xx
|
65 |
+
elif method == "s2gc":
|
66 |
+
return yy
|
67 |
+
elif method == "appnp":
|
68 |
+
for _ in range(degree):
|
69 |
+
xx = (1 - alpha) * S @ xx + alpha * features
|
70 |
+
return xx
|
71 |
+
elif method == "dgc":
|
72 |
+
k = degree + 1
|
73 |
+
for _ in range(1, degree + 1):
|
74 |
+
xx = (1 - t / k) * xx + (t / k) * (S @ xx)
|
75 |
+
return xx
|
76 |
+
else:
|
77 |
+
raise "unrecognized filter"
|
78 |
+
|
79 |
+
|
80 |
+
def load_json_config(json_file_path="./config/config.json"):
|
81 |
+
with open(json_file_path, "r") as file:
|
82 |
+
data = json.load(file)
|
83 |
+
return data
|
84 |
+
|
85 |
+
|
86 |
+
class Clustering:
|
87 |
+
def __init__(self):
|
88 |
+
self.api_key = os.environ.get("OPENAI_API_KEY")
|
89 |
+
self.method = "dgc"
|
90 |
+
|
91 |
+
if not os.path.exists("./data/clusteringFiles/cluster_data.csv"):
|
92 |
+
self.create_Cluster()
|
93 |
+
|
94 |
+
self.index = self.initialize_FAISS()
|
95 |
+
|
96 |
+
def create_embedding(self):
|
97 |
+
'''This function will perform two task:
|
98 |
+
1. embedding on entire data, abstract_data.csv
|
99 |
+
2. save embeddings in cluster_data-embedding.mat in format uuid, text
|
100 |
+
'''
|
101 |
+
data = pd.read_csv("./data/summaryFiles/abstract_summary_data.csv")
|
102 |
+
json_config = load_json_config()
|
103 |
+
|
104 |
+
text, id = data["text"], data["uuid"]
|
105 |
+
# embedding model
|
106 |
+
embed = OpenAIEmbeddings(
|
107 |
+
model=json_config["EMBEDDING_MODEL_NAME"], openai_api_key=self.api_key
|
108 |
+
)
|
109 |
+
embeddings = embed.embed_documents(text)
|
110 |
+
savemat(
|
111 |
+
"./data/embeddingFiles/cluster-embedding.mat",
|
112 |
+
{"uuid": id, "text": embeddings},
|
113 |
+
)
|
114 |
+
|
115 |
+
|
116 |
+
def create_Cluster(self):
|
117 |
+
'''
|
118 |
+
This function will perform following tasks:
|
119 |
+
1. call embedding function
|
120 |
+
2. form clusters using cluste_data-embedding.mat file
|
121 |
+
3. Save predicted labels in cluster_data.csv
|
122 |
+
'''
|
123 |
+
self.create_embedding()
|
124 |
+
|
125 |
+
data = loadmat("./data/embeddingFiles/cluster-embedding.mat")
|
126 |
+
features1 = data["text"]
|
127 |
+
|
128 |
+
features = graph_filtering(features1, method=self.method)
|
129 |
+
ibandwidth = estimate_bandwidth(features, quantile=0.30, random_state=42)
|
130 |
+
msclustering = MeanShift(bandwidth=ibandwidth, max_iter=900)
|
131 |
+
msclustering.fit(features)
|
132 |
+
model_path = f"./data/clusteringFiles/{self.method}_model.joblib"
|
133 |
+
joblib.dump(msclustering, model_path)
|
134 |
+
|
135 |
+
print("Model saved")
|
136 |
+
|
137 |
+
df = pd.read_csv(f"./data/summaryFiles/abstract_summary_data.csv")
|
138 |
+
df["cluster"] = msclustering.predict(features)
|
139 |
+
df.to_csv("./data/clusteringFiles/cluster_data.csv")
|
140 |
+
print("Cluster data saved")
|
141 |
+
self.index = self.initialize_FAISS()
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
def uuid_for_query(self, query):
|
146 |
+
'''
|
147 |
+
Returns the uuids of the documents that are similar to the query, based on the clustering
|
148 |
+
'''
|
149 |
+
query_cluster_label = self.index.search_query(query)
|
150 |
+
print(f"Predicted Label : {query_cluster_label[0]}")
|
151 |
+
df = pd.read_csv("./data/clusteringFiles/cluster_data.csv")
|
152 |
+
filtered_uuids = df[df["cluster"] == query_cluster_label[0]]["uuid"].tolist()
|
153 |
+
return filtered_uuids
|
154 |
+
|
155 |
+
def initialize_FAISS(self):
|
156 |
+
model = SemanticSearch.SemanticEmbedding()
|
157 |
+
index = SemanticSearch.FaissForQuerySearch(model)
|
158 |
+
data = pd.read_csv("./data/clusteringFiles/cluster_data.csv")
|
159 |
+
features1 = data["text"]
|
160 |
+
uuids = data["uuid"]
|
161 |
+
labels = data["cluster"]
|
162 |
+
for text, uuid, label in zip(features1, uuids, labels):
|
163 |
+
index.add_summary(text, uuid, label)
|
164 |
+
return index
|
165 |
+
|
166 |
+
|
167 |
+
if __name__ == "__main__":
|
168 |
+
load_dotenv("./config/.env")
|
169 |
+
Clustering_obj = Clustering()
|
170 |
+
print(
|
171 |
+
Clustering_obj.uuid_for_query(
|
172 |
+
"What is the goal of defining maintainability for the new diffs architecture?"
|
173 |
+
)
|
174 |
+
)
|
175 |
+
print(
|
176 |
+
Clustering_obj.uuid_for_query(
|
177 |
+
"What was the design component for remote control?"
|
178 |
+
)
|
179 |
+
)
|
src/clustering/resonate_semantic_search.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Description: Using Facebook's Faiss library to perform semantic search according to the query
|
2 |
+
# Reference: https://deepnote.com/blog/semantic-search-using-faiss-and-mpnet
|
3 |
+
|
4 |
+
from transformers import AutoTokenizer, AutoModel
|
5 |
+
import torch
|
6 |
+
import torch.nn.functional as F
|
7 |
+
import faiss
|
8 |
+
|
9 |
+
class SemanticEmbedding:
|
10 |
+
|
11 |
+
def __init__(self, model_name="sentence-transformers/all-mpnet-base-v2"):
|
12 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
13 |
+
self.model = AutoModel.from_pretrained(model_name)
|
14 |
+
|
15 |
+
# Mean Pooling - Take attention mask into account for correct averaging
|
16 |
+
def mean_pooling(self, model_output, attention_mask):
|
17 |
+
token_embeddings = model_output[
|
18 |
+
0
|
19 |
+
] # First element of model_output contains all token embeddings
|
20 |
+
input_mask_expanded = (
|
21 |
+
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
22 |
+
)
|
23 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
|
24 |
+
input_mask_expanded.sum(1), min=1e-9
|
25 |
+
)
|
26 |
+
|
27 |
+
def get_embedding(self, sentences):
|
28 |
+
# Tokenize sentences
|
29 |
+
encoded_input = self.tokenizer(
|
30 |
+
sentences, padding=True, truncation=True, return_tensors="pt"
|
31 |
+
)
|
32 |
+
with torch.no_grad():
|
33 |
+
model_output = self.model(**encoded_input)
|
34 |
+
# Perform pooling
|
35 |
+
sentence_embeddings = self.mean_pooling(
|
36 |
+
model_output, encoded_input["attention_mask"]
|
37 |
+
)
|
38 |
+
|
39 |
+
# Normalize embeddings
|
40 |
+
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
41 |
+
return sentence_embeddings.detach().numpy()
|
42 |
+
|
43 |
+
|
44 |
+
class FaissForQuerySearch:
|
45 |
+
|
46 |
+
def __init__(self, model, dim=768):
|
47 |
+
self.index = faiss.IndexFlatIP(dim)
|
48 |
+
# Maintaining the document data
|
49 |
+
self.doc_map = dict()
|
50 |
+
self.model = model
|
51 |
+
self.ctr = 0
|
52 |
+
self.uuid = []
|
53 |
+
self.labels = []
|
54 |
+
|
55 |
+
def search_query(self, query, k=1):
|
56 |
+
D, I = self.index.search(self.model.get_embedding(query), k)
|
57 |
+
return [
|
58 |
+
self.labels[idx] for idx, score in zip(I[0], D[0]) if idx in self.doc_map
|
59 |
+
]
|
60 |
+
|
61 |
+
def add_summary(self, document_text, id, predicted_label):
|
62 |
+
self.index.add((self.model.get_embedding(document_text))) # index
|
63 |
+
self.uuid.append(id) # appending the uuid
|
64 |
+
self.labels.append(predicted_label) # appending the predicted label
|
65 |
+
self.doc_map[self.ctr] = document_text # store the original document text
|
66 |
+
self.ctr += 1
|
src/langchain/resonate_langchain_functions.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#Description: This file contains the LangChain class which is used to query the pinecone and chatbot
|
2 |
+
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from src.pinecone.resonate_pinecone_functions import PineconeServerless
|
6 |
+
from langchain_community.callbacks import get_openai_callback
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
from datetime import datetime
|
9 |
+
from langchain_openai import ChatOpenAI
|
10 |
+
from langchain.chains import ConversationChain
|
11 |
+
from langchain.chains.conversation.memory import (
|
12 |
+
ConversationSummaryBufferMemory,
|
13 |
+
)
|
14 |
+
from langchain.prompts.chat import (
|
15 |
+
HumanMessagePromptTemplate,
|
16 |
+
ChatPromptTemplate,
|
17 |
+
SystemMessagePromptTemplate,
|
18 |
+
)
|
19 |
+
|
20 |
+
def load_json_config(json_file_path="./config/config.json"):
|
21 |
+
with open(json_file_path, "r") as file:
|
22 |
+
data = json.load(file)
|
23 |
+
return data
|
24 |
+
|
25 |
+
|
26 |
+
class LangChain:
|
27 |
+
def __init__(self):
|
28 |
+
|
29 |
+
json_config = load_json_config()
|
30 |
+
# check if the .env file exists using os
|
31 |
+
if os.path.exists("./config/.env"):
|
32 |
+
load_dotenv("./config/.env")
|
33 |
+
|
34 |
+
self.PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
|
35 |
+
self.OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
36 |
+
print("PINECONE_API_KEY: ", self.PINECONE_API_KEY)
|
37 |
+
if self.PINECONE_API_KEY and self.OPENAI_API_KEY :
|
38 |
+
self.pinecone = PineconeServerless()
|
39 |
+
print("INSIDE PINECONE")
|
40 |
+
self.llm_temperature = json_config["LC_LLM_TEMPERATURE"]
|
41 |
+
self.llm_model = json_config["LC_LLM_MODEL"]
|
42 |
+
self.llm_summary_max_token_limit = json_config[
|
43 |
+
"LC_LLM_SUMMARY_MAX_TOKEN_LIMIT"
|
44 |
+
]
|
45 |
+
self.llm_conv_buffer_memory_window = json_config[
|
46 |
+
"LC_CONV_BUFFER_MEMORY_WINDOW"
|
47 |
+
]
|
48 |
+
|
49 |
+
self.llm = ChatOpenAI(
|
50 |
+
temperature=self.llm_temperature,
|
51 |
+
model_name=self.llm_model,
|
52 |
+
streaming=False,
|
53 |
+
)
|
54 |
+
|
55 |
+
self.conversation_bufw = ConversationChain(
|
56 |
+
llm=self.llm,
|
57 |
+
memory=ConversationSummaryBufferMemory(
|
58 |
+
llm=self.llm, max_token_limit=self.llm_summary_max_token_limit
|
59 |
+
),
|
60 |
+
)
|
61 |
+
|
62 |
+
def prompt(self, query, context):
|
63 |
+
|
64 |
+
system_template = SystemMessagePromptTemplate.from_template(
|
65 |
+
"As a helpful assistant, your task is to provide concise and relevant answers based on the given context, which consists of a transcript excerpt from a meeting. The format of the context is as follows:"
|
66 |
+
"\nConversations in meeting: <meeting_title>, <meeting_date>"
|
67 |
+
"\nStart Time - Speaker: Text"
|
68 |
+
"\nYou may receive multiple meeting transcripts, if you feel your response requires reference to multiple meetings, feel free to mention <meeting_title> in your response."
|
69 |
+
"Your responses should strictly adhere to the provided context. If you cannot find an answer within the given context, you may ask the user for more information. Ensure clarity by referencing the meeting_title, meeting_date, and speaker as needed."
|
70 |
+
"Your responses should be succinct and directly address the user query using the context provided. Avoid discussing any information beyond the given context or using external sources."
|
71 |
+
"Skip unnecessary phrases like 'Based on the context provided' and focus solely on answering the users query. No need for greetings or farewells."
|
72 |
+
"\nContext:\n"
|
73 |
+
"{context}"
|
74 |
+
)
|
75 |
+
|
76 |
+
human_template = HumanMessagePromptTemplate.from_template("\nUser Query: {input}")
|
77 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
78 |
+
[system_template, human_template]
|
79 |
+
)
|
80 |
+
chat_prompt_value = chat_prompt.format_prompt(context=context, input=query)
|
81 |
+
return chat_prompt_value.to_messages()
|
82 |
+
|
83 |
+
|
84 |
+
def query_chatbot(self, query, context):
|
85 |
+
'''
|
86 |
+
Query the chatbot with the given query and context
|
87 |
+
'''
|
88 |
+
self.messages = self.prompt(query, context)
|
89 |
+
#print("Complete msg passed to LLM: \n"self.messages)
|
90 |
+
#print("CONTEXT: \n", self.messages)
|
91 |
+
resp = self.conversation_bufw(self.messages)
|
92 |
+
return resp
|
93 |
+
|
94 |
+
def parse_conversations(self, conversations) -> str:
|
95 |
+
''''
|
96 |
+
Parse the conversations and return the data in a readable format
|
97 |
+
'''
|
98 |
+
'''
|
99 |
+
Format:
|
100 |
+
Conversations in meeting: <meeting_title1>
|
101 |
+
Start Time - Speaker: Text
|
102 |
+
Start Time - Speaker: Text
|
103 |
+
.
|
104 |
+
.
|
105 |
+
\n\n
|
106 |
+
Conversations in meeting: <meeting_title2>
|
107 |
+
Start Time - Speaker: Text
|
108 |
+
Start Time - Speaker: Text
|
109 |
+
.
|
110 |
+
.
|
111 |
+
'''
|
112 |
+
data = []
|
113 |
+
for cluster_id, cluster_df in conversations.items():
|
114 |
+
with open(f"./data/jsonMetaDataFiles/{cluster_id}.json") as f:
|
115 |
+
meeting_data = json.load(f)
|
116 |
+
meeting_title = meeting_data["meeting_title"]
|
117 |
+
meeting_date = datetime.strptime(meeting_data["meeting_date"], '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S')
|
118 |
+
data.append(f"Conversations in meeting: '{meeting_title}', meeting_date: {meeting_date} :")
|
119 |
+
for i, row in cluster_df.iterrows():
|
120 |
+
data.append(
|
121 |
+
f"{row['start_time']} - {row['speaker']}: {row['text']}"
|
122 |
+
)
|
123 |
+
data.append("\n\n")
|
124 |
+
data = "\n".join(data)
|
125 |
+
return data
|
126 |
+
|
127 |
+
def chat(self, query, in_filter: list[str] = [], complete_db_flag: bool = False):
|
128 |
+
'''
|
129 |
+
Primary chat function to query the pinecone and chatbot
|
130 |
+
If in_filter is provided, the query will be filtered based on the in_filter
|
131 |
+
If complete_db_flag is True, the query will be searched in the complete database
|
132 |
+
'''
|
133 |
+
# if "summary" in query:
|
134 |
+
# pass # Future implementation, using semantic routing
|
135 |
+
|
136 |
+
self.pinecone.query_pinecone(query, in_filter, complete_db_flag)
|
137 |
+
conversation = self.pinecone.query_delta_conversations()
|
138 |
+
# print("Conversation: ", conversation)
|
139 |
+
context = self.parse_conversations(conversation)
|
140 |
+
print("Context: ", context)
|
141 |
+
response = self.query_chatbot(query, context)
|
142 |
+
return response
|
143 |
+
|
144 |
+
def count_tokens(self, chain, query):
|
145 |
+
with get_openai_callback() as callback:
|
146 |
+
response = chain(query)
|
147 |
+
print(f"Call Back: {callback}")
|
148 |
+
return response
|
src/pinecone/resonate_pinecone_functions.py
ADDED
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Description: Pinecone Serverless Class for Resonate
|
2 |
+
# Reference: https://www.pinecone.io/docs/
|
3 |
+
|
4 |
+
import datetime
|
5 |
+
import uuid
|
6 |
+
import json
|
7 |
+
import os
|
8 |
+
import time
|
9 |
+
import pandas as pd
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
from langchain_openai import OpenAIEmbeddings
|
12 |
+
from pinecone import Pinecone, ServerlessSpec
|
13 |
+
|
14 |
+
def load_json_config(json_file_path="./config/config.json"):
|
15 |
+
with open(json_file_path, "r") as file:
|
16 |
+
data = json.load(file)
|
17 |
+
return data
|
18 |
+
|
19 |
+
|
20 |
+
class PineconeServerless:
|
21 |
+
def __init__(self) -> None:
|
22 |
+
print("Pinecone Serverless Initializing")
|
23 |
+
json_config = load_json_config()
|
24 |
+
load_dotenv("./config/.env")
|
25 |
+
|
26 |
+
self.PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
|
27 |
+
self.OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
28 |
+
if self.PINECONE_API_KEY is not None:
|
29 |
+
self.pinecone = Pinecone(api_key=self.PINECONE_API_KEY)
|
30 |
+
self._init_config(json_config)
|
31 |
+
self.meeting_title = None
|
32 |
+
self.base_data_path = "./data/jsonMetaDataFiles/"
|
33 |
+
self.master_json_file = f"{self.base_data_path}{self.master_json_filename}.json"
|
34 |
+
self._create_master_json()
|
35 |
+
self._create_index()
|
36 |
+
self.response = None
|
37 |
+
print("Pinecone Serverless Initialized")
|
38 |
+
|
39 |
+
def _init_config(self, json_config) -> None:
|
40 |
+
for key, value in json_config.items():
|
41 |
+
setattr(self, key.lower(), value)
|
42 |
+
|
43 |
+
def check_index_already_exists(self) -> bool:
|
44 |
+
return self.pinecone_index_name in self.pinecone.list_indexes()
|
45 |
+
|
46 |
+
def _get_vector_embedder(self):
|
47 |
+
if self.embedding_provider == "OpenAI":
|
48 |
+
return OpenAIEmbeddings(model=self.embedding_model_name)
|
49 |
+
else:
|
50 |
+
raise ValueError("Invalid Embedding Model")
|
51 |
+
|
52 |
+
def _get_index(self):
|
53 |
+
return self.pinecone.Index(self.pinecone_index_name)
|
54 |
+
|
55 |
+
def _create_index(self) -> None:
|
56 |
+
'''
|
57 |
+
Creates a new index in Pinecone if it does not exist
|
58 |
+
'''
|
59 |
+
pinecone_indexes_list = [
|
60 |
+
index.get("name")
|
61 |
+
for index in self.pinecone.list_indexes().get("indexes", [])]
|
62 |
+
|
63 |
+
if self.pinecone_index_name not in pinecone_indexes_list:
|
64 |
+
try:
|
65 |
+
self.pinecone.create_index(
|
66 |
+
name=self.pinecone_index_name,
|
67 |
+
metric=self.pinecone_metric,
|
68 |
+
dimension=self.pinecone_vector_dimension,
|
69 |
+
spec=ServerlessSpec(
|
70 |
+
cloud=self.pinecone_cloud_provider,
|
71 |
+
region=self.pinecone_region,
|
72 |
+
# pod_type="p1.x1", # Future use
|
73 |
+
),
|
74 |
+
)
|
75 |
+
|
76 |
+
while not self.pinecone.describe_index(self.pinecone_index_name).status["ready"]:
|
77 |
+
time.sleep(5)
|
78 |
+
|
79 |
+
except Exception as e:
|
80 |
+
print("Index creation failed: ", e)
|
81 |
+
|
82 |
+
def describe_index_stats(self) -> dict:
|
83 |
+
try:
|
84 |
+
index = self._get_index()
|
85 |
+
return index.describe_index_stats()
|
86 |
+
except Exception as e:
|
87 |
+
print("Index does not exist: ", e)
|
88 |
+
return {}
|
89 |
+
|
90 |
+
def _delete_index(self) -> None:
|
91 |
+
try:
|
92 |
+
self.pinecone.delete_index(self.pinecone_index_name)
|
93 |
+
except Exception as e:
|
94 |
+
print("Index does not exist: ", e)
|
95 |
+
|
96 |
+
def _create_master_json(self) -> None:
|
97 |
+
'''
|
98 |
+
Check if the master json file exists, if not, create it
|
99 |
+
'''
|
100 |
+
os.makedirs(os.path.dirname(self.base_data_path), exist_ok=True)
|
101 |
+
if not os.path.exists(self.master_json_file):
|
102 |
+
with open(self.master_json_file, "w") as file:
|
103 |
+
data = {
|
104 |
+
"index": self.pinecone_index_name,
|
105 |
+
"namespace": self.pinecone_namespace,
|
106 |
+
"last_conversation_no": 0,
|
107 |
+
"meeting_uuids": [],
|
108 |
+
"meetings": [],
|
109 |
+
}
|
110 |
+
|
111 |
+
with open(self.master_json_file, "w") as f:
|
112 |
+
json.dump(data, f, indent=4)
|
113 |
+
|
114 |
+
print(f"Created {self.master_json_file}")
|
115 |
+
|
116 |
+
def _update_master_json(
|
117 |
+
self,
|
118 |
+
meeting_uuid: str,
|
119 |
+
meeting_title: str,
|
120 |
+
last_conversation_no: int,
|
121 |
+
meeting_video_file: bool,
|
122 |
+
time_stamp: str,
|
123 |
+
) -> dict:
|
124 |
+
'''
|
125 |
+
Updates the master json file with the new meeting details
|
126 |
+
'''
|
127 |
+
with open(self.master_json_file, "r+") as f:
|
128 |
+
data = json.load(f)
|
129 |
+
data["meeting_uuids"] = list(set(data["meeting_uuids"] + [meeting_uuid]))
|
130 |
+
data["last_conversation_no"] = last_conversation_no
|
131 |
+
data["meetings"].append(
|
132 |
+
{
|
133 |
+
"meeting_uuid": meeting_uuid,
|
134 |
+
"meeting_title": meeting_title,
|
135 |
+
"meeting_date": time_stamp,
|
136 |
+
"meeting_video_file": meeting_video_file,
|
137 |
+
}
|
138 |
+
)
|
139 |
+
return data
|
140 |
+
|
141 |
+
def _get_meeting_members(self, transcript: pd.DataFrame) -> list[str]:
|
142 |
+
return list(transcript["speaker_label"].unique())
|
143 |
+
|
144 |
+
def _create_new_meeting_json(
|
145 |
+
self,
|
146 |
+
meeting_uuid: str,
|
147 |
+
meeting_title: str,
|
148 |
+
last_conversation_no: int,
|
149 |
+
meeting_members: list[str],
|
150 |
+
meeting_video_file: bool,
|
151 |
+
time_stamp: str,
|
152 |
+
meeting_summary: str,
|
153 |
+
) -> dict:
|
154 |
+
'''
|
155 |
+
Creates a new json file for the meeting details
|
156 |
+
'''
|
157 |
+
data = {
|
158 |
+
"index": self.pinecone_index_name,
|
159 |
+
"namespace": self.pinecone_namespace,
|
160 |
+
"meeting_title": meeting_title,
|
161 |
+
"meeting_uuid": meeting_uuid,
|
162 |
+
"meeting_date": time_stamp,
|
163 |
+
"last_conversation_no": last_conversation_no,
|
164 |
+
"meeting_video_file": meeting_video_file,
|
165 |
+
"meeting_members": meeting_members,
|
166 |
+
"meeting_summary": meeting_summary,
|
167 |
+
}
|
168 |
+
|
169 |
+
meeting_details_file = os.path.join(self.base_data_path, f"{meeting_uuid}.json")
|
170 |
+
with open(meeting_details_file, "w") as f:
|
171 |
+
json.dump(data, f, indent=4)
|
172 |
+
|
173 |
+
def _get_last_conversation_no(self) -> list[str]:
|
174 |
+
|
175 |
+
with open(self.master_json_file, "r") as f:
|
176 |
+
data = json.load(f)
|
177 |
+
|
178 |
+
return data["last_conversation_no"]
|
179 |
+
|
180 |
+
def _set_new_meeting_json(
|
181 |
+
self,
|
182 |
+
meeting_uuid: str,
|
183 |
+
meeting_title: str,
|
184 |
+
last_conversation_no: str,
|
185 |
+
meeting_members: list[str],
|
186 |
+
meeting_video_file: bool,
|
187 |
+
meeting_summary: str,
|
188 |
+
) -> dict:
|
189 |
+
'''
|
190 |
+
Updates the master json file with the new meeting details
|
191 |
+
'''
|
192 |
+
time_stamp = str(datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
|
193 |
+
self._create_new_meeting_json(
|
194 |
+
meeting_uuid,
|
195 |
+
meeting_title,
|
196 |
+
last_conversation_no,
|
197 |
+
meeting_members,
|
198 |
+
meeting_video_file,
|
199 |
+
time_stamp,
|
200 |
+
meeting_summary,
|
201 |
+
)
|
202 |
+
data = self._update_master_json(
|
203 |
+
meeting_uuid,
|
204 |
+
meeting_title,
|
205 |
+
last_conversation_no,
|
206 |
+
meeting_video_file,
|
207 |
+
time_stamp,
|
208 |
+
)
|
209 |
+
|
210 |
+
with open(self.master_json_file, "w") as f:
|
211 |
+
json.dump(data, f, indent=4)
|
212 |
+
|
213 |
+
def _convert_to_hr_min_sec(self, time_in_minutes) -> str:
|
214 |
+
# Hr:Min:Sec
|
215 |
+
hours = int(time_in_minutes // 60)
|
216 |
+
minutes = int(time_in_minutes % 60)
|
217 |
+
seconds = int((time_in_minutes - int(time_in_minutes)) * 60)
|
218 |
+
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
|
219 |
+
|
220 |
+
def pinecone_upsert(
|
221 |
+
self,
|
222 |
+
transcript: pd.DataFrame,
|
223 |
+
meeting_uuid: str = "",
|
224 |
+
meeting_video_file: bool = False,
|
225 |
+
meeting_title: str = "Unnamed",
|
226 |
+
meeting_summary: str = "",
|
227 |
+
) -> None:
|
228 |
+
"""
|
229 |
+
Upserts the transcript into Pinecone
|
230 |
+
"""
|
231 |
+
print("Upserting transcript into Pinecone...")
|
232 |
+
texts = []
|
233 |
+
metadatas = []
|
234 |
+
|
235 |
+
last_conversation_no = self._get_last_conversation_no()
|
236 |
+
last_conversation_no = int(last_conversation_no)
|
237 |
+
|
238 |
+
embed = self._get_vector_embedder()
|
239 |
+
meeting_members = self._get_meeting_members(transcript)
|
240 |
+
index = self._get_index()
|
241 |
+
|
242 |
+
for _, record in transcript.iterrows():
|
243 |
+
start_time = self._convert_to_hr_min_sec(record["start_time"])
|
244 |
+
|
245 |
+
metadata = {
|
246 |
+
"speaker": record["speaker_label"],
|
247 |
+
"start_time": start_time,
|
248 |
+
"text": record["text"],
|
249 |
+
"meeting_uuid": meeting_uuid,
|
250 |
+
}
|
251 |
+
texts.append(record["text"])
|
252 |
+
metadatas.append(metadata)
|
253 |
+
|
254 |
+
if len(texts) >= self.pinecone_upsert_batch_limit:
|
255 |
+
ids = list(
|
256 |
+
map(
|
257 |
+
lambda i: str(i + 1),
|
258 |
+
range(last_conversation_no, last_conversation_no + len(texts)),
|
259 |
+
)
|
260 |
+
)
|
261 |
+
last_conversation_no += len(texts)
|
262 |
+
embeds = embed.embed_documents(texts)
|
263 |
+
|
264 |
+
try:
|
265 |
+
index.upsert(
|
266 |
+
vectors=zip(ids, embeds, metadatas),
|
267 |
+
namespace=self.pinecone_namespace,
|
268 |
+
)
|
269 |
+
except Exception as e:
|
270 |
+
print("Error upserting into Pinecone: ", e)
|
271 |
+
texts = []
|
272 |
+
metadatas = []
|
273 |
+
|
274 |
+
# Upsert the remaining texts
|
275 |
+
if len(texts) > 0:
|
276 |
+
ids = list(
|
277 |
+
map(
|
278 |
+
lambda i: str(i + 1),
|
279 |
+
range(last_conversation_no, last_conversation_no + len(texts)),
|
280 |
+
)
|
281 |
+
)
|
282 |
+
last_conversation_no += len(texts)
|
283 |
+
embeds = embed.embed_documents(texts)
|
284 |
+
|
285 |
+
try:
|
286 |
+
index.upsert(
|
287 |
+
vectors=zip(ids, embeds, metadatas),
|
288 |
+
namespace=self.pinecone_namespace,
|
289 |
+
)
|
290 |
+
except Exception as e:
|
291 |
+
print("Error upserting into Pinecone: ", e)
|
292 |
+
|
293 |
+
self._set_new_meeting_json(
|
294 |
+
meeting_uuid,
|
295 |
+
meeting_title,
|
296 |
+
last_conversation_no,
|
297 |
+
meeting_members,
|
298 |
+
meeting_video_file,
|
299 |
+
meeting_summary,
|
300 |
+
)
|
301 |
+
|
302 |
+
print("Upserted transcript into Pinecone")
|
303 |
+
|
304 |
+
def _extract_id_from_response(self, response: list) -> list[int]:
|
305 |
+
if response:
|
306 |
+
return list(int(match["id"]) for match in response["matches"])
|
307 |
+
return []
|
308 |
+
|
309 |
+
def query_pinecone(
|
310 |
+
self, query: str, in_filter: list[str] = [], complete_db_flag: bool = False
|
311 |
+
) -> list:
|
312 |
+
"""
|
313 |
+
Queries Pinecone for the given query, where in_filter is the list of meeting_uuids to filter the query
|
314 |
+
and if complete_db_flag is True, the entire database is queried
|
315 |
+
"""
|
316 |
+
# for using without clustering, complete_db_flag to True
|
317 |
+
try:
|
318 |
+
index = self._get_index()
|
319 |
+
embed = self._get_vector_embedder()
|
320 |
+
|
321 |
+
filter = None if complete_db_flag else {"meeting_uuid": {"$in": in_filter}}
|
322 |
+
|
323 |
+
self.response = index.query(
|
324 |
+
vector=embed.embed_documents([query])[0],
|
325 |
+
namespace=self.pinecone_namespace,
|
326 |
+
top_k=self.pinecone_top_k_results,
|
327 |
+
include_metadata=True,
|
328 |
+
filter=filter,
|
329 |
+
)
|
330 |
+
return self.response
|
331 |
+
except Exception as e:
|
332 |
+
print("Error querying Pinecone: ", e)
|
333 |
+
return []
|
334 |
+
|
335 |
+
|
336 |
+
def query_delta_conversations(self) -> pd.DataFrame:
|
337 |
+
"""
|
338 |
+
Queries Pinecone for the given query and returns the delta conversations (conversation window around the query result)
|
339 |
+
"""
|
340 |
+
ids = self._extract_id_from_response(self.response)
|
341 |
+
last_conversation_no = self._get_last_conversation_no()
|
342 |
+
index = self._get_index()
|
343 |
+
conversation = {}
|
344 |
+
|
345 |
+
for id in ids:
|
346 |
+
left = (
|
347 |
+
id - self.pinecone_delta_window
|
348 |
+
if id - self.pinecone_delta_window > 0
|
349 |
+
else 1
|
350 |
+
)
|
351 |
+
right = (
|
352 |
+
id + self.pinecone_delta_window
|
353 |
+
if id + self.pinecone_delta_window <= last_conversation_no
|
354 |
+
else last_conversation_no
|
355 |
+
)
|
356 |
+
window = [str(i) for i in range(left, right + 1)]
|
357 |
+
try:
|
358 |
+
# print("Fetch window: ", window)
|
359 |
+
print("Contextual Window Conversation IDs: ", window)
|
360 |
+
fetch_response = index.fetch(
|
361 |
+
ids=window, namespace=self.pinecone_namespace
|
362 |
+
)
|
363 |
+
conversation[id] = fetch_response
|
364 |
+
except Exception as e:
|
365 |
+
print("Error fetching from Pinecone for id:", id, "Error:", e)
|
366 |
+
continue
|
367 |
+
# print('conversation length: ', len(conversation))
|
368 |
+
return self._parse_fetch_conversations(conversation)
|
369 |
+
|
370 |
+
|
371 |
+
def _parse_fetch_conversations(self, conversation)-> dict:
|
372 |
+
'''
|
373 |
+
Parses the conversation dictionary and returns a grouped_dfs
|
374 |
+
'''
|
375 |
+
data_rows = []
|
376 |
+
for primary_hit_id, primary_hit_data in conversation.items():
|
377 |
+
for _, vector_data in primary_hit_data["vectors"].items():
|
378 |
+
id = vector_data["id"]
|
379 |
+
meeting_uuid = vector_data["metadata"]["meeting_uuid"]
|
380 |
+
speaker = vector_data["metadata"]["speaker"]
|
381 |
+
start_time = vector_data["metadata"]["start_time"]
|
382 |
+
text = vector_data["metadata"]["text"]
|
383 |
+
|
384 |
+
data_rows.append(
|
385 |
+
(primary_hit_id, id, meeting_uuid, speaker, start_time, text)
|
386 |
+
)
|
387 |
+
|
388 |
+
columns = ["primary_id", "id", "meeting_uuid", "speaker", "start_time", "text"]
|
389 |
+
delta_conversation_df = pd.DataFrame(data_rows, columns=columns)
|
390 |
+
delta_conversation_df = delta_conversation_df.sort_values(by=["id"])
|
391 |
+
delta_conversation_df = delta_conversation_df.drop_duplicates(subset=["id"])
|
392 |
+
|
393 |
+
# creating separate df for rows with same meeting_cluster_id
|
394 |
+
grouped_dfs = {
|
395 |
+
group_name: group.reset_index(drop=True, inplace=False)
|
396 |
+
for group_name, group in delta_conversation_df.groupby("meeting_uuid")
|
397 |
+
}
|
398 |
+
# return delta_conversation_df
|
399 |
+
return grouped_dfs
|
400 |
+
|
401 |
+
|
402 |
+
if __name__ == "__main__":
|
403 |
+
pinecone = PineconeServerless()
|
404 |
+
print(pinecone.describe_index_stats())
|
405 |
+
|
406 |
+
for i in range(1, 3):
|
407 |
+
print(i)
|
408 |
+
transcript = pd.read_csv(f"./data/transcriptFiles/healthcare_{i}.csv")
|
409 |
+
transcript.dropna(inplace=True)
|
410 |
+
pinecone.pinecone_upsert(
|
411 |
+
transcript,
|
412 |
+
meeting_uuid=str(uuid.uuid4()),
|
413 |
+
meeting_video_file=False,
|
414 |
+
meeting_title=f"Healthcare Meeting {i}",
|
415 |
+
meeting_summary=f"Healthcare Meeting Summary Meeting {i}",
|
416 |
+
)
|
417 |
+
time.sleep(5)
|
418 |
+
print(pinecone.describe_index_stats())
|
419 |
+
|
420 |
+
query = "I am one of the directors in Wappingers Central School District."
|
421 |
+
response1 = pinecone.query_pinecone(query, "", True)
|
422 |
+
print(response1)
|
src/utils/resonate_streamlitUtils.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import moviepy.editor as mp
|
2 |
+
import json
|
3 |
+
from datetime import datetime, timedelta
|
4 |
+
from src.aws.resonate_aws_functions import resonate_aws_transcribe
|
5 |
+
import os
|
6 |
+
|
7 |
+
from src.pinecone.resonate_pinecone_functions import PineconeServerless
|
8 |
+
import time
|
9 |
+
import uuid
|
10 |
+
import pandas as pd
|
11 |
+
|
12 |
+
|
13 |
+
def convert_video_to_audio(video_path, audio_path):
|
14 |
+
# Convert video file to audio file
|
15 |
+
audio_clip = mp.VideoFileClip(video_path).audio
|
16 |
+
audio_clip.write_audiofile(audio_path)
|
17 |
+
|
18 |
+
|
19 |
+
def transcript_text_editor_minutes_to_hhmmss(minutes):
|
20 |
+
time_delta = timedelta(minutes=minutes)
|
21 |
+
hhmmss_format = str(time_delta)
|
22 |
+
return hhmmss_format
|
23 |
+
|
24 |
+
|
25 |
+
def load_json_config(json_file_path="./config/config.json"):
|
26 |
+
# Use a context manager to ensure the file is properly closed after opening
|
27 |
+
with open(json_file_path, "r") as file:
|
28 |
+
# Load the JSON data
|
29 |
+
data = json.load(file)
|
30 |
+
return data
|
31 |
+
|
32 |
+
|
33 |
+
def aws_transcribe(file_name):
|
34 |
+
|
35 |
+
json_config = load_json_config()
|
36 |
+
current_timestamp = str.lower(datetime.now().strftime("%Y-%b-%d-%I-%M-%p"))
|
37 |
+
|
38 |
+
json_config["AWS_INPUT_BUCKET"] += f"{str(current_timestamp)}"
|
39 |
+
json_config["AWS_OUTPUT_BUCKET"] += f"{str(current_timestamp)}"
|
40 |
+
json_config["AWS_TRANSCRIBE_JOB_NAME"] += f"{str(current_timestamp)}"
|
41 |
+
|
42 |
+
print(json_config)
|
43 |
+
|
44 |
+
try:
|
45 |
+
rat = resonate_aws_transcribe()
|
46 |
+
df = rat.runner(
|
47 |
+
file_name=file_name,
|
48 |
+
input_bucket=json_config["AWS_INPUT_BUCKET"],
|
49 |
+
output_bucket=json_config["AWS_OUTPUT_BUCKET"],
|
50 |
+
transcribe_job_name=json_config["AWS_TRANSCRIBE_JOB_NAME"],
|
51 |
+
aws_access_key=os.getenv("AWS_ACCESS_KEY"),
|
52 |
+
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
53 |
+
aws_region_name=json_config["AWS_REGION"],
|
54 |
+
)
|
55 |
+
|
56 |
+
return df
|
57 |
+
except Exception as e:
|
58 |
+
return e
|
59 |
+
|
60 |
+
|
61 |
+
def pinecone_init_upsert(
|
62 |
+
df_transcript: pd.DataFrame,
|
63 |
+
meeting_title: str,
|
64 |
+
meeting_summary: str,
|
65 |
+
meeting_uuid: str,
|
66 |
+
):
|
67 |
+
try:
|
68 |
+
|
69 |
+
pinecone = PineconeServerless()
|
70 |
+
pinecone.pinecone_upsert(
|
71 |
+
df_transcript,
|
72 |
+
# meeting_uuid=NULL,
|
73 |
+
meeting_uuid=meeting_uuid,
|
74 |
+
meeting_video_file=False,
|
75 |
+
meeting_title=meeting_title,
|
76 |
+
meeting_summary=meeting_summary,
|
77 |
+
)
|
78 |
+
time.sleep(5)
|
79 |
+
except Exception as e:
|
80 |
+
print("Error upserting transcript to Pinecone: ", e)
|