madhuroopa commited on
Commit
1366204
·
1 Parent(s): a67e2cf

added new application files

Browse files
app.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Description: This file contains the main Streamlit application for the Resonate project.
2
+ # Run command: streamlit run app.py to run the application
3
+
4
+
5
+ import os
6
+ import pandas as pd
7
+ import streamlit as st
8
+ from dotenv import load_dotenv
9
+ from streamlit import session_state as ss
10
+ from streamlit_chat import message
11
+ from src.clustering.resonate_bert_summarizer import summarize_runner
12
+ from src.clustering.resonate_clustering import Clustering
13
+ from src.langchain.resonate_langchain_functions import LangChain
14
+ from src.utils.resonate_streamlitUtils import (
15
+ aws_transcribe,
16
+ convert_video_to_audio,
17
+ pinecone_init_upsert,
18
+ transcript_text_editor_minutes_to_hhmmss,
19
+ )
20
+
21
+
22
+ def initialize_session_state():
23
+ # Initialize API keys in session state if not present
24
+ if "api_keys" not in ss:
25
+ ss.api_keys = {}
26
+ ss.api_keys["openai_api_key"] = os.environ.get("OPENAI_API_KEY")
27
+ ss.api_keys["pinecone_api_key"] = os.environ.get("PINECONE_API_KEY")
28
+ ss.api_keys["aws_access_key"] = os.environ.get("AWS_ACCESS_KEY")
29
+ ss.api_keys["aws_secret_access_key"] = os.environ.get("AWS_SECRET_ACCESS_KEY")
30
+ if "add_meeting" not in ss:
31
+ ss.add_meeting = False
32
+ if "Clustering_obj" not in ss:
33
+ ss.Clustering_obj = Clustering()
34
+ # Initialize - Main Screen - Transcript Editor
35
+ if "transcript_speaker_editor" not in ss:
36
+ ss.transcript_speaker_editor = False
37
+ if "transcript_text_editor" not in ss:
38
+ ss.transcript_text_editor = False
39
+ if "meeting_name" not in ss:
40
+ ss.meeting_name = ""
41
+ if "df_transcript_speaker" not in ss:
42
+ ss.df_transcript_speaker = pd.DataFrame()
43
+ if "df_transcript_text" not in ss:
44
+ ss.df_transcript_text = pd.DataFrame()
45
+ if "updated_df" not in ss:
46
+ ss.updated_transcript_df_to_embed = pd.DataFrame()
47
+ if "chat_view" not in ss:
48
+ ss.chat_view = True
49
+ if "langchain_obj" not in ss:
50
+ ss.langchain_obj = LangChain()
51
+ if "query" not in ss:
52
+ ss.query = ""
53
+ if "responses" not in ss:
54
+ ss["responses"] = ["How can I assist you?"]
55
+ if "requests" not in ss:
56
+ ss["requests"] = []
57
+
58
+
59
+ def chat_view():
60
+ st.header("Chat")
61
+ response_container = st.container()
62
+ textcontainer = st.container()
63
+ with textcontainer:
64
+ query = st.text_input(
65
+ "Chat Here",
66
+ placeholder="Message Resonate ... ",
67
+ value=ss.query,
68
+ key="query_input",
69
+ )
70
+ # Clear button
71
+ if st.button("Clear"):
72
+ ss.langchain_obj.conversation_bufw.memory.clear() # Clear conversation buffer
73
+ ss.query = ""
74
+ ss.requests = []
75
+ ss.responses = []
76
+ ss["responses"] = ["How can I assist you?"]
77
+ st.rerun()
78
+
79
+ elif query:
80
+ with st.spinner("typing..."):
81
+ uuid_list = ss.Clustering_obj.uuid_for_query(query=query)
82
+ print(f"Meeting Unique ID : {uuid_list}")
83
+ response = ss.langchain_obj.chat(
84
+ query=query, in_filter=uuid_list, complete_db_flag=False
85
+ )
86
+ response = response["response"]
87
+ ss.requests.append(query)
88
+ ss.responses.append(response)
89
+ ss.query = ""
90
+ with response_container:
91
+ if ss["responses"]:
92
+ for i in range(len(ss["responses"])):
93
+ message(ss["responses"][i], key=str(i))
94
+ if i < len(ss["requests"]):
95
+ message(
96
+ ss["requests"][i],
97
+ is_user=True,
98
+ key=str(i) + "_user",
99
+ )
100
+
101
+
102
+ def api_keys_input():
103
+ with st.form("keys_input_form"):
104
+ # Retrieve values from session state
105
+ openai_api_key = st.text_input(
106
+ "OpenAPI Key:",
107
+ type="password",
108
+ value=ss.api_keys.get(
109
+ "openai_api_key", ""
110
+ ), # Use default value if key is not present
111
+ )
112
+ pinecone_api_key = st.text_input(
113
+ "Pinecone Key:",
114
+ type="password",
115
+ value=ss.api_keys.get(
116
+ "pinecone_api_key", ""
117
+ ), # Use default value if key is not present
118
+ )
119
+ aws_access_key = st.text_input(
120
+ "AWS Access Key:",
121
+ type="password",
122
+ value=ss.api_keys.get(
123
+ "aws_access_key", ""
124
+ ), # Use default value if key is not present
125
+ )
126
+ aws_secret_access_key = st.text_input(
127
+ "AWS Secret Access Key:",
128
+ type="password",
129
+ value=ss.api_keys.get(
130
+ "aws_secret_access_key", ""
131
+ ), # Use default value if key is not present
132
+ )
133
+ # Add a button to save the keys
134
+ save_button = st.form_submit_button("Save API Keys")
135
+ if save_button:
136
+ # Update session state with provided keys
137
+ ss.api_keys["openai_api_key"] = openai_api_key
138
+ ss.api_keys["pinecone_api_key"] = pinecone_api_key
139
+ ss.api_keys["aws_access_key"] = aws_access_key
140
+ ss.api_keys["aws_secret_access_key"] = aws_secret_access_key
141
+ # Set environment variables only if the keys are not None
142
+ if openai_api_key:
143
+ os.environ["OPENAI_API_KEY"] = ss.api_keys["openai_api_key"]
144
+ if pinecone_api_key:
145
+ os.environ["PINECONE_API_KEY"] = ss.api_keys["pinecone_api_key"]
146
+ if aws_access_key:
147
+ os.environ["AWS_ACCESS_KEY"] = ss.api_keys["aws_access_key"]
148
+ if aws_secret_access_key:
149
+ os.environ["AWS_SECRET_ACCESS_KEY"] = ss.api_keys[
150
+ "aws_secret_access_key"
151
+ ]
152
+ st.rerun()
153
+
154
+
155
+ def add_meeting():
156
+ with st.form("add_meeting_form"):
157
+ uploaded_file = st.file_uploader("Choose a file", type=["wav", "mp4"])
158
+ # Get user input
159
+ meeting_name = st.text_input("Enter Meeting Name:")
160
+ save_meeting_button = st.form_submit_button("Save Meeting")
161
+ if save_meeting_button:
162
+ if not meeting_name:
163
+ st.warning("Please enter Meeting Name.")
164
+ elif uploaded_file is None:
165
+ st.warning("Please upload a meeting recording.")
166
+ elif meeting_name and uploaded_file:
167
+ with st.spinner("Processing..."):
168
+ file_name = uploaded_file.name.replace(" ", "_")
169
+ if file_name.endswith(".mp4") or file_name.endswith(".mpeg4"):
170
+ print("in video")
171
+ with open("data/videoFiles/" + file_name, "wb") as f:
172
+ f.write(uploaded_file.getbuffer())
173
+ f.close()
174
+ # Convert video file to audio file
175
+ audio_path = "data/audioFiles/" + file_name[:-4] + ".wav"
176
+ convert_video_to_audio(
177
+ "data/videoFiles/" + file_name, audio_path
178
+ )
179
+ file_name = file_name[:-4] + ".wav"
180
+ elif file_name.endswith(".wav"):
181
+ print("in audio")
182
+ with open("data/audioFiles/" + file_name, "wb") as f:
183
+ f.write(uploaded_file.getbuffer())
184
+ f.close()
185
+ ss.df_transcript_speaker = aws_transcribe(file_name)
186
+ ss.meeting_name = meeting_name
187
+ ss.transcript_speaker_editor = True
188
+
189
+
190
+ def transcript_speaker_editor():
191
+ ss.add_meeting = False
192
+ with st.form("transcript_speaker_editor_form"):
193
+ st.write("Transcript Speaker Editor:")
194
+ st.dataframe(ss.df_transcript_speaker)
195
+ df = ss.df_transcript_speaker.copy(deep=True)
196
+ # Create a list of unique speaker labels
197
+ speaker_labels = df["speaker_label"].unique()
198
+ # Create a dictionary to store the updated values
199
+ updated_speaker_names = {}
200
+ # Display text input boxes for each speaker label
201
+ for speaker_label in speaker_labels:
202
+ new_name = st.text_input(
203
+ f"Edit speaker label '{speaker_label}'", speaker_label
204
+ )
205
+ updated_speaker_names[speaker_label] = new_name
206
+ # Update the DataFrame with the new speaker label names
207
+ for old_name, new_name in updated_speaker_names.items():
208
+ df["speaker_label"] = df["speaker_label"].replace(old_name, new_name)
209
+ update_speaker_button = st.form_submit_button("Update Speakers")
210
+ if update_speaker_button and df is not None:
211
+ ss.df_transcript_speaker = pd.DataFrame()
212
+ ss.df_transcript_text = df.copy(deep=True)
213
+ del df
214
+ ss.transcript_text_editor = True
215
+ ss.transcript_speaker_editor = False
216
+ st.rerun()
217
+
218
+
219
+ # Function to update the text column
220
+ def transcript_text_editor_update_text(row_index, new_text):
221
+ ss.updated_transcript_df_to_embed.at[row_index, "text"] = new_text
222
+
223
+
224
+ def transcript_text_editor():
225
+ ss.transcript_speaker_editor = False
226
+ st.write("Transcript Text Editor:")
227
+ st.write(ss.df_transcript_text)
228
+ df = ss.df_transcript_text.copy(deep=True)
229
+ ss.updated_transcript_df_to_embed = df.copy(deep=True)
230
+ # Convert start_time and end_time to HH:MM:SS format
231
+ df["start_time"] = df["start_time"].apply(transcript_text_editor_minutes_to_hhmmss)
232
+ df["end_time"] = df["end_time"].apply(transcript_text_editor_minutes_to_hhmmss)
233
+ row_index = st.number_input(
234
+ "Enter the row index:",
235
+ min_value=0,
236
+ max_value=len(df) - 1,
237
+ value=0,
238
+ step=1,
239
+ )
240
+ new_text = st.text_area("Enter the new text:", df.at[row_index, "text"])
241
+ update_text_button_inner = st.button("Update Text")
242
+ if update_text_button_inner:
243
+ transcript_text_editor_update_text(row_index, new_text)
244
+ st.success("Text updated successfully!")
245
+ # Display the updated dataframe
246
+ st.header("Updated Transcript")
247
+ st.table(ss.updated_transcript_df_to_embed)
248
+ update_text_button = st.button("Finish Transcript Editing")
249
+ if update_text_button:
250
+ with st.spinner("Uploading..."):
251
+ ss.df_transcript_text = pd.DataFrame()
252
+ meeting_summary, meeting_uuid = summarize_runner(
253
+ ss.updated_transcript_df_to_embed
254
+ )
255
+ ss.Clustering_obj.create_Cluster()
256
+ pinecone_init_upsert(
257
+ ss.updated_transcript_df_to_embed,
258
+ meeting_title=ss.meeting_name,
259
+ meeting_summary=meeting_summary,
260
+ meeting_uuid=meeting_uuid,
261
+ )
262
+ ss.meeting_name = "unnamed"
263
+ st.success("Pinecone upsert completed successfully!")
264
+ ss.transcript_text_editor = False
265
+ ss.updated_transcript_df_to_embed = pd.DataFrame()
266
+ ss.chat_view = True
267
+ st.rerun()
268
+
269
+
270
+ def init_streamlit():
271
+ load_dotenv("./config/.env")
272
+ initialize_session_state()
273
+ # Set initial state of the sidebar
274
+ if ss.api_keys["pinecone_api_key"] is not None:
275
+ st.set_page_config(
276
+ initial_sidebar_state="collapsed",
277
+ layout="wide",
278
+ )
279
+ st.title("RESONATE")
280
+ # Initializing sidebar and its components
281
+ with st.sidebar:
282
+ api_keys_input()
283
+ if st.button("Upload Meeting / Chat"):
284
+ ss.add_meeting = not ss.add_meeting
285
+ ss.chat_view = not ss.chat_view
286
+ ss.transcript_speaker_editor = False
287
+ ss.transcript_text_editor = False
288
+ if ss.add_meeting:
289
+ add_meeting()
290
+ if ss.transcript_speaker_editor:
291
+ transcript_speaker_editor()
292
+ if ss.df_transcript_text is not None and ss.transcript_text_editor:
293
+ transcript_text_editor()
294
+ if ss.chat_view:
295
+ chat_view() # Chat view
296
+
297
+
298
+ if __name__ == "__main__":
299
+ init_streamlit()
300
+
301
+ # Test questions:
302
+ # What was discussed about cyberbullying?
303
+ # What is one new feature planned for GitLab's code search?
304
+ # What is the goal of defining maintainability for the new diffs architecture?
requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core
2
+ numpy
3
+ pandas
4
+ tqdm
5
+ ipykernel
6
+ boto3
7
+ botocore
8
+ jupyter
9
+ ipywidgets
10
+ python-dotenv
11
+ scikit-learn
12
+ torch
13
+
14
+ # Project Specific
15
+ openai
16
+ langchain-community
17
+ langchain-openai
18
+ langchain
19
+ pinecone-client
20
+ streamlit
21
+ tiktoken
22
+ webvtt-py
23
+ moviepy
24
+ transformers
25
+ streamlit_chat==0.0.2.2
26
+ faiss-cpu
27
+
28
+ # LLM Eval
29
+ # trulens
src/aws/resonate_aws_functions.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Description: AWS utility functions for Resonate. This file contains the code to parse the AWS Transcribe output.
2
+ # Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/transcribe/client/start_transcription_job.html
3
+
4
+ import json
5
+ import os
6
+ import re
7
+ import time
8
+ import boto3
9
+ import dotenv
10
+ import pandas as pd
11
+ import webvtt
12
+ from datetime import datetime
13
+ from IPython.display import HTML, display
14
+
15
+ class resonate_aws_transcribe:
16
+ def create_client(
17
+ self,
18
+ aws_access_key: str,
19
+ aws_secret_access_key: str,
20
+ aws_region_name: str,
21
+ ) -> tuple[boto3.client, boto3.client]:
22
+ """
23
+ Create and return AWS Transcribe and S3 clients with the specified AWS region.
24
+ """
25
+ session = boto3.Session(
26
+ aws_access_key_id=aws_access_key,
27
+ aws_secret_access_key=aws_secret_access_key,
28
+ region_name=aws_region_name,
29
+ )
30
+ return session.client("transcribe"), session.client("s3")
31
+
32
+ def create_s3_bucket(
33
+ self, s3: boto3.client, bucket_name: str, aws_region_name: str
34
+ ) -> bool:
35
+ """
36
+ Create an S3 bucket using the provided AWS S3 client if it doesn't exist.
37
+ """
38
+ try:
39
+ s3.create_bucket(
40
+ Bucket=bucket_name,
41
+ CreateBucketConfiguration={"LocationConstraint": aws_region_name},
42
+ )
43
+ print(f"S3 bucket '{bucket_name}' created successfully.")
44
+ return True
45
+ except s3.exceptions.BucketAlreadyExists:
46
+ print(f"S3 bucket '{bucket_name}' already exists.")
47
+ return True
48
+ except Exception as e:
49
+ print(f"Error creating S3 bucket '{bucket_name}': {e}")
50
+ return False
51
+
52
+ def upload_to_s3(
53
+ self, s3: boto3.client, file_path: str, bucket_name: str, object_name=None
54
+ ) -> str:
55
+ """
56
+ Upload the audio file to S3 bucket using the provided AWS S3 client.
57
+ """
58
+ if object_name is None:
59
+ object_name = file_path
60
+
61
+ try:
62
+ s3.upload_file(file_path, bucket_name, object_name)
63
+ uri = f"s3://{bucket_name}/{object_name}"
64
+ print(f"File '{file_path}' uploaded successfully to '{uri}'")
65
+ return uri
66
+
67
+ except Exception as e:
68
+ print(
69
+ f"Error uploading file '{file_path}' to '{bucket_name}/{object_name}': {e}"
70
+ )
71
+ return ""
72
+
73
+ def download_from_s3(
74
+ self,
75
+ s3: boto3.client,
76
+ object_name: str,
77
+ bucket_name: str,
78
+ local_directory: str,
79
+ ) -> bool:
80
+ """
81
+ Download the .json and .vtt files from an S3 bucket to a local directory.
82
+ """
83
+ local_file_json = f"{local_directory}/{object_name}.json"
84
+ local_file_vtt = f"{local_directory}/{object_name}.vtt"
85
+
86
+ try:
87
+ s3.download_file(bucket_name, object_name + ".json", local_file_json)
88
+ print(f"File '{object_name}' (JSON) downloaded successfully to '{local_file_json}'")
89
+
90
+ s3.download_file(bucket_name, object_name + ".vtt", local_file_vtt)
91
+ print(f"File '{object_name}' (VTT) downloaded successfully to '{local_file_vtt}'")
92
+ return True
93
+ except Exception as e:
94
+ print(f"Error downloading file '{object_name}' from '{bucket_name}': {e}")
95
+ return False
96
+
97
+ def delete_from_s3(
98
+ self, s3: boto3.client, bucket_name: str, object_name: str
99
+ ) -> bool:
100
+ """
101
+ Delete the file from an S3 bucket using the provided AWS S3 client.
102
+ """
103
+ try:
104
+ s3.delete_object(Bucket=bucket_name, Key=object_name)
105
+ print(f"File '{object_name}' deleted successfully from '{bucket_name}'")
106
+ return True
107
+ except Exception as e:
108
+ print(f"Error deleting file '{object_name}' from '{bucket_name}': {e}")
109
+ return False
110
+
111
+ def delete_s3_bucket(self, s3: boto3.client, bucket_name: str) -> bool:
112
+ """
113
+ Delete a S3 bucket along with its contents using the provided AWS S3 client.
114
+ """
115
+ try:
116
+ objects = s3.list_objects(Bucket=bucket_name).get("Contents", [])
117
+ for obj in objects:
118
+ s3.delete_object(Bucket=bucket_name, Key=obj["Key"])
119
+ print(
120
+ f"Object '{obj['Key']}' deleted successfully from '{bucket_name}'"
121
+ )
122
+
123
+ s3.delete_bucket(Bucket=bucket_name)
124
+ print(f"S3 bucket '{bucket_name}' and its contents deleted successfully.")
125
+ return True
126
+ except Exception as e:
127
+ return e
128
+
129
+ def transcribe_audio(
130
+ self,
131
+ transcribe_client: boto3.client,
132
+ uri: str,
133
+ output_bucket: str,
134
+ transcribe_job_name: str = "job",
135
+ ) -> dict:
136
+ """
137
+ Start a transcription job for audio stored in an S3 bucket using the AWS Transcribe service.
138
+ """
139
+ print("Calling AWS Transcribe Job...")
140
+ response = transcribe_client.start_transcription_job(
141
+ TranscriptionJobName=transcribe_job_name,
142
+ LanguageCode="en-US",
143
+ MediaFormat="wav",
144
+ Settings={
145
+ "ShowSpeakerLabels": True,
146
+ "MaxSpeakerLabels": 10,
147
+ "ChannelIdentification": False,
148
+ },
149
+ Media={"MediaFileUri": uri},
150
+ Subtitles={"Formats": ["vtt"]},
151
+ OutputBucketName=output_bucket,
152
+ )
153
+ return response
154
+
155
+ def combine_files(self, file_name: str, local_directory: str) -> pd.DataFrame:
156
+ """
157
+ Combines information from a JSON file and a WebVTT file into a CSV file.
158
+ """
159
+ json_file_path = f"{local_directory}/{file_name}.json"
160
+ with open(json_file_path, "r") as f:
161
+ data = json.load(f)
162
+
163
+ segments = data["results"]["speaker_labels"]["segments"]
164
+ df = pd.DataFrame(segments)
165
+ df["start_time"] = df["start_time"].astype(float) / 60
166
+ df["end_time"] = df["end_time"].astype(float) / 60
167
+ df = df.rename(
168
+ columns={
169
+ "start_time": "start_time",
170
+ "end_time": "end_time",
171
+ "speaker_label": "speaker_label",
172
+ }
173
+ )
174
+
175
+ vtt_file_path = f"{local_directory}/{file_name}.vtt"
176
+ subtitles = webvtt.read(vtt_file_path)
177
+
178
+ data = [
179
+ (
180
+ subtitle.start_in_seconds / 60,
181
+ subtitle.end_in_seconds / 60,
182
+ subtitle.text.strip(),
183
+ )
184
+ for subtitle in subtitles
185
+ ]
186
+ titles = pd.DataFrame(data, columns=["start_time", "end_time", "text"])
187
+ transcript = pd.merge_asof(
188
+ titles.sort_values("start_time"),
189
+ df.sort_values("start_time"),
190
+ on="start_time",
191
+ direction="backward",
192
+ )
193
+
194
+ transcript = transcript.dropna(subset=["speaker_label"])
195
+ transcript = transcript[["start_time", "end_time_x", "speaker_label", "text"]]
196
+ transcript.columns = ["start_time", "end_time", "speaker_label", "text"]
197
+
198
+ # Reset the index
199
+ transcript = transcript.reset_index(drop=True)
200
+
201
+ print("Combined transcript successfully!")
202
+ return transcript
203
+
204
+ def aws_transcribe_parser(
205
+ self, transcript_df: pd.DataFrame, output_filename: str
206
+ ) -> pd.DataFrame:
207
+ """
208
+ Parses the AWS Transcribe output by cleaning duplicate texts and merging consecutive rows with
209
+ the same speaker.
210
+ """
211
+ prev_text = None # Initialize prev_text
212
+ transcript_df["text"] = transcript_df["text"].apply(
213
+ lambda x: re.sub(r"[\"\'\--]+", "", x)
214
+ )
215
+
216
+ for index, row in transcript_df.iterrows():
217
+ if row["text"] == prev_text and row["speaker_label"] == prev_speaker:
218
+ transcript_df.at[merge_start, "end_time"] = row["end_time"]
219
+ transcript_df.drop(index, inplace=True)
220
+ else:
221
+ merge_start = index
222
+
223
+ prev_text = row["text"]
224
+ prev_speaker = row["speaker_label"]
225
+
226
+ transcript_df["group"] = (
227
+ transcript_df["speaker_label"] != transcript_df["speaker_label"].shift()
228
+ ).cumsum()
229
+ result_df = transcript_df.groupby(
230
+ ["group", "speaker_label"], as_index=False
231
+ ).agg({"start_time": "first", "end_time": "last", "text": " ".join})
232
+ result_df = result_df.drop(columns=["group"])
233
+
234
+ result_df.to_csv(
235
+ "./data/transcriptFiles/" + output_filename + ".csv", index=False
236
+ )
237
+ return result_df
238
+
239
+ def delete_local_temp_file(self, tempFiles: str) -> bool:
240
+ """
241
+ Delete a local temporary file specified by the file path.
242
+ """
243
+ if os.path.exists("./data/tempFiles/" + tempFiles + ".json"):
244
+ os.remove("./data/tempFiles/" + tempFiles + ".json")
245
+
246
+ if os.path.exists("./data/tempFiles/" + tempFiles + ".vtt"):
247
+ os.remove("./data/tempFiles/" + tempFiles + ".vtt")
248
+
249
+ def runner(
250
+ self,
251
+ file_name: str,
252
+ input_bucket: str,
253
+ output_bucket: str,
254
+ transcribe_job_name: str,
255
+ aws_access_key: str,
256
+ aws_secret_access_key: str,
257
+ aws_region_name: str,
258
+ ) -> None:
259
+ """
260
+ Run the transcription process for an audio file using AWS Transcribe.
261
+ """
262
+ transcribe_client, s3_client = self.create_client(
263
+ aws_access_key=aws_access_key,
264
+ aws_secret_access_key=aws_secret_access_key,
265
+ aws_region_name=aws_region_name,
266
+ )
267
+
268
+ print("Transcribe_client created: ", transcribe_client)
269
+ print("s3_client created: ", s3_client)
270
+
271
+ # Create S3 buckets
272
+ print(
273
+ f"Create S3 Bucket {input_bucket} : ",
274
+ self.create_s3_bucket(s3_client, input_bucket, aws_region_name),
275
+ )
276
+ print(
277
+ f"Create S3 Bucket {output_bucket} : ",
278
+ self.create_s3_bucket(s3_client, output_bucket, aws_region_name),
279
+ )
280
+
281
+ URI = self.upload_to_s3(
282
+ s3_client, "./data/audioFiles/" + file_name, input_bucket
283
+ )
284
+ print("Upload completed now will initiate transcription job.")
285
+ self.transcribe_audio(
286
+ transcribe_client,
287
+ URI,
288
+ output_bucket,
289
+ transcribe_job_name=transcribe_job_name,
290
+ )
291
+
292
+ # Check status of transcription job
293
+ while (
294
+ transcribe_client.get_transcription_job(
295
+ TranscriptionJobName=transcribe_job_name
296
+ )["TranscriptionJob"]["TranscriptionJobStatus"]
297
+ != "COMPLETED"
298
+ ):
299
+ time.sleep(3)
300
+
301
+ # Download transcription job output
302
+ print(
303
+ "Download from S3 : ",
304
+ self.download_from_s3(
305
+ s3_client,
306
+ transcribe_job_name,
307
+ output_bucket,
308
+ local_directory="./data/tempFiles/",
309
+ ),
310
+ )
311
+
312
+ print(
313
+ "Delete S3 Bucket Input Bucket : ",
314
+ self.delete_s3_bucket(s3_client, input_bucket),
315
+ )
316
+ print(
317
+ "Delete S3 Bucket Output Bucket: ",
318
+ self.delete_s3_bucket(s3_client, output_bucket),
319
+ )
320
+
321
+ try:
322
+ transcribe_client.delete_transcription_job(
323
+ TranscriptionJobName=transcribe_job_name
324
+ )
325
+ except:
326
+ print("Transcription Job does not exist.")
327
+
328
+ # Close clients
329
+ transcribe_client.close()
330
+ s3_client.close()
331
+
332
+ # combine the json and vtt results to create a transcript
333
+ df_transcript_combined = self.combine_files(
334
+ transcribe_job_name, local_directory="./data/tempFiles/"
335
+ )
336
+ df_transcript_combined_parsed = self.aws_transcribe_parser(
337
+ transcript_df=df_transcript_combined, output_filename=transcribe_job_name
338
+ )
339
+ print("Transcript parsed successfully")
340
+
341
+ self.delete_local_temp_file(tempFiles=transcribe_job_name)
342
+ return df_transcript_combined_parsed
343
+
344
+
345
+ if __name__ == "__main__":
346
+ dotenv.load_dotenv("./config/.env")
347
+
348
+ current_timestamp = str.lower(datetime.now().strftime("%Y-%b-%d-%I-%M-%p"))
349
+
350
+ aws_access_key = os.getenv("AWS_ACCESS_KEY")
351
+ aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
352
+ print(aws_access_key, aws_secret_access_key)
353
+ aws_region_name = "us-east-2"
354
+ file_name = "test.wav"
355
+ input_bucket = f"resonate-input-{str(current_timestamp)}"
356
+ output_bucket = f"resonate-output-{str(current_timestamp)}"
357
+ transcribe_job_name = f"resonate-job-{str(current_timestamp)}"
358
+
359
+ rat = resonate_aws_transcribe()
360
+ df = rat.runner(
361
+ file_name=file_name,
362
+ input_bucket=input_bucket,
363
+ output_bucket=output_bucket,
364
+ transcribe_job_name=transcribe_job_name,
365
+ aws_access_key=aws_access_key,
366
+ aws_secret_access_key=aws_secret_access_key,
367
+ aws_region_name=aws_region_name,
368
+ )
369
+ print(df)
src/clustering/resonate_bert_summarizer.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Description: The code helps generate abstractive summary for the transcript data using BART model
2
+ # Reference: https://huggingface.co/vmarklynn/bart-large-cnn-samsum-acsi-ami-v2
3
+ # Reference : https://github.com/vmarklynn/parrot/blob/main/notebooks/summarizer.ipynb
4
+
5
+ import math
6
+ import pandas as pd
7
+ import os
8
+ import uuid
9
+ from transformers import pipeline
10
+
11
+ # Initialization of summarizer based on Bart
12
+ MODEL = 'vmarklynn/bart-large-cnn-samsum-acsi-ami-v2'
13
+ summarizer = pipeline("summarization", MODEL, truncation=True)
14
+
15
+ def format_text(text):
16
+ '''
17
+ Format the transcript data into a readable format
18
+ '''
19
+ try:
20
+ formatted_data = [
21
+ f"{row['speaker_label']}: {row['text']}" for _, row in text.iterrows()
22
+ ]
23
+ formatted_text = "\n".join([f"{line}" for line in formatted_data])
24
+ return formatted_text
25
+ except Exception as e:
26
+ print(f"Error formatting text: {e}")
27
+ return ""
28
+
29
+
30
+ def summarize_text(transcript):
31
+ '''
32
+ Summarize the text using the BART model
33
+ '''
34
+ try:
35
+ text = format_text(transcript)
36
+
37
+ print("\n\nSummarizing Text...")
38
+ summary = summarizer(text)[0]["summary_text"]
39
+ response = {"transcription": format_text, "summary": summary}
40
+ return response
41
+ except Exception as e:
42
+ print(f"Error summarizing text: {e}")
43
+ return {}
44
+
45
+
46
+ def summarize_summary(summary_input):
47
+ '''
48
+ Summarize the summarized text using the BART model
49
+ '''
50
+ try:
51
+ word_count = 1024
52
+ summary = summarizer(
53
+ summary_input,
54
+ min_length=math.ceil(int(word_count) * 0.1),
55
+ max_length=math.ceil(int(word_count) * 0.25),
56
+ )[0]["summary_text"]
57
+ response = {"summary": summary}
58
+ return response
59
+ except Exception as e:
60
+ print(f"Error summarizing summary: {e}")
61
+ return {}
62
+
63
+
64
+ def append_summary_to_csv(summary_text):
65
+ try:
66
+ csv_filename = "./data/summaryFiles/abstract_summary_data.csv"
67
+ meeting_uuid = str(uuid.uuid4())
68
+ if os.path.exists(csv_filename):
69
+ df = pd.read_csv(csv_filename)
70
+ else:
71
+ df = pd.DataFrame(columns=["uuid", "text"])
72
+ new_data = pd.DataFrame({"uuid": [meeting_uuid], "text": [summary_text]})
73
+ df = pd.concat([df, new_data], ignore_index=True)
74
+
75
+ df.to_csv(csv_filename, index=False)
76
+ return meeting_uuid
77
+ except Exception as e:
78
+ print(f"Error appending summary to CSV: {e}")
79
+ return False
80
+
81
+
82
+ def summarize_runner(transcript):
83
+ try:
84
+ transcript.drop(["end_time"], axis=1, inplace=True)
85
+ summary_transcript = summarize_text(transcript)
86
+ summarized_summary = summarize_summary(summary_transcript["summary"])
87
+ final_summary = summarized_summary["summary"]
88
+ meeting_uuid = append_summary_to_csv(final_summary)
89
+ except Exception as e:
90
+ print(f"Error in summarize_runner: {e}")
91
+ return final_summary, meeting_uuid
92
+
93
+
94
+ if __name__ == "__main__":
95
+ df = pd.read_csv("./data/transcriptFiles/Social_Media_-_Ruins_your_life.csv")
96
+ summarize_runner(df)
src/clustering/resonate_clustering.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Description:Creates clusters based on the uploaded transcripts and returns the uuid of the documents that are similar to the query.
2
+ # Reference Code: https://github.com/chakib401/smoothing_sentence_embeddings/blob/master/utils.py
3
+
4
+ '''
5
+ Paper Citation for def normalize_adj():
6
+ Fettal, Chakib, Lazhar Labiod, and Mohamed Nadif.
7
+ "More Discriminative Sentence Embeddings via Semantic Graph Smoothing."
8
+ arXiv preprint arXiv:2402.12890 (2024).
9
+ '''
10
+
11
+ import json
12
+ import os
13
+ import joblib
14
+ import numpy as np
15
+ import pandas as pd
16
+ import scipy.sparse as sp
17
+ import src.clustering.resonate_semantic_search as SemanticSearch
18
+ from dotenv import load_dotenv
19
+ from langchain_openai import OpenAIEmbeddings
20
+ from scipy.io import loadmat, savemat
21
+ from sklearn.cluster import MeanShift, estimate_bandwidth
22
+ from sklearn.neighbors import kneighbors_graph
23
+
24
+ def normalize_adj(adj, lmbda=1):
25
+ '''
26
+ Normalize adjacency matrix of semantic graph
27
+ '''
28
+ adj = adj + lmbda * sp.eye(adj.shape[0])
29
+ rowsum = np.array(adj.sum(1))
30
+ r_inv = np.power(rowsum, -1).flatten()
31
+ r_inv[np.isinf(r_inv)] = 0.0
32
+ r_mat_inv = sp.diags(r_inv)
33
+ adj = r_mat_inv.dot(adj)
34
+
35
+ adj = adj + lmbda * sp.eye(adj.shape[0])
36
+ adj = sp.coo_matrix(adj)
37
+ row_sum = np.array(adj.sum(1))
38
+ d_inv_sqrt = np.power(row_sum, -0.5).flatten()
39
+ d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.0
40
+ d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
41
+ return d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt).tocoo()
42
+
43
+
44
+ def graph_filtering(features, degree=2, lmbda=1, nn=10, alpha=0.5, t=5, method="sgc"):
45
+ """
46
+ This function will perform graph filtering based on four polynomial filters
47
+ We keep n=10, as per paper. And is used to calculate the graph (adjacency matrix)
48
+ between 10 vectors/features.
49
+
50
+ **That is why we have 10 pre-existing transcripts placed in pinecone (through the ont_time_script)
51
+ **If you want to change the number of transcripts, you will have to change the number of neighbors
52
+ """
53
+ adj = kneighbors_graph(features, n_neighbors=nn, metric="cosine")
54
+ adj = (adj + adj.T) / 2
55
+
56
+ S = normalize_adj(adj, lmbda)
57
+ xx = features
58
+ yy = features.copy()
59
+ if method in ["sgc", "s2gc"]:
60
+ for _ in range(degree):
61
+ xx = S @ xx
62
+ yy += xx
63
+ if method == "sgc":
64
+ return xx
65
+ elif method == "s2gc":
66
+ return yy
67
+ elif method == "appnp":
68
+ for _ in range(degree):
69
+ xx = (1 - alpha) * S @ xx + alpha * features
70
+ return xx
71
+ elif method == "dgc":
72
+ k = degree + 1
73
+ for _ in range(1, degree + 1):
74
+ xx = (1 - t / k) * xx + (t / k) * (S @ xx)
75
+ return xx
76
+ else:
77
+ raise "unrecognized filter"
78
+
79
+
80
+ def load_json_config(json_file_path="./config/config.json"):
81
+ with open(json_file_path, "r") as file:
82
+ data = json.load(file)
83
+ return data
84
+
85
+
86
+ class Clustering:
87
+ def __init__(self):
88
+ self.api_key = os.environ.get("OPENAI_API_KEY")
89
+ self.method = "dgc"
90
+
91
+ if not os.path.exists("./data/clusteringFiles/cluster_data.csv"):
92
+ self.create_Cluster()
93
+
94
+ self.index = self.initialize_FAISS()
95
+
96
+ def create_embedding(self):
97
+ '''This function will perform two task:
98
+ 1. embedding on entire data, abstract_data.csv
99
+ 2. save embeddings in cluster_data-embedding.mat in format uuid, text
100
+ '''
101
+ data = pd.read_csv("./data/summaryFiles/abstract_summary_data.csv")
102
+ json_config = load_json_config()
103
+
104
+ text, id = data["text"], data["uuid"]
105
+ # embedding model
106
+ embed = OpenAIEmbeddings(
107
+ model=json_config["EMBEDDING_MODEL_NAME"], openai_api_key=self.api_key
108
+ )
109
+ embeddings = embed.embed_documents(text)
110
+ savemat(
111
+ "./data/embeddingFiles/cluster-embedding.mat",
112
+ {"uuid": id, "text": embeddings},
113
+ )
114
+
115
+
116
+ def create_Cluster(self):
117
+ '''
118
+ This function will perform following tasks:
119
+ 1. call embedding function
120
+ 2. form clusters using cluste_data-embedding.mat file
121
+ 3. Save predicted labels in cluster_data.csv
122
+ '''
123
+ self.create_embedding()
124
+
125
+ data = loadmat("./data/embeddingFiles/cluster-embedding.mat")
126
+ features1 = data["text"]
127
+
128
+ features = graph_filtering(features1, method=self.method)
129
+ ibandwidth = estimate_bandwidth(features, quantile=0.30, random_state=42)
130
+ msclustering = MeanShift(bandwidth=ibandwidth, max_iter=900)
131
+ msclustering.fit(features)
132
+ model_path = f"./data/clusteringFiles/{self.method}_model.joblib"
133
+ joblib.dump(msclustering, model_path)
134
+
135
+ print("Model saved")
136
+
137
+ df = pd.read_csv(f"./data/summaryFiles/abstract_summary_data.csv")
138
+ df["cluster"] = msclustering.predict(features)
139
+ df.to_csv("./data/clusteringFiles/cluster_data.csv")
140
+ print("Cluster data saved")
141
+ self.index = self.initialize_FAISS()
142
+
143
+
144
+
145
+ def uuid_for_query(self, query):
146
+ '''
147
+ Returns the uuids of the documents that are similar to the query, based on the clustering
148
+ '''
149
+ query_cluster_label = self.index.search_query(query)
150
+ print(f"Predicted Label : {query_cluster_label[0]}")
151
+ df = pd.read_csv("./data/clusteringFiles/cluster_data.csv")
152
+ filtered_uuids = df[df["cluster"] == query_cluster_label[0]]["uuid"].tolist()
153
+ return filtered_uuids
154
+
155
+ def initialize_FAISS(self):
156
+ model = SemanticSearch.SemanticEmbedding()
157
+ index = SemanticSearch.FaissForQuerySearch(model)
158
+ data = pd.read_csv("./data/clusteringFiles/cluster_data.csv")
159
+ features1 = data["text"]
160
+ uuids = data["uuid"]
161
+ labels = data["cluster"]
162
+ for text, uuid, label in zip(features1, uuids, labels):
163
+ index.add_summary(text, uuid, label)
164
+ return index
165
+
166
+
167
+ if __name__ == "__main__":
168
+ load_dotenv("./config/.env")
169
+ Clustering_obj = Clustering()
170
+ print(
171
+ Clustering_obj.uuid_for_query(
172
+ "What is the goal of defining maintainability for the new diffs architecture?"
173
+ )
174
+ )
175
+ print(
176
+ Clustering_obj.uuid_for_query(
177
+ "What was the design component for remote control?"
178
+ )
179
+ )
src/clustering/resonate_semantic_search.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Description: Using Facebook's Faiss library to perform semantic search according to the query
2
+ # Reference: https://deepnote.com/blog/semantic-search-using-faiss-and-mpnet
3
+
4
+ from transformers import AutoTokenizer, AutoModel
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import faiss
8
+
9
+ class SemanticEmbedding:
10
+
11
+ def __init__(self, model_name="sentence-transformers/all-mpnet-base-v2"):
12
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+ self.model = AutoModel.from_pretrained(model_name)
14
+
15
+ # Mean Pooling - Take attention mask into account for correct averaging
16
+ def mean_pooling(self, model_output, attention_mask):
17
+ token_embeddings = model_output[
18
+ 0
19
+ ] # First element of model_output contains all token embeddings
20
+ input_mask_expanded = (
21
+ attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
22
+ )
23
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
24
+ input_mask_expanded.sum(1), min=1e-9
25
+ )
26
+
27
+ def get_embedding(self, sentences):
28
+ # Tokenize sentences
29
+ encoded_input = self.tokenizer(
30
+ sentences, padding=True, truncation=True, return_tensors="pt"
31
+ )
32
+ with torch.no_grad():
33
+ model_output = self.model(**encoded_input)
34
+ # Perform pooling
35
+ sentence_embeddings = self.mean_pooling(
36
+ model_output, encoded_input["attention_mask"]
37
+ )
38
+
39
+ # Normalize embeddings
40
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
41
+ return sentence_embeddings.detach().numpy()
42
+
43
+
44
+ class FaissForQuerySearch:
45
+
46
+ def __init__(self, model, dim=768):
47
+ self.index = faiss.IndexFlatIP(dim)
48
+ # Maintaining the document data
49
+ self.doc_map = dict()
50
+ self.model = model
51
+ self.ctr = 0
52
+ self.uuid = []
53
+ self.labels = []
54
+
55
+ def search_query(self, query, k=1):
56
+ D, I = self.index.search(self.model.get_embedding(query), k)
57
+ return [
58
+ self.labels[idx] for idx, score in zip(I[0], D[0]) if idx in self.doc_map
59
+ ]
60
+
61
+ def add_summary(self, document_text, id, predicted_label):
62
+ self.index.add((self.model.get_embedding(document_text))) # index
63
+ self.uuid.append(id) # appending the uuid
64
+ self.labels.append(predicted_label) # appending the predicted label
65
+ self.doc_map[self.ctr] = document_text # store the original document text
66
+ self.ctr += 1
src/langchain/resonate_langchain_functions.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Description: This file contains the LangChain class which is used to query the pinecone and chatbot
2
+
3
+ import os
4
+ import json
5
+ from src.pinecone.resonate_pinecone_functions import PineconeServerless
6
+ from langchain_community.callbacks import get_openai_callback
7
+ from dotenv import load_dotenv
8
+ from datetime import datetime
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain.chains import ConversationChain
11
+ from langchain.chains.conversation.memory import (
12
+ ConversationSummaryBufferMemory,
13
+ )
14
+ from langchain.prompts.chat import (
15
+ HumanMessagePromptTemplate,
16
+ ChatPromptTemplate,
17
+ SystemMessagePromptTemplate,
18
+ )
19
+
20
+ def load_json_config(json_file_path="./config/config.json"):
21
+ with open(json_file_path, "r") as file:
22
+ data = json.load(file)
23
+ return data
24
+
25
+
26
+ class LangChain:
27
+ def __init__(self):
28
+
29
+ json_config = load_json_config()
30
+ # check if the .env file exists using os
31
+ if os.path.exists("./config/.env"):
32
+ load_dotenv("./config/.env")
33
+
34
+ self.PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
35
+ self.OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
36
+ print("PINECONE_API_KEY: ", self.PINECONE_API_KEY)
37
+ if self.PINECONE_API_KEY and self.OPENAI_API_KEY :
38
+ self.pinecone = PineconeServerless()
39
+ print("INSIDE PINECONE")
40
+ self.llm_temperature = json_config["LC_LLM_TEMPERATURE"]
41
+ self.llm_model = json_config["LC_LLM_MODEL"]
42
+ self.llm_summary_max_token_limit = json_config[
43
+ "LC_LLM_SUMMARY_MAX_TOKEN_LIMIT"
44
+ ]
45
+ self.llm_conv_buffer_memory_window = json_config[
46
+ "LC_CONV_BUFFER_MEMORY_WINDOW"
47
+ ]
48
+
49
+ self.llm = ChatOpenAI(
50
+ temperature=self.llm_temperature,
51
+ model_name=self.llm_model,
52
+ streaming=False,
53
+ )
54
+
55
+ self.conversation_bufw = ConversationChain(
56
+ llm=self.llm,
57
+ memory=ConversationSummaryBufferMemory(
58
+ llm=self.llm, max_token_limit=self.llm_summary_max_token_limit
59
+ ),
60
+ )
61
+
62
+ def prompt(self, query, context):
63
+
64
+ system_template = SystemMessagePromptTemplate.from_template(
65
+ "As a helpful assistant, your task is to provide concise and relevant answers based on the given context, which consists of a transcript excerpt from a meeting. The format of the context is as follows:"
66
+ "\nConversations in meeting: <meeting_title>, <meeting_date>"
67
+ "\nStart Time - Speaker: Text"
68
+ "\nYou may receive multiple meeting transcripts, if you feel your response requires reference to multiple meetings, feel free to mention <meeting_title> in your response."
69
+ "Your responses should strictly adhere to the provided context. If you cannot find an answer within the given context, you may ask the user for more information. Ensure clarity by referencing the meeting_title, meeting_date, and speaker as needed."
70
+ "Your responses should be succinct and directly address the user query using the context provided. Avoid discussing any information beyond the given context or using external sources."
71
+ "Skip unnecessary phrases like 'Based on the context provided' and focus solely on answering the users query. No need for greetings or farewells."
72
+ "\nContext:\n"
73
+ "{context}"
74
+ )
75
+
76
+ human_template = HumanMessagePromptTemplate.from_template("\nUser Query: {input}")
77
+ chat_prompt = ChatPromptTemplate.from_messages(
78
+ [system_template, human_template]
79
+ )
80
+ chat_prompt_value = chat_prompt.format_prompt(context=context, input=query)
81
+ return chat_prompt_value.to_messages()
82
+
83
+
84
+ def query_chatbot(self, query, context):
85
+ '''
86
+ Query the chatbot with the given query and context
87
+ '''
88
+ self.messages = self.prompt(query, context)
89
+ #print("Complete msg passed to LLM: \n"self.messages)
90
+ #print("CONTEXT: \n", self.messages)
91
+ resp = self.conversation_bufw(self.messages)
92
+ return resp
93
+
94
+ def parse_conversations(self, conversations) -> str:
95
+ ''''
96
+ Parse the conversations and return the data in a readable format
97
+ '''
98
+ '''
99
+ Format:
100
+ Conversations in meeting: <meeting_title1>
101
+ Start Time - Speaker: Text
102
+ Start Time - Speaker: Text
103
+ .
104
+ .
105
+ \n\n
106
+ Conversations in meeting: <meeting_title2>
107
+ Start Time - Speaker: Text
108
+ Start Time - Speaker: Text
109
+ .
110
+ .
111
+ '''
112
+ data = []
113
+ for cluster_id, cluster_df in conversations.items():
114
+ with open(f"./data/jsonMetaDataFiles/{cluster_id}.json") as f:
115
+ meeting_data = json.load(f)
116
+ meeting_title = meeting_data["meeting_title"]
117
+ meeting_date = datetime.strptime(meeting_data["meeting_date"], '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S')
118
+ data.append(f"Conversations in meeting: '{meeting_title}', meeting_date: {meeting_date} :")
119
+ for i, row in cluster_df.iterrows():
120
+ data.append(
121
+ f"{row['start_time']} - {row['speaker']}: {row['text']}"
122
+ )
123
+ data.append("\n\n")
124
+ data = "\n".join(data)
125
+ return data
126
+
127
+ def chat(self, query, in_filter: list[str] = [], complete_db_flag: bool = False):
128
+ '''
129
+ Primary chat function to query the pinecone and chatbot
130
+ If in_filter is provided, the query will be filtered based on the in_filter
131
+ If complete_db_flag is True, the query will be searched in the complete database
132
+ '''
133
+ # if "summary" in query:
134
+ # pass # Future implementation, using semantic routing
135
+
136
+ self.pinecone.query_pinecone(query, in_filter, complete_db_flag)
137
+ conversation = self.pinecone.query_delta_conversations()
138
+ # print("Conversation: ", conversation)
139
+ context = self.parse_conversations(conversation)
140
+ print("Context: ", context)
141
+ response = self.query_chatbot(query, context)
142
+ return response
143
+
144
+ def count_tokens(self, chain, query):
145
+ with get_openai_callback() as callback:
146
+ response = chain(query)
147
+ print(f"Call Back: {callback}")
148
+ return response
src/pinecone/resonate_pinecone_functions.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Description: Pinecone Serverless Class for Resonate
2
+ # Reference: https://www.pinecone.io/docs/
3
+
4
+ import datetime
5
+ import uuid
6
+ import json
7
+ import os
8
+ import time
9
+ import pandas as pd
10
+ from dotenv import load_dotenv
11
+ from langchain_openai import OpenAIEmbeddings
12
+ from pinecone import Pinecone, ServerlessSpec
13
+
14
+ def load_json_config(json_file_path="./config/config.json"):
15
+ with open(json_file_path, "r") as file:
16
+ data = json.load(file)
17
+ return data
18
+
19
+
20
+ class PineconeServerless:
21
+ def __init__(self) -> None:
22
+ print("Pinecone Serverless Initializing")
23
+ json_config = load_json_config()
24
+ load_dotenv("./config/.env")
25
+
26
+ self.PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
27
+ self.OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
28
+ if self.PINECONE_API_KEY is not None:
29
+ self.pinecone = Pinecone(api_key=self.PINECONE_API_KEY)
30
+ self._init_config(json_config)
31
+ self.meeting_title = None
32
+ self.base_data_path = "./data/jsonMetaDataFiles/"
33
+ self.master_json_file = f"{self.base_data_path}{self.master_json_filename}.json"
34
+ self._create_master_json()
35
+ self._create_index()
36
+ self.response = None
37
+ print("Pinecone Serverless Initialized")
38
+
39
+ def _init_config(self, json_config) -> None:
40
+ for key, value in json_config.items():
41
+ setattr(self, key.lower(), value)
42
+
43
+ def check_index_already_exists(self) -> bool:
44
+ return self.pinecone_index_name in self.pinecone.list_indexes()
45
+
46
+ def _get_vector_embedder(self):
47
+ if self.embedding_provider == "OpenAI":
48
+ return OpenAIEmbeddings(model=self.embedding_model_name)
49
+ else:
50
+ raise ValueError("Invalid Embedding Model")
51
+
52
+ def _get_index(self):
53
+ return self.pinecone.Index(self.pinecone_index_name)
54
+
55
+ def _create_index(self) -> None:
56
+ '''
57
+ Creates a new index in Pinecone if it does not exist
58
+ '''
59
+ pinecone_indexes_list = [
60
+ index.get("name")
61
+ for index in self.pinecone.list_indexes().get("indexes", [])]
62
+
63
+ if self.pinecone_index_name not in pinecone_indexes_list:
64
+ try:
65
+ self.pinecone.create_index(
66
+ name=self.pinecone_index_name,
67
+ metric=self.pinecone_metric,
68
+ dimension=self.pinecone_vector_dimension,
69
+ spec=ServerlessSpec(
70
+ cloud=self.pinecone_cloud_provider,
71
+ region=self.pinecone_region,
72
+ # pod_type="p1.x1", # Future use
73
+ ),
74
+ )
75
+
76
+ while not self.pinecone.describe_index(self.pinecone_index_name).status["ready"]:
77
+ time.sleep(5)
78
+
79
+ except Exception as e:
80
+ print("Index creation failed: ", e)
81
+
82
+ def describe_index_stats(self) -> dict:
83
+ try:
84
+ index = self._get_index()
85
+ return index.describe_index_stats()
86
+ except Exception as e:
87
+ print("Index does not exist: ", e)
88
+ return {}
89
+
90
+ def _delete_index(self) -> None:
91
+ try:
92
+ self.pinecone.delete_index(self.pinecone_index_name)
93
+ except Exception as e:
94
+ print("Index does not exist: ", e)
95
+
96
+ def _create_master_json(self) -> None:
97
+ '''
98
+ Check if the master json file exists, if not, create it
99
+ '''
100
+ os.makedirs(os.path.dirname(self.base_data_path), exist_ok=True)
101
+ if not os.path.exists(self.master_json_file):
102
+ with open(self.master_json_file, "w") as file:
103
+ data = {
104
+ "index": self.pinecone_index_name,
105
+ "namespace": self.pinecone_namespace,
106
+ "last_conversation_no": 0,
107
+ "meeting_uuids": [],
108
+ "meetings": [],
109
+ }
110
+
111
+ with open(self.master_json_file, "w") as f:
112
+ json.dump(data, f, indent=4)
113
+
114
+ print(f"Created {self.master_json_file}")
115
+
116
+ def _update_master_json(
117
+ self,
118
+ meeting_uuid: str,
119
+ meeting_title: str,
120
+ last_conversation_no: int,
121
+ meeting_video_file: bool,
122
+ time_stamp: str,
123
+ ) -> dict:
124
+ '''
125
+ Updates the master json file with the new meeting details
126
+ '''
127
+ with open(self.master_json_file, "r+") as f:
128
+ data = json.load(f)
129
+ data["meeting_uuids"] = list(set(data["meeting_uuids"] + [meeting_uuid]))
130
+ data["last_conversation_no"] = last_conversation_no
131
+ data["meetings"].append(
132
+ {
133
+ "meeting_uuid": meeting_uuid,
134
+ "meeting_title": meeting_title,
135
+ "meeting_date": time_stamp,
136
+ "meeting_video_file": meeting_video_file,
137
+ }
138
+ )
139
+ return data
140
+
141
+ def _get_meeting_members(self, transcript: pd.DataFrame) -> list[str]:
142
+ return list(transcript["speaker_label"].unique())
143
+
144
+ def _create_new_meeting_json(
145
+ self,
146
+ meeting_uuid: str,
147
+ meeting_title: str,
148
+ last_conversation_no: int,
149
+ meeting_members: list[str],
150
+ meeting_video_file: bool,
151
+ time_stamp: str,
152
+ meeting_summary: str,
153
+ ) -> dict:
154
+ '''
155
+ Creates a new json file for the meeting details
156
+ '''
157
+ data = {
158
+ "index": self.pinecone_index_name,
159
+ "namespace": self.pinecone_namespace,
160
+ "meeting_title": meeting_title,
161
+ "meeting_uuid": meeting_uuid,
162
+ "meeting_date": time_stamp,
163
+ "last_conversation_no": last_conversation_no,
164
+ "meeting_video_file": meeting_video_file,
165
+ "meeting_members": meeting_members,
166
+ "meeting_summary": meeting_summary,
167
+ }
168
+
169
+ meeting_details_file = os.path.join(self.base_data_path, f"{meeting_uuid}.json")
170
+ with open(meeting_details_file, "w") as f:
171
+ json.dump(data, f, indent=4)
172
+
173
+ def _get_last_conversation_no(self) -> list[str]:
174
+
175
+ with open(self.master_json_file, "r") as f:
176
+ data = json.load(f)
177
+
178
+ return data["last_conversation_no"]
179
+
180
+ def _set_new_meeting_json(
181
+ self,
182
+ meeting_uuid: str,
183
+ meeting_title: str,
184
+ last_conversation_no: str,
185
+ meeting_members: list[str],
186
+ meeting_video_file: bool,
187
+ meeting_summary: str,
188
+ ) -> dict:
189
+ '''
190
+ Updates the master json file with the new meeting details
191
+ '''
192
+ time_stamp = str(datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
193
+ self._create_new_meeting_json(
194
+ meeting_uuid,
195
+ meeting_title,
196
+ last_conversation_no,
197
+ meeting_members,
198
+ meeting_video_file,
199
+ time_stamp,
200
+ meeting_summary,
201
+ )
202
+ data = self._update_master_json(
203
+ meeting_uuid,
204
+ meeting_title,
205
+ last_conversation_no,
206
+ meeting_video_file,
207
+ time_stamp,
208
+ )
209
+
210
+ with open(self.master_json_file, "w") as f:
211
+ json.dump(data, f, indent=4)
212
+
213
+ def _convert_to_hr_min_sec(self, time_in_minutes) -> str:
214
+ # Hr:Min:Sec
215
+ hours = int(time_in_minutes // 60)
216
+ minutes = int(time_in_minutes % 60)
217
+ seconds = int((time_in_minutes - int(time_in_minutes)) * 60)
218
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
219
+
220
+ def pinecone_upsert(
221
+ self,
222
+ transcript: pd.DataFrame,
223
+ meeting_uuid: str = "",
224
+ meeting_video_file: bool = False,
225
+ meeting_title: str = "Unnamed",
226
+ meeting_summary: str = "",
227
+ ) -> None:
228
+ """
229
+ Upserts the transcript into Pinecone
230
+ """
231
+ print("Upserting transcript into Pinecone...")
232
+ texts = []
233
+ metadatas = []
234
+
235
+ last_conversation_no = self._get_last_conversation_no()
236
+ last_conversation_no = int(last_conversation_no)
237
+
238
+ embed = self._get_vector_embedder()
239
+ meeting_members = self._get_meeting_members(transcript)
240
+ index = self._get_index()
241
+
242
+ for _, record in transcript.iterrows():
243
+ start_time = self._convert_to_hr_min_sec(record["start_time"])
244
+
245
+ metadata = {
246
+ "speaker": record["speaker_label"],
247
+ "start_time": start_time,
248
+ "text": record["text"],
249
+ "meeting_uuid": meeting_uuid,
250
+ }
251
+ texts.append(record["text"])
252
+ metadatas.append(metadata)
253
+
254
+ if len(texts) >= self.pinecone_upsert_batch_limit:
255
+ ids = list(
256
+ map(
257
+ lambda i: str(i + 1),
258
+ range(last_conversation_no, last_conversation_no + len(texts)),
259
+ )
260
+ )
261
+ last_conversation_no += len(texts)
262
+ embeds = embed.embed_documents(texts)
263
+
264
+ try:
265
+ index.upsert(
266
+ vectors=zip(ids, embeds, metadatas),
267
+ namespace=self.pinecone_namespace,
268
+ )
269
+ except Exception as e:
270
+ print("Error upserting into Pinecone: ", e)
271
+ texts = []
272
+ metadatas = []
273
+
274
+ # Upsert the remaining texts
275
+ if len(texts) > 0:
276
+ ids = list(
277
+ map(
278
+ lambda i: str(i + 1),
279
+ range(last_conversation_no, last_conversation_no + len(texts)),
280
+ )
281
+ )
282
+ last_conversation_no += len(texts)
283
+ embeds = embed.embed_documents(texts)
284
+
285
+ try:
286
+ index.upsert(
287
+ vectors=zip(ids, embeds, metadatas),
288
+ namespace=self.pinecone_namespace,
289
+ )
290
+ except Exception as e:
291
+ print("Error upserting into Pinecone: ", e)
292
+
293
+ self._set_new_meeting_json(
294
+ meeting_uuid,
295
+ meeting_title,
296
+ last_conversation_no,
297
+ meeting_members,
298
+ meeting_video_file,
299
+ meeting_summary,
300
+ )
301
+
302
+ print("Upserted transcript into Pinecone")
303
+
304
+ def _extract_id_from_response(self, response: list) -> list[int]:
305
+ if response:
306
+ return list(int(match["id"]) for match in response["matches"])
307
+ return []
308
+
309
+ def query_pinecone(
310
+ self, query: str, in_filter: list[str] = [], complete_db_flag: bool = False
311
+ ) -> list:
312
+ """
313
+ Queries Pinecone for the given query, where in_filter is the list of meeting_uuids to filter the query
314
+ and if complete_db_flag is True, the entire database is queried
315
+ """
316
+ # for using without clustering, complete_db_flag to True
317
+ try:
318
+ index = self._get_index()
319
+ embed = self._get_vector_embedder()
320
+
321
+ filter = None if complete_db_flag else {"meeting_uuid": {"$in": in_filter}}
322
+
323
+ self.response = index.query(
324
+ vector=embed.embed_documents([query])[0],
325
+ namespace=self.pinecone_namespace,
326
+ top_k=self.pinecone_top_k_results,
327
+ include_metadata=True,
328
+ filter=filter,
329
+ )
330
+ return self.response
331
+ except Exception as e:
332
+ print("Error querying Pinecone: ", e)
333
+ return []
334
+
335
+
336
+ def query_delta_conversations(self) -> pd.DataFrame:
337
+ """
338
+ Queries Pinecone for the given query and returns the delta conversations (conversation window around the query result)
339
+ """
340
+ ids = self._extract_id_from_response(self.response)
341
+ last_conversation_no = self._get_last_conversation_no()
342
+ index = self._get_index()
343
+ conversation = {}
344
+
345
+ for id in ids:
346
+ left = (
347
+ id - self.pinecone_delta_window
348
+ if id - self.pinecone_delta_window > 0
349
+ else 1
350
+ )
351
+ right = (
352
+ id + self.pinecone_delta_window
353
+ if id + self.pinecone_delta_window <= last_conversation_no
354
+ else last_conversation_no
355
+ )
356
+ window = [str(i) for i in range(left, right + 1)]
357
+ try:
358
+ # print("Fetch window: ", window)
359
+ print("Contextual Window Conversation IDs: ", window)
360
+ fetch_response = index.fetch(
361
+ ids=window, namespace=self.pinecone_namespace
362
+ )
363
+ conversation[id] = fetch_response
364
+ except Exception as e:
365
+ print("Error fetching from Pinecone for id:", id, "Error:", e)
366
+ continue
367
+ # print('conversation length: ', len(conversation))
368
+ return self._parse_fetch_conversations(conversation)
369
+
370
+
371
+ def _parse_fetch_conversations(self, conversation)-> dict:
372
+ '''
373
+ Parses the conversation dictionary and returns a grouped_dfs
374
+ '''
375
+ data_rows = []
376
+ for primary_hit_id, primary_hit_data in conversation.items():
377
+ for _, vector_data in primary_hit_data["vectors"].items():
378
+ id = vector_data["id"]
379
+ meeting_uuid = vector_data["metadata"]["meeting_uuid"]
380
+ speaker = vector_data["metadata"]["speaker"]
381
+ start_time = vector_data["metadata"]["start_time"]
382
+ text = vector_data["metadata"]["text"]
383
+
384
+ data_rows.append(
385
+ (primary_hit_id, id, meeting_uuid, speaker, start_time, text)
386
+ )
387
+
388
+ columns = ["primary_id", "id", "meeting_uuid", "speaker", "start_time", "text"]
389
+ delta_conversation_df = pd.DataFrame(data_rows, columns=columns)
390
+ delta_conversation_df = delta_conversation_df.sort_values(by=["id"])
391
+ delta_conversation_df = delta_conversation_df.drop_duplicates(subset=["id"])
392
+
393
+ # creating separate df for rows with same meeting_cluster_id
394
+ grouped_dfs = {
395
+ group_name: group.reset_index(drop=True, inplace=False)
396
+ for group_name, group in delta_conversation_df.groupby("meeting_uuid")
397
+ }
398
+ # return delta_conversation_df
399
+ return grouped_dfs
400
+
401
+
402
+ if __name__ == "__main__":
403
+ pinecone = PineconeServerless()
404
+ print(pinecone.describe_index_stats())
405
+
406
+ for i in range(1, 3):
407
+ print(i)
408
+ transcript = pd.read_csv(f"./data/transcriptFiles/healthcare_{i}.csv")
409
+ transcript.dropna(inplace=True)
410
+ pinecone.pinecone_upsert(
411
+ transcript,
412
+ meeting_uuid=str(uuid.uuid4()),
413
+ meeting_video_file=False,
414
+ meeting_title=f"Healthcare Meeting {i}",
415
+ meeting_summary=f"Healthcare Meeting Summary Meeting {i}",
416
+ )
417
+ time.sleep(5)
418
+ print(pinecone.describe_index_stats())
419
+
420
+ query = "I am one of the directors in Wappingers Central School District."
421
+ response1 = pinecone.query_pinecone(query, "", True)
422
+ print(response1)
src/utils/resonate_streamlitUtils.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import moviepy.editor as mp
2
+ import json
3
+ from datetime import datetime, timedelta
4
+ from src.aws.resonate_aws_functions import resonate_aws_transcribe
5
+ import os
6
+
7
+ from src.pinecone.resonate_pinecone_functions import PineconeServerless
8
+ import time
9
+ import uuid
10
+ import pandas as pd
11
+
12
+
13
+ def convert_video_to_audio(video_path, audio_path):
14
+ # Convert video file to audio file
15
+ audio_clip = mp.VideoFileClip(video_path).audio
16
+ audio_clip.write_audiofile(audio_path)
17
+
18
+
19
+ def transcript_text_editor_minutes_to_hhmmss(minutes):
20
+ time_delta = timedelta(minutes=minutes)
21
+ hhmmss_format = str(time_delta)
22
+ return hhmmss_format
23
+
24
+
25
+ def load_json_config(json_file_path="./config/config.json"):
26
+ # Use a context manager to ensure the file is properly closed after opening
27
+ with open(json_file_path, "r") as file:
28
+ # Load the JSON data
29
+ data = json.load(file)
30
+ return data
31
+
32
+
33
+ def aws_transcribe(file_name):
34
+
35
+ json_config = load_json_config()
36
+ current_timestamp = str.lower(datetime.now().strftime("%Y-%b-%d-%I-%M-%p"))
37
+
38
+ json_config["AWS_INPUT_BUCKET"] += f"{str(current_timestamp)}"
39
+ json_config["AWS_OUTPUT_BUCKET"] += f"{str(current_timestamp)}"
40
+ json_config["AWS_TRANSCRIBE_JOB_NAME"] += f"{str(current_timestamp)}"
41
+
42
+ print(json_config)
43
+
44
+ try:
45
+ rat = resonate_aws_transcribe()
46
+ df = rat.runner(
47
+ file_name=file_name,
48
+ input_bucket=json_config["AWS_INPUT_BUCKET"],
49
+ output_bucket=json_config["AWS_OUTPUT_BUCKET"],
50
+ transcribe_job_name=json_config["AWS_TRANSCRIBE_JOB_NAME"],
51
+ aws_access_key=os.getenv("AWS_ACCESS_KEY"),
52
+ aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
53
+ aws_region_name=json_config["AWS_REGION"],
54
+ )
55
+
56
+ return df
57
+ except Exception as e:
58
+ return e
59
+
60
+
61
+ def pinecone_init_upsert(
62
+ df_transcript: pd.DataFrame,
63
+ meeting_title: str,
64
+ meeting_summary: str,
65
+ meeting_uuid: str,
66
+ ):
67
+ try:
68
+
69
+ pinecone = PineconeServerless()
70
+ pinecone.pinecone_upsert(
71
+ df_transcript,
72
+ # meeting_uuid=NULL,
73
+ meeting_uuid=meeting_uuid,
74
+ meeting_video_file=False,
75
+ meeting_title=meeting_title,
76
+ meeting_summary=meeting_summary,
77
+ )
78
+ time.sleep(5)
79
+ except Exception as e:
80
+ print("Error upserting transcript to Pinecone: ", e)