tykiww commited on
Commit
262ffbf
Β·
verified Β·
1 Parent(s): bad8824

Create transcripts.py

Browse files
Files changed (1) hide show
  1. utilities/transcripts.py +258 -0
utilities/transcripts.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Imports for Transcript Loader
2
+ import os
3
+ import webvtt
4
+ import re
5
+ from datetime import datetime
6
+ from llama_index import Document
7
+
8
+
9
+ # Imports for Document Embedder
10
+ import gc
11
+ import re
12
+
13
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
14
+ from pinecone.grpc import PineconeGRPC
15
+ from pinecone import ServerlessSpec
16
+
17
+ from llama_index.vector_stores import PineconeVectorStore
18
+ from llama_index.node_parser import SemanticSplitterNodeParser
19
+ from llama_index.ingestion import IngestionPipeline
20
+
21
+
22
+
23
+
24
+
25
+ class VTTTranscriptLoader:
26
+ """
27
+ vtt file ingestion and cleaning. This was done because vtt files
28
+ are not recognized by llamaindex. The output should mirror that of
29
+ any document loader from llamaindex or langchain.
30
+ """
31
+
32
+ def __init__(self, file_path):
33
+ self.fp = file_path
34
+ self.data = None
35
+
36
+ def open_vtt(self, file_path, plaintext=True):
37
+ """Read VTT file."""
38
+ if plaintext:
39
+ with open(file_path, "r") as f:
40
+ data = f.readlines()
41
+ else:
42
+ data = webvtt.read(file_path)
43
+ return data
44
+
45
+ def extract_speaker_name(self, text):
46
+ """Extracts the speaker name from a VTT caption."""
47
+ match = re.search(r"<v (.*?)>", text)
48
+ if match:
49
+ return match.group(1)
50
+ else:
51
+ return None
52
+
53
+ def extract_speaker_words(self, captions):
54
+ """Extracts the speaker text from a VTT caption."""
55
+ return [caption.text for caption in captions]
56
+
57
+ def merge_speaker_words(self, words, speakers, split=True):
58
+ """Joins speaker names with their words."""
59
+ # Extract speaker names
60
+ speaker_list = [self.extract_speaker_name(line) for line in speakers if self.extract_speaker_name(line)]
61
+ # Extract words
62
+ words_list = self.extract_speaker_words(words)
63
+ # Combine speaker names and words
64
+ combined_list = list(zip(speaker_list, words_list))
65
+ # Return the combined list as a single string if split is False
66
+ if not split:
67
+ combined_list = '\n'.join([f"{name}: '{text}'" for name, text in combined_list])
68
+ return combined_list, speaker_list
69
+
70
+ def get_metadata(self, speaker_list, file_path):
71
+ """Generates metadata for the transcript."""
72
+ # Meeting length
73
+ time_format = "%H:%M:%S.%f"
74
+ sess = self.open_vtt(file_path, plaintext=False)
75
+
76
+ dt1 = datetime.strptime(sess[0].start, time_format)
77
+ dt2 = datetime.strptime(sess[-1].end, time_format)
78
+
79
+ minutes = (dt2 - dt1).seconds / 60
80
+ # Meeting date
81
+ match = re.search(r"\d{4}[-_]\d{2}[-_]\d{2}", file_path)
82
+ if match:
83
+ date_str = match.group().replace('_', '-')
84
+ date_obj = datetime.strptime(date_str, "%Y-%m-%d").date()
85
+ else:
86
+ date_obj = None
87
+
88
+ # Pull dictionary here
89
+ output = {
90
+ 'title': file_path,
91
+ 'duration': minutes,
92
+ 'meeting_date': date_obj.strftime("%Y-%m-%d"),
93
+ 'speakers': list(set(speaker_list)),
94
+ }
95
+
96
+ return output
97
+
98
+ def manual_document(self, output, metadata):
99
+ """Create document manually"""
100
+ document = Document(text=output)
101
+ document.metadata = metadata
102
+ return document
103
+
104
+ def process_file(self, file_path):
105
+ """Processes a single VTT file and returns the combined speaker names and words."""
106
+ # Get words as webvtt captions
107
+ words = self.open_vtt(file_path, plaintext=False)
108
+ # Get speaker lines as plaintext
109
+ speaker = self.open_vtt(file_path, plaintext=True)
110
+ # Combine speaker names and words
111
+ output, speaker_list = self.merge_speaker_words(words, speaker, split=False)
112
+ # Get session data as dictionary
113
+ metadata = self.get_metadata(speaker_list, file_path)
114
+
115
+ return self.manual_document(output, metadata)
116
+
117
+ def load(self):
118
+ """Processes all VTT files in the directory or the single file and returns a list of results."""
119
+ results = []
120
+ if os.path.isdir(self.fp):
121
+ for root, _, files in os.walk(self.fp):
122
+ for file in files:
123
+ if file.endswith('.vtt'):
124
+ file_path = os.path.join(root, file)
125
+ transcript = self.process_file(file_path)
126
+ results.append(transcript)
127
+ else:
128
+ transcript = self.process_file(self.fp)
129
+ results.append(transcript)
130
+ return results
131
+
132
+
133
+ class DocumentEmbedder:
134
+ """
135
+ Takes a document and embeds it directly into a pinecone data store.
136
+ Process retrieves, cleans, embeds, and sends the documents to vector
137
+ store.
138
+
139
+ Currently supports hugginface embeddings only. Gotta keep things cheap.
140
+ """
141
+
142
+ def __init__(self, api_keys, files, embedding, index_name):
143
+ # api keys
144
+ self.pinecone_api_key = api_keys['pinecone']
145
+ self.openai_api_key = api_keys['openai']
146
+ self.huggingface_api_key = api_keys['huggingface']
147
+ # pinecone
148
+ self.embedding = embedding
149
+ self.vector_db = index_name
150
+ # basic items
151
+ self.files = files
152
+ self.interactive = interactive
153
+
154
+
155
+ def clean_text(self, content: str) -> str:
156
+ """
157
+ Remove unwanted characters and patterns in text input.
158
+ :param content: Text input.
159
+ :return: Cleaned version of original text input.
160
+ """
161
+
162
+ # Fix hyphenated words broken by newline
163
+ content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)
164
+
165
+ # Remove specific unwanted patterns and characters
166
+ unwanted_patterns = [
167
+ "\\n", " β€”", "β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”", "β€”β€”β€”β€”β€”β€”β€”β€”β€”", "β€”β€”β€”β€”β€”",
168
+ r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
169
+ ]
170
+ for pattern in unwanted_patterns:
171
+ content = re.sub(pattern, "", content)
172
+
173
+ # Fix improperly spaced hyphenated words and normalize whitespace
174
+ content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
175
+ content = re.sub(r'\s+', ' ', content)
176
+
177
+ return content
178
+
179
+
180
+ def create_embedder(self):
181
+ """Get the right embedding model"""
182
+
183
+ embedding = HuggingFaceEmbedding(model_name=self.embedding)
184
+ return embedding
185
+
186
+
187
+ def pinecone_pipeline(self, embedding):
188
+ """Initialize pinecone connection and vectorstore"""
189
+
190
+ # connect
191
+ pc = PineconeGRPC(api_key=self.pinecone_api_key)
192
+
193
+ # Create your index if index does not exist
194
+ indexes = [i.name for i in pc.list_indexes()]
195
+ index_exists = any([self.vector_db in i for i in indexes])
196
+
197
+ if index_exists:
198
+ print("Index already exists")
199
+ else:
200
+ print("Creating index")
201
+ pc.create_index(
202
+ self.vector_db,
203
+ dimension=768,
204
+ metric="cosine",
205
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
206
+ )
207
+
208
+ # Initialize your index
209
+ pinecone_index = pc.Index(self.vector_db)
210
+
211
+ # Initialize VectorStore
212
+ vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
213
+
214
+ # create pipeline (abstracts away the need to adaptively process and batch)
215
+ pipeline = IngestionPipeline(
216
+ transformations=[
217
+ # creating appropriate chunks and cutoffs (this needs to be worked on).
218
+ SemanticSplitterNodeParser(
219
+ buffer_size=10, # 1 = each sentence is a node
220
+ breakpoint_percentile_threshold=95,
221
+ embed_model=embedding,
222
+ ),
223
+ embedding,
224
+ ],
225
+ vector_store=vector_store
226
+ )
227
+
228
+ return pipeline
229
+
230
+
231
+ def embed(self):
232
+ """stringing process above to embed and upsert directly to pinecone"""
233
+
234
+ # read_file
235
+ print("reading files")
236
+ results = self.files
237
+
238
+ # Call clean function
239
+ print("cleaning files")
240
+ for d in range(len(results)):
241
+ results[d].text = self.clean_text(results[d].text)
242
+
243
+ # set up embedder
244
+ print("retrieving embedder")
245
+ embedder = self.create_embedder()
246
+
247
+ # set up pinecone pipeline
248
+ print("initializing pinecone db")
249
+ pipeline = self.pinecone_pipeline(embedder)
250
+
251
+ # run pinecone in batches (of 1) for memory preservation.
252
+ print("reading into pinecone db")
253
+ batchsize = 1
254
+ for i in range(0, len(results), batchsize):
255
+ gc.collect()
256
+ batch = pipeline.run(documents=results[i:i+batchsize])
257
+ print("completed batch %s" % ((i+batchsize)/batchsize))
258
+