multi-meeting-QnA

Sleeping

App Files Files Community

tykiww commited on Jul 29, 2024

Commit

262ffbf

verified ·

1 Parent(s): bad8824

Create transcripts.py

Browse files

Files changed (1) hide show

utilities/transcripts.py +258 -0

utilities/transcripts.py ADDED Viewed

	@@ -0,0 +1,258 @@

+# Imports for Transcript Loader
+import os
+import webvtt
+import re
+from datetime import datetime
+from llama_index import Document
+# Imports for Document Embedder
+import gc
+import re
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from pinecone.grpc import PineconeGRPC
+from pinecone import ServerlessSpec
+from llama_index.vector_stores import PineconeVectorStore
+from llama_index.node_parser import SemanticSplitterNodeParser
+from llama_index.ingestion import IngestionPipeline
+class VTTTranscriptLoader:
+    """
+    vtt file ingestion and cleaning. This was done because vtt files
+    are not recognized by llamaindex. The output should mirror that of
+    any document loader from llamaindex or langchain.
+    """
+    def __init__(self, file_path):
+        self.fp = file_path
+        self.data = None
+    def open_vtt(self, file_path, plaintext=True):
+        """Read VTT file."""
+        if plaintext:
+            with open(file_path, "r") as f:
+                data = f.readlines()
+        else:
+            data = webvtt.read(file_path)
+        return data
+    def extract_speaker_name(self, text):
+        """Extracts the speaker name from a VTT caption."""
+        match = re.search(r"<v (.*?)>", text)
+        if match:
+            return match.group(1)
+        else:
+            return None
+    def extract_speaker_words(self, captions):
+        """Extracts the speaker text from a VTT caption."""
+        return [caption.text for caption in captions]
+    def merge_speaker_words(self, words, speakers, split=True):
+        """Joins speaker names with their words."""
+        # Extract speaker names
+        speaker_list = [self.extract_speaker_name(line) for line in speakers if self.extract_speaker_name(line)]
+        # Extract words
+        words_list = self.extract_speaker_words(words)
+        # Combine speaker names and words
+        combined_list = list(zip(speaker_list, words_list))
+        # Return the combined list as a single string if split is False
+        if not split:
+            combined_list = '\n'.join([f"{name}: '{text}'" for name, text in combined_list])
+        return combined_list, speaker_list
+    def get_metadata(self, speaker_list, file_path):
+        """Generates metadata for the transcript."""
+        # Meeting length
+        time_format = "%H:%M:%S.%f"
+        sess = self.open_vtt(file_path, plaintext=False)
+        dt1 = datetime.strptime(sess[0].start, time_format)
+        dt2 = datetime.strptime(sess[-1].end, time_format)
+        minutes = (dt2 - dt1).seconds / 60
+        # Meeting date
+        match = re.search(r"\d{4}[-_]\d{2}[-_]\d{2}", file_path)
+        if match:
+            date_str = match.group().replace('_', '-')
+            date_obj = datetime.strptime(date_str, "%Y-%m-%d").date()
+        else:
+            date_obj = None
+        # Pull dictionary here
+        output = {
+            'title': file_path,
+            'duration': minutes,
+            'meeting_date': date_obj.strftime("%Y-%m-%d"),
+            'speakers': list(set(speaker_list)),
+        }
+        return output
+    def manual_document(self, output, metadata):
+        """Create document manually"""
+        document = Document(text=output)
+        document.metadata = metadata
+        return document
+    def process_file(self, file_path):
+        """Processes a single VTT file and returns the combined speaker names and words."""
+        # Get words as webvtt captions
+        words = self.open_vtt(file_path, plaintext=False)
+        # Get speaker lines as plaintext
+        speaker = self.open_vtt(file_path, plaintext=True)
+        # Combine speaker names and words
+        output, speaker_list = self.merge_speaker_words(words, speaker, split=False)
+        # Get session data as dictionary
+        metadata = self.get_metadata(speaker_list, file_path)
+        return self.manual_document(output, metadata)
+    def load(self):
+        """Processes all VTT files in the directory or the single file and returns a list of results."""
+        results = []
+        if os.path.isdir(self.fp):
+            for root, _, files in os.walk(self.fp):
+                for file in files:
+                    if file.endswith('.vtt'):
+                        file_path = os.path.join(root, file)
+                        transcript = self.process_file(file_path)
+                        results.append(transcript)
+        else:
+            transcript = self.process_file(self.fp)
+            results.append(transcript)
+        return results
+class DocumentEmbedder:
+    """
+    Takes a document and embeds it directly into a pinecone data store.
+    Process retrieves, cleans, embeds, and sends the documents to vector
+    store.
+    Currently supports hugginface embeddings only. Gotta keep things cheap.
+    """
+    def __init__(self, api_keys, files, embedding, index_name):
+        # api keys
+        self.pinecone_api_key = api_keys['pinecone']
+        self.openai_api_key = api_keys['openai']
+        self.huggingface_api_key = api_keys['huggingface']
+        # pinecone
+        self.embedding = embedding
+        self.vector_db = index_name
+        # basic items
+        self.files = files
+        self.interactive = interactive
+    def clean_text(self, content: str) -> str:
+        """
+        Remove unwanted characters and patterns in text input.
+        :param content: Text input.
+        :return: Cleaned version of original text input.
+        """
+        # Fix hyphenated words broken by newline
+        content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)
+        # Remove specific unwanted patterns and characters
+        unwanted_patterns = [
+            "\\n", "  —", "——————————", "—————————", "—————",
+            r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
+        ]
+        for pattern in unwanted_patterns:
+            content = re.sub(pattern, "", content)
+        # Fix improperly spaced hyphenated words and normalize whitespace
+        content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
+        content = re.sub(r'\s+', ' ', content)
+        return content
+    def create_embedder(self):
+        """Get the right embedding model"""
+        embedding = HuggingFaceEmbedding(model_name=self.embedding)
+        return embedding
+    def pinecone_pipeline(self, embedding):
+        """Initialize pinecone connection and vectorstore"""
+        # connect
+        pc = PineconeGRPC(api_key=self.pinecone_api_key)
+        # Create your index if index does not exist
+        indexes = [i.name for i in pc.list_indexes()]
+        index_exists = any([self.vector_db in i for i in indexes])
+        if index_exists:
+            print("Index already exists")
+        else:
+            print("Creating index")
+            pc.create_index(
+                self.vector_db,
+                dimension=768,
+                metric="cosine",
+                spec=ServerlessSpec(cloud="aws", region="us-east-1"),
+        )
+        # Initialize your index
+        pinecone_index = pc.Index(self.vector_db)
+        # Initialize VectorStore
+        vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
+        # create pipeline (abstracts away the need to adaptively process and batch)
+        pipeline = IngestionPipeline(
+            transformations=[
+                # creating appropriate chunks and cutoffs (this needs to be worked on).
+                SemanticSplitterNodeParser(
+                    buffer_size=10, # 1 = each sentence is a node
+                    breakpoint_percentile_threshold=95,
+                    embed_model=embedding,
+                    ),
+                embedding,
+                ],
+                vector_store=vector_store
+            )
+        return pipeline
+    def embed(self):
+        """stringing process above to embed and upsert directly to pinecone"""
+        # read_file
+        print("reading files")
+        results = self.files
+        # Call clean function
+        print("cleaning files")
+        for d in range(len(results)):
+            results[d].text = self.clean_text(results[d].text)
+        # set up embedder
+        print("retrieving embedder")
+        embedder = self.create_embedder()
+        # set up pinecone pipeline
+        print("initializing pinecone db")
+        pipeline = self.pinecone_pipeline(embedder)
+        # run pinecone in batches (of 1) for memory preservation.
+        print("reading into pinecone db")
+        batchsize = 1
+        for i in range(0, len(results), batchsize):
+            gc.collect()
+            batch = pipeline.run(documents=results[i:i+batchsize])
+            print("completed batch %s" % ((i+batchsize)/batchsize))