Spaces:

nmac
/

lex_fridman_podcast_semantic_search

Runtime error

App Files Files Community

Nuno Machado commited on May 10, 2023

Commit

37deedc

•

1 Parent(s): f9ad220

Add dataset preprocessing options

Browse files

Files changed (5) hide show

requirements_cpu.txt +7 -0
requirements_gpu.txt +7 -0
utils/__init__.py +0 -0
utils/dataset_loader.py +41 -0
utils/dataset_preprocessing.py +82 -0

requirements_cpu.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+transformers
+faiss-cpu
+datasets
+sentence-transformers
+data-cache
+torch

requirements_gpu.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+transformers
+faiss-gpu
+datasets
+sentence-transformers
+data-cache
+torch

utils/__init__.py ADDED Viewed

File without changes

utils/dataset_loader.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import pandas as pd
+import numpy as np
+from datasets import load_dataset
+def convert_embeddings(embeddings_str):
+    embeddings = [np.fromstring(e[1:-1], sep=' ') for e in embeddings_str]
+    return embeddings
+class DatasetLoader:
+    @staticmethod
+    def load_from_file(path: str, to_pandas=False):
+        if to_pandas:
+            return pd.read_csv(path)
+        else:
+            return load_dataset("csv", data_files=path)['train']
+    @staticmethod
+    def load_from_file_with_embeddings(path: str, to_pandas=False):
+        dataset = load_dataset("csv", data_files=path)['train']
+        new_dataset = dataset.map(
+            lambda example: {
+                'embeddings': convert_embeddings(example['embeddings'])
+            },
+            batched=True,
+        )
+        if to_pandas:
+            return new_dataset.to_pandas()
+        else:
+            return new_dataset
+    @staticmethod
+    def load_from_huggingface(dataset_name: str, split: str, to_pandas=False):
+        # load dataset from HuggingFace hub
+        dataset = load_dataset(dataset_name, split=split)
+        if to_pandas:
+            return dataset.to_pandas()
+        else:
+            return dataset

utils/dataset_preprocessing.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import re
+import pandas as pd
+def is_valid_sentence(text):
+    # check if text starts with a capitalized letter and ends with a punctuation mark
+    return re.match(r"^[A-Z].*", text) and re.match(r".*[.?!]\s*$", text)
+def group_by_sentences(df):
+    """Group episode transcript passages by sentences.
+    """
+    episode_id = None
+    guest = None
+    title = None
+    text = ""
+    start = None
+    end = None
+    new_rows = []
+    for i, row in df.iterrows():
+        # continue previous sentence that wasn't complete
+        if episode_id == row["id"]:
+            # append the current text to the previous text and merge timestamps
+            text += " " + row["text"]
+            end = row["end"]
+        else:
+            # otherwise, create a new row and reset variables
+            episode_id = row["id"]
+            guest = row["guest"]
+            title = row["title"]
+            text = row["text"]
+            start = row["start"]
+            end = row["end"]
+        if is_valid_sentence(text):
+            # add new sentence if valid and reset id
+            new_rows.append([episode_id, guest, title, text, start, end])
+            episode_id = None
+    # add the last row to the new_rows list
+    # new_rows.append([id, guest, title, text, start, end])
+    # create a new dataframe with the new rows
+    new_df = pd.DataFrame(new_rows, columns=df.columns)
+    return new_df
+def group_by_chunks(df, chunk_size=10):
+    """Group episode transcript passages by chunks (where each chunk is a series of 'chunk_size' contiguous passages).
+    """
+    # create an empty DataFrame with the same columns as the input df
+    chunked_df = pd.DataFrame(columns=df.columns)
+    # group the dataframe by episodes
+    grouped_df = df.groupby('id')
+    # iterate over the groups
+    for id_val, id_df in grouped_df:
+        # iterate over the rows in chunks of size chunk_size
+        for i in range(0, len(id_df), chunk_size):
+            chunk = id_df.iloc[i:i+chunk_size]
+            # concatenate the text values and update the start and end values for the current chunk
+            text = " ".join(chunk['text'])
+            start = chunk['start'].iat[0]
+            end = chunk['end'].iat[-1]
+            # create a new row for the current chunk with the concatenated text and updated start and end values
+            new_row = pd.DataFrame({
+                'id': [chunk['id'].iat[0]],
+                'guest': [chunk['guest'].iat[0]],
+                'title': [chunk['title'].iat[0]],
+                'text': [text], 'start': [start],
+                'end': [end]
+            })
+            # add the new row to the chunked_df DataFrame
+            chunked_df = pd.concat([chunked_df, new_row], ignore_index=True)
+    return chunked_df