Nuno Machado commited on
Commit
37deedc
1 Parent(s): f9ad220

Add dataset preprocessing options

Browse files
requirements_cpu.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ faiss-cpu
4
+ datasets
5
+ sentence-transformers
6
+ data-cache
7
+ torch
requirements_gpu.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ faiss-gpu
4
+ datasets
5
+ sentence-transformers
6
+ data-cache
7
+ torch
utils/__init__.py ADDED
File without changes
utils/dataset_loader.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from datasets import load_dataset
5
+
6
+
7
+ def convert_embeddings(embeddings_str):
8
+ embeddings = [np.fromstring(e[1:-1], sep=' ') for e in embeddings_str]
9
+ return embeddings
10
+
11
+
12
+ class DatasetLoader:
13
+ @staticmethod
14
+ def load_from_file(path: str, to_pandas=False):
15
+ if to_pandas:
16
+ return pd.read_csv(path)
17
+ else:
18
+ return load_dataset("csv", data_files=path)['train']
19
+
20
+ @staticmethod
21
+ def load_from_file_with_embeddings(path: str, to_pandas=False):
22
+ dataset = load_dataset("csv", data_files=path)['train']
23
+ new_dataset = dataset.map(
24
+ lambda example: {
25
+ 'embeddings': convert_embeddings(example['embeddings'])
26
+ },
27
+ batched=True,
28
+ )
29
+ if to_pandas:
30
+ return new_dataset.to_pandas()
31
+ else:
32
+ return new_dataset
33
+
34
+ @staticmethod
35
+ def load_from_huggingface(dataset_name: str, split: str, to_pandas=False):
36
+ # load dataset from HuggingFace hub
37
+ dataset = load_dataset(dataset_name, split=split)
38
+ if to_pandas:
39
+ return dataset.to_pandas()
40
+ else:
41
+ return dataset
utils/dataset_preprocessing.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+
4
+
5
+ def is_valid_sentence(text):
6
+ # check if text starts with a capitalized letter and ends with a punctuation mark
7
+ return re.match(r"^[A-Z].*", text) and re.match(r".*[.?!]\s*$", text)
8
+
9
+
10
+ def group_by_sentences(df):
11
+ """Group episode transcript passages by sentences.
12
+ """
13
+ episode_id = None
14
+ guest = None
15
+ title = None
16
+ text = ""
17
+ start = None
18
+ end = None
19
+ new_rows = []
20
+
21
+ for i, row in df.iterrows():
22
+ # continue previous sentence that wasn't complete
23
+ if episode_id == row["id"]:
24
+ # append the current text to the previous text and merge timestamps
25
+ text += " " + row["text"]
26
+ end = row["end"]
27
+ else:
28
+ # otherwise, create a new row and reset variables
29
+ episode_id = row["id"]
30
+ guest = row["guest"]
31
+ title = row["title"]
32
+ text = row["text"]
33
+ start = row["start"]
34
+ end = row["end"]
35
+
36
+ if is_valid_sentence(text):
37
+ # add new sentence if valid and reset id
38
+ new_rows.append([episode_id, guest, title, text, start, end])
39
+ episode_id = None
40
+
41
+ # add the last row to the new_rows list
42
+ # new_rows.append([id, guest, title, text, start, end])
43
+
44
+ # create a new dataframe with the new rows
45
+ new_df = pd.DataFrame(new_rows, columns=df.columns)
46
+
47
+ return new_df
48
+
49
+
50
+ def group_by_chunks(df, chunk_size=10):
51
+ """Group episode transcript passages by chunks (where each chunk is a series of 'chunk_size' contiguous passages).
52
+ """
53
+ # create an empty DataFrame with the same columns as the input df
54
+ chunked_df = pd.DataFrame(columns=df.columns)
55
+
56
+ # group the dataframe by episodes
57
+ grouped_df = df.groupby('id')
58
+
59
+ # iterate over the groups
60
+ for id_val, id_df in grouped_df:
61
+ # iterate over the rows in chunks of size chunk_size
62
+ for i in range(0, len(id_df), chunk_size):
63
+ chunk = id_df.iloc[i:i+chunk_size]
64
+
65
+ # concatenate the text values and update the start and end values for the current chunk
66
+ text = " ".join(chunk['text'])
67
+ start = chunk['start'].iat[0]
68
+ end = chunk['end'].iat[-1]
69
+
70
+ # create a new row for the current chunk with the concatenated text and updated start and end values
71
+ new_row = pd.DataFrame({
72
+ 'id': [chunk['id'].iat[0]],
73
+ 'guest': [chunk['guest'].iat[0]],
74
+ 'title': [chunk['title'].iat[0]],
75
+ 'text': [text], 'start': [start],
76
+ 'end': [end]
77
+ })
78
+
79
+ # add the new row to the chunked_df DataFrame
80
+ chunked_df = pd.concat([chunked_df, new_row], ignore_index=True)
81
+
82
+ return chunked_df