Spaces:
Runtime error
Runtime error
Nuno Machado
commited on
Commit
•
37deedc
1
Parent(s):
f9ad220
Add dataset preprocessing options
Browse files- requirements_cpu.txt +7 -0
- requirements_gpu.txt +7 -0
- utils/__init__.py +0 -0
- utils/dataset_loader.py +41 -0
- utils/dataset_preprocessing.py +82 -0
requirements_cpu.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
transformers
|
3 |
+
faiss-cpu
|
4 |
+
datasets
|
5 |
+
sentence-transformers
|
6 |
+
data-cache
|
7 |
+
torch
|
requirements_gpu.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
transformers
|
3 |
+
faiss-gpu
|
4 |
+
datasets
|
5 |
+
sentence-transformers
|
6 |
+
data-cache
|
7 |
+
torch
|
utils/__init__.py
ADDED
File without changes
|
utils/dataset_loader.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
from datasets import load_dataset
|
5 |
+
|
6 |
+
|
7 |
+
def convert_embeddings(embeddings_str):
|
8 |
+
embeddings = [np.fromstring(e[1:-1], sep=' ') for e in embeddings_str]
|
9 |
+
return embeddings
|
10 |
+
|
11 |
+
|
12 |
+
class DatasetLoader:
|
13 |
+
@staticmethod
|
14 |
+
def load_from_file(path: str, to_pandas=False):
|
15 |
+
if to_pandas:
|
16 |
+
return pd.read_csv(path)
|
17 |
+
else:
|
18 |
+
return load_dataset("csv", data_files=path)['train']
|
19 |
+
|
20 |
+
@staticmethod
|
21 |
+
def load_from_file_with_embeddings(path: str, to_pandas=False):
|
22 |
+
dataset = load_dataset("csv", data_files=path)['train']
|
23 |
+
new_dataset = dataset.map(
|
24 |
+
lambda example: {
|
25 |
+
'embeddings': convert_embeddings(example['embeddings'])
|
26 |
+
},
|
27 |
+
batched=True,
|
28 |
+
)
|
29 |
+
if to_pandas:
|
30 |
+
return new_dataset.to_pandas()
|
31 |
+
else:
|
32 |
+
return new_dataset
|
33 |
+
|
34 |
+
@staticmethod
|
35 |
+
def load_from_huggingface(dataset_name: str, split: str, to_pandas=False):
|
36 |
+
# load dataset from HuggingFace hub
|
37 |
+
dataset = load_dataset(dataset_name, split=split)
|
38 |
+
if to_pandas:
|
39 |
+
return dataset.to_pandas()
|
40 |
+
else:
|
41 |
+
return dataset
|
utils/dataset_preprocessing.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
|
5 |
+
def is_valid_sentence(text):
|
6 |
+
# check if text starts with a capitalized letter and ends with a punctuation mark
|
7 |
+
return re.match(r"^[A-Z].*", text) and re.match(r".*[.?!]\s*$", text)
|
8 |
+
|
9 |
+
|
10 |
+
def group_by_sentences(df):
|
11 |
+
"""Group episode transcript passages by sentences.
|
12 |
+
"""
|
13 |
+
episode_id = None
|
14 |
+
guest = None
|
15 |
+
title = None
|
16 |
+
text = ""
|
17 |
+
start = None
|
18 |
+
end = None
|
19 |
+
new_rows = []
|
20 |
+
|
21 |
+
for i, row in df.iterrows():
|
22 |
+
# continue previous sentence that wasn't complete
|
23 |
+
if episode_id == row["id"]:
|
24 |
+
# append the current text to the previous text and merge timestamps
|
25 |
+
text += " " + row["text"]
|
26 |
+
end = row["end"]
|
27 |
+
else:
|
28 |
+
# otherwise, create a new row and reset variables
|
29 |
+
episode_id = row["id"]
|
30 |
+
guest = row["guest"]
|
31 |
+
title = row["title"]
|
32 |
+
text = row["text"]
|
33 |
+
start = row["start"]
|
34 |
+
end = row["end"]
|
35 |
+
|
36 |
+
if is_valid_sentence(text):
|
37 |
+
# add new sentence if valid and reset id
|
38 |
+
new_rows.append([episode_id, guest, title, text, start, end])
|
39 |
+
episode_id = None
|
40 |
+
|
41 |
+
# add the last row to the new_rows list
|
42 |
+
# new_rows.append([id, guest, title, text, start, end])
|
43 |
+
|
44 |
+
# create a new dataframe with the new rows
|
45 |
+
new_df = pd.DataFrame(new_rows, columns=df.columns)
|
46 |
+
|
47 |
+
return new_df
|
48 |
+
|
49 |
+
|
50 |
+
def group_by_chunks(df, chunk_size=10):
|
51 |
+
"""Group episode transcript passages by chunks (where each chunk is a series of 'chunk_size' contiguous passages).
|
52 |
+
"""
|
53 |
+
# create an empty DataFrame with the same columns as the input df
|
54 |
+
chunked_df = pd.DataFrame(columns=df.columns)
|
55 |
+
|
56 |
+
# group the dataframe by episodes
|
57 |
+
grouped_df = df.groupby('id')
|
58 |
+
|
59 |
+
# iterate over the groups
|
60 |
+
for id_val, id_df in grouped_df:
|
61 |
+
# iterate over the rows in chunks of size chunk_size
|
62 |
+
for i in range(0, len(id_df), chunk_size):
|
63 |
+
chunk = id_df.iloc[i:i+chunk_size]
|
64 |
+
|
65 |
+
# concatenate the text values and update the start and end values for the current chunk
|
66 |
+
text = " ".join(chunk['text'])
|
67 |
+
start = chunk['start'].iat[0]
|
68 |
+
end = chunk['end'].iat[-1]
|
69 |
+
|
70 |
+
# create a new row for the current chunk with the concatenated text and updated start and end values
|
71 |
+
new_row = pd.DataFrame({
|
72 |
+
'id': [chunk['id'].iat[0]],
|
73 |
+
'guest': [chunk['guest'].iat[0]],
|
74 |
+
'title': [chunk['title'].iat[0]],
|
75 |
+
'text': [text], 'start': [start],
|
76 |
+
'end': [end]
|
77 |
+
})
|
78 |
+
|
79 |
+
# add the new row to the chunked_df DataFrame
|
80 |
+
chunked_df = pd.concat([chunked_df, new_row], ignore_index=True)
|
81 |
+
|
82 |
+
return chunked_df
|