lex_fridman_podcast_semantic_search / utils /dataset_preprocessing.py
Nuno Machado
Add dataset preprocessing options
37deedc
raw
history blame
2.76 kB
import re
import pandas as pd
def is_valid_sentence(text):
# check if text starts with a capitalized letter and ends with a punctuation mark
return re.match(r"^[A-Z].*", text) and re.match(r".*[.?!]\s*$", text)
def group_by_sentences(df):
"""Group episode transcript passages by sentences.
"""
episode_id = None
guest = None
title = None
text = ""
start = None
end = None
new_rows = []
for i, row in df.iterrows():
# continue previous sentence that wasn't complete
if episode_id == row["id"]:
# append the current text to the previous text and merge timestamps
text += " " + row["text"]
end = row["end"]
else:
# otherwise, create a new row and reset variables
episode_id = row["id"]
guest = row["guest"]
title = row["title"]
text = row["text"]
start = row["start"]
end = row["end"]
if is_valid_sentence(text):
# add new sentence if valid and reset id
new_rows.append([episode_id, guest, title, text, start, end])
episode_id = None
# add the last row to the new_rows list
# new_rows.append([id, guest, title, text, start, end])
# create a new dataframe with the new rows
new_df = pd.DataFrame(new_rows, columns=df.columns)
return new_df
def group_by_chunks(df, chunk_size=10):
"""Group episode transcript passages by chunks (where each chunk is a series of 'chunk_size' contiguous passages).
"""
# create an empty DataFrame with the same columns as the input df
chunked_df = pd.DataFrame(columns=df.columns)
# group the dataframe by episodes
grouped_df = df.groupby('id')
# iterate over the groups
for id_val, id_df in grouped_df:
# iterate over the rows in chunks of size chunk_size
for i in range(0, len(id_df), chunk_size):
chunk = id_df.iloc[i:i+chunk_size]
# concatenate the text values and update the start and end values for the current chunk
text = " ".join(chunk['text'])
start = chunk['start'].iat[0]
end = chunk['end'].iat[-1]
# create a new row for the current chunk with the concatenated text and updated start and end values
new_row = pd.DataFrame({
'id': [chunk['id'].iat[0]],
'guest': [chunk['guest'].iat[0]],
'title': [chunk['title'].iat[0]],
'text': [text], 'start': [start],
'end': [end]
})
# add the new row to the chunked_df DataFrame
chunked_df = pd.concat([chunked_df, new_row], ignore_index=True)
return chunked_df