Spaces:

nmac
/

lex_fridman_podcast_semantic_search

Runtime error

lex_fridman_podcast_semantic_search / utils /dataset_preprocessing.py

Nuno Machado

Add dataset preprocessing options

37deedc over 1 year ago

2.76 kB

	import re
	import pandas as pd


	def is_valid_sentence(text):
	# check if text starts with a capitalized letter and ends with a punctuation mark
	return re.match(r"^[A-Z].", text) and re.match(r".[.?!]\s*$", text)


	def group_by_sentences(df):
	"""Group episode transcript passages by sentences.
	"""
	episode_id = None
	guest = None
	title = None
	text = ""
	start = None
	end = None
	new_rows = []

	for i, row in df.iterrows():
	# continue previous sentence that wasn't complete
	if episode_id == row["id"]:
	# append the current text to the previous text and merge timestamps
	text += " " + row["text"]
	end = row["end"]
	else:
	# otherwise, create a new row and reset variables
	episode_id = row["id"]
	guest = row["guest"]
	title = row["title"]
	text = row["text"]
	start = row["start"]
	end = row["end"]

	if is_valid_sentence(text):
	# add new sentence if valid and reset id
	new_rows.append([episode_id, guest, title, text, start, end])
	episode_id = None

	# add the last row to the new_rows list
	# new_rows.append([id, guest, title, text, start, end])

	# create a new dataframe with the new rows
	new_df = pd.DataFrame(new_rows, columns=df.columns)

	return new_df


	def group_by_chunks(df, chunk_size=10):
	"""Group episode transcript passages by chunks (where each chunk is a series of 'chunk_size' contiguous passages).
	"""
	# create an empty DataFrame with the same columns as the input df
	chunked_df = pd.DataFrame(columns=df.columns)

	# group the dataframe by episodes
	grouped_df = df.groupby('id')

	# iterate over the groups
	for id_val, id_df in grouped_df:
	# iterate over the rows in chunks of size chunk_size
	for i in range(0, len(id_df), chunk_size):
	chunk = id_df.iloc[i:i+chunk_size]

	# concatenate the text values and update the start and end values for the current chunk
	text = " ".join(chunk['text'])
	start = chunk['start'].iat[0]
	end = chunk['end'].iat[-1]

	# create a new row for the current chunk with the concatenated text and updated start and end values
	new_row = pd.DataFrame({
	'id': [chunk['id'].iat[0]],
	'guest': [chunk['guest'].iat[0]],
	'title': [chunk['title'].iat[0]],
	'text': [text], 'start': [start],
	'end': [end]
	})

	# add the new row to the chunked_df DataFrame
	chunked_df = pd.concat([chunked_df, new_row], ignore_index=True)

	return chunked_df