Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

App Files Files Community

dataset-creator-reddit-bestofredditorupdates / utilities /data_collator.py

derek-thomas HF staff

Adding code to show new rows

bcf2055 12 months ago

raw

history blame

3.79 kB

	import pandas as pd

	from utilities.praw_downloader import praw_downloader
	from utilities.praw_processor import preprocess_praw_data


	def get_latest_data():
	submissions = praw_downloader()
	df = preprocess_praw_data(submissions=submissions)
	return df


	def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
	"""
	For each id, creates a new row with the longest content and the highest score
	from the available rows with the same id. Adds a boolean column 'updated'
	indicating whether the row was updated.

	Parameters:
	- df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.

	Returns:
	- pd.DataFrame: A DataFrame with unique ids, where each id is associated
	with the longest content available and the highest score from
	potentially different rows, and a boolean column 'updated'.
	"""

	# Create a copy of the original DataFrame to avoid modifying it directly
	original_df = df.copy()

	# Create a column for content length
	df['content_length'] = df['content'].str.len()

	# Find row with the longest content for each 'id'
	idx_longest_content = df.groupby('id')['content_length'].idxmax().values
	df_longest_content = df.loc[idx_longest_content][['id', 'content']]

	# Find row with the highest score for each 'id'
	idx_highest_score = df.groupby('id')['score'].idxmax().values
	df_highest_score = df.loc[idx_highest_score][['id', 'score']]

	# Merge the two DataFrames on 'id'
	df_merged = pd.merge(df_longest_content, df_highest_score, on='id')

	# Check if the content or score was updated for each id
	df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
	df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) \| (
	df_merged['score'] != df_merged['score_original'])

	# Drop duplicates to keep only the rows with longest content and highest score
	df_merged.drop_duplicates(subset='id', inplace=True)

	# Drop original content and score columns
	df_merged.drop(columns=['content_original', 'score_original'], inplace=True)

	return df_merged


	def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
	"""
	Merges two dataframes, sorts them by 'date_utc', and marks new IDs.

	The function first marks rows from the new dataframe, then concatenates the old and new dataframes.
	It sorts the resulting dataframe by the 'date_utc' column. Rows from the new dataframe that are not
	in the old dataframe are marked as 'new'.

	Args:
	- old_df (pd.DataFrame): The original dataframe.
	- new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.

	Returns:
	- pd.DataFrame: The merged, sorted, and marked dataframe.
	"""

	# Mark rows in old and new dataframes
	old_df['new'] = False
	new_df['new'] = True

	# Concatenate old and new dataframes, sort by 'date_utc', and reset index
	df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)

	# Optional: If you have a function to filter redundant IDs, you can use it here
	df = filter_redundant_ids(df)

	# Identify new rows (present in new_df but not in old_df)
	df['new'] = df['new'] & ~df['id'].duplicated(keep=False)

	return df


	if __name__ == '__main__':
	# Mock data
	data = {
	'id': [1, 1, 2, 2, 3],
	'content': ['short', 'much longer content', 'mid', 'size', 'constant'],
	'score': [10, 5, 7, 9, 6],
	'another_column': ['a', 'a', 'b', 'b', 'c']
	}

	df = pd.DataFrame(data)

	print("Original DataFrame:")
	print(df)
	print("\nFiltered DataFrame:")
	print(filter_redundant_ids(df))