derek-thomas's picture
derek-thomas HF staff
Major updates from sister repo
5d9e0b8
raw
history blame
No virus
2.74 kB
import pandas as pd
from utilities.praw_downloader import praw_downloader
from utilities.praw_processor import preprocess_praw_data
def get_latest_data():
submissions = praw_downloader()
df = preprocess_praw_data(submissions=submissions)
return df
def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
"""
For each id, creates a new row with the longest content and the highest score
from the available rows with the same id.
Parameters:
- df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.
Returns:
- pd.DataFrame: A DataFrame with unique ids, where each id is associated
with the longest content available and the highest score from
potentially different rows.
"""
# Create a column for content length
df['content_length'] = df['content'].str.len()
# Find row with the longest content for each 'id'
idx_longest_content = df.groupby('id')['content_length'].idxmax().values
df_longest_content = df.loc[idx_longest_content].drop(columns=['score'])
# Find row with the highest score for each 'id'
idx_highest_score = df.groupby('id')['score'].idxmax().values
df_highest_score = df.loc[idx_highest_score][['id', 'score']]
# Merge the two DataFrames on 'id'
df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
return df_merged
def merge_and_filter_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
"""
Merges two dataframes, sorts them by 'date_utc', and filters out redundant IDs.
The function first concatenates the old and new dataframes. Then, it sorts the
resulting dataframe by the 'date_utc' column. Finally, it filters out redundant IDs
using the `filter_redundant_ids` function.
Args:
- old_df (pd.DataFrame): The original dataframe.
- new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.
Returns:
- pd.DataFrame: The merged, sorted, and filtered dataframe.
"""
# Concatenate old and new dataframes, sort by 'date_utc', and reset index
df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
# Filter out redundant IDs
df = filter_redundant_ids(df)
return df
if __name__ == '__main__':
# Mock data
data = {
'id': [1, 1, 2, 2, 3],
'content': ['short', 'much longer content', 'mid', 'size', 'constant'],
'score': [10, 5, 7, 9, 6],
'another_column': ['a', 'a', 'b', 'b', 'c']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\nFiltered DataFrame:")
print(filter_redundant_ids(df))