File size: 1,726 Bytes
285612d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import pandas as pd
from utilities.praw_downloader import praw_downloader
from utilities.praw_processor import preprocess_praw_data
def get_latest_data():
submissions = praw_downloader()
df = preprocess_praw_data(submissions=submissions)
return df
def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
"""
Removes rows with redundant ids, retaining the one with the longest content.
Parameters:
- df (pd.DataFrame): The input DataFrame with columns 'id' and 'content'.
Returns:
- pd.DataFrame: A filtered DataFrame with unique ids, where each id is associated
with the longest content available.
"""
# Create a column for content length
df['content_length'] = df['content'].str.len()
# Use groupby to get the index of the row with the longest content for each 'id'
idx_to_keep = df.groupby('id')['content_length'].idxmax().values
# Filter the DataFrame to only keep those rows
df_filtered = df.loc[idx_to_keep]
# Drop the 'content_length' column
df_filtered = df_filtered.drop(columns=['content_length'])
return df_filtered
def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
"""
Merges the provided dataset with the latest data, sorts them by 'date_utc',
filters out redundant IDs, and returns the merged and filtered dataset.
Args:
- dataset (Type[Dataset]): The dataset to be merged with the latest data.
Returns:
- Type[Dataset]: The merged and filtered dataset.
"""
latest_df = get_latest_data()
df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
df = filter_redundant_ids(df)
return df
|