dataset-creator-reddit-uwaterloo

Building

File size: 1,726 Bytes

285612d

import pandas as pd

from utilities.praw_downloader import praw_downloader
from utilities.praw_processor import preprocess_praw_data


def get_latest_data():
    submissions = praw_downloader()
    df = preprocess_praw_data(submissions=submissions)
    return df


def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
    """
    Removes rows with redundant ids, retaining the one with the longest content.

    Parameters:
    - df (pd.DataFrame): The input DataFrame with columns 'id' and 'content'.

    Returns:
    - pd.DataFrame: A filtered DataFrame with unique ids, where each id is associated
                    with the longest content available.
    """

    # Create a column for content length
    df['content_length'] = df['content'].str.len()

    # Use groupby to get the index of the row with the longest content for each 'id'
    idx_to_keep = df.groupby('id')['content_length'].idxmax().values

    # Filter the DataFrame to only keep those rows
    df_filtered = df.loc[idx_to_keep]

    # Drop the 'content_length' column
    df_filtered = df_filtered.drop(columns=['content_length'])

    return df_filtered


def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
    """
    Merges the provided dataset with the latest data, sorts them by 'date_utc',
    filters out redundant IDs, and returns the merged and filtered dataset.

    Args:
    - dataset (Type[Dataset]): The dataset to be merged with the latest data.

    Returns:
    - Type[Dataset]: The merged and filtered dataset.
    """
    latest_df = get_latest_data()

    df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
    df = filter_redundant_ids(df)
    return df