|
import pandas as pd |
|
|
|
from utilities.praw_downloader import praw_downloader |
|
from utilities.praw_processor import preprocess_praw_data |
|
|
|
|
|
def get_latest_data(): |
|
submissions = praw_downloader() |
|
df = preprocess_praw_data(submissions=submissions) |
|
return df |
|
|
|
|
|
def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame: |
|
""" |
|
For each id, creates a new row with the longest content and the highest score |
|
from the available rows with the same id. Adds a boolean column 'updated' |
|
indicating whether the row was updated. |
|
|
|
Parameters: |
|
- df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'. |
|
|
|
Returns: |
|
- pd.DataFrame: A DataFrame with unique ids, where each id is associated |
|
with the longest content available and the highest score from |
|
potentially different rows, and a boolean column 'updated'. |
|
""" |
|
|
|
|
|
original_df = df.copy() |
|
|
|
|
|
df['content_length'] = df['content'].str.len() |
|
|
|
|
|
idx_longest_content = df.groupby('id')['content_length'].idxmax().values |
|
df_longest_content = df.loc[idx_longest_content][['id', 'content']] |
|
|
|
|
|
idx_highest_score = df.groupby('id')['score'].idxmax().values |
|
df_highest_score = df.loc[idx_highest_score][['id', 'score']] |
|
|
|
|
|
df_merged = pd.merge(df_longest_content, df_highest_score, on='id') |
|
|
|
|
|
df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original')) |
|
df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | ( |
|
df_merged['score'] != df_merged['score_original']) |
|
|
|
|
|
df_merged.drop_duplicates(subset='id', inplace=True) |
|
|
|
|
|
df_merged.drop(columns=['content_original', 'score_original'], inplace=True) |
|
|
|
return df_merged |
|
|
|
|
|
def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame: |
|
""" |
|
Merges two dataframes, sorts them by 'date_utc', and marks new IDs. |
|
|
|
The function first marks rows from the new dataframe, then concatenates the old and new dataframes. |
|
It sorts the resulting dataframe by the 'date_utc' column. Rows from the new dataframe that are not |
|
in the old dataframe are marked as 'new'. |
|
|
|
Args: |
|
- old_df (pd.DataFrame): The original dataframe. |
|
- new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe. |
|
|
|
Returns: |
|
- pd.DataFrame: The merged, sorted, and marked dataframe. |
|
""" |
|
|
|
|
|
old_df['new'] = False |
|
new_df['new'] = True |
|
|
|
|
|
df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True) |
|
|
|
|
|
df = filter_redundant_ids(df) |
|
|
|
|
|
df['new'] = df['new'] & ~df['id'].duplicated(keep=False) |
|
|
|
return df |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
data = { |
|
'id': [1, 1, 2, 2, 3], |
|
'content': ['short', 'much longer content', 'mid', 'size', 'constant'], |
|
'score': [10, 5, 7, 9, 6], |
|
'another_column': ['a', 'a', 'b', 'b', 'c'] |
|
} |
|
|
|
df = pd.DataFrame(data) |
|
|
|
print("Original DataFrame:") |
|
print(df) |
|
print("\nFiltered DataFrame:") |
|
print(filter_redundant_ids(df)) |
|
|