File size: 3,788 Bytes
285612d
 
 
 
 
 
 
 
 
 
 
 
 
 
d0c9304
76a52b4
 
285612d
 
d0c9304
285612d
 
d0c9304
 
76a52b4
285612d
 
76a52b4
 
 
285612d
 
 
d0c9304
 
76a52b4
285612d
d0c9304
 
 
285612d
d0c9304
 
285612d
76a52b4
 
 
 
 
 
 
285612d
76a52b4
 
 
 
5d9e0b8
 
bcf2055
285612d
bcf2055
5d9e0b8
bcf2055
 
 
285612d
 
5d9e0b8
 
285612d
 
bcf2055
285612d
 
bcf2055
 
 
 
5d9e0b8
 
 
bcf2055
285612d
bcf2055
 
 
 
285612d
d0c9304
 
 
 
 
5d9e0b8
 
 
 
d0c9304
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pandas as pd

from utilities.praw_downloader import praw_downloader
from utilities.praw_processor import preprocess_praw_data


def get_latest_data():
    submissions = praw_downloader()
    df = preprocess_praw_data(submissions=submissions)
    return df


def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
    """
    For each id, creates a new row with the longest content and the highest score
    from the available rows with the same id. Adds a boolean column 'updated'
    indicating whether the row was updated.

    Parameters:
    - df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.

    Returns:
    - pd.DataFrame: A DataFrame with unique ids, where each id is associated
                    with the longest content available and the highest score from
                    potentially different rows, and a boolean column 'updated'.
    """

    # Create a copy of the original DataFrame to avoid modifying it directly
    original_df = df.copy()

    # Create a column for content length
    df['content_length'] = df['content'].str.len()

    # Find row with the longest content for each 'id'
    idx_longest_content = df.groupby('id')['content_length'].idxmax().values
    df_longest_content = df.loc[idx_longest_content][['id', 'content']]

    # Find row with the highest score for each 'id'
    idx_highest_score = df.groupby('id')['score'].idxmax().values
    df_highest_score = df.loc[idx_highest_score][['id', 'score']]

    # Merge the two DataFrames on 'id'
    df_merged = pd.merge(df_longest_content, df_highest_score, on='id')

    # Check if the content or score was updated for each id
    df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
    df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
            df_merged['score'] != df_merged['score_original'])

    # Drop duplicates to keep only the rows with longest content and highest score
    df_merged.drop_duplicates(subset='id', inplace=True)

    # Drop original content and score columns
    df_merged.drop(columns=['content_original', 'score_original'], inplace=True)

    return df_merged


def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
    """
    Merges two dataframes, sorts them by 'date_utc', and marks new IDs.

    The function first marks rows from the new dataframe, then concatenates the old and new dataframes.
    It sorts the resulting dataframe by the 'date_utc' column. Rows from the new dataframe that are not
    in the old dataframe are marked as 'new'.

    Args:
    - old_df (pd.DataFrame): The original dataframe.
    - new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.

    Returns:
    - pd.DataFrame: The merged, sorted, and marked dataframe.
    """

    # Mark rows in old and new dataframes
    old_df['new'] = False
    new_df['new'] = True

    # Concatenate old and new dataframes, sort by 'date_utc', and reset index
    df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)

    # Optional: If you have a function to filter redundant IDs, you can use it here
    df = filter_redundant_ids(df)

    # Identify new rows (present in new_df but not in old_df)
    df['new'] = df['new'] & ~df['id'].duplicated(keep=False)

    return df


if __name__ == '__main__':
    # Mock data
    data = {
        'id': [1, 1, 2, 2, 3],
        'content': ['short', 'much longer content', 'mid', 'size', 'constant'],
        'score': [10, 5, 7, 9, 6],
        'another_column': ['a', 'a', 'b', 'b', 'c']
        }

    df = pd.DataFrame(data)

    print("Original DataFrame:")
    print(df)
    print("\nFiltered DataFrame:")
    print(filter_redundant_ids(df))