File size: 2,736 Bytes
285612d
 
 
 
 
 
 
 
 
 
 
 
 
 
d0c9304
 
285612d
 
d0c9304
285612d
 
d0c9304
 
 
285612d
 
 
 
 
d0c9304
 
5d9e0b8
285612d
d0c9304
 
 
285612d
d0c9304
 
285612d
d0c9304
285612d
 
5d9e0b8
 
 
285612d
5d9e0b8
 
 
 
 
285612d
 
5d9e0b8
 
285612d
 
5d9e0b8
285612d
 
5d9e0b8
 
 
 
285612d
 
d0c9304
 
 
 
 
5d9e0b8
 
 
 
d0c9304
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pandas as pd

from utilities.praw_downloader import praw_downloader
from utilities.praw_processor import preprocess_praw_data


def get_latest_data():
    submissions = praw_downloader()
    df = preprocess_praw_data(submissions=submissions)
    return df


def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
    """
    For each id, creates a new row with the longest content and the highest score
    from the available rows with the same id.

    Parameters:
    - df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.

    Returns:
    - pd.DataFrame: A DataFrame with unique ids, where each id is associated
                    with the longest content available and the highest score from
                    potentially different rows.
    """

    # Create a column for content length
    df['content_length'] = df['content'].str.len()

    # Find row with the longest content for each 'id'
    idx_longest_content = df.groupby('id')['content_length'].idxmax().values
    df_longest_content = df.loc[idx_longest_content].drop(columns=['score'])

    # Find row with the highest score for each 'id'
    idx_highest_score = df.groupby('id')['score'].idxmax().values
    df_highest_score = df.loc[idx_highest_score][['id', 'score']]

    # Merge the two DataFrames on 'id'
    df_merged = pd.merge(df_longest_content, df_highest_score, on='id')

    return df_merged




def merge_and_filter_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
    """
    Merges two dataframes, sorts them by 'date_utc', and filters out redundant IDs.

    The function first concatenates the old and new dataframes. Then, it sorts the
    resulting dataframe by the 'date_utc' column. Finally, it filters out redundant IDs
    using the `filter_redundant_ids` function.

    Args:
    - old_df (pd.DataFrame): The original dataframe.
    - new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.

    Returns:
    - pd.DataFrame: The merged, sorted, and filtered dataframe.
    """

    # Concatenate old and new dataframes, sort by 'date_utc', and reset index
    df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)

    # Filter out redundant IDs
    df = filter_redundant_ids(df)
    return df


if __name__ == '__main__':
    # Mock data
    data = {
        'id': [1, 1, 2, 2, 3],
        'content': ['short', 'much longer content', 'mid', 'size', 'constant'],
        'score': [10, 5, 7, 9, 6],
        'another_column': ['a', 'a', 'b', 'b', 'c']
        }

    df = pd.DataFrame(data)

    print("Original DataFrame:")
    print(df)
    print("\nFiltered DataFrame:")
    print(filter_redundant_ids(df))