derek-thomas HF staff commited on
Commit
76a52b4
1 Parent(s): 9a66c2f

Updating merge function to show if a row was updated

Browse files
Files changed (1) hide show
  1. utilities/data_collator.py +18 -4
utilities/data_collator.py CHANGED
@@ -13,7 +13,8 @@ def get_latest_data():
13
  def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
14
  """
15
  For each id, creates a new row with the longest content and the highest score
16
- from the available rows with the same id.
 
17
 
18
  Parameters:
19
  - df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.
@@ -21,15 +22,18 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
21
  Returns:
22
  - pd.DataFrame: A DataFrame with unique ids, where each id is associated
23
  with the longest content available and the highest score from
24
- potentially different rows.
25
  """
26
 
 
 
 
27
  # Create a column for content length
28
  df['content_length'] = df['content'].str.len()
29
 
30
  # Find row with the longest content for each 'id'
31
  idx_longest_content = df.groupby('id')['content_length'].idxmax().values
32
- df_longest_content = df.loc[idx_longest_content].drop(columns=['score'])
33
 
34
  # Find row with the highest score for each 'id'
35
  idx_highest_score = df.groupby('id')['score'].idxmax().values
@@ -38,9 +42,19 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
38
  # Merge the two DataFrames on 'id'
39
  df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
40
 
41
- return df_merged
42
 
 
 
 
 
 
 
 
43
 
 
 
 
 
44
 
45
 
46
  def merge_and_filter_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
 
13
  def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
14
  """
15
  For each id, creates a new row with the longest content and the highest score
16
+ from the available rows with the same id. Adds a boolean column 'updated'
17
+ indicating whether the row was updated.
18
 
19
  Parameters:
20
  - df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.
 
22
  Returns:
23
  - pd.DataFrame: A DataFrame with unique ids, where each id is associated
24
  with the longest content available and the highest score from
25
+ potentially different rows, and a boolean column 'updated'.
26
  """
27
 
28
+ # Create a copy of the original DataFrame to avoid modifying it directly
29
+ original_df = df.copy()
30
+
31
  # Create a column for content length
32
  df['content_length'] = df['content'].str.len()
33
 
34
  # Find row with the longest content for each 'id'
35
  idx_longest_content = df.groupby('id')['content_length'].idxmax().values
36
+ df_longest_content = df.loc[idx_longest_content][['id', 'content']]
37
 
38
  # Find row with the highest score for each 'id'
39
  idx_highest_score = df.groupby('id')['score'].idxmax().values
 
42
  # Merge the two DataFrames on 'id'
43
  df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
44
 
 
45
 
46
+ # Check if the content or score was updated for each id
47
+ df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
48
+ df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
49
+ df_merged['score'] != df_merged['score_original'])
50
+
51
+ # Drop duplicates to keep only the rows with longest content and highest score
52
+ df_merged.drop_duplicates(subset='id', inplace=True)
53
 
54
+ # Drop original content and score columns
55
+ df_merged.drop(columns=['content_original', 'score_original'], inplace=True)
56
+
57
+ return df_merged
58
 
59
 
60
  def merge_and_filter_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame: