derek-thomas HF staff commited on
Commit
04fde16
1 Parent(s): 1756d68

Updating "updated" to uninclude score updates

Browse files
Files changed (1) hide show
  1. utilities/data_processing.py +8 -7
utilities/data_processing.py CHANGED
@@ -23,27 +23,28 @@ def data_processing(df: pd.DataFrame) -> pd.DataFrame:
23
  df['content_length'] = df['content'].str.len()
24
 
25
  # Find row with the longest content for each 'id'
26
- idx_longest_content = df.groupby('id')['content_length'].idxmax().values
27
  df_longest_content = df.loc[idx_longest_content][['id', 'content']]
28
 
29
  # Find row with the highest score for each 'id'
30
- idx_highest_score = df.groupby('id')['score'].idxmax().values
31
  df_highest_score = df.loc[idx_highest_score][['id', 'score']]
32
 
33
  # Merge the two DataFrames on 'id'
34
  df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
35
 
36
- # Check if the content or score was updated for each id
37
  df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
38
- df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
39
- df_merged['score'] != df_merged['score_original'])
40
 
41
- # Drop duplicates to keep only the rows with longest content and highest score
42
- df_merged.drop_duplicates(subset='id', inplace=True)
43
 
44
  # Drop original content and score columns
45
  df_merged.drop(columns=['content_original', 'score_original'], inplace=True)
46
 
 
 
 
47
  return df_merged
48
 
49
 
 
23
  df['content_length'] = df['content'].str.len()
24
 
25
  # Find row with the longest content for each 'id'
26
+ idx_longest_content = df.groupby('id')['content_length'].idxmax()
27
  df_longest_content = df.loc[idx_longest_content][['id', 'content']]
28
 
29
  # Find row with the highest score for each 'id'
30
+ idx_highest_score = df.groupby('id')['score'].idxmax()
31
  df_highest_score = df.loc[idx_highest_score][['id', 'score']]
32
 
33
  # Merge the two DataFrames on 'id'
34
  df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
35
 
36
+ # Merge with original DataFrame to compare content and score
37
  df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
 
 
38
 
39
+ # Check if the content or score was updated for each id
40
+ df_merged['updated'] = (df_merged['content'] != df_merged['content_original'])
41
 
42
  # Drop original content and score columns
43
  df_merged.drop(columns=['content_original', 'score_original'], inplace=True)
44
 
45
+ # Drop duplicates to keep only the rows with longest content and highest score
46
+ df_merged.drop_duplicates(subset='id', inplace=True)
47
+
48
  return df_merged
49
 
50