derek-thomas HF staff commited on
Commit
bcf2055
1 Parent(s): 76a52b4

Adding code to show new rows

Browse files
main.py CHANGED
@@ -7,7 +7,7 @@ import schedule
7
  from datasets import Dataset, DatasetDict, load_dataset
8
  from huggingface_hub import login
9
 
10
- from utilities.data_collator import get_latest_data, merge_and_filter_data
11
  from utilities.my_logger import setup_logger
12
  from utilities.praw_downloader import dummy_data
13
  from utilities.readme_update import update_readme
@@ -57,10 +57,13 @@ def main():
57
 
58
  # Get Latest Data and merge with historic data
59
  new_df = get_latest_data()
 
 
60
  if 'train' in dataset.keys():
61
  old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
62
- df = merge_and_filter_data(old_df=old_df, new_df=new_df)
63
  new_rows = len(df) - len(old_df)
 
64
  else:
65
  df = new_df
66
  new_rows = len(new_df)
@@ -80,6 +83,7 @@ def schedule_periodic_task():
80
  """
81
  Schedule the main task to run at the user-defined frequency
82
  """
 
83
  if frequency == 'hourly':
84
  logger.info(f'Scheduling tasks to run every hour at the top of the hour')
85
  schedule.every().hour.at(":00").do(main)
 
7
  from datasets import Dataset, DatasetDict, load_dataset
8
  from huggingface_hub import login
9
 
10
+ from utilities.data_collator import get_latest_data, merge_data
11
  from utilities.my_logger import setup_logger
12
  from utilities.praw_downloader import dummy_data
13
  from utilities.readme_update import update_readme
 
57
 
58
  # Get Latest Data and merge with historic data
59
  new_df = get_latest_data()
60
+
61
+ # Using dataset from hub
62
  if 'train' in dataset.keys():
63
  old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
64
+ df = merge_data(old_df=old_df, new_df=new_df)
65
  new_rows = len(df) - len(old_df)
66
+ # New dataset
67
  else:
68
  df = new_df
69
  new_rows = len(new_df)
 
83
  """
84
  Schedule the main task to run at the user-defined frequency
85
  """
86
+ main()
87
  if frequency == 'hourly':
88
  logger.info(f'Scheduling tasks to run every hour at the top of the hour')
89
  schedule.every().hour.at(":00").do(main)
utilities/data_collator.py CHANGED
@@ -42,7 +42,6 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
42
  # Merge the two DataFrames on 'id'
43
  df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
44
 
45
-
46
  # Check if the content or score was updated for each id
47
  df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
48
  df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
@@ -57,27 +56,35 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
57
  return df_merged
58
 
59
 
60
- def merge_and_filter_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
61
  """
62
- Merges two dataframes, sorts them by 'date_utc', and filters out redundant IDs.
63
 
64
- The function first concatenates the old and new dataframes. Then, it sorts the
65
- resulting dataframe by the 'date_utc' column. Finally, it filters out redundant IDs
66
- using the `filter_redundant_ids` function.
67
 
68
  Args:
69
  - old_df (pd.DataFrame): The original dataframe.
70
  - new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.
71
 
72
  Returns:
73
- - pd.DataFrame: The merged, sorted, and filtered dataframe.
74
  """
75
 
 
 
 
 
76
  # Concatenate old and new dataframes, sort by 'date_utc', and reset index
77
  df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
78
 
79
- # Filter out redundant IDs
80
  df = filter_redundant_ids(df)
 
 
 
 
81
  return df
82
 
83
 
 
42
  # Merge the two DataFrames on 'id'
43
  df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
44
 
 
45
  # Check if the content or score was updated for each id
46
  df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
47
  df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
 
56
  return df_merged
57
 
58
 
59
+ def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
60
  """
61
+ Merges two dataframes, sorts them by 'date_utc', and marks new IDs.
62
 
63
+ The function first marks rows from the new dataframe, then concatenates the old and new dataframes.
64
+ It sorts the resulting dataframe by the 'date_utc' column. Rows from the new dataframe that are not
65
+ in the old dataframe are marked as 'new'.
66
 
67
  Args:
68
  - old_df (pd.DataFrame): The original dataframe.
69
  - new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.
70
 
71
  Returns:
72
+ - pd.DataFrame: The merged, sorted, and marked dataframe.
73
  """
74
 
75
+ # Mark rows in old and new dataframes
76
+ old_df['new'] = False
77
+ new_df['new'] = True
78
+
79
  # Concatenate old and new dataframes, sort by 'date_utc', and reset index
80
  df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
81
 
82
+ # Optional: If you have a function to filter redundant IDs, you can use it here
83
  df = filter_redundant_ids(df)
84
+
85
+ # Identify new rows (present in new_df but not in old_df)
86
+ df['new'] = df['new'] & ~df['id'].duplicated(keep=False)
87
+
88
  return df
89
 
90
 
utilities/praw_downloader.py CHANGED
@@ -15,6 +15,7 @@ reddit_pull_limit = int(os.getenv("REDDIT_PULL_LIMIT"))
15
 
16
  # Dummy row for when we create a new repo
17
  dummy_data = {
 
18
  "content": ["This is a sample post content. Just for demonstration purposes!"],
19
  "poster": ["sampleUser123"],
20
  "date_utc": [datetime.strptime("2023-10-26 14:30:45", '%Y-%m-%d %H:%M:%S')],
@@ -22,7 +23,8 @@ dummy_data = {
22
  "title": ["Sample Post Title: How to Use Hugging Face?"],
23
  "score": [457],
24
  "permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
25
- "id": ['id']
 
26
  }
27
 
28
 
 
15
 
16
  # Dummy row for when we create a new repo
17
  dummy_data = {
18
+ "id": ['id'],
19
  "content": ["This is a sample post content. Just for demonstration purposes!"],
20
  "poster": ["sampleUser123"],
21
  "date_utc": [datetime.strptime("2023-10-26 14:30:45", '%Y-%m-%d %H:%M:%S')],
 
23
  "title": ["Sample Post Title: How to Use Hugging Face?"],
24
  "score": [457],
25
  "permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
26
+ "updated": False,
27
+ "new": False,
28
  }
29
 
30