Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

App Files Files Community

derek-thomas HF staff commited on Nov 16, 2023

Commit

bcf2055

•

1 Parent(s): 76a52b4

Adding code to show new rows

Browse files

Files changed (3) hide show

main.py +6 -2
utilities/data_collator.py +15 -8
utilities/praw_downloader.py +3 -1

main.py CHANGED Viewed

@@ -7,7 +7,7 @@ import schedule
 from datasets import Dataset, DatasetDict, load_dataset
 from huggingface_hub import login
-from utilities.data_collator import get_latest_data, merge_and_filter_data
 from utilities.my_logger import setup_logger
 from utilities.praw_downloader import dummy_data
 from utilities.readme_update import update_readme
@@ -57,10 +57,13 @@ def main():
     # Get Latest Data and merge with historic data
     new_df = get_latest_data()
     if 'train' in dataset.keys():
         old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
-        df = merge_and_filter_data(old_df=old_df, new_df=new_df)
         new_rows = len(df) - len(old_df)
     else:
         df = new_df
         new_rows = len(new_df)
@@ -80,6 +83,7 @@ def schedule_periodic_task():
     """
     Schedule the main task to run at the user-defined frequency
     """
     if frequency == 'hourly':
         logger.info(f'Scheduling tasks to run every hour at the top of the hour')
         schedule.every().hour.at(":00").do(main)

 from datasets import Dataset, DatasetDict, load_dataset
 from huggingface_hub import login
+from utilities.data_collator import get_latest_data, merge_data
 from utilities.my_logger import setup_logger
 from utilities.praw_downloader import dummy_data
 from utilities.readme_update import update_readme
     # Get Latest Data and merge with historic data
     new_df = get_latest_data()
+    # Using dataset from hub
     if 'train' in dataset.keys():
         old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
+        df = merge_data(old_df=old_df, new_df=new_df)
         new_rows = len(df) - len(old_df)
+    # New dataset
     else:
         df = new_df
         new_rows = len(new_df)
     """
     Schedule the main task to run at the user-defined frequency
     """
+    main()
     if frequency == 'hourly':
         logger.info(f'Scheduling tasks to run every hour at the top of the hour')
         schedule.every().hour.at(":00").do(main)

utilities/data_collator.py CHANGED Viewed

@@ -42,7 +42,6 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
     # Merge the two DataFrames on 'id'
     df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
     # Check if the content or score was updated for each id
     df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
     df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
@@ -57,27 +56,35 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
     return df_merged
-def merge_and_filter_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
     """
-    Merges two dataframes, sorts them by 'date_utc', and filters out redundant IDs.
-    The function first concatenates the old and new dataframes. Then, it sorts the
-    resulting dataframe by the 'date_utc' column. Finally, it filters out redundant IDs
-    using the `filter_redundant_ids` function.
     Args:
     - old_df (pd.DataFrame): The original dataframe.
     - new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.
     Returns:
-    - pd.DataFrame: The merged, sorted, and filtered dataframe.
     """
     # Concatenate old and new dataframes, sort by 'date_utc', and reset index
     df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
-    # Filter out redundant IDs
     df = filter_redundant_ids(df)
     return df

     # Merge the two DataFrames on 'id'
     df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
     # Check if the content or score was updated for each id
     df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
     df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
     return df_merged
+def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
     """
+    Merges two dataframes, sorts them by 'date_utc', and marks new IDs.
+    The function first marks rows from the new dataframe, then concatenates the old and new dataframes.
+    It sorts the resulting dataframe by the 'date_utc' column. Rows from the new dataframe that are not
+    in the old dataframe are marked as 'new'.
     Args:
     - old_df (pd.DataFrame): The original dataframe.
     - new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.
     Returns:
+    - pd.DataFrame: The merged, sorted, and marked dataframe.
     """
+    # Mark rows in old and new dataframes
+    old_df['new'] = False
+    new_df['new'] = True
     # Concatenate old and new dataframes, sort by 'date_utc', and reset index
     df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
+    # Optional: If you have a function to filter redundant IDs, you can use it here
     df = filter_redundant_ids(df)
+    # Identify new rows (present in new_df but not in old_df)
+    df['new'] = df['new'] & ~df['id'].duplicated(keep=False)
     return df

utilities/praw_downloader.py CHANGED Viewed

@@ -15,6 +15,7 @@ reddit_pull_limit = int(os.getenv("REDDIT_PULL_LIMIT"))
 # Dummy row for when we create a new repo
 dummy_data = {
     "content": ["This is a sample post content. Just for demonstration purposes!"],
     "poster": ["sampleUser123"],
     "date_utc": [datetime.strptime("2023-10-26 14:30:45", '%Y-%m-%d %H:%M:%S')],
@@ -22,7 +23,8 @@ dummy_data = {
     "title": ["Sample Post Title: How to Use Hugging Face?"],
     "score": [457],
     "permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
-    "id": ['id']
     }

 # Dummy row for when we create a new repo
 dummy_data = {
+    "id": ['id'],
     "content": ["This is a sample post content. Just for demonstration purposes!"],
     "poster": ["sampleUser123"],
     "date_utc": [datetime.strptime("2023-10-26 14:30:45", '%Y-%m-%d %H:%M:%S')],
     "title": ["Sample Post Title: How to Use Hugging Face?"],
     "score": [457],
     "permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
+    "updated": False,
+    "new": False,
     }