Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

App Files Files Community

derek-thomas HF staff commited on Nov 17, 2023

Commit

2703fdd

•

1 Parent(s): 826ed51

Refactoring to make it clear which functions are user-defined

Browse files

Files changed (4) hide show

main.py +2 -37
utilities/{data_collator.py → data_processing.py} +2 -43
utilities/praw_downloader.py +0 -14
utilities/user_defined_functions.py +95 -0

main.py CHANGED Viewed

@@ -4,12 +4,11 @@ from datetime import datetime
 import pandas as pd
 import schedule
-from datasets import Dataset, DatasetDict, load_dataset
 from huggingface_hub import login
-from utilities.data_collator import get_latest_data, merge_data
 from utilities.my_logger import setup_logger
-from utilities.praw_downloader import dummy_data
 from utilities.readme_update import update_readme
 # Set dataset name, path to README.md, and existing dataset details
@@ -29,40 +28,6 @@ login(auth_token, add_to_git_credential=True)
 logger = setup_logger(__name__)
-def load_or_create_dataset():
-    """
-    Loads an existing dataset from the Hugging Face hub or creates a new one if it doesn't exist.
-    This function attempts to load a dataset specified by 'dataset_name'. If the dataset is not found,
-    it creates a new dataset with 'dummy_data', pushes it to the Hugging Face hub, and then reloads it.
-    After reloading, the dummy data is removed from the dataset.
-    Returns:
-        dataset (DatasetDict): The loaded or newly created dataset.
-    Raises:
-        FileNotFoundError: If the dataset cannot be loaded or created.
-    """
-    # Load the existing dataset from the Hugging Face hub or create a new one
-    try:
-        dataset = load_dataset(dataset_name)
-        logger.debug("Loading existing dataset")
-    except FileNotFoundError:
-        logger.warning("Creating new dataset")
-        # Creating Initial Repo
-        dataset = DatasetDict()
-        dataset['train'] = Dataset.from_dict(dummy_data)
-        dataset.push_to_hub(repo_id=dataset_name, token=auth_token)
-        # Pulling from Initial Repo
-        dataset = load_dataset(dataset_name)
-        # Remove dummy data
-        del dataset['train']
-    return dataset
 def main():
     date = datetime.now().strftime('%Y-%m-%d')
     logger.warning(f"Running main function for date: {date}")

 import pandas as pd
 import schedule
+from datasets import Dataset, load_dataset
 from huggingface_hub import login
+from utilities.user_defined_functions import get_latest_data, merge_data
 from utilities.my_logger import setup_logger
 from utilities.readme_update import update_readme
 # Set dataset name, path to README.md, and existing dataset details
 logger = setup_logger(__name__)
 def main():
     date = datetime.now().strftime('%Y-%m-%d')
     logger.warning(f"Running main function for date: {date}")

utilities/{data_collator.py → data_processing.py} RENAMED Viewed

@@ -1,16 +1,7 @@
 import pandas as pd
-from utilities.praw_downloader import praw_downloader
-from utilities.praw_processor import preprocess_praw_data
-def get_latest_data():
-    submissions = praw_downloader()
-    df = preprocess_praw_data(submissions=submissions)
-    return df
-def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
     """
     For each id, creates a new row with the longest content and the highest score
     from the available rows with the same id. Adds a boolean column 'updated'
@@ -56,38 +47,6 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
     return df_merged
-def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Merges two dataframes, sorts them by 'date_utc', and marks new IDs.
-    The function first marks rows from the new dataframe, then concatenates the old and new dataframes.
-    It sorts the resulting dataframe by the 'date_utc' column. Rows from the new dataframe that are not
-    in the old dataframe are marked as 'new'.
-    Args:
-    - old_df (pd.DataFrame): The original dataframe.
-    - new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.
-    Returns:
-    - pd.DataFrame: The merged, sorted, and marked dataframe.
-    """
-    # Mark rows in old and new dataframes
-    old_df['new'] = False
-    new_df['new'] = True
-    # Concatenate old and new dataframes, sort by 'date_utc', and reset index
-    df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
-    # Optional: If you have a function to filter redundant IDs, you can use it here
-    df = filter_redundant_ids(df)
-    # Identify new rows (present in new_df but not in old_df)
-    df['new'] = df['new'] & ~df['id'].duplicated(keep=False)
-    return df
 if __name__ == '__main__':
     # Mock data
     data = {
@@ -102,4 +61,4 @@ if __name__ == '__main__':
     print("Original DataFrame:")
     print(df)
     print("\nFiltered DataFrame:")
-    print(filter_redundant_ids(df))

 import pandas as pd
+def data_processing(df: pd.DataFrame) -> pd.DataFrame:
     """
     For each id, creates a new row with the longest content and the highest score
     from the available rows with the same id. Adds a boolean column 'updated'
     return df_merged
 if __name__ == '__main__':
     # Mock data
     data = {
     print("Original DataFrame:")
     print(df)
     print("\nFiltered DataFrame:")
+    print(data_processing(df))

utilities/praw_downloader.py CHANGED Viewed

@@ -13,20 +13,6 @@ logger = setup_logger(__name__)
 subreddit_var = os.getenv("SUBREDDIT")
 reddit_pull_limit = int(os.getenv("REDDIT_PULL_LIMIT"))
-# Dummy row for when we create a new repo
-dummy_data = {
-    "id": ['id'],
-    "content": ["This is a sample post content. Just for demonstration purposes!"],
-    "poster": ["sampleUser123"],
-    "date_utc": [datetime.strptime("2023-10-26 14:30:45", '%Y-%m-%d %H:%M:%S')],
-    "flair": ["Discussion"],
-    "title": ["Sample Post Title: How to Use Hugging Face?"],
-    "score": [457],
-    "permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
-    "updated": False,
-    "new": False,
-    }
 def get_reddit_instance() -> praw.Reddit:
     """Initialize and return a Reddit instance using PRAW."""

 subreddit_var = os.getenv("SUBREDDIT")
 reddit_pull_limit = int(os.getenv("REDDIT_PULL_LIMIT"))
 def get_reddit_instance() -> praw.Reddit:
     """Initialize and return a Reddit instance using PRAW."""

utilities/user_defined_functions.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from datetime import datetime
+import pandas as pd
+from datasets import Dataset, DatasetDict, load_dataset
+from main import auth_token, dataset_name, logger
+from utilities.data_processing import data_processing
+from utilities.praw_downloader import praw_downloader
+from utilities.praw_processor import preprocess_praw_data
+# Dummy row for when we create a new repo
+dummy_data = {
+    "id": ['id'],
+    "content": ["This is a sample post content. Just for demonstration purposes!"],
+    "poster": ["sampleUser123"],
+    "date_utc": [datetime.strptime("2023-10-26 14:30:45", '%Y-%m-%d %H:%M:%S')],
+    "flair": ["Discussion"],
+    "title": ["Sample Post Title: How to Use Hugging Face?"],
+    "score": [457],
+    "permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
+    "updated": False,
+    "new": False,
+    }
+def load_or_create_dataset():
+    """
+    Loads an existing dataset from the Hugging Face hub or creates a new one if it doesn't exist.
+    This function attempts to load a dataset specified by 'dataset_name'. If the dataset is not found,
+    it creates a new dataset with 'dummy_data', pushes it to the Hugging Face hub, and then reloads it.
+    After reloading, the dummy data is removed from the dataset.
+    Returns:
+        dataset (DatasetDict): The loaded or newly created dataset.
+    Raises:
+        FileNotFoundError: If the dataset cannot be loaded or created.
+    """
+    # Load the existing dataset from the Hugging Face hub or create a new one
+    try:
+        dataset = load_dataset(dataset_name)
+        logger.debug("Loading existing dataset")
+    except FileNotFoundError:
+        logger.warning("Creating new dataset")
+        # Creating Initial Repo
+        dataset = DatasetDict()
+        dataset['train'] = Dataset.from_dict(dummy_data)
+        dataset.push_to_hub(repo_id=dataset_name, token=auth_token)
+        # Pulling from Initial Repo
+        dataset = load_dataset(dataset_name)
+        # Remove dummy data
+        del dataset['train']
+    return dataset
+def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Merges two dataframes, sorts them by 'date_utc', and marks new IDs.
+    The function first marks rows from the new dataframe, then concatenates the old and new dataframes.
+    It sorts the resulting dataframe by the 'date_utc' column. Rows from the new dataframe that are not
+    in the old dataframe are marked as 'new'.
+    Args:
+    - old_df (pd.DataFrame): The original dataframe.
+    - new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.
+    Returns:
+    - pd.DataFrame: The merged, sorted, and marked dataframe.
+    """
+    # Mark rows in old and new dataframes
+    old_df['new'] = False
+    new_df['new'] = True
+    # Concatenate old and new dataframes, sort by 'date_utc', and reset index
+    df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
+    # Process data accordingly
+    df = data_processing(df)
+    # Identify new rows (present in new_df but not in old_df)
+    df['new'] = df['new'] & ~df['id'].duplicated(keep=False)
+    return df
+def get_latest_data():
+    submissions = praw_downloader()
+    df = preprocess_praw_data(submissions=submissions)
+    return df