derek-thomas HF staff commited on
Commit
61f9cd0
1 Parent(s): 04fde16

Refactoring updates and minor fixes

Browse files
Files changed (2) hide show
  1. main.py +3 -5
  2. utilities/user_defined_functions.py +6 -8
main.py CHANGED
@@ -4,10 +4,9 @@ from datetime import datetime
4
 
5
  import pandas as pd
6
  import schedule
7
- from datasets import Dataset, load_dataset
8
- from huggingface_hub import login
9
 
10
- from utilities.user_defined_functions import get_latest_data, merge_data
11
  from utilities.my_logger import setup_logger
12
  from utilities.readme_update import update_readme
13
 
@@ -30,9 +29,8 @@ logger = setup_logger(__name__)
30
  def main():
31
  date = datetime.now().strftime('%Y-%m-%d')
32
  logger.warning(f"Running main function for date: {date}")
33
- dataset = load_dataset()
34
 
35
- # Get Latest Data and merge with historic data
36
  new_df = get_latest_data()
37
 
38
  # Using dataset from hub
 
4
 
5
  import pandas as pd
6
  import schedule
7
+ from datasets import Dataset
 
8
 
9
+ from utilities.user_defined_functions import get_latest_data, merge_data, load_or_create_dataset
10
  from utilities.my_logger import setup_logger
11
  from utilities.readme_update import update_readme
12
 
 
29
  def main():
30
  date = datetime.now().strftime('%Y-%m-%d')
31
  logger.warning(f"Running main function for date: {date}")
32
+ dataset = load_or_create_dataset()
33
 
 
34
  new_df = get_latest_data()
35
 
36
  # Using dataset from hub
utilities/user_defined_functions.py CHANGED
@@ -24,7 +24,8 @@ auth_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
24
  login(auth_token, add_to_git_credential=True)
25
 
26
  logger = setup_logger(__name__)
27
- # Dummy row for when we create a new repo
 
28
  dummy_data = {
29
  "id": ['id'],
30
  "content": ["This is a sample post content. Just for demonstration purposes!"],
@@ -34,8 +35,8 @@ dummy_data = {
34
  "title": ["Sample Post Title: How to Use Hugging Face?"],
35
  "score": [457],
36
  "permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
37
- "updated": False,
38
- "new": False,
39
  }
40
 
41
 
@@ -89,10 +90,7 @@ def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
89
  - pd.DataFrame: The merged, sorted, and marked dataframe.
90
  """
91
 
92
- # Mark rows in old and new dataframes
93
- old_df['new'] = False
94
- new_df['new'] = True
95
-
96
  # Concatenate old and new dataframes, sort by 'date_utc', and reset index
97
  df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
98
 
@@ -100,7 +98,7 @@ def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
100
  df = data_processing(df)
101
 
102
  # Identify new rows (present in new_df but not in old_df)
103
- df['new'] = df['new'] & ~df['id'].duplicated(keep=False)
104
 
105
  return df
106
 
 
24
  login(auth_token, add_to_git_credential=True)
25
 
26
  logger = setup_logger(__name__)
27
+
28
+ # Dummy row for when we create a new repo make sure to put everything in a list
29
  dummy_data = {
30
  "id": ['id'],
31
  "content": ["This is a sample post content. Just for demonstration purposes!"],
 
35
  "title": ["Sample Post Title: How to Use Hugging Face?"],
36
  "score": [457],
37
  "permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
38
+ "updated": [False],
39
+ "new": [False],
40
  }
41
 
42
 
 
90
  - pd.DataFrame: The merged, sorted, and marked dataframe.
91
  """
92
 
93
+ old_df.drop(columns=['new', 'updated'], inplace=True)
 
 
 
94
  # Concatenate old and new dataframes, sort by 'date_utc', and reset index
95
  df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
96
 
 
98
  df = data_processing(df)
99
 
100
  # Identify new rows (present in new_df but not in old_df)
101
+ df['new'] = df['id'].apply(lambda x: x in set(new_df['id']) - set(old_df['id']))
102
 
103
  return df
104