derek-thomas HF staff commited on
Commit
e77b07f
·
1 Parent(s): ce8a94b

Adding remove filtered rows functionality

Browse files
filter_ids.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [
2
+ "qirdpd"
3
+ ]
main.py CHANGED
@@ -6,7 +6,7 @@ import pandas as pd
6
  import schedule
7
  from datasets import Dataset
8
 
9
- from utilities.user_defined_functions import get_latest_data, merge_data, load_or_create_dataset
10
  from utilities.my_logger import setup_logger
11
  from utilities.readme_update import update_dataset_readme
12
 
@@ -44,6 +44,7 @@ def main():
44
  df['new'] = True
45
  df['updated'] = False
46
  new_rows = len(new_df)
 
47
  dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
48
 
49
  # Update README
 
6
  import schedule
7
  from datasets import Dataset
8
 
9
+ from utilities.user_defined_functions import get_latest_data, merge_data, load_or_create_dataset, remove_filtered_rows
10
  from utilities.my_logger import setup_logger
11
  from utilities.readme_update import update_dataset_readme
12
 
 
44
  df['new'] = True
45
  df['updated'] = False
46
  new_rows = len(new_df)
47
+ df = remove_filtered_rows(df)
48
  dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
49
 
50
  # Update README
utilities/user_defined_functions.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  from datetime import datetime
 
3
 
4
  import pandas as pd
5
  from datasets import Dataset, DatasetDict, load_dataset, DownloadMode
@@ -104,6 +105,24 @@ def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
104
  return df
105
 
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  def get_latest_data():
108
  submissions = praw_downloader()
109
  df = preprocess_praw_data(submissions=submissions)
 
1
  import os
2
  from datetime import datetime
3
+ import json
4
 
5
  import pandas as pd
6
  from datasets import Dataset, DatasetDict, load_dataset, DownloadMode
 
105
  return df
106
 
107
 
108
+ def remove_filtered_rows(df: pd.DataFrame) -> pd.DataFrame:
109
+ """
110
+ Removes rows from the DataFrame where the 'id' is present in filter_ids.json.
111
+
112
+ :param df: Input DataFrame to be filtered.
113
+ :return: DataFrame with rows containing IDs present in filter_ids.json removed.
114
+ """
115
+ # Load filter IDs from JSON file
116
+ with open('filter_ids.json', 'r') as file:
117
+ filter_ids = json.load(file)
118
+
119
+ # Remove the rows with IDs present in filter_ids
120
+ filtered_df = df[~df['id'].isin(filter_ids)]
121
+ logger.info(f"Filtered {len(df) - len(filtered_df)} rows from the DataFrame")
122
+
123
+ return filtered_df
124
+
125
+
126
  def get_latest_data():
127
  submissions = praw_downloader()
128
  df = preprocess_praw_data(submissions=submissions)