Commit
·
e77b07f
1
Parent(s):
ce8a94b
Adding remove filtered rows functionality
Browse files- filter_ids.json +3 -0
- main.py +2 -1
- utilities/user_defined_functions.py +19 -0
filter_ids.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
"qirdpd"
|
3 |
+
]
|
main.py
CHANGED
@@ -6,7 +6,7 @@ import pandas as pd
|
|
6 |
import schedule
|
7 |
from datasets import Dataset
|
8 |
|
9 |
-
from utilities.user_defined_functions import get_latest_data, merge_data, load_or_create_dataset
|
10 |
from utilities.my_logger import setup_logger
|
11 |
from utilities.readme_update import update_dataset_readme
|
12 |
|
@@ -44,6 +44,7 @@ def main():
|
|
44 |
df['new'] = True
|
45 |
df['updated'] = False
|
46 |
new_rows = len(new_df)
|
|
|
47 |
dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
|
48 |
|
49 |
# Update README
|
|
|
6 |
import schedule
|
7 |
from datasets import Dataset
|
8 |
|
9 |
+
from utilities.user_defined_functions import get_latest_data, merge_data, load_or_create_dataset, remove_filtered_rows
|
10 |
from utilities.my_logger import setup_logger
|
11 |
from utilities.readme_update import update_dataset_readme
|
12 |
|
|
|
44 |
df['new'] = True
|
45 |
df['updated'] = False
|
46 |
new_rows = len(new_df)
|
47 |
+
df = remove_filtered_rows(df)
|
48 |
dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
|
49 |
|
50 |
# Update README
|
utilities/user_defined_functions.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
from datetime import datetime
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
from datasets import Dataset, DatasetDict, load_dataset, DownloadMode
|
@@ -104,6 +105,24 @@ def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
|
|
104 |
return df
|
105 |
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
def get_latest_data():
|
108 |
submissions = praw_downloader()
|
109 |
df = preprocess_praw_data(submissions=submissions)
|
|
|
1 |
import os
|
2 |
from datetime import datetime
|
3 |
+
import json
|
4 |
|
5 |
import pandas as pd
|
6 |
from datasets import Dataset, DatasetDict, load_dataset, DownloadMode
|
|
|
105 |
return df
|
106 |
|
107 |
|
108 |
+
def remove_filtered_rows(df: pd.DataFrame) -> pd.DataFrame:
|
109 |
+
"""
|
110 |
+
Removes rows from the DataFrame where the 'id' is present in filter_ids.json.
|
111 |
+
|
112 |
+
:param df: Input DataFrame to be filtered.
|
113 |
+
:return: DataFrame with rows containing IDs present in filter_ids.json removed.
|
114 |
+
"""
|
115 |
+
# Load filter IDs from JSON file
|
116 |
+
with open('filter_ids.json', 'r') as file:
|
117 |
+
filter_ids = json.load(file)
|
118 |
+
|
119 |
+
# Remove the rows with IDs present in filter_ids
|
120 |
+
filtered_df = df[~df['id'].isin(filter_ids)]
|
121 |
+
logger.info(f"Filtered {len(df) - len(filtered_df)} rows from the DataFrame")
|
122 |
+
|
123 |
+
return filtered_df
|
124 |
+
|
125 |
+
|
126 |
def get_latest_data():
|
127 |
submissions = praw_downloader()
|
128 |
df = preprocess_praw_data(submissions=submissions)
|