zhoudanxie
commited on
Commit
•
4ea5901
1
Parent(s):
16160fe
Fix fr_tracking data errors
Browse filesDrop duplicates in fr_tracking data if any
- modules/significant.py +7 -0
modules/significant.py
CHANGED
@@ -48,6 +48,13 @@ def read_csv_data(
|
|
48 |
except UnicodeDecodeError:
|
49 |
df_pd = pd_read_csv(url, usecols=cols, encoding="latin")
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
df_pd.loc[:, "publication_dt"] = to_datetime(df_pd["publication_date"], format="mixed", dayfirst=False, yearfirst=False)
|
52 |
max_date = max(df_pd.loc[:, "publication_dt"].to_list()).date()
|
53 |
#print(max_date)
|
|
|
48 |
except UnicodeDecodeError:
|
49 |
df_pd = pd_read_csv(url, usecols=cols, encoding="latin")
|
50 |
|
51 |
+
# drop duplicates if any (to avoid dashboard crush; original data need to be revised)
|
52 |
+
if len(df_pd[df_pd.duplicated(subset=['document_number'],keep=False)])>0:
|
53 |
+
df_pd=df_pd.sort_values(['document_number','publication_date','significant','3(f)(1) significant','Major']).\
|
54 |
+
drop_duplicates(subset=['document_number'],keep='last',ignore_index=True)
|
55 |
+
else:
|
56 |
+
pass
|
57 |
+
|
58 |
df_pd.loc[:, "publication_dt"] = to_datetime(df_pd["publication_date"], format="mixed", dayfirst=False, yearfirst=False)
|
59 |
max_date = max(df_pd.loc[:, "publication_dt"].to_list()).date()
|
60 |
#print(max_date)
|