Upload standardize-us-states.py
Browse files- standardize-us-states.py +43 -0
standardize-us-states.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import re
|
3 |
+
|
4 |
+
# dictionary of state names to abbreviations
|
5 |
+
state_abbreviations = {
|
6 |
+
'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
|
7 |
+
'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
|
8 |
+
'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
|
9 |
+
'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
|
10 |
+
'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
|
11 |
+
'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
|
12 |
+
'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
|
13 |
+
'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
|
14 |
+
'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
|
15 |
+
'Virginia': 'VA', 'Washington DC': 'DC', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
|
16 |
+
}
|
17 |
+
|
18 |
+
df = pd.read_csv('data/2019-climate-all.csv')
|
19 |
+
|
20 |
+
# remove duplicates
|
21 |
+
df.drop_duplicates(subset=['Username', 'Content'], inplace=True)
|
22 |
+
|
23 |
+
def get_state(location):
|
24 |
+
if not isinstance(location, str):
|
25 |
+
return None
|
26 |
+
|
27 |
+
# check for DC first
|
28 |
+
if re.search(r'\b(Washington DC|DC|D\.C)\b', location, re.IGNORECASE):
|
29 |
+
return 'Washington DC'
|
30 |
+
|
31 |
+
for state, abbrev in state_abbreviations.items():
|
32 |
+
pattern = rf'\b({re.escape(state)}|{re.escape(abbrev)})\b'
|
33 |
+
if re.search(pattern, location, re.IGNORECASE):
|
34 |
+
return state
|
35 |
+
|
36 |
+
return None
|
37 |
+
|
38 |
+
df['Filtered Location'] = df['User Location'].apply(get_state)
|
39 |
+
|
40 |
+
# filter rows where 'User Location (State)' is not blank
|
41 |
+
filtered_df = df[df['Filtered Location'].notna() & (df['Filtered Location'] != '')]
|
42 |
+
|
43 |
+
filtered_df.to_csv('data/2019-climate-usa-redo.csv', index=False)
|