Spaces:
Running
Running
some fixes
Browse files- requirements.txt +2 -1
- src/donorframe.py +1 -1
- src/handlers.py +30 -23
- src/tranformers/state_transforms.py +2 -0
- tests/donorframe_test.py +5 -0
requirements.txt
CHANGED
@@ -12,4 +12,5 @@ usaddress-scourgify
|
|
12 |
openpyxl
|
13 |
xlrd
|
14 |
bs4
|
15 |
-
chardet
|
|
|
|
12 |
openpyxl
|
13 |
xlrd
|
14 |
bs4
|
15 |
+
chardet
|
16 |
+
pytest
|
src/donorframe.py
CHANGED
@@ -174,7 +174,7 @@ class DonorFrame:
|
|
174 |
return self.curr_data
|
175 |
|
176 |
logger.info(
|
177 |
-
f"Produced final dataframe with {data_for_preferred.count().iloc[0]} unique donors."
|
178 |
)
|
179 |
logger.info(f"Transformation took {round(transform_time, 6)} seconds.")
|
180 |
|
|
|
174 |
return self.curr_data
|
175 |
|
176 |
logger.info(
|
177 |
+
f"Produced final dataframe with {data_for_preferred.count().iloc[0]:,} unique donors."
|
178 |
)
|
179 |
logger.info(f"Transformation took {round(transform_time, 6)} seconds.")
|
180 |
|
src/handlers.py
CHANGED
@@ -93,12 +93,14 @@ class Ingester:
|
|
93 |
def ingest(self) -> pd.DataFrame:
|
94 |
if self.file_type == ".xml":
|
95 |
return self._parse_xml()
|
96 |
-
elif self.file_type
|
97 |
return self._parse_csv()
|
98 |
elif self.file_type == ".html":
|
99 |
return self._parse_html()
|
100 |
elif self.file_type in [".xls", ".xlsx"]:
|
101 |
return self._parse_excel()
|
|
|
|
|
102 |
|
103 |
def _get_encoding(self):
|
104 |
with open(self.path, "rb") as f:
|
@@ -109,33 +111,30 @@ class Ingester:
|
|
109 |
sniffer = Sniffer()
|
110 |
with open(self.path, newline="") as csvfile:
|
111 |
to_detect = csvfile.read(1024)
|
112 |
-
return sniffer.sniff(to_detect)
|
113 |
|
114 |
def _detect_row_skip(self) -> int:
|
115 |
rs = 0
|
116 |
-
while len(pd.read_csv(self.path, skiprows=rs, nrows=1).columns) < 4:
|
117 |
rs += 1
|
118 |
return rs
|
119 |
|
120 |
-
def
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
)
|
137 |
-
except ParserError:
|
138 |
-
tmp_file = pd.read_csv(
|
139 |
self.path,
|
140 |
index_col=False,
|
141 |
encoding="unicode_escape",
|
@@ -145,6 +144,14 @@ class Ingester:
|
|
145 |
quotechar='"',
|
146 |
on_bad_lines="skip",
|
147 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
return tmp_file
|
149 |
|
150 |
def _parse_excel(self):
|
@@ -252,11 +259,11 @@ class Polisher:
|
|
252 |
input_df[col] = (
|
253 |
input_df[col].astype(str).map(lambda x: x.lower().strip())
|
254 |
)
|
|
|
255 |
elif col == "state":
|
256 |
input_df[col] = (
|
257 |
input_df[col].astype(str).map(lambda x: x.upper().strip())
|
258 |
)
|
259 |
-
|
260 |
return input_df
|
261 |
|
262 |
def _dedupe(self, input_df: pd.DataFrame) -> pd.DataFrame:
|
|
|
93 |
def ingest(self) -> pd.DataFrame:
|
94 |
if self.file_type == ".xml":
|
95 |
return self._parse_xml()
|
96 |
+
elif self.file_type in [".csv", ".txt"]:
|
97 |
return self._parse_csv()
|
98 |
elif self.file_type == ".html":
|
99 |
return self._parse_html()
|
100 |
elif self.file_type in [".xls", ".xlsx"]:
|
101 |
return self._parse_excel()
|
102 |
+
else:
|
103 |
+
raise OSError(f"File extension {self.file_type} not supported.")
|
104 |
|
105 |
def _get_encoding(self):
|
106 |
with open(self.path, "rb") as f:
|
|
|
111 |
sniffer = Sniffer()
|
112 |
with open(self.path, newline="") as csvfile:
|
113 |
to_detect = csvfile.read(1024)
|
114 |
+
return sniffer.sniff(to_detect).delimiter
|
115 |
|
116 |
def _detect_row_skip(self) -> int:
|
117 |
rs = 0
|
118 |
+
while len(pd.read_csv(self.path, sep=self.delimiter, skiprows=rs, nrows=1).columns) < 4:
|
119 |
rs += 1
|
120 |
return rs
|
121 |
|
122 |
+
def get_base_csv_params(self) -> dict:
|
123 |
+
return {
|
124 |
+
"filepath_or_buffer": self.path,
|
125 |
+
"sep": self.delimiter,
|
126 |
+
"skiprows": self._detect_row_skip(),
|
127 |
+
"index_col": False,
|
128 |
+
"skip_blank_lines": True,
|
129 |
+
"skipinitialspace": True,
|
130 |
+
"escapechar": "\\",
|
131 |
+
}
|
132 |
+
|
133 |
+
def _parse_csv_encoding_errors(self):
|
134 |
+
return pd.read_csv(**self.get_base_csv_params(), encoding_errors="ignore")
|
135 |
+
|
136 |
+
def _parse_csv_parser_error(self):
|
137 |
+
return pd.read_csv(
|
|
|
|
|
|
|
138 |
self.path,
|
139 |
index_col=False,
|
140 |
encoding="unicode_escape",
|
|
|
144 |
quotechar='"',
|
145 |
on_bad_lines="skip",
|
146 |
)
|
147 |
+
|
148 |
+
def _parse_csv(self):
|
149 |
+
try:
|
150 |
+
tmp_file = pd.read_csv(**self.get_base_csv_params())
|
151 |
+
except UnicodeDecodeError:
|
152 |
+
tmp_file = self._parse_csv_encoding_errors()
|
153 |
+
except ParserError:
|
154 |
+
tmp_file = self._parse_csv_parser_error()
|
155 |
return tmp_file
|
156 |
|
157 |
def _parse_excel(self):
|
|
|
259 |
input_df[col] = (
|
260 |
input_df[col].astype(str).map(lambda x: x.lower().strip())
|
261 |
)
|
262 |
+
input_df[col] = input_df[col].str.replace(r"[^a-z0-9 ]", "", regex=True)
|
263 |
elif col == "state":
|
264 |
input_df[col] = (
|
265 |
input_df[col].astype(str).map(lambda x: x.upper().strip())
|
266 |
)
|
|
|
267 |
return input_df
|
268 |
|
269 |
def _dedupe(self, input_df: pd.DataFrame) -> pd.DataFrame:
|
src/tranformers/state_transforms.py
CHANGED
@@ -305,6 +305,8 @@ class LA_C(StateTransformer):
|
|
305 |
|
306 |
class MA(StateTransformer):
|
307 |
def format_data(self, input_df: pd.DataFrame, **kwargs) -> str:
|
|
|
|
|
308 |
input_df["donation_amount"] = input_df["donation_amount"].replace(
|
309 |
r"$|,", "", regex=True
|
310 |
)
|
|
|
305 |
|
306 |
class MA(StateTransformer):
|
307 |
def format_data(self, input_df: pd.DataFrame, **kwargs) -> str:
|
308 |
+
#### temporary fix for column issue
|
309 |
+
input_df.rename(columns={'full_name': 'last_name'}, inplace=True)
|
310 |
input_df["donation_amount"] = input_df["donation_amount"].replace(
|
311 |
r"$|,", "", regex=True
|
312 |
)
|
tests/donorframe_test.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.donorframe import DonorFrame
|
2 |
+
import pytest
|
3 |
+
|
4 |
+
def test():
|
5 |
+
pass
|