jer164 commited on
Commit
d90274f
·
1 Parent(s): 0e90b68

some fixes

Browse files
requirements.txt CHANGED
@@ -12,4 +12,5 @@ usaddress-scourgify
12
  openpyxl
13
  xlrd
14
  bs4
15
- chardet
 
 
12
  openpyxl
13
  xlrd
14
  bs4
15
+ chardet
16
+ pytest
src/donorframe.py CHANGED
@@ -174,7 +174,7 @@ class DonorFrame:
174
  return self.curr_data
175
 
176
  logger.info(
177
- f"Produced final dataframe with {data_for_preferred.count().iloc[0]} unique donors."
178
  )
179
  logger.info(f"Transformation took {round(transform_time, 6)} seconds.")
180
 
 
174
  return self.curr_data
175
 
176
  logger.info(
177
+ f"Produced final dataframe with {data_for_preferred.count().iloc[0]:,} unique donors."
178
  )
179
  logger.info(f"Transformation took {round(transform_time, 6)} seconds.")
180
 
src/handlers.py CHANGED
@@ -93,12 +93,14 @@ class Ingester:
93
  def ingest(self) -> pd.DataFrame:
94
  if self.file_type == ".xml":
95
  return self._parse_xml()
96
- elif self.file_type == ".csv":
97
  return self._parse_csv()
98
  elif self.file_type == ".html":
99
  return self._parse_html()
100
  elif self.file_type in [".xls", ".xlsx"]:
101
  return self._parse_excel()
 
 
102
 
103
  def _get_encoding(self):
104
  with open(self.path, "rb") as f:
@@ -109,33 +111,30 @@ class Ingester:
109
  sniffer = Sniffer()
110
  with open(self.path, newline="") as csvfile:
111
  to_detect = csvfile.read(1024)
112
- return sniffer.sniff(to_detect)
113
 
114
  def _detect_row_skip(self) -> int:
115
  rs = 0
116
- while len(pd.read_csv(self.path, skiprows=rs, nrows=1).columns) < 4:
117
  rs += 1
118
  return rs
119
 
120
- def _parse_csv(self):
121
- try:
122
- tmp_file = pd.read_csv(
123
- self.path,
124
- skiprows=self._detect_row_skip(),
125
- index_col=False,
126
- skip_blank_lines=True,
127
- skipinitialspace=True,
128
- escapechar="\\",
129
- )
130
- except UnicodeDecodeError:
131
- tmp_file = pd.read_csv(
132
- self.path,
133
- skiprows=1,
134
- index_col=False,
135
- encoding_errors="ignore",
136
- )
137
- except ParserError:
138
- tmp_file = pd.read_csv(
139
  self.path,
140
  index_col=False,
141
  encoding="unicode_escape",
@@ -145,6 +144,14 @@ class Ingester:
145
  quotechar='"',
146
  on_bad_lines="skip",
147
  )
 
 
 
 
 
 
 
 
148
  return tmp_file
149
 
150
  def _parse_excel(self):
@@ -252,11 +259,11 @@ class Polisher:
252
  input_df[col] = (
253
  input_df[col].astype(str).map(lambda x: x.lower().strip())
254
  )
 
255
  elif col == "state":
256
  input_df[col] = (
257
  input_df[col].astype(str).map(lambda x: x.upper().strip())
258
  )
259
-
260
  return input_df
261
 
262
  def _dedupe(self, input_df: pd.DataFrame) -> pd.DataFrame:
 
93
  def ingest(self) -> pd.DataFrame:
94
  if self.file_type == ".xml":
95
  return self._parse_xml()
96
+ elif self.file_type in [".csv", ".txt"]:
97
  return self._parse_csv()
98
  elif self.file_type == ".html":
99
  return self._parse_html()
100
  elif self.file_type in [".xls", ".xlsx"]:
101
  return self._parse_excel()
102
+ else:
103
+ raise OSError(f"File extension {self.file_type} not supported.")
104
 
105
  def _get_encoding(self):
106
  with open(self.path, "rb") as f:
 
111
  sniffer = Sniffer()
112
  with open(self.path, newline="") as csvfile:
113
  to_detect = csvfile.read(1024)
114
+ return sniffer.sniff(to_detect).delimiter
115
 
116
  def _detect_row_skip(self) -> int:
117
  rs = 0
118
+ while len(pd.read_csv(self.path, sep=self.delimiter, skiprows=rs, nrows=1).columns) < 4:
119
  rs += 1
120
  return rs
121
 
122
+ def get_base_csv_params(self) -> dict:
123
+ return {
124
+ "filepath_or_buffer": self.path,
125
+ "sep": self.delimiter,
126
+ "skiprows": self._detect_row_skip(),
127
+ "index_col": False,
128
+ "skip_blank_lines": True,
129
+ "skipinitialspace": True,
130
+ "escapechar": "\\",
131
+ }
132
+
133
+ def _parse_csv_encoding_errors(self):
134
+ return pd.read_csv(**self.get_base_csv_params(), encoding_errors="ignore")
135
+
136
+ def _parse_csv_parser_error(self):
137
+ return pd.read_csv(
 
 
 
138
  self.path,
139
  index_col=False,
140
  encoding="unicode_escape",
 
144
  quotechar='"',
145
  on_bad_lines="skip",
146
  )
147
+
148
+ def _parse_csv(self):
149
+ try:
150
+ tmp_file = pd.read_csv(**self.get_base_csv_params())
151
+ except UnicodeDecodeError:
152
+ tmp_file = self._parse_csv_encoding_errors()
153
+ except ParserError:
154
+ tmp_file = self._parse_csv_parser_error()
155
  return tmp_file
156
 
157
  def _parse_excel(self):
 
259
  input_df[col] = (
260
  input_df[col].astype(str).map(lambda x: x.lower().strip())
261
  )
262
+ input_df[col] = input_df[col].str.replace(r"[^a-z0-9 ]", "", regex=True)
263
  elif col == "state":
264
  input_df[col] = (
265
  input_df[col].astype(str).map(lambda x: x.upper().strip())
266
  )
 
267
  return input_df
268
 
269
  def _dedupe(self, input_df: pd.DataFrame) -> pd.DataFrame:
src/tranformers/state_transforms.py CHANGED
@@ -305,6 +305,8 @@ class LA_C(StateTransformer):
305
 
306
  class MA(StateTransformer):
307
  def format_data(self, input_df: pd.DataFrame, **kwargs) -> str:
 
 
308
  input_df["donation_amount"] = input_df["donation_amount"].replace(
309
  r"$|,", "", regex=True
310
  )
 
305
 
306
  class MA(StateTransformer):
307
  def format_data(self, input_df: pd.DataFrame, **kwargs) -> str:
308
+ #### temporary fix for column issue
309
+ input_df.rename(columns={'full_name': 'last_name'}, inplace=True)
310
  input_df["donation_amount"] = input_df["donation_amount"].replace(
311
  r"$|,", "", regex=True
312
  )
tests/donorframe_test.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from src.donorframe import DonorFrame
2
+ import pytest
3
+
4
+ def test():
5
+ pass