jer164 commited on
Commit
3eb7933
·
1 Parent(s): cb2c213
app.py CHANGED
@@ -164,7 +164,8 @@ app_ui = ui.page_fluid(
164
  },
165
  },
166
  ),
167
- ui.input_text("recipient_name", "Filter Recipient (TN Only)", ""),
 
168
  ui_card(
169
  "File Facts:",
170
  ui.output_text("total_donors"),
@@ -199,6 +200,8 @@ def server(input, output, session):
199
 
200
  @reactive.Calc
201
  def get_file_name():
 
 
202
  paths = [file["name"] for file in input.donor_file()]
203
  tmp_path = paths[0].split(".")[0]
204
  return tmp_path
@@ -206,16 +209,24 @@ def server(input, output, session):
206
  @reactive.Calc
207
  def compile_donors():
208
  file: list[FileInfo] | None = input.donor_file()
209
- if file is None:
 
210
  return pd.DataFrame()
211
- paths = [file["datapath"] for file in input.donor_file()]
212
- files = [DonorFrame(path, input.source(), recip_filter()) for path in paths]
213
- print(files[0].recipient_name)
214
- try:
215
- dataframes = [file.format_donors(export=False) for file in files]
216
- except KeyError:
217
- raise SilentException()
218
- return pd.concat(dataframes, ignore_index=True)
 
 
 
 
 
 
 
219
 
220
  @output
221
  @render.data_frame
 
164
  },
165
  },
166
  ),
167
+ ui.panel_conditional("input.source === 'VA'", ui.input_text("recipient_name", "Filter Recipient (TN Only)", "")),
168
+ ui.panel_conditional("input.source === 'VA'", ui.input_text("va_report_link", "Contributions URL", "")),
169
  ui_card(
170
  "File Facts:",
171
  ui.output_text("total_donors"),
 
200
 
201
  @reactive.Calc
202
  def get_file_name():
203
+ if input.source() == 'VA':
204
+ return "va_candidate_donors"
205
  paths = [file["name"] for file in input.donor_file()]
206
  tmp_path = paths[0].split(".")[0]
207
  return tmp_path
 
209
  @reactive.Calc
210
  def compile_donors():
211
  file: list[FileInfo] | None = input.donor_file()
212
+ url = input.va_report_link()
213
+ if file is None and not url:
214
  return pd.DataFrame()
215
+ if file:
216
+ paths = [file["datapath"] for file in input.donor_file()]
217
+ files = [DonorFrame(path, input.source(), recip_filter()) for path in paths]
218
+ print(files[0].recipient_name)
219
+ try:
220
+ dataframes = [file.format_donors(export=False) for file in files]
221
+ except KeyError:
222
+ raise SilentException()
223
+ return pd.concat(dataframes, ignore_index=True)
224
+ elif url and not file:
225
+ try:
226
+ dataframe = DonorFrame(data_path=url, source='VA').format_donors(export=False)
227
+ except ValueError as e:
228
+ raise e
229
+ return dataframe
230
 
231
  @output
232
  @render.data_frame
requirements.txt CHANGED
@@ -13,4 +13,6 @@ openpyxl
13
  xlrd
14
  bs4
15
  chardet
16
- pytest
 
 
 
13
  xlrd
14
  bs4
15
  chardet
16
+ pytest
17
+ xmltodict
18
+ tqdm
src/add_funcs.py CHANGED
@@ -1,11 +1,17 @@
 
 
 
1
  import re
2
  import usaddress
 
3
  import pandas as pd
4
  from scourgify import normalize_address_record
5
  import warnings
6
- warnings.filterwarnings(category=DeprecationWarning, action='ignore')
7
  from bs4 import BeautifulSoup, SoupStrainer
8
- from lxml import etree, objectify
 
 
 
9
  ### need to refactor these into a special class
10
 
11
 
@@ -58,39 +64,66 @@ def nevada(input_path):
58
 
59
  return nv_df
60
 
 
61
 
62
- def virginia(input_xml):
63
- metadata = input_xml
64
- parser = etree.XMLParser(remove_blank_text=True)
65
- tree = etree.parse(metadata, parser)
66
- root = tree.getroot()
 
67
 
68
- ####
69
- for elem in root.getiterator():
70
- if not hasattr(elem.tag, "find"):
71
- continue # guard for Comment tags
72
- i = elem.tag.find("}")
73
- if i >= 0:
74
- elem.tag = elem.tag[i + 1 :]
75
- objectify.deannotate(root, cleanup_namespaces=True)
76
- ####
77
 
78
- to_parse = etree.tostring(tree, encoding='utf-8', method='xml')
 
79
  try:
80
- df = pd.read_xml(to_parse, xpath="//LiA")
81
- df = df.drop("Contributor", axis=1)
82
- df_2 = pd.read_xml(to_parse, xpath="//Contributor")
83
- df_3 = pd.read_xml(to_parse, xpath="//ScheduleA//LiA//Contributor//Address")
84
- final_df = pd.concat(
85
- [df, df_2.loc[:, ["FirstName", "LastName", "IsIndividual"]], df_3],
86
- axis="columns",
87
- )
88
- final_df.columns = [col.lower() for col in final_df.columns]
89
-
90
- return final_df
91
- except ValueError:
92
- raise Exception("No valid ScheduleA donations.")
93
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  def missouri(input_path):
96
  import pandas as pd
 
1
+ import grequests
2
+ import requests
3
+ from requests import Response
4
  import re
5
  import usaddress
6
+ from typing import List, Dict
7
  import pandas as pd
8
  from scourgify import normalize_address_record
9
  import warnings
 
10
  from bs4 import BeautifulSoup, SoupStrainer
11
+ from bs4.element import Tag
12
+ warnings.filterwarnings(category=DeprecationWarning, action='ignore')
13
+ import xmltodict
14
+ from tqdm import tqdm
15
  ### need to refactor these into a special class
16
 
17
 
 
64
 
65
  return nv_df
66
 
67
+ #### virginia parsers
68
 
69
+ def get_downloadable_reports(link: str) -> List[str]:
70
+ va_base_url = "https://cfreports.elections.virginia.gov/Report/ReportXML/{report_id}"
71
+ resp = requests.get(link)
72
+ parsed = BeautifulSoup(resp.content, "html.parser").find("div", {"id": "ScheduledReports"})
73
+ ids = [extract_va_report_id(i) for i in parsed.find_all("a", {"title": "Click to view report"})]
74
+ return [va_base_url.format(report_id = rep_id) for rep_id in ids]
75
 
76
+ def extract_va_report_id(tag: Tag) -> str:
77
+ return str(tag['href']).split("/")[-1]
 
 
 
 
 
 
 
78
 
79
+ def get_contributions_from_report(report: Response) -> List[Dict]:
80
+ content = report.content
81
  try:
82
+ report = xmltodict.parse(content)["Report"]
83
+ except KeyError:
84
+ report = xmltodict.parse(content)["FinalReport"]
85
+ if report.get("ScheduleA", None):
86
+ return report["ScheduleA"]["LiA"]
87
+ return None
88
+
89
+ def make_all_requests(report_urls: List[str]) -> list:
90
+ results = []
91
+ for idx, i in enumerate(report_urls):
92
+ print(idx)
93
+ results.append(requests.get(i))
94
+ return results
95
+
96
+ def parse_va_xml(xml_dict: Dict[str,str]) -> Dict:
97
+ if xml_dict is not None and isinstance(xml_dict, dict):
98
+ if xml_dict['Contributor']['@IsIndividual'] == 'true':
99
+ return {
100
+ 'first_name': xml_dict['Contributor'].get("FirstName", None),
101
+ 'last_name': xml_dict['Contributor'].get("LastName", None),
102
+ 'addr1': xml_dict['Contributor']['Address'].get("Line1", None),
103
+ 'addr2': xml_dict['Contributor']['Address'].get("Line2", None),
104
+ 'city': xml_dict['Contributor']['Address'].get("City", None),
105
+ 'state': xml_dict['Contributor']['Address'].get("State", None),
106
+ 'zip': xml_dict['Contributor']['Address'].get("ZipCode", None),
107
+ 'donation_amount': xml_dict.get("Amount", None),
108
+ 'donation_date': xml_dict.get("TransactionDate")
109
+ }
110
+ return None
111
+
112
+ def virginia(report_url: str) -> pd.DataFrame:
113
+ reports = get_downloadable_reports(report_url)
114
+ requests = make_all_requests(reports)
115
+ contributions = []
116
+ for r in requests:
117
+ xml_dict = get_contributions_from_report(r)
118
+ if xml_dict:
119
+ contributions.extend(xml_dict)
120
+ parsed_dicts = [parse_va_xml(x) for x in contributions if x]
121
+ if not parsed_dicts:
122
+ raise ValueError("No valid records for this candidate.")
123
+ return pd.DataFrame.from_records([d for d in parsed_dicts if d])
124
+
125
+
126
+ ### html parsers for Kansas and Missouri
127
 
128
  def missouri(input_path):
129
  import pandas as pd
src/handlers.py CHANGED
@@ -10,6 +10,7 @@ import chardet
10
  import re
11
  from typing import Optional
12
  from pandas.errors import EmptyDataError
 
13
 
14
  class ColumnCoder:
15
  def __init__(self, input_df: Optional[pd.DataFrame] = pd.DataFrame()):
@@ -81,17 +82,19 @@ class ColumnSelector:
81
 
82
  class Ingester:
83
  def __init__(self, path: str, source):
84
- self.path = Path(path)
85
  self.encoding = self._get_encoding()
86
  self.delimiter = self._detect_delimiter()
87
  self.source = source
88
 
89
  @property
90
  def file_type(self):
91
- return self.path.suffix
 
 
92
 
93
  def ingest(self) -> pd.DataFrame:
94
- if self.file_type == ".xml":
95
  return self._parse_xml()
96
  elif self.file_type in [".csv", ".txt"]:
97
  return self._parse_csv()
@@ -103,12 +106,14 @@ class Ingester:
103
  raise OSError(f"File extension {self.file_type} not supported.")
104
 
105
  def _get_encoding(self):
 
 
106
  with open(self.path, "rb") as f:
107
  to_detect = f.read()
108
  return chardet.detect(to_detect)["encoding"]
109
 
110
  def _detect_delimiter(self) -> str:
111
- if self.file_type in ['.xlsx', '.xls']:
112
  return None
113
  if self.file_type == ['.txt']:
114
  return "\t"
 
10
  import re
11
  from typing import Optional
12
  from pandas.errors import EmptyDataError
13
+ import validators
14
 
15
  class ColumnCoder:
16
  def __init__(self, input_df: Optional[pd.DataFrame] = pd.DataFrame()):
 
82
 
83
  class Ingester:
84
  def __init__(self, path: str, source):
85
+ self.path = path if source == 'VA' else Path(path)
86
  self.encoding = self._get_encoding()
87
  self.delimiter = self._detect_delimiter()
88
  self.source = source
89
 
90
  @property
91
  def file_type(self):
92
+ if isinstance(self.path, Path):
93
+ return self.path.suffix
94
+ return "url"
95
 
96
  def ingest(self) -> pd.DataFrame:
97
+ if self.file_type == "url":
98
  return self._parse_xml()
99
  elif self.file_type in [".csv", ".txt"]:
100
  return self._parse_csv()
 
106
  raise OSError(f"File extension {self.file_type} not supported.")
107
 
108
  def _get_encoding(self):
109
+ if self.file_type == 'url':
110
+ return None
111
  with open(self.path, "rb") as f:
112
  to_detect = f.read()
113
  return chardet.detect(to_detect)["encoding"]
114
 
115
  def _detect_delimiter(self) -> str:
116
+ if self.file_type in ['.xlsx', '.xls', 'url']:
117
  return None
118
  if self.file_type == ['.txt']:
119
  return "\t"
src/tranformers/state_transforms.py CHANGED
@@ -619,7 +619,7 @@ class TX(StateTransformer):
619
 
620
  class VA(StateTransformer):
621
  def format_data(self, input_df: pd.DataFrame, **kwargs) -> pd.DataFrame:
622
- input_df = input_df[input_df["isindividual"] == True]
623
  return input_df
624
 
625
 
 
619
 
620
  class VA(StateTransformer):
621
  def format_data(self, input_df: pd.DataFrame, **kwargs) -> pd.DataFrame:
622
+ input_df['zip'] = input_df['zip'].str[:5]
623
  return input_df
624
 
625