Spaces:
Running
Running
virjinyuh
Browse files- app.py +21 -10
- requirements.txt +3 -1
- src/add_funcs.py +64 -31
- src/handlers.py +9 -4
- src/tranformers/state_transforms.py +1 -1
app.py
CHANGED
@@ -164,7 +164,8 @@ app_ui = ui.page_fluid(
|
|
164 |
},
|
165 |
},
|
166 |
),
|
167 |
-
ui.input_text("recipient_name", "Filter Recipient (TN Only)", ""),
|
|
|
168 |
ui_card(
|
169 |
"File Facts:",
|
170 |
ui.output_text("total_donors"),
|
@@ -199,6 +200,8 @@ def server(input, output, session):
|
|
199 |
|
200 |
@reactive.Calc
|
201 |
def get_file_name():
|
|
|
|
|
202 |
paths = [file["name"] for file in input.donor_file()]
|
203 |
tmp_path = paths[0].split(".")[0]
|
204 |
return tmp_path
|
@@ -206,16 +209,24 @@ def server(input, output, session):
|
|
206 |
@reactive.Calc
|
207 |
def compile_donors():
|
208 |
file: list[FileInfo] | None = input.donor_file()
|
209 |
-
|
|
|
210 |
return pd.DataFrame()
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
@output
|
221 |
@render.data_frame
|
|
|
164 |
},
|
165 |
},
|
166 |
),
|
167 |
+
ui.panel_conditional("input.source === 'VA'", ui.input_text("recipient_name", "Filter Recipient (TN Only)", "")),
|
168 |
+
ui.panel_conditional("input.source === 'VA'", ui.input_text("va_report_link", "Contributions URL", "")),
|
169 |
ui_card(
|
170 |
"File Facts:",
|
171 |
ui.output_text("total_donors"),
|
|
|
200 |
|
201 |
@reactive.Calc
|
202 |
def get_file_name():
|
203 |
+
if input.source() == 'VA':
|
204 |
+
return "va_candidate_donors"
|
205 |
paths = [file["name"] for file in input.donor_file()]
|
206 |
tmp_path = paths[0].split(".")[0]
|
207 |
return tmp_path
|
|
|
209 |
@reactive.Calc
|
210 |
def compile_donors():
|
211 |
file: list[FileInfo] | None = input.donor_file()
|
212 |
+
url = input.va_report_link()
|
213 |
+
if file is None and not url:
|
214 |
return pd.DataFrame()
|
215 |
+
if file:
|
216 |
+
paths = [file["datapath"] for file in input.donor_file()]
|
217 |
+
files = [DonorFrame(path, input.source(), recip_filter()) for path in paths]
|
218 |
+
print(files[0].recipient_name)
|
219 |
+
try:
|
220 |
+
dataframes = [file.format_donors(export=False) for file in files]
|
221 |
+
except KeyError:
|
222 |
+
raise SilentException()
|
223 |
+
return pd.concat(dataframes, ignore_index=True)
|
224 |
+
elif url and not file:
|
225 |
+
try:
|
226 |
+
dataframe = DonorFrame(data_path=url, source='VA').format_donors(export=False)
|
227 |
+
except ValueError as e:
|
228 |
+
raise e
|
229 |
+
return dataframe
|
230 |
|
231 |
@output
|
232 |
@render.data_frame
|
requirements.txt
CHANGED
@@ -13,4 +13,6 @@ openpyxl
|
|
13 |
xlrd
|
14 |
bs4
|
15 |
chardet
|
16 |
-
pytest
|
|
|
|
|
|
13 |
xlrd
|
14 |
bs4
|
15 |
chardet
|
16 |
+
pytest
|
17 |
+
xmltodict
|
18 |
+
tqdm
|
src/add_funcs.py
CHANGED
@@ -1,11 +1,17 @@
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
import usaddress
|
|
|
3 |
import pandas as pd
|
4 |
from scourgify import normalize_address_record
|
5 |
import warnings
|
6 |
-
warnings.filterwarnings(category=DeprecationWarning, action='ignore')
|
7 |
from bs4 import BeautifulSoup, SoupStrainer
|
8 |
-
from
|
|
|
|
|
|
|
9 |
### need to refactor these into a special class
|
10 |
|
11 |
|
@@ -58,39 +64,66 @@ def nevada(input_path):
|
|
58 |
|
59 |
return nv_df
|
60 |
|
|
|
61 |
|
62 |
-
def
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
if not hasattr(elem.tag, "find"):
|
71 |
-
continue # guard for Comment tags
|
72 |
-
i = elem.tag.find("}")
|
73 |
-
if i >= 0:
|
74 |
-
elem.tag = elem.tag[i + 1 :]
|
75 |
-
objectify.deannotate(root, cleanup_namespaces=True)
|
76 |
-
####
|
77 |
|
78 |
-
|
|
|
79 |
try:
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
def missouri(input_path):
|
96 |
import pandas as pd
|
|
|
1 |
+
import grequests
|
2 |
+
import requests
|
3 |
+
from requests import Response
|
4 |
import re
|
5 |
import usaddress
|
6 |
+
from typing import List, Dict
|
7 |
import pandas as pd
|
8 |
from scourgify import normalize_address_record
|
9 |
import warnings
|
|
|
10 |
from bs4 import BeautifulSoup, SoupStrainer
|
11 |
+
from bs4.element import Tag
|
12 |
+
warnings.filterwarnings(category=DeprecationWarning, action='ignore')
|
13 |
+
import xmltodict
|
14 |
+
from tqdm import tqdm
|
15 |
### need to refactor these into a special class
|
16 |
|
17 |
|
|
|
64 |
|
65 |
return nv_df
|
66 |
|
67 |
+
#### virginia parsers
|
68 |
|
69 |
+
def get_downloadable_reports(link: str) -> List[str]:
|
70 |
+
va_base_url = "https://cfreports.elections.virginia.gov/Report/ReportXML/{report_id}"
|
71 |
+
resp = requests.get(link)
|
72 |
+
parsed = BeautifulSoup(resp.content, "html.parser").find("div", {"id": "ScheduledReports"})
|
73 |
+
ids = [extract_va_report_id(i) for i in parsed.find_all("a", {"title": "Click to view report"})]
|
74 |
+
return [va_base_url.format(report_id = rep_id) for rep_id in ids]
|
75 |
|
76 |
+
def extract_va_report_id(tag: Tag) -> str:
|
77 |
+
return str(tag['href']).split("/")[-1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
+
def get_contributions_from_report(report: Response) -> List[Dict]:
|
80 |
+
content = report.content
|
81 |
try:
|
82 |
+
report = xmltodict.parse(content)["Report"]
|
83 |
+
except KeyError:
|
84 |
+
report = xmltodict.parse(content)["FinalReport"]
|
85 |
+
if report.get("ScheduleA", None):
|
86 |
+
return report["ScheduleA"]["LiA"]
|
87 |
+
return None
|
88 |
+
|
89 |
+
def make_all_requests(report_urls: List[str]) -> list:
|
90 |
+
results = []
|
91 |
+
for idx, i in enumerate(report_urls):
|
92 |
+
print(idx)
|
93 |
+
results.append(requests.get(i))
|
94 |
+
return results
|
95 |
+
|
96 |
+
def parse_va_xml(xml_dict: Dict[str,str]) -> Dict:
|
97 |
+
if xml_dict is not None and isinstance(xml_dict, dict):
|
98 |
+
if xml_dict['Contributor']['@IsIndividual'] == 'true':
|
99 |
+
return {
|
100 |
+
'first_name': xml_dict['Contributor'].get("FirstName", None),
|
101 |
+
'last_name': xml_dict['Contributor'].get("LastName", None),
|
102 |
+
'addr1': xml_dict['Contributor']['Address'].get("Line1", None),
|
103 |
+
'addr2': xml_dict['Contributor']['Address'].get("Line2", None),
|
104 |
+
'city': xml_dict['Contributor']['Address'].get("City", None),
|
105 |
+
'state': xml_dict['Contributor']['Address'].get("State", None),
|
106 |
+
'zip': xml_dict['Contributor']['Address'].get("ZipCode", None),
|
107 |
+
'donation_amount': xml_dict.get("Amount", None),
|
108 |
+
'donation_date': xml_dict.get("TransactionDate")
|
109 |
+
}
|
110 |
+
return None
|
111 |
+
|
112 |
+
def virginia(report_url: str) -> pd.DataFrame:
|
113 |
+
reports = get_downloadable_reports(report_url)
|
114 |
+
requests = make_all_requests(reports)
|
115 |
+
contributions = []
|
116 |
+
for r in requests:
|
117 |
+
xml_dict = get_contributions_from_report(r)
|
118 |
+
if xml_dict:
|
119 |
+
contributions.extend(xml_dict)
|
120 |
+
parsed_dicts = [parse_va_xml(x) for x in contributions if x]
|
121 |
+
if not parsed_dicts:
|
122 |
+
raise ValueError("No valid records for this candidate.")
|
123 |
+
return pd.DataFrame.from_records([d for d in parsed_dicts if d])
|
124 |
+
|
125 |
+
|
126 |
+
### html parsers for Kansas and Missouri
|
127 |
|
128 |
def missouri(input_path):
|
129 |
import pandas as pd
|
src/handlers.py
CHANGED
@@ -10,6 +10,7 @@ import chardet
|
|
10 |
import re
|
11 |
from typing import Optional
|
12 |
from pandas.errors import EmptyDataError
|
|
|
13 |
|
14 |
class ColumnCoder:
|
15 |
def __init__(self, input_df: Optional[pd.DataFrame] = pd.DataFrame()):
|
@@ -81,17 +82,19 @@ class ColumnSelector:
|
|
81 |
|
82 |
class Ingester:
|
83 |
def __init__(self, path: str, source):
|
84 |
-
self.path = Path(path)
|
85 |
self.encoding = self._get_encoding()
|
86 |
self.delimiter = self._detect_delimiter()
|
87 |
self.source = source
|
88 |
|
89 |
@property
|
90 |
def file_type(self):
|
91 |
-
|
|
|
|
|
92 |
|
93 |
def ingest(self) -> pd.DataFrame:
|
94 |
-
if self.file_type == "
|
95 |
return self._parse_xml()
|
96 |
elif self.file_type in [".csv", ".txt"]:
|
97 |
return self._parse_csv()
|
@@ -103,12 +106,14 @@ class Ingester:
|
|
103 |
raise OSError(f"File extension {self.file_type} not supported.")
|
104 |
|
105 |
def _get_encoding(self):
|
|
|
|
|
106 |
with open(self.path, "rb") as f:
|
107 |
to_detect = f.read()
|
108 |
return chardet.detect(to_detect)["encoding"]
|
109 |
|
110 |
def _detect_delimiter(self) -> str:
|
111 |
-
if self.file_type in ['.xlsx', '.xls']:
|
112 |
return None
|
113 |
if self.file_type == ['.txt']:
|
114 |
return "\t"
|
|
|
10 |
import re
|
11 |
from typing import Optional
|
12 |
from pandas.errors import EmptyDataError
|
13 |
+
import validators
|
14 |
|
15 |
class ColumnCoder:
|
16 |
def __init__(self, input_df: Optional[pd.DataFrame] = pd.DataFrame()):
|
|
|
82 |
|
83 |
class Ingester:
|
84 |
def __init__(self, path: str, source):
|
85 |
+
self.path = path if source == 'VA' else Path(path)
|
86 |
self.encoding = self._get_encoding()
|
87 |
self.delimiter = self._detect_delimiter()
|
88 |
self.source = source
|
89 |
|
90 |
@property
|
91 |
def file_type(self):
|
92 |
+
if isinstance(self.path, Path):
|
93 |
+
return self.path.suffix
|
94 |
+
return "url"
|
95 |
|
96 |
def ingest(self) -> pd.DataFrame:
|
97 |
+
if self.file_type == "url":
|
98 |
return self._parse_xml()
|
99 |
elif self.file_type in [".csv", ".txt"]:
|
100 |
return self._parse_csv()
|
|
|
106 |
raise OSError(f"File extension {self.file_type} not supported.")
|
107 |
|
108 |
def _get_encoding(self):
|
109 |
+
if self.file_type == 'url':
|
110 |
+
return None
|
111 |
with open(self.path, "rb") as f:
|
112 |
to_detect = f.read()
|
113 |
return chardet.detect(to_detect)["encoding"]
|
114 |
|
115 |
def _detect_delimiter(self) -> str:
|
116 |
+
if self.file_type in ['.xlsx', '.xls', 'url']:
|
117 |
return None
|
118 |
if self.file_type == ['.txt']:
|
119 |
return "\t"
|
src/tranformers/state_transforms.py
CHANGED
@@ -619,7 +619,7 @@ class TX(StateTransformer):
|
|
619 |
|
620 |
class VA(StateTransformer):
|
621 |
def format_data(self, input_df: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
622 |
-
input_df = input_df[
|
623 |
return input_df
|
624 |
|
625 |
|
|
|
619 |
|
620 |
class VA(StateTransformer):
|
621 |
def format_data(self, input_df: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
622 |
+
input_df['zip'] = input_df['zip'].str[:5]
|
623 |
return input_df
|
624 |
|
625 |
|