Spaces:
Sleeping
Sleeping
seanpedrickcase
commited on
Commit
·
8d1cc2b
1
Parent(s):
36bca81
Updated gradio and requirements. Hopefully fixed duplicate results issue. General code clean up
Browse files- .dockerignore +1 -1
- .gitignore +1 -0
- Dockerfile +11 -10
- README.md +2 -2
- app.py +8 -4
- requirements.txt +3 -4
- tools/aws_functions.py +2 -2
- tools/matcher_funcs.py +73 -41
.dockerignore
CHANGED
@@ -8,8 +8,8 @@
|
|
8 |
*.env
|
9 |
*.zip
|
10 |
test/*
|
11 |
-
nnet_model/*
|
12 |
deprecated_models/*
|
|
|
13 |
.ipynb_checkpoints/*
|
14 |
orchestration/*
|
15 |
.vscode/*
|
|
|
8 |
*.env
|
9 |
*.zip
|
10 |
test/*
|
|
|
11 |
deprecated_models/*
|
12 |
+
experiments/*
|
13 |
.ipynb_checkpoints/*
|
14 |
orchestration/*
|
15 |
.vscode/*
|
.gitignore
CHANGED
@@ -11,4 +11,5 @@ test/*
|
|
11 |
deprecated_models/*
|
12 |
.ipynb_checkpoints/*
|
13 |
orchestration/*
|
|
|
14 |
.vscode/*
|
|
|
11 |
deprecated_models/*
|
12 |
.ipynb_checkpoints/*
|
13 |
orchestration/*
|
14 |
+
experiments/*
|
15 |
.vscode/*
|
Dockerfile
CHANGED
@@ -1,23 +1,24 @@
|
|
1 |
-
FROM public.ecr.aws/docker/library/python:3.11.
|
2 |
-
# FROM public.ecr.aws/docker/library/python:3.10.13-slim
|
3 |
|
4 |
WORKDIR /src
|
5 |
|
6 |
COPY requirements.txt .
|
7 |
|
8 |
-
RUN pip install -r requirements.txt
|
|
|
|
|
9 |
|
10 |
# Set up a new user named "user" with user ID 1000
|
11 |
-
|
12 |
|
13 |
# Change ownership of /home/user directory
|
14 |
-
|
15 |
|
16 |
-
# Create the
|
17 |
-
|
18 |
|
19 |
# Switch to the "user" user
|
20 |
-
|
21 |
|
22 |
# Set home to the user's home directory
|
23 |
ENV HOME=/home/user \
|
@@ -37,7 +38,7 @@ ENV HOME=/home/user \
|
|
37 |
WORKDIR $HOME/app
|
38 |
|
39 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
40 |
-
|
41 |
-
COPY . $HOME/app
|
42 |
|
43 |
CMD ["python", "app.py"]
|
|
|
1 |
+
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
|
|
2 |
|
3 |
WORKDIR /src
|
4 |
|
5 |
COPY requirements.txt .
|
6 |
|
7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
8 |
+
|
9 |
+
RUN pip install --no-cache-dir gradio==4.31.0
|
10 |
|
11 |
# Set up a new user named "user" with user ID 1000
|
12 |
+
RUN useradd -m -u 1000 user
|
13 |
|
14 |
# Change ownership of /home/user directory
|
15 |
+
RUN chown -R user:user /home/user
|
16 |
|
17 |
+
# Create the output files directory and set its permissions
|
18 |
+
RUN mkdir -p /home/user/output && chown -R user:user /home/user/output
|
19 |
|
20 |
# Switch to the "user" user
|
21 |
+
USER user
|
22 |
|
23 |
# Set home to the user's home directory
|
24 |
ENV HOME=/home/user \
|
|
|
38 |
WORKDIR $HOME/app
|
39 |
|
40 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
41 |
+
COPY --chown=user . $HOME/app
|
42 |
+
#COPY . $HOME/app
|
43 |
|
44 |
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -4,10 +4,10 @@ emoji: 🌍
|
|
4 |
colorFrom: purple
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
license:
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
4 |
colorFrom: purple
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.31.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -159,9 +159,9 @@ def create_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:i
|
|
159 |
return lengths_df
|
160 |
|
161 |
|
162 |
-
def run_matcher(in_text, in_file, in_ref, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
|
163 |
'''
|
164 |
-
Split search and reference data into batches. Loop and run through the match script.
|
165 |
'''
|
166 |
|
167 |
overall_tic = time.perf_counter()
|
@@ -318,8 +318,12 @@ def run_matcher(in_text, in_file, in_ref, data_state:PandasDataFrame, results_da
|
|
318 |
else:
|
319 |
summary_of_summaries, BatchMatch_out = run_match_batch(BatchMatch, n, number_of_batches)
|
320 |
|
|
|
|
|
321 |
OutputMatch = combine_two_matches(OutputMatch, BatchMatch_out, "All up to and including batch " + str(n+1))
|
322 |
|
|
|
|
|
323 |
n += 1
|
324 |
|
325 |
if in_api==True:
|
@@ -389,8 +393,8 @@ with block:
|
|
389 |
|
390 |
gr.Markdown(
|
391 |
"""
|
392 |
-
## Choose reference file
|
393 |
-
Fuzzy matching will work on any address format, but the neural network will only work with the LLPG LPI format, e.g. with columns SaoText, SaoStartNumber etc.. This joins on the UPRN column. If any of these are different for you,
|
394 |
open 'Custom reference file format or join columns' below.
|
395 |
""")
|
396 |
|
|
|
159 |
return lengths_df
|
160 |
|
161 |
|
162 |
+
def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
|
163 |
'''
|
164 |
+
Split search and reference data into batches. Loop and run through the match script for each batch of data.
|
165 |
'''
|
166 |
|
167 |
overall_tic = time.perf_counter()
|
|
|
318 |
else:
|
319 |
summary_of_summaries, BatchMatch_out = run_match_batch(BatchMatch, n, number_of_batches)
|
320 |
|
321 |
+
print("BatchMatch_out match shape: ", BatchMatch_out.results_on_orig_df.shape)
|
322 |
+
|
323 |
OutputMatch = combine_two_matches(OutputMatch, BatchMatch_out, "All up to and including batch " + str(n+1))
|
324 |
|
325 |
+
print("Output results match shape: ", OutputMatch.results_on_orig_df.shape)
|
326 |
+
|
327 |
n += 1
|
328 |
|
329 |
if in_api==True:
|
|
|
393 |
|
394 |
gr.Markdown(
|
395 |
"""
|
396 |
+
## Choose reference file / call API
|
397 |
+
Upload a reference file to match against, or alternatively call the Addressbase API (requires API key). Fuzzy matching will work on any address format, but the neural network will only work with the LLPG LPI format, e.g. with columns SaoText, SaoStartNumber etc.. This joins on the UPRN column. If any of these are different for you,
|
398 |
open 'Custom reference file format or join columns' below.
|
399 |
""")
|
400 |
|
requirements.txt
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
-
#fuzzywuzzy==0.18.0
|
2 |
numpy==1.26.2
|
3 |
-
pandas==2.2.
|
4 |
rapidfuzz==3.8.1
|
5 |
torch==2.2.1
|
6 |
recordlinkage==0.16
|
@@ -8,6 +7,6 @@ pyap==0.3.1
|
|
8 |
pytest==7.4.3
|
9 |
pyarrow==14.0.1
|
10 |
openpyxl==3.1.2
|
11 |
-
gradio==4.
|
12 |
-
boto3==1.34.
|
13 |
polars==0.20.19
|
|
|
|
|
1 |
numpy==1.26.2
|
2 |
+
pandas==2.2.2
|
3 |
rapidfuzz==3.8.1
|
4 |
torch==2.2.1
|
5 |
recordlinkage==0.16
|
|
|
7 |
pytest==7.4.3
|
8 |
pyarrow==14.0.1
|
9 |
openpyxl==3.1.2
|
10 |
+
gradio==4.31.0
|
11 |
+
boto3==1.34.103
|
12 |
polars==0.20.19
|
tools/aws_functions.py
CHANGED
@@ -6,11 +6,11 @@ import os
|
|
6 |
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
|
9 |
-
bucket_name = os.environ['ADDRESS_MATCHER_BUCKET']
|
10 |
-
|
11 |
try:
|
|
|
12 |
session = boto3.Session(profile_name="default")
|
13 |
except Exception as e:
|
|
|
14 |
print(e)
|
15 |
|
16 |
# sts = session.client("sts")
|
|
|
6 |
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
|
|
|
|
|
9 |
try:
|
10 |
+
bucket_name = os.environ['ADDRESS_MATCHER_BUCKET']
|
11 |
session = boto3.Session(profile_name="default")
|
12 |
except Exception as e:
|
13 |
+
bucket_name = ''
|
14 |
print(e)
|
15 |
|
16 |
# sts = session.client("sts")
|
tools/matcher_funcs.py
CHANGED
@@ -24,14 +24,13 @@ run_fuzzy_match = True
|
|
24 |
run_nnet_match = True
|
25 |
run_standardise = True
|
26 |
|
27 |
-
from tools.preparation import prepare_search_address_string, prepare_search_address,
|
28 |
-
from tools.standardise import standardise_wrapper_func
|
29 |
from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, join_to_orig_df
|
30 |
|
31 |
# Neural network functions
|
32 |
### Predict function for imported model
|
33 |
from tools.model_predict import full_predict_func, full_predict_torch, post_predict_clean
|
34 |
-
from tools.recordlinkage_funcs import score_based_match
|
35 |
from tools.gradio import initial_data_load
|
36 |
|
37 |
# API functions
|
@@ -43,7 +42,7 @@ from tools.constants import max_predict_len, MatcherClass
|
|
43 |
|
44 |
# Load in data functions
|
45 |
|
46 |
-
def detect_file_type(filename):
|
47 |
"""Detect the file type based on its extension."""
|
48 |
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
49 |
return 'csv'
|
@@ -54,8 +53,8 @@ def detect_file_type(filename):
|
|
54 |
else:
|
55 |
raise ValueError("Unsupported file type.")
|
56 |
|
57 |
-
def read_file(filename):
|
58 |
-
"""Read the file based on its detected type."""
|
59 |
file_type = detect_file_type(filename)
|
60 |
|
61 |
if file_type == 'csv':
|
@@ -65,7 +64,11 @@ def read_file(filename):
|
|
65 |
elif file_type == 'parquet':
|
66 |
return pd.read_parquet(filename)
|
67 |
|
68 |
-
def get_file_name(in_name):
|
|
|
|
|
|
|
|
|
69 |
# Corrected regex pattern
|
70 |
match = re.search(r'\\(?!.*\\)(.*)', in_name)
|
71 |
if match:
|
@@ -98,12 +101,10 @@ def filter_not_matched(
|
|
98 |
matched_results_success = matched_results[matched_results["full_match"]==True]
|
99 |
|
100 |
# Filter search_df
|
101 |
-
#print(search_df.columns)
|
102 |
-
#print(key_col)
|
103 |
|
104 |
-
matched = search_df[key_col].astype(str).isin(matched_results_success[key_col].astype(str))
|
105 |
|
106 |
-
return search_df.iloc[np.where(~matched)[0]]
|
107 |
|
108 |
def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress()):
|
109 |
if in_api_key == "":
|
@@ -136,9 +137,9 @@ def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, prog
|
|
136 |
api_output_folder = check_and_create_api_folder()
|
137 |
|
138 |
# Check if the file exists
|
139 |
-
print("Matcher file name: ", Matcher.file_name)
|
140 |
search_file_name_without_extension = re.sub(r'\.[^.]+$', '', Matcher.file_name)
|
141 |
-
|
142 |
api_ref_save_loc = api_output_folder + search_file_name_without_extension + "_api_" + today_month_rev + "_" + query_type + "_ckpt"
|
143 |
print("API reference save location: ", api_ref_save_loc)
|
144 |
|
@@ -688,18 +689,20 @@ def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state,
|
|
688 |
print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
|
689 |
print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
|
690 |
|
691 |
-
Matcher.match_outputs_name = "diagnostics_initial_" + today_rev + ".csv"
|
692 |
-
Matcher.results_orig_df_name = "results_initial_" + today_rev + ".csv"
|
693 |
|
694 |
Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
|
695 |
Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
|
696 |
|
697 |
return Matcher
|
698 |
|
699 |
-
#
|
|
|
|
|
|
|
|
|
700 |
|
701 |
-
# Run batch of matches
|
702 |
-
def run_match_batch(InitialMatch, batch_n, total_batches, progress=gr.Progress()):
|
703 |
if run_fuzzy_match == True:
|
704 |
|
705 |
overall_tic = time.perf_counter()
|
@@ -717,8 +720,13 @@ def run_match_batch(InitialMatch, batch_n, total_batches, progress=gr.Progress()
|
|
717 |
message = "Nothing to match! Aborting address check."
|
718 |
print(message)
|
719 |
return message, InitialMatch
|
|
|
|
|
720 |
|
721 |
FuzzyNotStdMatch = combine_two_matches(InitialMatch, FuzzyNotStdMatch, df_name)
|
|
|
|
|
|
|
722 |
|
723 |
if (len(FuzzyNotStdMatch.search_df_not_matched) == 0) | (sum(FuzzyNotStdMatch.match_results_output[FuzzyNotStdMatch.match_results_output['full_match']==False]['fuzzy_score'])==0):
|
724 |
overall_toc = time.perf_counter()
|
@@ -867,12 +875,14 @@ def orchestrate_match_run(Matcher, standardise = False, nnet = False, file_stub=
|
|
867 |
# Save to file
|
868 |
Matcher.results_on_orig_df = results_on_orig_df
|
869 |
|
|
|
|
|
870 |
Matcher.summary = summary
|
871 |
|
872 |
Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
|
873 |
|
874 |
-
Matcher.match_outputs_name = "diagnostics_" + file_stub + today_rev + ".csv"
|
875 |
-
Matcher.results_orig_df_name = "results_" + file_stub + today_rev + ".csv"
|
876 |
|
877 |
Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
|
878 |
Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
|
@@ -963,6 +973,8 @@ def full_fuzzy_match(search_df:PandasDataFrame,
|
|
963 |
if type(search_df) != str:
|
964 |
results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
|
965 |
else: results_on_orig_df = match_results_output
|
|
|
|
|
966 |
|
967 |
return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
|
968 |
|
@@ -1016,7 +1028,7 @@ def full_fuzzy_match(search_df:PandasDataFrame,
|
|
1016 |
fuzzy_match_limit, search_df_cleaned, search_df_key_field, new_join_col, standardise, blocker_col = "Street")
|
1017 |
match_results_output_st['match_method'] = "Fuzzy match - street"
|
1018 |
|
1019 |
-
match_results_output_st_out =
|
1020 |
|
1021 |
match_results_output = match_results_output_st_out
|
1022 |
|
@@ -1027,6 +1039,8 @@ def full_fuzzy_match(search_df:PandasDataFrame,
|
|
1027 |
if type(search_df) != str:
|
1028 |
results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
|
1029 |
else: results_on_orig_df = match_results_output
|
|
|
|
|
1030 |
|
1031 |
return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
|
1032 |
|
@@ -1148,7 +1162,7 @@ def full_nn_match(ref_address_cols:List[str],
|
|
1148 |
else:
|
1149 |
matched_output_SBM_pc["match_method"] = "Neural net - Postcode"
|
1150 |
|
1151 |
-
match_results_output_final_pc =
|
1152 |
|
1153 |
summary_pc = create_match_summary(match_results_output_final_pc, df_name = "NNet blocked by Postcode " + df_name)
|
1154 |
print(summary_pc)
|
@@ -1171,7 +1185,7 @@ def full_nn_match(ref_address_cols:List[str],
|
|
1171 |
|
1172 |
### Join together old match df with new (model) match df
|
1173 |
|
1174 |
-
match_results_output_final_st =
|
1175 |
|
1176 |
summary_street = create_match_summary(match_results_output_final_st, df_name = "NNet blocked by Street " + df_name)
|
1177 |
print(summary_street)
|
@@ -1196,44 +1210,58 @@ def full_nn_match(ref_address_cols:List[str],
|
|
1196 |
|
1197 |
|
1198 |
# Combiner/summary functions
|
1199 |
-
def
|
1200 |
|
1201 |
-
|
1202 |
-
|
|
|
1203 |
|
1204 |
-
|
|
|
|
|
|
|
1205 |
|
1206 |
-
|
|
|
|
|
|
|
|
|
|
|
1207 |
combined_std_not_matches[match_address_series] = False
|
1208 |
|
1209 |
if "full_address" in combined_std_not_matches.columns:
|
1210 |
-
combined_std_not_matches[
|
1211 |
combined_std_not_matches["fuzzy_score"] = 0
|
1212 |
return combined_std_not_matches
|
|
|
|
|
|
|
1213 |
|
1214 |
-
combined_std_not_matches = combined_std_not_matches.sort_values([
|
1215 |
|
1216 |
if keep_only_duplicated == True:
|
1217 |
-
combined_std_not_matches = combined_std_not_matches[combined_std_not_matches.duplicated(
|
1218 |
-
|
1219 |
-
combined_std_not_matches_no_dups = combined_std_not_matches.drop_duplicates(
|
1220 |
|
1221 |
return combined_std_not_matches_no_dups
|
1222 |
|
1223 |
-
def combine_two_matches(OrigMatchClass, NewMatchClass, df_name):
|
|
|
|
|
|
|
|
|
1224 |
|
1225 |
today_rev = datetime.now().strftime("%Y%m%d")
|
1226 |
|
1227 |
-
NewMatchClass.match_results_output =
|
1228 |
|
1229 |
-
NewMatchClass.results_on_orig_df =
|
1230 |
-
|
1231 |
|
1232 |
# Filter out search results where a match was found
|
1233 |
NewMatchClass.pre_filter_search_df = NewMatchClass.results_on_orig_df
|
1234 |
|
1235 |
found_index = NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["Matched with reference address"] == True, NewMatchClass.search_df_key_field].astype(int)
|
1236 |
-
#print(found_index)[NewMatchClass.search_df_key_field]
|
1237 |
|
1238 |
key_field_values = NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int) # Assuming list conversion is suitable
|
1239 |
rows_to_drop = key_field_values[key_field_values.isin(found_index)].tolist()
|
@@ -1272,8 +1300,8 @@ def combine_two_matches(OrigMatchClass, NewMatchClass, df_name):
|
|
1272 |
### Rejoin the excluded matches onto the output file
|
1273 |
#NewMatchClass.results_on_orig_df = pd.concat([NewMatchClass.results_on_orig_df, NewMatchClass.excluded_df])
|
1274 |
|
1275 |
-
NewMatchClass.match_outputs_name = "
|
1276 |
-
NewMatchClass.results_orig_df_name = "
|
1277 |
|
1278 |
# Only keep essential columns
|
1279 |
essential_results_cols = [NewMatchClass.search_df_key_field, "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
|
@@ -1284,7 +1312,11 @@ def combine_two_matches(OrigMatchClass, NewMatchClass, df_name):
|
|
1284 |
|
1285 |
return NewMatchClass
|
1286 |
|
1287 |
-
def create_match_summary(match_results_output:PandasDataFrame, df_name:str):
|
|
|
|
|
|
|
|
|
1288 |
|
1289 |
# Check if match_results_output is a dictionary-like object and has the key 'full_match'
|
1290 |
|
|
|
24 |
run_nnet_match = True
|
25 |
run_standardise = True
|
26 |
|
27 |
+
from tools.preparation import prepare_search_address_string, prepare_search_address, extract_street_name
|
|
|
28 |
from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, join_to_orig_df
|
29 |
|
30 |
# Neural network functions
|
31 |
### Predict function for imported model
|
32 |
from tools.model_predict import full_predict_func, full_predict_torch, post_predict_clean
|
33 |
+
from tools.recordlinkage_funcs import score_based_match
|
34 |
from tools.gradio import initial_data_load
|
35 |
|
36 |
# API functions
|
|
|
42 |
|
43 |
# Load in data functions
|
44 |
|
45 |
+
def detect_file_type(filename:str) -> str:
|
46 |
"""Detect the file type based on its extension."""
|
47 |
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
48 |
return 'csv'
|
|
|
53 |
else:
|
54 |
raise ValueError("Unsupported file type.")
|
55 |
|
56 |
+
def read_file(filename:str) -> PandasDataFrame:
|
57 |
+
"""Read the file based on its detected type and convert to Pandas Dataframe. Supports csv, xlsx, and parquet."""
|
58 |
file_type = detect_file_type(filename)
|
59 |
|
60 |
if file_type == 'csv':
|
|
|
64 |
elif file_type == 'parquet':
|
65 |
return pd.read_parquet(filename)
|
66 |
|
67 |
+
def get_file_name(in_name:str) -> str:
|
68 |
+
'''
|
69 |
+
Get the name of a file from a string using the re package.
|
70 |
+
'''
|
71 |
+
|
72 |
# Corrected regex pattern
|
73 |
match = re.search(r'\\(?!.*\\)(.*)', in_name)
|
74 |
if match:
|
|
|
101 |
matched_results_success = matched_results[matched_results["full_match"]==True]
|
102 |
|
103 |
# Filter search_df
|
|
|
|
|
104 |
|
105 |
+
matched = search_df[key_col].astype(str).isin(matched_results_success[key_col].astype(str))
|
106 |
|
107 |
+
return search_df.iloc[np.where(~matched)[0]]
|
108 |
|
109 |
def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress()):
|
110 |
if in_api_key == "":
|
|
|
137 |
api_output_folder = check_and_create_api_folder()
|
138 |
|
139 |
# Check if the file exists
|
140 |
+
#print("Matcher file name: ", Matcher.file_name)
|
141 |
search_file_name_without_extension = re.sub(r'\.[^.]+$', '', Matcher.file_name)
|
142 |
+
|
143 |
api_ref_save_loc = api_output_folder + search_file_name_without_extension + "_api_" + today_month_rev + "_" + query_type + "_ckpt"
|
144 |
print("API reference save location: ", api_ref_save_loc)
|
145 |
|
|
|
689 |
print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
|
690 |
print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
|
691 |
|
692 |
+
Matcher.match_outputs_name = "output/diagnostics_initial_" + today_rev + ".csv"
|
693 |
+
Matcher.results_orig_df_name = "output/results_initial_" + today_rev + ".csv"
|
694 |
|
695 |
Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
|
696 |
Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
|
697 |
|
698 |
return Matcher
|
699 |
|
700 |
+
# Run a match run for a single batch
|
701 |
+
def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress()):
|
702 |
+
'''
|
703 |
+
Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
|
704 |
+
'''
|
705 |
|
|
|
|
|
706 |
if run_fuzzy_match == True:
|
707 |
|
708 |
overall_tic = time.perf_counter()
|
|
|
720 |
message = "Nothing to match! Aborting address check."
|
721 |
print(message)
|
722 |
return message, InitialMatch
|
723 |
+
|
724 |
+
print("FuzzyNotStdMatch shape before combine two matches: ", FuzzyNotStdMatch.results_on_orig_df.shape)
|
725 |
|
726 |
FuzzyNotStdMatch = combine_two_matches(InitialMatch, FuzzyNotStdMatch, df_name)
|
727 |
+
|
728 |
+
print("InitialMatch shape: ", InitialMatch.results_on_orig_df.shape)
|
729 |
+
print("FuzzyNotStdMatch shape: ", FuzzyNotStdMatch.results_on_orig_df.shape)
|
730 |
|
731 |
if (len(FuzzyNotStdMatch.search_df_not_matched) == 0) | (sum(FuzzyNotStdMatch.match_results_output[FuzzyNotStdMatch.match_results_output['full_match']==False]['fuzzy_score'])==0):
|
732 |
overall_toc = time.perf_counter()
|
|
|
875 |
# Save to file
|
876 |
Matcher.results_on_orig_df = results_on_orig_df
|
877 |
|
878 |
+
print("Results output in orchestrate match run shape: ", Matcher.results_on_orig_df.shape)
|
879 |
+
|
880 |
Matcher.summary = summary
|
881 |
|
882 |
Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
|
883 |
|
884 |
+
Matcher.match_outputs_name = "output/diagnostics_" + file_stub + today_rev + ".csv"
|
885 |
+
Matcher.results_orig_df_name = "output/results_" + file_stub + today_rev + ".csv"
|
886 |
|
887 |
Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
|
888 |
Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
|
|
|
973 |
if type(search_df) != str:
|
974 |
results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
|
975 |
else: results_on_orig_df = match_results_output
|
976 |
+
|
977 |
+
print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
|
978 |
|
979 |
return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
|
980 |
|
|
|
1028 |
fuzzy_match_limit, search_df_cleaned, search_df_key_field, new_join_col, standardise, blocker_col = "Street")
|
1029 |
match_results_output_st['match_method'] = "Fuzzy match - street"
|
1030 |
|
1031 |
+
match_results_output_st_out = combine_dfs_and_remove_dups(match_results_output, match_results_output_st, index_col = search_df_key_field)
|
1032 |
|
1033 |
match_results_output = match_results_output_st_out
|
1034 |
|
|
|
1039 |
if type(search_df) != str:
|
1040 |
results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
|
1041 |
else: results_on_orig_df = match_results_output
|
1042 |
+
|
1043 |
+
print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
|
1044 |
|
1045 |
return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
|
1046 |
|
|
|
1162 |
else:
|
1163 |
matched_output_SBM_pc["match_method"] = "Neural net - Postcode"
|
1164 |
|
1165 |
+
match_results_output_final_pc = combine_dfs_and_remove_dups(match_results, matched_output_SBM_pc, index_col = search_df_key_field)
|
1166 |
|
1167 |
summary_pc = create_match_summary(match_results_output_final_pc, df_name = "NNet blocked by Postcode " + df_name)
|
1168 |
print(summary_pc)
|
|
|
1185 |
|
1186 |
### Join together old match df with new (model) match df
|
1187 |
|
1188 |
+
match_results_output_final_st = combine_dfs_and_remove_dups(match_results_output_final_pc,matched_output_SBM_st, index_col = search_df_key_field)
|
1189 |
|
1190 |
summary_street = create_match_summary(match_results_output_final_st, df_name = "NNet blocked by Street " + df_name)
|
1191 |
print(summary_street)
|
|
|
1210 |
|
1211 |
|
1212 |
# Combiner/summary functions
|
1213 |
+
def combine_dfs_and_remove_dups(orig_df:PandasDataFrame, new_df:PandasDataFrame, index_col:str = "search_orig_address", match_address_series:str = "full_match", keep_only_duplicated:bool = False) -> PandasDataFrame:
|
1214 |
|
1215 |
+
'''
|
1216 |
+
Combine two Pandas dataframes and remove duplicates according to a specified 'index' column. Data is sorted with matched addresses first, non-matched second, so that the duplicate removal gets rid of rows that are not matched in the case of address duplicates.
|
1217 |
+
'''
|
1218 |
|
1219 |
+
# If one of the dataframes is empty, break
|
1220 |
+
if (orig_df.empty) & (new_df.empty):
|
1221 |
+
return orig_df
|
1222 |
+
|
1223 |
|
1224 |
+
|
1225 |
+
combined_std_not_matches = pd.concat([orig_df, new_df])#, ignore_index=True)
|
1226 |
+
|
1227 |
+
|
1228 |
+
# If no results were combined
|
1229 |
+
if combined_std_not_matches.empty:
|
1230 |
combined_std_not_matches[match_address_series] = False
|
1231 |
|
1232 |
if "full_address" in combined_std_not_matches.columns:
|
1233 |
+
combined_std_not_matches[index_col] = combined_std_not_matches["full_address"]
|
1234 |
combined_std_not_matches["fuzzy_score"] = 0
|
1235 |
return combined_std_not_matches
|
1236 |
+
|
1237 |
+
# Convert index_col to string to ensure indexes from different sources are being compared correctly
|
1238 |
+
combined_std_not_matches[index_col] = combined_std_not_matches[index_col].astype(str)
|
1239 |
|
1240 |
+
combined_std_not_matches = combined_std_not_matches.sort_values([index_col, match_address_series], ascending=[True, False])
|
1241 |
|
1242 |
if keep_only_duplicated == True:
|
1243 |
+
combined_std_not_matches = combined_std_not_matches[combined_std_not_matches.duplicated(index_col)]
|
1244 |
+
|
1245 |
+
combined_std_not_matches_no_dups = combined_std_not_matches.drop_duplicates(index_col).sort_index()
|
1246 |
|
1247 |
return combined_std_not_matches_no_dups
|
1248 |
|
1249 |
+
def combine_two_matches(OrigMatchClass:MatcherClass, NewMatchClass:MatcherClass, df_name:str) -> MatcherClass:
|
1250 |
+
|
1251 |
+
'''
|
1252 |
+
Combine two MatcherClass objects to retain newest matches and drop duplicate addresses.
|
1253 |
+
'''
|
1254 |
|
1255 |
today_rev = datetime.now().strftime("%Y%m%d")
|
1256 |
|
1257 |
+
NewMatchClass.match_results_output = combine_dfs_and_remove_dups(OrigMatchClass.match_results_output, NewMatchClass.match_results_output, index_col = NewMatchClass.search_df_key_field)
|
1258 |
|
1259 |
+
NewMatchClass.results_on_orig_df = combine_dfs_and_remove_dups(OrigMatchClass.pre_filter_search_df, NewMatchClass.results_on_orig_df, index_col = NewMatchClass.search_df_key_field, match_address_series = 'Matched with reference address')
|
|
|
1260 |
|
1261 |
# Filter out search results where a match was found
|
1262 |
NewMatchClass.pre_filter_search_df = NewMatchClass.results_on_orig_df
|
1263 |
|
1264 |
found_index = NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["Matched with reference address"] == True, NewMatchClass.search_df_key_field].astype(int)
|
|
|
1265 |
|
1266 |
key_field_values = NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int) # Assuming list conversion is suitable
|
1267 |
rows_to_drop = key_field_values[key_field_values.isin(found_index)].tolist()
|
|
|
1300 |
### Rejoin the excluded matches onto the output file
|
1301 |
#NewMatchClass.results_on_orig_df = pd.concat([NewMatchClass.results_on_orig_df, NewMatchClass.excluded_df])
|
1302 |
|
1303 |
+
NewMatchClass.match_outputs_name = "output/diagnostics_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
|
1304 |
+
NewMatchClass.results_orig_df_name = "output/results_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
|
1305 |
|
1306 |
# Only keep essential columns
|
1307 |
essential_results_cols = [NewMatchClass.search_df_key_field, "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
|
|
|
1312 |
|
1313 |
return NewMatchClass
|
1314 |
|
1315 |
+
def create_match_summary(match_results_output:PandasDataFrame, df_name:str) -> str:
|
1316 |
+
|
1317 |
+
'''
|
1318 |
+
Create a text summary of the matching process results to export to a text box or log file.
|
1319 |
+
'''
|
1320 |
|
1321 |
# Check if match_results_output is a dictionary-like object and has the key 'full_match'
|
1322 |
|