seanpedrickcase commited on
Commit
8d1cc2b
1 Parent(s): 36bca81

Updated gradio and requirements. Hopefully fixed duplicate results issue. General code clean up

Browse files
Files changed (8) hide show
  1. .dockerignore +1 -1
  2. .gitignore +1 -0
  3. Dockerfile +11 -10
  4. README.md +2 -2
  5. app.py +8 -4
  6. requirements.txt +3 -4
  7. tools/aws_functions.py +2 -2
  8. tools/matcher_funcs.py +73 -41
.dockerignore CHANGED
@@ -8,8 +8,8 @@
8
  *.env
9
  *.zip
10
  test/*
11
- nnet_model/*
12
  deprecated_models/*
 
13
  .ipynb_checkpoints/*
14
  orchestration/*
15
  .vscode/*
 
8
  *.env
9
  *.zip
10
  test/*
 
11
  deprecated_models/*
12
+ experiments/*
13
  .ipynb_checkpoints/*
14
  orchestration/*
15
  .vscode/*
.gitignore CHANGED
@@ -11,4 +11,5 @@ test/*
11
  deprecated_models/*
12
  .ipynb_checkpoints/*
13
  orchestration/*
 
14
  .vscode/*
 
11
  deprecated_models/*
12
  .ipynb_checkpoints/*
13
  orchestration/*
14
+ experiments/*
15
  .vscode/*
Dockerfile CHANGED
@@ -1,23 +1,24 @@
1
- FROM public.ecr.aws/docker/library/python:3.11.8-slim-bookworm
2
- # FROM public.ecr.aws/docker/library/python:3.10.13-slim
3
 
4
  WORKDIR /src
5
 
6
  COPY requirements.txt .
7
 
8
- RUN pip install -r requirements.txt
 
 
9
 
10
  # Set up a new user named "user" with user ID 1000
11
- #RUN useradd -m -u 1000 user
12
 
13
  # Change ownership of /home/user directory
14
- #RUN chown -R user:user /home/user
15
 
16
- # Create the temp files directory and set its permissions
17
- #RUN mkdir -p /home/user/tmp && chown -R user:user /home/user/tmp
18
 
19
  # Switch to the "user" user
20
- #USER user
21
 
22
  # Set home to the user's home directory
23
  ENV HOME=/home/user \
@@ -37,7 +38,7 @@ ENV HOME=/home/user \
37
  WORKDIR $HOME/app
38
 
39
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
40
- #COPY --chown=user . $HOME/app
41
- COPY . $HOME/app
42
 
43
  CMD ["python", "app.py"]
 
1
+ FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
 
2
 
3
  WORKDIR /src
4
 
5
  COPY requirements.txt .
6
 
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ RUN pip install --no-cache-dir gradio==4.31.0
10
 
11
  # Set up a new user named "user" with user ID 1000
12
+ RUN useradd -m -u 1000 user
13
 
14
  # Change ownership of /home/user directory
15
+ RUN chown -R user:user /home/user
16
 
17
+ # Create the output files directory and set its permissions
18
+ RUN mkdir -p /home/user/output && chown -R user:user /home/user/output
19
 
20
  # Switch to the "user" user
21
+ USER user
22
 
23
  # Set home to the user's home directory
24
  ENV HOME=/home/user \
 
38
  WORKDIR $HOME/app
39
 
40
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
41
+ COPY --chown=user . $HOME/app
42
+ #COPY . $HOME/app
43
 
44
  CMD ["python", "app.py"]
README.md CHANGED
@@ -4,10 +4,10 @@ emoji: 🌍
4
  colorFrom: purple
5
  colorTo: gray
6
  sdk: gradio
7
- sdk_version: 4.20.1
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: purple
5
  colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 4.31.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -159,9 +159,9 @@ def create_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:i
159
  return lengths_df
160
 
161
 
162
- def run_matcher(in_text, in_file, in_ref, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
163
  '''
164
- Split search and reference data into batches. Loop and run through the match script.
165
  '''
166
 
167
  overall_tic = time.perf_counter()
@@ -318,8 +318,12 @@ def run_matcher(in_text, in_file, in_ref, data_state:PandasDataFrame, results_da
318
  else:
319
  summary_of_summaries, BatchMatch_out = run_match_batch(BatchMatch, n, number_of_batches)
320
 
 
 
321
  OutputMatch = combine_two_matches(OutputMatch, BatchMatch_out, "All up to and including batch " + str(n+1))
322
 
 
 
323
  n += 1
324
 
325
  if in_api==True:
@@ -389,8 +393,8 @@ with block:
389
 
390
  gr.Markdown(
391
  """
392
- ## Choose reference file
393
- Fuzzy matching will work on any address format, but the neural network will only work with the LLPG LPI format, e.g. with columns SaoText, SaoStartNumber etc.. This joins on the UPRN column. If any of these are different for you,
394
  open 'Custom reference file format or join columns' below.
395
  """)
396
 
 
159
  return lengths_df
160
 
161
 
162
+ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
163
  '''
164
+ Split search and reference data into batches. Loop and run through the match script for each batch of data.
165
  '''
166
 
167
  overall_tic = time.perf_counter()
 
318
  else:
319
  summary_of_summaries, BatchMatch_out = run_match_batch(BatchMatch, n, number_of_batches)
320
 
321
+ print("BatchMatch_out match shape: ", BatchMatch_out.results_on_orig_df.shape)
322
+
323
  OutputMatch = combine_two_matches(OutputMatch, BatchMatch_out, "All up to and including batch " + str(n+1))
324
 
325
+ print("Output results match shape: ", OutputMatch.results_on_orig_df.shape)
326
+
327
  n += 1
328
 
329
  if in_api==True:
 
393
 
394
  gr.Markdown(
395
  """
396
+ ## Choose reference file / call API
397
+ Upload a reference file to match against, or alternatively call the Addressbase API (requires API key). Fuzzy matching will work on any address format, but the neural network will only work with the LLPG LPI format, e.g. with columns SaoText, SaoStartNumber etc.. This joins on the UPRN column. If any of these are different for you,
398
  open 'Custom reference file format or join columns' below.
399
  """)
400
 
requirements.txt CHANGED
@@ -1,6 +1,5 @@
1
- #fuzzywuzzy==0.18.0
2
  numpy==1.26.2
3
- pandas==2.2.1
4
  rapidfuzz==3.8.1
5
  torch==2.2.1
6
  recordlinkage==0.16
@@ -8,6 +7,6 @@ pyap==0.3.1
8
  pytest==7.4.3
9
  pyarrow==14.0.1
10
  openpyxl==3.1.2
11
- gradio==4.20.1
12
- boto3==1.34.63
13
  polars==0.20.19
 
 
1
  numpy==1.26.2
2
+ pandas==2.2.2
3
  rapidfuzz==3.8.1
4
  torch==2.2.1
5
  recordlinkage==0.16
 
7
  pytest==7.4.3
8
  pyarrow==14.0.1
9
  openpyxl==3.1.2
10
+ gradio==4.31.0
11
+ boto3==1.34.103
12
  polars==0.20.19
tools/aws_functions.py CHANGED
@@ -6,11 +6,11 @@ import os
6
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
9
- bucket_name = os.environ['ADDRESS_MATCHER_BUCKET']
10
-
11
  try:
 
12
  session = boto3.Session(profile_name="default")
13
  except Exception as e:
 
14
  print(e)
15
 
16
  # sts = session.client("sts")
 
6
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
 
 
9
  try:
10
+ bucket_name = os.environ['ADDRESS_MATCHER_BUCKET']
11
  session = boto3.Session(profile_name="default")
12
  except Exception as e:
13
+ bucket_name = ''
14
  print(e)
15
 
16
  # sts = session.client("sts")
tools/matcher_funcs.py CHANGED
@@ -24,14 +24,13 @@ run_fuzzy_match = True
24
  run_nnet_match = True
25
  run_standardise = True
26
 
27
- from tools.preparation import prepare_search_address_string, prepare_search_address, prepare_ref_address, check_no_number_addresses, extract_street_name, remove_non_postal
28
- from tools.standardise import standardise_wrapper_func
29
  from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, join_to_orig_df
30
 
31
  # Neural network functions
32
  ### Predict function for imported model
33
  from tools.model_predict import full_predict_func, full_predict_torch, post_predict_clean
34
- from tools.recordlinkage_funcs import score_based_match, check_matches_against_fuzzy
35
  from tools.gradio import initial_data_load
36
 
37
  # API functions
@@ -43,7 +42,7 @@ from tools.constants import max_predict_len, MatcherClass
43
 
44
  # Load in data functions
45
 
46
- def detect_file_type(filename):
47
  """Detect the file type based on its extension."""
48
  if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
49
  return 'csv'
@@ -54,8 +53,8 @@ def detect_file_type(filename):
54
  else:
55
  raise ValueError("Unsupported file type.")
56
 
57
- def read_file(filename):
58
- """Read the file based on its detected type."""
59
  file_type = detect_file_type(filename)
60
 
61
  if file_type == 'csv':
@@ -65,7 +64,11 @@ def read_file(filename):
65
  elif file_type == 'parquet':
66
  return pd.read_parquet(filename)
67
 
68
- def get_file_name(in_name):
 
 
 
 
69
  # Corrected regex pattern
70
  match = re.search(r'\\(?!.*\\)(.*)', in_name)
71
  if match:
@@ -98,12 +101,10 @@ def filter_not_matched(
98
  matched_results_success = matched_results[matched_results["full_match"]==True]
99
 
100
  # Filter search_df
101
- #print(search_df.columns)
102
- #print(key_col)
103
 
104
- matched = search_df[key_col].astype(str).isin(matched_results_success[key_col].astype(str))#.drop(['level_0','index'], axis = 1, errors="ignore").reset_index() #
105
 
106
- return search_df.iloc[np.where(~matched)[0]] # search_df[~matched]
107
 
108
  def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress()):
109
  if in_api_key == "":
@@ -136,9 +137,9 @@ def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, prog
136
  api_output_folder = check_and_create_api_folder()
137
 
138
  # Check if the file exists
139
- print("Matcher file name: ", Matcher.file_name)
140
  search_file_name_without_extension = re.sub(r'\.[^.]+$', '', Matcher.file_name)
141
- #print("Search file name without extension: ", search_file_name_without_extension)
142
  api_ref_save_loc = api_output_folder + search_file_name_without_extension + "_api_" + today_month_rev + "_" + query_type + "_ckpt"
143
  print("API reference save location: ", api_ref_save_loc)
144
 
@@ -688,18 +689,20 @@ def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state,
688
  print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
689
  print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
690
 
691
- Matcher.match_outputs_name = "diagnostics_initial_" + today_rev + ".csv"
692
- Matcher.results_orig_df_name = "results_initial_" + today_rev + ".csv"
693
 
694
  Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
695
  Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
696
 
697
  return Matcher
698
 
699
- # DF preparation functions
 
 
 
 
700
 
701
- # Run batch of matches
702
- def run_match_batch(InitialMatch, batch_n, total_batches, progress=gr.Progress()):
703
  if run_fuzzy_match == True:
704
 
705
  overall_tic = time.perf_counter()
@@ -717,8 +720,13 @@ def run_match_batch(InitialMatch, batch_n, total_batches, progress=gr.Progress()
717
  message = "Nothing to match! Aborting address check."
718
  print(message)
719
  return message, InitialMatch
 
 
720
 
721
  FuzzyNotStdMatch = combine_two_matches(InitialMatch, FuzzyNotStdMatch, df_name)
 
 
 
722
 
723
  if (len(FuzzyNotStdMatch.search_df_not_matched) == 0) | (sum(FuzzyNotStdMatch.match_results_output[FuzzyNotStdMatch.match_results_output['full_match']==False]['fuzzy_score'])==0):
724
  overall_toc = time.perf_counter()
@@ -867,12 +875,14 @@ def orchestrate_match_run(Matcher, standardise = False, nnet = False, file_stub=
867
  # Save to file
868
  Matcher.results_on_orig_df = results_on_orig_df
869
 
 
 
870
  Matcher.summary = summary
871
 
872
  Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
873
 
874
- Matcher.match_outputs_name = "diagnostics_" + file_stub + today_rev + ".csv"
875
- Matcher.results_orig_df_name = "results_" + file_stub + today_rev + ".csv"
876
 
877
  Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
878
  Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
@@ -963,6 +973,8 @@ def full_fuzzy_match(search_df:PandasDataFrame,
963
  if type(search_df) != str:
964
  results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
965
  else: results_on_orig_df = match_results_output
 
 
966
 
967
  return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
968
 
@@ -1016,7 +1028,7 @@ def full_fuzzy_match(search_df:PandasDataFrame,
1016
  fuzzy_match_limit, search_df_cleaned, search_df_key_field, new_join_col, standardise, blocker_col = "Street")
1017
  match_results_output_st['match_method'] = "Fuzzy match - street"
1018
 
1019
- match_results_output_st_out = combine_std_df_remove_dups(match_results_output, match_results_output_st, orig_addr_col = search_df_key_field)
1020
 
1021
  match_results_output = match_results_output_st_out
1022
 
@@ -1027,6 +1039,8 @@ def full_fuzzy_match(search_df:PandasDataFrame,
1027
  if type(search_df) != str:
1028
  results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
1029
  else: results_on_orig_df = match_results_output
 
 
1030
 
1031
  return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
1032
 
@@ -1148,7 +1162,7 @@ def full_nn_match(ref_address_cols:List[str],
1148
  else:
1149
  matched_output_SBM_pc["match_method"] = "Neural net - Postcode"
1150
 
1151
- match_results_output_final_pc = combine_std_df_remove_dups(match_results, matched_output_SBM_pc, orig_addr_col = search_df_key_field)
1152
 
1153
  summary_pc = create_match_summary(match_results_output_final_pc, df_name = "NNet blocked by Postcode " + df_name)
1154
  print(summary_pc)
@@ -1171,7 +1185,7 @@ def full_nn_match(ref_address_cols:List[str],
1171
 
1172
  ### Join together old match df with new (model) match df
1173
 
1174
- match_results_output_final_st = combine_std_df_remove_dups(match_results_output_final_pc,matched_output_SBM_st, orig_addr_col = search_df_key_field)
1175
 
1176
  summary_street = create_match_summary(match_results_output_final_st, df_name = "NNet blocked by Street " + df_name)
1177
  print(summary_street)
@@ -1196,44 +1210,58 @@ def full_nn_match(ref_address_cols:List[str],
1196
 
1197
 
1198
  # Combiner/summary functions
1199
- def combine_std_df_remove_dups(df_not_std, df_std, orig_addr_col = "search_orig_address", match_address_series = "full_match", keep_only_duplicated = False):
1200
 
1201
- if (df_not_std.empty) & (df_std.empty):
1202
- return df_not_std
 
1203
 
1204
- combined_std_not_matches = pd.concat([df_not_std, df_std])#, ignore_index=True)
 
 
 
1205
 
1206
- if combined_std_not_matches.empty: #| ~(match_address_series in combined_std_not_matches.columns) | ~(orig_addr_col in combined_std_not_matches.columns):
 
 
 
 
 
1207
  combined_std_not_matches[match_address_series] = False
1208
 
1209
  if "full_address" in combined_std_not_matches.columns:
1210
- combined_std_not_matches[orig_addr_col] = combined_std_not_matches["full_address"]
1211
  combined_std_not_matches["fuzzy_score"] = 0
1212
  return combined_std_not_matches
 
 
 
1213
 
1214
- combined_std_not_matches = combined_std_not_matches.sort_values([orig_addr_col, match_address_series], ascending=False)
1215
 
1216
  if keep_only_duplicated == True:
1217
- combined_std_not_matches = combined_std_not_matches[combined_std_not_matches.duplicated(orig_addr_col)]
1218
-
1219
- combined_std_not_matches_no_dups = combined_std_not_matches.drop_duplicates(orig_addr_col).sort_index()
1220
 
1221
  return combined_std_not_matches_no_dups
1222
 
1223
- def combine_two_matches(OrigMatchClass, NewMatchClass, df_name):
 
 
 
 
1224
 
1225
  today_rev = datetime.now().strftime("%Y%m%d")
1226
 
1227
- NewMatchClass.match_results_output = combine_std_df_remove_dups(OrigMatchClass.match_results_output, NewMatchClass.match_results_output, orig_addr_col = NewMatchClass.search_df_key_field)
1228
 
1229
- NewMatchClass.results_on_orig_df = combine_std_df_remove_dups(OrigMatchClass.pre_filter_search_df, NewMatchClass.results_on_orig_df, orig_addr_col = NewMatchClass.search_df_key_field, match_address_series = 'Matched with reference address')
1230
-
1231
 
1232
  # Filter out search results where a match was found
1233
  NewMatchClass.pre_filter_search_df = NewMatchClass.results_on_orig_df
1234
 
1235
  found_index = NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["Matched with reference address"] == True, NewMatchClass.search_df_key_field].astype(int)
1236
- #print(found_index)[NewMatchClass.search_df_key_field]
1237
 
1238
  key_field_values = NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int) # Assuming list conversion is suitable
1239
  rows_to_drop = key_field_values[key_field_values.isin(found_index)].tolist()
@@ -1272,8 +1300,8 @@ def combine_two_matches(OrigMatchClass, NewMatchClass, df_name):
1272
  ### Rejoin the excluded matches onto the output file
1273
  #NewMatchClass.results_on_orig_df = pd.concat([NewMatchClass.results_on_orig_df, NewMatchClass.excluded_df])
1274
 
1275
- NewMatchClass.match_outputs_name = "match_results_output_std_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
1276
- NewMatchClass.results_orig_df_name = "results_on_orig_df_std_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
1277
 
1278
  # Only keep essential columns
1279
  essential_results_cols = [NewMatchClass.search_df_key_field, "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
@@ -1284,7 +1312,11 @@ def combine_two_matches(OrigMatchClass, NewMatchClass, df_name):
1284
 
1285
  return NewMatchClass
1286
 
1287
- def create_match_summary(match_results_output:PandasDataFrame, df_name:str):
 
 
 
 
1288
 
1289
  # Check if match_results_output is a dictionary-like object and has the key 'full_match'
1290
 
 
24
  run_nnet_match = True
25
  run_standardise = True
26
 
27
+ from tools.preparation import prepare_search_address_string, prepare_search_address, extract_street_name
 
28
  from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, join_to_orig_df
29
 
30
  # Neural network functions
31
  ### Predict function for imported model
32
  from tools.model_predict import full_predict_func, full_predict_torch, post_predict_clean
33
+ from tools.recordlinkage_funcs import score_based_match
34
  from tools.gradio import initial_data_load
35
 
36
  # API functions
 
42
 
43
  # Load in data functions
44
 
45
+ def detect_file_type(filename:str) -> str:
46
  """Detect the file type based on its extension."""
47
  if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
48
  return 'csv'
 
53
  else:
54
  raise ValueError("Unsupported file type.")
55
 
56
+ def read_file(filename:str) -> PandasDataFrame:
57
+ """Read the file based on its detected type and convert to Pandas Dataframe. Supports csv, xlsx, and parquet."""
58
  file_type = detect_file_type(filename)
59
 
60
  if file_type == 'csv':
 
64
  elif file_type == 'parquet':
65
  return pd.read_parquet(filename)
66
 
67
+ def get_file_name(in_name:str) -> str:
68
+ '''
69
+ Get the name of a file from a string using the re package.
70
+ '''
71
+
72
  # Corrected regex pattern
73
  match = re.search(r'\\(?!.*\\)(.*)', in_name)
74
  if match:
 
101
  matched_results_success = matched_results[matched_results["full_match"]==True]
102
 
103
  # Filter search_df
 
 
104
 
105
+ matched = search_df[key_col].astype(str).isin(matched_results_success[key_col].astype(str))
106
 
107
+ return search_df.iloc[np.where(~matched)[0]]
108
 
109
  def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress()):
110
  if in_api_key == "":
 
137
  api_output_folder = check_and_create_api_folder()
138
 
139
  # Check if the file exists
140
+ #print("Matcher file name: ", Matcher.file_name)
141
  search_file_name_without_extension = re.sub(r'\.[^.]+$', '', Matcher.file_name)
142
+
143
  api_ref_save_loc = api_output_folder + search_file_name_without_extension + "_api_" + today_month_rev + "_" + query_type + "_ckpt"
144
  print("API reference save location: ", api_ref_save_loc)
145
 
 
689
  print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
690
  print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
691
 
692
+ Matcher.match_outputs_name = "output/diagnostics_initial_" + today_rev + ".csv"
693
+ Matcher.results_orig_df_name = "output/results_initial_" + today_rev + ".csv"
694
 
695
  Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
696
  Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
697
 
698
  return Matcher
699
 
700
+ # Run a match run for a single batch
701
+ def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress()):
702
+ '''
703
+ Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
704
+ '''
705
 
 
 
706
  if run_fuzzy_match == True:
707
 
708
  overall_tic = time.perf_counter()
 
720
  message = "Nothing to match! Aborting address check."
721
  print(message)
722
  return message, InitialMatch
723
+
724
+ print("FuzzyNotStdMatch shape before combine two matches: ", FuzzyNotStdMatch.results_on_orig_df.shape)
725
 
726
  FuzzyNotStdMatch = combine_two_matches(InitialMatch, FuzzyNotStdMatch, df_name)
727
+
728
+ print("InitialMatch shape: ", InitialMatch.results_on_orig_df.shape)
729
+ print("FuzzyNotStdMatch shape: ", FuzzyNotStdMatch.results_on_orig_df.shape)
730
 
731
  if (len(FuzzyNotStdMatch.search_df_not_matched) == 0) | (sum(FuzzyNotStdMatch.match_results_output[FuzzyNotStdMatch.match_results_output['full_match']==False]['fuzzy_score'])==0):
732
  overall_toc = time.perf_counter()
 
875
  # Save to file
876
  Matcher.results_on_orig_df = results_on_orig_df
877
 
878
+ print("Results output in orchestrate match run shape: ", Matcher.results_on_orig_df.shape)
879
+
880
  Matcher.summary = summary
881
 
882
  Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
883
 
884
+ Matcher.match_outputs_name = "output/diagnostics_" + file_stub + today_rev + ".csv"
885
+ Matcher.results_orig_df_name = "output/results_" + file_stub + today_rev + ".csv"
886
 
887
  Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
888
  Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
 
973
  if type(search_df) != str:
974
  results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
975
  else: results_on_orig_df = match_results_output
976
+
977
+ print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
978
 
979
  return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
980
 
 
1028
  fuzzy_match_limit, search_df_cleaned, search_df_key_field, new_join_col, standardise, blocker_col = "Street")
1029
  match_results_output_st['match_method'] = "Fuzzy match - street"
1030
 
1031
+ match_results_output_st_out = combine_dfs_and_remove_dups(match_results_output, match_results_output_st, index_col = search_df_key_field)
1032
 
1033
  match_results_output = match_results_output_st_out
1034
 
 
1039
  if type(search_df) != str:
1040
  results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
1041
  else: results_on_orig_df = match_results_output
1042
+
1043
+ print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
1044
 
1045
  return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
1046
 
 
1162
  else:
1163
  matched_output_SBM_pc["match_method"] = "Neural net - Postcode"
1164
 
1165
+ match_results_output_final_pc = combine_dfs_and_remove_dups(match_results, matched_output_SBM_pc, index_col = search_df_key_field)
1166
 
1167
  summary_pc = create_match_summary(match_results_output_final_pc, df_name = "NNet blocked by Postcode " + df_name)
1168
  print(summary_pc)
 
1185
 
1186
  ### Join together old match df with new (model) match df
1187
 
1188
+ match_results_output_final_st = combine_dfs_and_remove_dups(match_results_output_final_pc,matched_output_SBM_st, index_col = search_df_key_field)
1189
 
1190
  summary_street = create_match_summary(match_results_output_final_st, df_name = "NNet blocked by Street " + df_name)
1191
  print(summary_street)
 
1210
 
1211
 
1212
  # Combiner/summary functions
1213
+ def combine_dfs_and_remove_dups(orig_df:PandasDataFrame, new_df:PandasDataFrame, index_col:str = "search_orig_address", match_address_series:str = "full_match", keep_only_duplicated:bool = False) -> PandasDataFrame:
1214
 
1215
+ '''
1216
+ Combine two Pandas dataframes and remove duplicates according to a specified 'index' column. Data is sorted with matched addresses first, non-matched second, so that the duplicate removal gets rid of rows that are not matched in the case of address duplicates.
1217
+ '''
1218
 
1219
+ # If one of the dataframes is empty, break
1220
+ if (orig_df.empty) & (new_df.empty):
1221
+ return orig_df
1222
+
1223
 
1224
+
1225
+ combined_std_not_matches = pd.concat([orig_df, new_df])#, ignore_index=True)
1226
+
1227
+
1228
+ # If no results were combined
1229
+ if combined_std_not_matches.empty:
1230
  combined_std_not_matches[match_address_series] = False
1231
 
1232
  if "full_address" in combined_std_not_matches.columns:
1233
+ combined_std_not_matches[index_col] = combined_std_not_matches["full_address"]
1234
  combined_std_not_matches["fuzzy_score"] = 0
1235
  return combined_std_not_matches
1236
+
1237
+ # Convert index_col to string to ensure indexes from different sources are being compared correctly
1238
+ combined_std_not_matches[index_col] = combined_std_not_matches[index_col].astype(str)
1239
 
1240
+ combined_std_not_matches = combined_std_not_matches.sort_values([index_col, match_address_series], ascending=[True, False])
1241
 
1242
  if keep_only_duplicated == True:
1243
+ combined_std_not_matches = combined_std_not_matches[combined_std_not_matches.duplicated(index_col)]
1244
+
1245
+ combined_std_not_matches_no_dups = combined_std_not_matches.drop_duplicates(index_col).sort_index()
1246
 
1247
  return combined_std_not_matches_no_dups
1248
 
1249
+ def combine_two_matches(OrigMatchClass:MatcherClass, NewMatchClass:MatcherClass, df_name:str) -> MatcherClass:
1250
+
1251
+ '''
1252
+ Combine two MatcherClass objects to retain newest matches and drop duplicate addresses.
1253
+ '''
1254
 
1255
  today_rev = datetime.now().strftime("%Y%m%d")
1256
 
1257
+ NewMatchClass.match_results_output = combine_dfs_and_remove_dups(OrigMatchClass.match_results_output, NewMatchClass.match_results_output, index_col = NewMatchClass.search_df_key_field)
1258
 
1259
+ NewMatchClass.results_on_orig_df = combine_dfs_and_remove_dups(OrigMatchClass.pre_filter_search_df, NewMatchClass.results_on_orig_df, index_col = NewMatchClass.search_df_key_field, match_address_series = 'Matched with reference address')
 
1260
 
1261
  # Filter out search results where a match was found
1262
  NewMatchClass.pre_filter_search_df = NewMatchClass.results_on_orig_df
1263
 
1264
  found_index = NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["Matched with reference address"] == True, NewMatchClass.search_df_key_field].astype(int)
 
1265
 
1266
  key_field_values = NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int) # Assuming list conversion is suitable
1267
  rows_to_drop = key_field_values[key_field_values.isin(found_index)].tolist()
 
1300
  ### Rejoin the excluded matches onto the output file
1301
  #NewMatchClass.results_on_orig_df = pd.concat([NewMatchClass.results_on_orig_df, NewMatchClass.excluded_df])
1302
 
1303
+ NewMatchClass.match_outputs_name = "output/diagnostics_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
1304
+ NewMatchClass.results_orig_df_name = "output/results_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
1305
 
1306
  # Only keep essential columns
1307
  essential_results_cols = [NewMatchClass.search_df_key_field, "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
 
1312
 
1313
  return NewMatchClass
1314
 
1315
+ def create_match_summary(match_results_output:PandasDataFrame, df_name:str) -> str:
1316
+
1317
+ '''
1318
+ Create a text summary of the matching process results to export to a text box or log file.
1319
+ '''
1320
 
1321
  # Check if match_results_output is a dictionary-like object and has the key 'full_match'
1322