seanpedrickcase commited on
Commit
4300019
1 Parent(s): 8d1cc2b

Some code rearranged. Fixed API call paths for Linux systems

Browse files
Files changed (3) hide show
  1. app.py +6 -341
  2. tools/fuzzy_match.py +6 -2
  3. tools/matcher_funcs.py +340 -37
app.py CHANGED
@@ -1,27 +1,20 @@
1
- # Load in packages, variables for fuzzy matching
2
  import os
3
  from datetime import datetime
4
  from pathlib import Path
5
- import time
6
- import copy
7
  import gradio as gr
8
- import re
9
- #import polars as pl
10
 
11
- from tools.constants import *
12
- from tools.matcher_funcs import load_matcher_data, run_match_batch, combine_two_matches, create_match_summary
13
  from tools.gradio import initial_data_load
14
  from tools.aws_functions import load_data_from_aws
15
- from tools.preparation import prepare_search_address_string, prepare_search_address, prepare_ref_address, remove_non_postal, check_no_number_addresses
16
- from tools.standardise import standardise_wrapper_func
17
 
18
  import warnings
 
19
  warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression')
20
  warnings.filterwarnings("ignore", 'Downcasting behavior')
21
  warnings.filterwarnings("ignore", 'A value is trying to be set on a copy of a slice from a DataFrame')
22
  warnings.filterwarnings("ignore")
23
 
24
-
25
  today = datetime.now().strftime("%d%m%Y")
26
  today_rev = datetime.now().strftime("%Y%m%d")
27
 
@@ -32,335 +25,7 @@ output_folder = base_folder/"Output/"
32
  diagnostics_folder = base_folder/"Diagnostics/"
33
  prep_folder = base_folder/"Helper functions/"
34
 
35
- def create_simple_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int):
36
- #print("Search df batch size: ", batch_size)
37
- #print("ref_df df batch size: ", ref_batch_size)
38
-
39
- total_rows = df.shape[0]
40
- ref_total_rows = ref_df.shape[0]
41
-
42
- # Creating bottom and top limits for search data
43
- search_ranges = []
44
- for start in range(0, total_rows, batch_size):
45
- end = min(start + batch_size - 1, total_rows - 1) # Adjusted to get the top limit
46
- search_ranges.append((start, end))
47
-
48
- # Creating bottom and top limits for reference data
49
- ref_ranges = []
50
- for start in range(0, ref_total_rows, ref_batch_size):
51
- end = min(start + ref_batch_size - 1, ref_total_rows - 1) # Adjusted to get the top limit
52
- ref_ranges.append((start, end))
53
-
54
- # Create DataFrame with combinations of search_range and ref_range
55
- result_data = []
56
- for search_range in search_ranges:
57
- for ref_range in ref_ranges:
58
- result_data.append((search_range, ref_range))
59
-
60
- range_df = pd.DataFrame(result_data, columns=['search_range', 'ref_range'])
61
-
62
- return range_df
63
-
64
-
65
- def create_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int, search_postcode_col:str, ref_postcode_col:str):
66
- '''
67
- Create batches of address indexes for search and reference dataframes based on shortened postcodes.
68
- '''
69
-
70
- # If df sizes are smaller than the batch size limits, no need to run through everything
71
- if len(df) < batch_size and len(ref_df) < ref_batch_size:
72
- print("Dataframe sizes are smaller than maximum batch sizes, no need to split data.")
73
- lengths_df = pd.DataFrame(data={'search_range':[df.index.tolist()], 'ref_range':[ref_df.index.tolist()], 'batch_length':len(df), 'ref_length':len(ref_df)})
74
- return lengths_df
75
-
76
- #df.index = df[search_postcode_col]
77
-
78
- df['index'] = df.index
79
- ref_df['index'] = ref_df.index
80
-
81
- # Remove the last character of postcode
82
- df['postcode_minus_last_character'] = df[search_postcode_col].str.lower().str.strip().str.replace("\s+", "", regex=True).str[:-1]
83
- ref_df['postcode_minus_last_character'] = ref_df[ref_postcode_col].str.lower().str.strip().str.replace("\s+", "", regex=True).str[:-1]
84
-
85
- unique_postcodes = df['postcode_minus_last_character'][df['postcode_minus_last_character'].str.len()>=4].unique().tolist()
86
-
87
- df = df.set_index('postcode_minus_last_character')
88
- ref_df = ref_df.set_index('postcode_minus_last_character')
89
-
90
- df = df.sort_index()
91
- ref_df = ref_df.sort_index()
92
-
93
- #df.to_csv("batch_search_df.csv")
94
-
95
- # Overall batch variables
96
- batch_indexes = []
97
- ref_indexes = []
98
- batch_lengths = []
99
- ref_lengths = []
100
-
101
- # Current batch variables for loop
102
- current_batch = []
103
- current_ref_batch = []
104
- current_batch_length = []
105
- current_ref_length = []
106
-
107
- unique_postcodes_iterator = unique_postcodes.copy()
108
-
109
- while unique_postcodes_iterator:
110
-
111
- unique_postcodes_loop = unique_postcodes_iterator.copy()
112
-
113
- #print("Current loop postcodes: ", unique_postcodes_loop)
114
-
115
- for current_postcode in unique_postcodes_loop:
116
-
117
-
118
-
119
- if len(current_batch) >= batch_size or len(current_ref_batch) >= ref_batch_size:
120
- print("Batch length reached - breaking")
121
- break
122
-
123
- try:
124
- current_postcode_search_data_add = df.loc[[current_postcode]]#[df['postcode_minus_last_character'].isin(current_postcode)]
125
- current_postcode_ref_data_add = ref_df.loc[[current_postcode]]#[ref_df['postcode_minus_last_character'].isin(current_postcode)]
126
-
127
- #print(current_postcode_search_data_add)
128
-
129
- if not current_postcode_search_data_add.empty:
130
- current_batch.extend(current_postcode_search_data_add['index'])
131
-
132
- if not current_postcode_ref_data_add.empty:
133
- current_ref_batch.extend(current_postcode_ref_data_add['index'])
134
-
135
- except:
136
- #print("postcode not found: ", current_postcode)
137
- pass
138
-
139
- unique_postcodes_iterator.remove(current_postcode)
140
-
141
- # Append the batch data to the master lists and reset lists
142
- batch_indexes.append(current_batch)
143
- ref_indexes.append(current_ref_batch)
144
-
145
- current_batch_length = len(current_batch)
146
- current_ref_length = len(current_ref_batch)
147
-
148
- batch_lengths.append(current_batch_length)
149
- ref_lengths.append(current_ref_length)
150
-
151
- current_batch = []
152
- current_ref_batch = []
153
- current_batch_length = []
154
- current_ref_length = []
155
-
156
- # Create df to store lengths
157
- lengths_df = pd.DataFrame(data={'search_range':batch_indexes, 'ref_range':ref_indexes, 'batch_length':batch_lengths, 'ref_length':ref_lengths})
158
-
159
- return lengths_df
160
-
161
-
162
- def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
163
- '''
164
- Split search and reference data into batches. Loop and run through the match script for each batch of data.
165
- '''
166
-
167
- overall_tic = time.perf_counter()
168
-
169
- # Load in initial data. This will filter to relevant addresses in the search and reference datasets that can potentially be matched, and will pull in API data if asked for.
170
- InitMatch = load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, InitMatch, in_api, in_api_key)
171
-
172
- if InitMatch.search_df.empty or InitMatch.ref_df.empty:
173
- out_message = "Nothing to match!"
174
- print(out_message)
175
- return out_message, [InitMatch.results_orig_df_name, InitMatch.match_outputs_name]
176
-
177
- # Run initial address preparation and standardisation processes
178
- # Prepare address format
179
-
180
- # Polars implementation not yet finalised
181
- #InitMatch.search_df = pl.from_pandas(InitMatch.search_df)
182
- #InitMatch.ref_df = pl.from_pandas(InitMatch.ref_df)
183
-
184
-
185
- # Prepare all search addresses
186
- if type(InitMatch.search_df) == str:
187
- InitMatch.search_df_cleaned, InitMatch.search_df_key_field, InitMatch.search_address_cols = prepare_search_address_string(InitMatch.search_df)
188
- else:
189
- InitMatch.search_df_cleaned = prepare_search_address(InitMatch.search_df, InitMatch.search_address_cols, InitMatch.search_postcode_col, InitMatch.search_df_key_field)
190
-
191
- # Remove addresses that are not postal addresses
192
- InitMatch.search_df_cleaned = remove_non_postal(InitMatch.search_df_cleaned, "full_address")
193
-
194
- # Remove addresses that have no numbers in from consideration
195
- InitMatch.search_df_cleaned = check_no_number_addresses(InitMatch.search_df_cleaned, "full_address")
196
-
197
- # Initial preparation of reference addresses
198
- InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
199
-
200
-
201
- # Sort dataframes by postcode - will allow for more efficient matching process if using multiple batches
202
- #InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.sort_values(by="postcode")
203
- #InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.sort_values(by="Postcode")
204
-
205
- # Polars implementation - not finalised
206
- #InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
207
- #InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
208
-
209
- # Standardise addresses
210
- # Standardise - minimal
211
-
212
-
213
- tic = time.perf_counter()
214
- InitMatch.search_df_after_stand, InitMatch.ref_df_after_stand = standardise_wrapper_func(
215
- InitMatch.search_df_cleaned.copy(),
216
- InitMatch.ref_df_cleaned.copy(),
217
- standardise = False,
218
- filter_to_lambeth_pcodes=filter_to_lambeth_pcodes,
219
- match_task="fuzzy") # InitMatch.search_df_after_stand_series, InitMatch.ref_df_after_stand_series
220
-
221
- toc = time.perf_counter()
222
- print(f"Performed the minimal standardisation step in {toc - tic:0.1f} seconds")
223
-
224
- # Standardise - full
225
- tic = time.perf_counter()
226
- InitMatch.search_df_after_full_stand, InitMatch.ref_df_after_full_stand = standardise_wrapper_func(
227
- InitMatch.search_df_cleaned.copy(),
228
- InitMatch.ref_df_cleaned.copy(),
229
- standardise = True,
230
- filter_to_lambeth_pcodes=filter_to_lambeth_pcodes,
231
- match_task="fuzzy") # , InitMatch.search_df_after_stand_series_full_stand, InitMatch.ref_df_after_stand_series_full_stand
232
-
233
- toc = time.perf_counter()
234
- print(f"Performed the full standardisation step in {toc - tic:0.1f} seconds")
235
-
236
- # Determine length of search df to create batches to send through the functions.
237
- #try:
238
- range_df = create_batch_ranges(InitMatch.search_df_cleaned.copy(), InitMatch.ref_df_cleaned.copy(), batch_size, ref_batch_size, "postcode", "Postcode")
239
- #except:
240
- # range_df = create_simple_batch_ranges(InitMatch.search_df_cleaned, InitMatch.ref_df_cleaned, batch_size, #ref_batch_size)
241
-
242
- print("Batches to run in this session: ", range_df)
243
-
244
- OutputMatch = copy.copy(InitMatch)
245
-
246
- n = 0
247
- number_of_batches = range_df.shape[0]
248
-
249
- for row in progress.tqdm(range(0,len(range_df)), desc= "Running through batches", unit="batches", total=number_of_batches):
250
- print("Running batch ", str(n+1))
251
-
252
- search_range = range_df.iloc[row]['search_range']
253
- ref_range = range_df.iloc[row]['ref_range']
254
-
255
- #print("search_range: ", search_range)
256
- #pd.DataFrame(search_range).to_csv("search_range.csv")
257
- #print("ref_range: ", ref_range)
258
-
259
- BatchMatch = copy.copy(InitMatch)
260
-
261
- # Subset the search and reference dfs based on current batch ranges
262
- # BatchMatch.search_df = BatchMatch.search_df.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
263
- # BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
264
- # BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
265
- # BatchMatch.ref_df = BatchMatch.ref_df.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
266
- # BatchMatch.ref_df_cleaned = BatchMatch.ref_df_cleaned.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
267
-
268
-
269
- # BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.iloc[search_range[0]:search_range[1] + 1]
270
- # BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.iloc[ref_range[0]:ref_range[1] + 1]
271
- # BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.iloc[search_range[0]:search_range[1] + 1]
272
- # BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.iloc[ref_range[0]:ref_range[1] + 1]
273
-
274
- # BatchMatch.search_df_after_stand = BatchMatch.search_df_after_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
275
- # BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
276
- # BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
277
- # BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
278
-
279
- BatchMatch.search_df = BatchMatch.search_df[BatchMatch.search_df.index.isin(search_range)].reset_index(drop=True)
280
- BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
281
- BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned[BatchMatch.search_df_cleaned.index.isin(search_range)].reset_index(drop=True)
282
-
283
- BatchMatch.ref_df = BatchMatch.ref_df[BatchMatch.ref_df.index.isin(ref_range)].reset_index(drop=True)
284
- BatchMatch.ref_df_cleaned = BatchMatch.ref_df_cleaned[BatchMatch.ref_df_cleaned.index.isin(ref_range)].reset_index(drop=True)
285
-
286
- # Dataframes after standardisation process
287
- BatchMatch.search_df_after_stand = BatchMatch.search_df_after_stand[BatchMatch.search_df_after_stand.index.isin(search_range)].reset_index(drop=True)
288
- BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand[BatchMatch.search_df_after_full_stand.index.isin(search_range)].reset_index(drop=True)
289
-
290
- ### Create lookup lists for fuzzy matches
291
- # BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand.copy().set_index('postcode_search')['search_address_stand']
292
- # BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_full_stand.copy().set_index('postcode_search')['search_address_stand']
293
- # BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.sort_index()
294
- # BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.sort_index()
295
-
296
- #BatchMatch.search_df_after_stand.reset_index(inplace=True, drop = True)
297
- #BatchMatch.search_df_after_full_stand.reset_index(inplace=True, drop = True)
298
-
299
- BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand[BatchMatch.ref_df_after_stand.index.isin(ref_range)].reset_index(drop=True)
300
- BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand[BatchMatch.ref_df_after_full_stand.index.isin(ref_range)].reset_index(drop=True)
301
-
302
- # BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand.copy().set_index('postcode_search')['ref_address_stand']
303
- # BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_full_stand.copy().set_index('postcode_search')['ref_address_stand']
304
- # BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.sort_index()
305
- # BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.sort_index()
306
-
307
- # BatchMatch.ref_df_after_stand.reset_index(inplace=True, drop=True)
308
- # BatchMatch.ref_df_after_full_stand.reset_index(inplace=True, drop=True)
309
-
310
- # Match the data, unless the search or reference dataframes are empty
311
- if BatchMatch.search_df.empty or BatchMatch.ref_df.empty:
312
- out_message = "Nothing to match for batch: " + str(n)
313
- print(out_message)
314
- BatchMatch_out = BatchMatch
315
- BatchMatch_out.results_on_orig_df = pd.DataFrame(data={"index":BatchMatch.search_df.index,
316
- "Excluded from search":False,
317
- "Matched with reference address":False})
318
- else:
319
- summary_of_summaries, BatchMatch_out = run_match_batch(BatchMatch, n, number_of_batches)
320
-
321
- print("BatchMatch_out match shape: ", BatchMatch_out.results_on_orig_df.shape)
322
-
323
- OutputMatch = combine_two_matches(OutputMatch, BatchMatch_out, "All up to and including batch " + str(n+1))
324
-
325
- print("Output results match shape: ", OutputMatch.results_on_orig_df.shape)
326
-
327
- n += 1
328
-
329
- if in_api==True:
330
- OutputMatch.results_on_orig_df['Matched with reference address'] = OutputMatch.results_on_orig_df['Matched with reference address'].replace({1:True, 0:False})
331
- OutputMatch.results_on_orig_df['Excluded from search'] = OutputMatch.results_on_orig_df['Excluded from search'].replace('nan', False).fillna(False)
332
-
333
- # Remove any duplicates from reference df, prioritise successful matches
334
- OutputMatch.results_on_orig_df = OutputMatch.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
335
-
336
-
337
- overall_toc = time.perf_counter()
338
- time_out = f"The overall match (all batches) took {overall_toc - overall_tic:0.1f} seconds"
339
-
340
- print(OutputMatch.output_summary)
341
-
342
- if OutputMatch.output_summary == "":
343
- OutputMatch.output_summary = "No matches were found."
344
-
345
- fuzzy_not_std_output = OutputMatch.match_results_output.copy()
346
- fuzzy_not_std_output_mask = ~(fuzzy_not_std_output["match_method"].str.contains("Fuzzy match")) | (fuzzy_not_std_output["standardised_address"] == True)
347
- fuzzy_not_std_output.loc[fuzzy_not_std_output_mask, "full_match"] = False
348
- fuzzy_not_std_summary = create_match_summary(fuzzy_not_std_output, "Fuzzy not standardised")
349
-
350
- fuzzy_std_output = OutputMatch.match_results_output.copy()
351
- fuzzy_std_output_mask = fuzzy_std_output["match_method"].str.contains("Fuzzy match")
352
- fuzzy_std_output.loc[fuzzy_std_output_mask == False, "full_match"] = False
353
- fuzzy_std_summary = create_match_summary(fuzzy_std_output, "Fuzzy standardised")
354
-
355
- nnet_std_output = OutputMatch.match_results_output.copy()
356
- nnet_std_summary = create_match_summary(nnet_std_output, "Neural net standardised")
357
-
358
- final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
359
-
360
- return final_summary, [OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name]
361
-
362
  # Create the gradio interface
363
-
364
  block = gr.Blocks(theme = gr.themes.Base())
365
 
366
  with block:
@@ -401,8 +66,8 @@ with block:
401
  in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
402
 
403
  with gr.Accordion("Use Addressbase API instead of reference file", open = False):
404
- in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
405
- in_api_key = gr.Textbox(label="Addressbase API key")
406
 
407
  with gr.Accordion("Custom reference file format or join columns (i.e. not LLPG LPI format)", open = False):
408
  in_refcol = gr.Dropdown(choices=[], multiselect=True, label="Select columns that make up the reference address. Make sure postcode is at the end")
@@ -439,7 +104,7 @@ with block:
439
  #block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
440
 
441
  # Simple run for AWS server
442
- block.queue().launch(ssl_verify=False) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
443
 
444
  # Download OpenSSL from here:
445
  # Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d
 
 
1
  import os
2
  from datetime import datetime
3
  from pathlib import Path
 
 
4
  import gradio as gr
5
+ import pandas as pd
 
6
 
7
+ from tools.matcher_funcs import run_matcher
 
8
  from tools.gradio import initial_data_load
9
  from tools.aws_functions import load_data_from_aws
 
 
10
 
11
  import warnings
12
+ # Remove warnings from print statements
13
  warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression')
14
  warnings.filterwarnings("ignore", 'Downcasting behavior')
15
  warnings.filterwarnings("ignore", 'A value is trying to be set on a copy of a slice from a DataFrame')
16
  warnings.filterwarnings("ignore")
17
 
 
18
  today = datetime.now().strftime("%d%m%Y")
19
  today_rev = datetime.now().strftime("%Y%m%d")
20
 
 
25
  diagnostics_folder = base_folder/"Diagnostics/"
26
  prep_folder = base_folder/"Helper functions/"
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # Create the gradio interface
 
29
  block = gr.Blocks(theme = gr.themes.Base())
30
 
31
  with block:
 
66
  in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
67
 
68
  with gr.Accordion("Use Addressbase API instead of reference file", open = False):
69
+ in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode"])#["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
70
+ in_api_key = gr.Textbox(label="Addressbase API key", type='password')
71
 
72
  with gr.Accordion("Custom reference file format or join columns (i.e. not LLPG LPI format)", open = False):
73
  in_refcol = gr.Dropdown(choices=[], multiselect=True, label="Select columns that make up the reference address. Make sure postcode is at the end")
 
104
  #block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
105
 
106
  # Simple run for AWS server
107
+ block.queue().launch(ssl_verify=False, inbrowser=True) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
108
 
109
  # Download OpenSSL from here:
110
  # Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d
tools/fuzzy_match.py CHANGED
@@ -169,7 +169,11 @@ def string_match_by_post_code_multiple(match_address_series:PandasSeries, refere
169
 
170
  return out_frame
171
 
172
- def _create_fuzzy_match_results_output(results, search_df_after_stand, ref_df_cleaned, ref_df_after_stand, fuzzy_match_limit, search_df_cleaned, search_df_key_field, new_join_col, standardise, blocker_col):
 
 
 
 
173
 
174
  ## Diagnostics
175
 
@@ -227,7 +231,7 @@ def _create_fuzzy_match_results_output(results, search_df_after_stand, ref_df_cl
227
 
228
  return match_results_output, diag_shortlist, diag_best_match
229
 
230
- def create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col, fuzzy_col="fuzzy_score", search_mod_address = "search_mod_address", resolve_tie_breaks=True, no_number_fuzzy_match_limit=no_number_fuzzy_match_limit):
231
  '''
232
  Create a shortlist of the best matches from a list of suggested matches
233
  '''
 
169
 
170
  return out_frame
171
 
172
+ def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_stand:PandasDataFrame, ref_df_cleaned:PandasDataFrame, ref_df_after_stand:PandasDataFrame, fuzzy_match_limit:int, search_df_cleaned:PandasDataFrame, search_df_key_field:str, new_join_col:str, standardise:bool, blocker_col:str):
173
+
174
+ '''
175
+ Take fuzzy match outputs, create shortlist dataframes, rearrange, return diagnostics and shortlist dataframes for export
176
+ '''
177
 
178
  ## Diagnostics
179
 
 
231
 
232
  return match_results_output, diag_shortlist, diag_best_match
233
 
234
+ def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_match_limit:int, blocker_col:str, fuzzy_col:str="fuzzy_score", search_mod_address:str = "search_mod_address", resolve_tie_breaks:bool=True, no_number_fuzzy_match_limit:int=no_number_fuzzy_match_limit) -> PandasDataFrame:
235
  '''
236
  Create a shortlist of the best matches from a list of suggested matches
237
  '''
tools/matcher_funcs.py CHANGED
@@ -24,8 +24,10 @@ run_fuzzy_match = True
24
  run_nnet_match = True
25
  run_standardise = True
26
 
27
- from tools.preparation import prepare_search_address_string, prepare_search_address, extract_street_name
 
28
  from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, join_to_orig_df
 
29
 
30
  # Neural network functions
31
  ### Predict function for imported model
@@ -64,17 +66,17 @@ def read_file(filename:str) -> PandasDataFrame:
64
  elif file_type == 'parquet':
65
  return pd.read_parquet(filename)
66
 
67
- def get_file_name(in_name:str) -> str:
68
- '''
69
- Get the name of a file from a string using the re package.
70
- '''
71
 
72
- # Corrected regex pattern
73
- match = re.search(r'\\(?!.*\\)(.*)', in_name)
74
  if match:
75
  matched_result = match.group(1)
76
  else:
77
  matched_result = None
 
 
78
 
79
  return matched_result
80
 
@@ -492,24 +494,22 @@ def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, re
492
  if not data_state.empty:
493
 
494
  Matcher.search_df = data_state
495
-
496
  Matcher.search_df['index'] = Matcher.search_df.reset_index().index
497
 
498
  else:
499
  Matcher.search_df = pd.DataFrame()
500
 
501
- # If someone has just entered open text, just load this instead
502
  if in_text:
503
  Matcher.search_df, Matcher.search_df_key_field, Matcher.search_address_cols, Matcher.search_postcode_col = prepare_search_address_string(in_text)
504
 
505
- # If two matcher files are loaded in, the algorithm will combine them together
506
  if Matcher.search_df.empty and in_file:
507
  output_message, drop1, drop2, Matcher.search_df, results_data_state = initial_data_load(in_file)
508
 
509
  file_list = [string.name for string in in_file]
510
- data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
511
-
512
- #print("Data file names: ", data_file_names)
513
  Matcher.file_name = get_file_name(data_file_names[0])
514
 
515
  # search_df makes column to use as index
@@ -524,20 +524,11 @@ def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, re
524
  Matcher.search_df = Matcher.search_df.merge(results_data_state, on = "index", how = "left")
525
 
526
  # If no join on column suggested, assume the user wants the UPRN
527
- print("in_joincol: ", in_joincol)
528
-
529
  if not in_joincol:
530
  Matcher.new_join_col = ['UPRN']
531
- #Matcher.new_join_col = Matcher.new_join_col#[0]
532
 
533
  else:
534
  Matcher.new_join_col = in_joincol
535
- #Matcher.new_join_col = Matcher.new_join_col
536
-
537
- # Extract the column names from the input data
538
- #print("In colnames: ", in_colnames)
539
-
540
- print("Matcher.in_joincol: ", Matcher.new_join_col)
541
 
542
  if len(in_colnames) > 1:
543
  Matcher.search_postcode_col = [in_colnames[-1]]
@@ -566,7 +557,6 @@ def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, re
566
 
567
  length_more_than_0 = Matcher.search_df["address_cols_joined"].str.len() > 0
568
 
569
-
570
  ### Filter addresses to match to postcode areas present in both search_df and ref_df_cleaned only (postcode without the last three characters). Only run if API call is false. When the API is called, relevant addresses and postcodes should be brought in by the API.
571
  if not in_api:
572
  if Matcher.filter_to_lambeth_pcodes == True:
@@ -621,7 +611,6 @@ def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, re
621
 
622
  Matcher.search_df_not_matched = Matcher.search_df
623
 
624
-
625
  # If this is for an API call, we need to convert the search_df address columns to one column now. This is so the API call can be made and the reference dataframe created.
626
  if in_api:
627
 
@@ -629,10 +618,14 @@ def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, re
629
  output_message, drop1, drop2, df, results_data_state = initial_data_load(in_file)
630
 
631
  file_list = [string.name for string in in_file]
632
- data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
633
 
634
  Matcher.file_name = get_file_name(data_file_names[0])
635
 
 
 
 
 
636
  else:
637
  if in_text:
638
  Matcher.file_name = in_text
@@ -654,8 +647,6 @@ def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, re
654
 
655
 
656
  Matcher.search_df['full_address_postcode'] = search_df_cleaned["full_address"]
657
- #Matcher.search_df = Matcher.search_df.reset_index(drop=True)
658
- #Matcher.search_df.index.name = 'index'
659
 
660
  return Matcher
661
 
@@ -677,15 +668,11 @@ def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state,
677
  # If doing API calls, we need to know the search data before querying for specific addresses/postcodes
678
  Matcher = check_match_data_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api)
679
 
680
-
681
  # If an API call, ref_df data is loaded after
682
  if in_api:
 
683
  Matcher = check_ref_data_exists(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
684
 
685
- #print("Resetting index.")
686
- # API-called data will often have duplicate indexes in it - drop them to avoid conflicts down the line
687
- #Matcher.ref_df = Matcher.ref_df.reset_index(drop = True)
688
-
689
  print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
690
  print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
691
 
@@ -697,7 +684,328 @@ def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state,
697
 
698
  return Matcher
699
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
  # Run a match run for a single batch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
701
  def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress()):
702
  '''
703
  Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
@@ -721,12 +1029,7 @@ def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, p
721
  print(message)
722
  return message, InitialMatch
723
 
724
- print("FuzzyNotStdMatch shape before combine two matches: ", FuzzyNotStdMatch.results_on_orig_df.shape)
725
-
726
  FuzzyNotStdMatch = combine_two_matches(InitialMatch, FuzzyNotStdMatch, df_name)
727
-
728
- print("InitialMatch shape: ", InitialMatch.results_on_orig_df.shape)
729
- print("FuzzyNotStdMatch shape: ", FuzzyNotStdMatch.results_on_orig_df.shape)
730
 
731
  if (len(FuzzyNotStdMatch.search_df_not_matched) == 0) | (sum(FuzzyNotStdMatch.match_results_output[FuzzyNotStdMatch.match_results_output['full_match']==False]['fuzzy_score'])==0):
732
  overall_toc = time.perf_counter()
 
24
  run_nnet_match = True
25
  run_standardise = True
26
 
27
+ from tools.constants import *
28
+ from tools.preparation import prepare_search_address_string, prepare_search_address, extract_street_name, prepare_ref_address, remove_non_postal, check_no_number_addresses
29
  from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, join_to_orig_df
30
+ from tools.standardise import standardise_wrapper_func
31
 
32
  # Neural network functions
33
  ### Predict function for imported model
 
66
  elif file_type == 'parquet':
67
  return pd.read_parquet(filename)
68
 
69
+ def get_file_name(in_name: str) -> str:
70
+ """Get the name of a file from a string, handling both Windows and Unix paths."""
 
 
71
 
72
+ print("in_name: ", in_name)
73
+ match = re.search(rf'{os.sep}(?!.*{os.sep})(.*)', in_name)
74
  if match:
75
  matched_result = match.group(1)
76
  else:
77
  matched_result = None
78
+
79
+ print("Matched result: ", matched_result)
80
 
81
  return matched_result
82
 
 
494
  if not data_state.empty:
495
 
496
  Matcher.search_df = data_state
 
497
  Matcher.search_df['index'] = Matcher.search_df.reset_index().index
498
 
499
  else:
500
  Matcher.search_df = pd.DataFrame()
501
 
502
+ # If a single address entered into the text box, just load this instead
503
  if in_text:
504
  Matcher.search_df, Matcher.search_df_key_field, Matcher.search_address_cols, Matcher.search_postcode_col = prepare_search_address_string(in_text)
505
 
506
+ # If no file loaded yet and a file has been selected
507
  if Matcher.search_df.empty and in_file:
508
  output_message, drop1, drop2, Matcher.search_df, results_data_state = initial_data_load(in_file)
509
 
510
  file_list = [string.name for string in in_file]
511
+ data_file_names = [string for string in file_list if "results_" not in string.lower()]
512
+
 
513
  Matcher.file_name = get_file_name(data_file_names[0])
514
 
515
  # search_df makes column to use as index
 
524
  Matcher.search_df = Matcher.search_df.merge(results_data_state, on = "index", how = "left")
525
 
526
  # If no join on column suggested, assume the user wants the UPRN
 
 
527
  if not in_joincol:
528
  Matcher.new_join_col = ['UPRN']
 
529
 
530
  else:
531
  Matcher.new_join_col = in_joincol
 
 
 
 
 
 
532
 
533
  if len(in_colnames) > 1:
534
  Matcher.search_postcode_col = [in_colnames[-1]]
 
557
 
558
  length_more_than_0 = Matcher.search_df["address_cols_joined"].str.len() > 0
559
 
 
560
  ### Filter addresses to match to postcode areas present in both search_df and ref_df_cleaned only (postcode without the last three characters). Only run if API call is false. When the API is called, relevant addresses and postcodes should be brought in by the API.
561
  if not in_api:
562
  if Matcher.filter_to_lambeth_pcodes == True:
 
611
 
612
  Matcher.search_df_not_matched = Matcher.search_df
613
 
 
614
  # If this is for an API call, we need to convert the search_df address columns to one column now. This is so the API call can be made and the reference dataframe created.
615
  if in_api:
616
 
 
618
  output_message, drop1, drop2, df, results_data_state = initial_data_load(in_file)
619
 
620
  file_list = [string.name for string in in_file]
621
+ data_file_names = [string for string in file_list if "results_" not in string.lower()]
622
 
623
  Matcher.file_name = get_file_name(data_file_names[0])
624
 
625
+ print("File list in in_api bit: ", file_list)
626
+ print("data_file_names in in_api bit: ", data_file_names)
627
+ print("Matcher.file_name in in_api bit: ", Matcher.file_name)
628
+
629
  else:
630
  if in_text:
631
  Matcher.file_name = in_text
 
647
 
648
 
649
  Matcher.search_df['full_address_postcode'] = search_df_cleaned["full_address"]
 
 
650
 
651
  return Matcher
652
 
 
668
  # If doing API calls, we need to know the search data before querying for specific addresses/postcodes
669
  Matcher = check_match_data_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api)
670
 
 
671
  # If an API call, ref_df data is loaded after
672
  if in_api:
673
+
674
  Matcher = check_ref_data_exists(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
675
 
 
 
 
 
676
  print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
677
  print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
678
 
 
684
 
685
  return Matcher
686
 
687
+ # Run whole matcher process
688
+ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
689
+ '''
690
+ Split search and reference data into batches. Loop and run through the match script for each batch of data.
691
+ '''
692
+
693
+ overall_tic = time.perf_counter()
694
+
695
+ # Load in initial data. This will filter to relevant addresses in the search and reference datasets that can potentially be matched, and will pull in API data if asked for.
696
+ InitMatch = load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, InitMatch, in_api, in_api_key)
697
+
698
+ if InitMatch.search_df.empty or InitMatch.ref_df.empty:
699
+ out_message = "Nothing to match!"
700
+ print(out_message)
701
+ return out_message, [InitMatch.results_orig_df_name, InitMatch.match_outputs_name]
702
+
703
+ # Run initial address preparation and standardisation processes
704
+ # Prepare address format
705
+
706
+ # Polars implementation not yet finalised
707
+ #InitMatch.search_df = pl.from_pandas(InitMatch.search_df)
708
+ #InitMatch.ref_df = pl.from_pandas(InitMatch.ref_df)
709
+
710
+
711
+ # Prepare all search addresses
712
+ if type(InitMatch.search_df) == str:
713
+ InitMatch.search_df_cleaned, InitMatch.search_df_key_field, InitMatch.search_address_cols = prepare_search_address_string(InitMatch.search_df)
714
+ else:
715
+ InitMatch.search_df_cleaned = prepare_search_address(InitMatch.search_df, InitMatch.search_address_cols, InitMatch.search_postcode_col, InitMatch.search_df_key_field)
716
+
717
+ # Remove addresses that are not postal addresses
718
+ InitMatch.search_df_cleaned = remove_non_postal(InitMatch.search_df_cleaned, "full_address")
719
+
720
+ # Remove addresses that have no numbers in from consideration
721
+ InitMatch.search_df_cleaned = check_no_number_addresses(InitMatch.search_df_cleaned, "full_address")
722
+
723
+ # Initial preparation of reference addresses
724
+ InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
725
+
726
+
727
+ # Sort dataframes by postcode - will allow for more efficient matching process if using multiple batches
728
+ #InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.sort_values(by="postcode")
729
+ #InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.sort_values(by="Postcode")
730
+
731
+ # Polars implementation - not finalised
732
+ #InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
733
+ #InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
734
+
735
+ # Standardise addresses
736
+ # Standardise - minimal
737
+
738
+
739
+ tic = time.perf_counter()
740
+ InitMatch.search_df_after_stand, InitMatch.ref_df_after_stand = standardise_wrapper_func(
741
+ InitMatch.search_df_cleaned.copy(),
742
+ InitMatch.ref_df_cleaned.copy(),
743
+ standardise = False,
744
+ filter_to_lambeth_pcodes=filter_to_lambeth_pcodes,
745
+ match_task="fuzzy") # InitMatch.search_df_after_stand_series, InitMatch.ref_df_after_stand_series
746
+
747
+ toc = time.perf_counter()
748
+ print(f"Performed the minimal standardisation step in {toc - tic:0.1f} seconds")
749
+
750
+ # Standardise - full
751
+ tic = time.perf_counter()
752
+ InitMatch.search_df_after_full_stand, InitMatch.ref_df_after_full_stand = standardise_wrapper_func(
753
+ InitMatch.search_df_cleaned.copy(),
754
+ InitMatch.ref_df_cleaned.copy(),
755
+ standardise = True,
756
+ filter_to_lambeth_pcodes=filter_to_lambeth_pcodes,
757
+ match_task="fuzzy") # , InitMatch.search_df_after_stand_series_full_stand, InitMatch.ref_df_after_stand_series_full_stand
758
+
759
+ toc = time.perf_counter()
760
+ print(f"Performed the full standardisation step in {toc - tic:0.1f} seconds")
761
+
762
+ # Determine length of search df to create batches to send through the functions.
763
+ #try:
764
+ range_df = create_batch_ranges(InitMatch.search_df_cleaned.copy(), InitMatch.ref_df_cleaned.copy(), batch_size, ref_batch_size, "postcode", "Postcode")
765
+ #except:
766
+ # range_df = create_simple_batch_ranges(InitMatch.search_df_cleaned, InitMatch.ref_df_cleaned, batch_size, #ref_batch_size)
767
+
768
+ print("Batches to run in this session: ", range_df)
769
+
770
+ OutputMatch = copy.copy(InitMatch)
771
+
772
+ n = 0
773
+ number_of_batches = range_df.shape[0]
774
+
775
+ for row in progress.tqdm(range(0,len(range_df)), desc= "Running through batches", unit="batches", total=number_of_batches):
776
+ print("Running batch ", str(n+1))
777
+
778
+ search_range = range_df.iloc[row]['search_range']
779
+ ref_range = range_df.iloc[row]['ref_range']
780
+
781
+ #print("search_range: ", search_range)
782
+ #pd.DataFrame(search_range).to_csv("search_range.csv")
783
+ #print("ref_range: ", ref_range)
784
+
785
+ BatchMatch = copy.copy(InitMatch)
786
+
787
+ # Subset the search and reference dfs based on current batch ranges
788
+ # BatchMatch.search_df = BatchMatch.search_df.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
789
+ # BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
790
+ # BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
791
+ # BatchMatch.ref_df = BatchMatch.ref_df.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
792
+ # BatchMatch.ref_df_cleaned = BatchMatch.ref_df_cleaned.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
793
+
794
+
795
+ # BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.iloc[search_range[0]:search_range[1] + 1]
796
+ # BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.iloc[ref_range[0]:ref_range[1] + 1]
797
+ # BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.iloc[search_range[0]:search_range[1] + 1]
798
+ # BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.iloc[ref_range[0]:ref_range[1] + 1]
799
+
800
+ # BatchMatch.search_df_after_stand = BatchMatch.search_df_after_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
801
+ # BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
802
+ # BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
803
+ # BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
804
+
805
+ BatchMatch.search_df = BatchMatch.search_df[BatchMatch.search_df.index.isin(search_range)].reset_index(drop=True)
806
+ BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
807
+ BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned[BatchMatch.search_df_cleaned.index.isin(search_range)].reset_index(drop=True)
808
+
809
+ BatchMatch.ref_df = BatchMatch.ref_df[BatchMatch.ref_df.index.isin(ref_range)].reset_index(drop=True)
810
+ BatchMatch.ref_df_cleaned = BatchMatch.ref_df_cleaned[BatchMatch.ref_df_cleaned.index.isin(ref_range)].reset_index(drop=True)
811
+
812
+ # Dataframes after standardisation process
813
+ BatchMatch.search_df_after_stand = BatchMatch.search_df_after_stand[BatchMatch.search_df_after_stand.index.isin(search_range)].reset_index(drop=True)
814
+ BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand[BatchMatch.search_df_after_full_stand.index.isin(search_range)].reset_index(drop=True)
815
+
816
+ ### Create lookup lists for fuzzy matches
817
+ # BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand.copy().set_index('postcode_search')['search_address_stand']
818
+ # BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_full_stand.copy().set_index('postcode_search')['search_address_stand']
819
+ # BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.sort_index()
820
+ # BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.sort_index()
821
+
822
+ #BatchMatch.search_df_after_stand.reset_index(inplace=True, drop = True)
823
+ #BatchMatch.search_df_after_full_stand.reset_index(inplace=True, drop = True)
824
+
825
+ BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand[BatchMatch.ref_df_after_stand.index.isin(ref_range)].reset_index(drop=True)
826
+ BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand[BatchMatch.ref_df_after_full_stand.index.isin(ref_range)].reset_index(drop=True)
827
+
828
+ # BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand.copy().set_index('postcode_search')['ref_address_stand']
829
+ # BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_full_stand.copy().set_index('postcode_search')['ref_address_stand']
830
+ # BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.sort_index()
831
+ # BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.sort_index()
832
+
833
+ # BatchMatch.ref_df_after_stand.reset_index(inplace=True, drop=True)
834
+ # BatchMatch.ref_df_after_full_stand.reset_index(inplace=True, drop=True)
835
+
836
+ # Match the data, unless the search or reference dataframes are empty
837
+ if BatchMatch.search_df.empty or BatchMatch.ref_df.empty:
838
+ out_message = "Nothing to match for batch: " + str(n)
839
+ print(out_message)
840
+ BatchMatch_out = BatchMatch
841
+ BatchMatch_out.results_on_orig_df = pd.DataFrame(data={"index":BatchMatch.search_df.index,
842
+ "Excluded from search":False,
843
+ "Matched with reference address":False})
844
+ else:
845
+ summary_of_summaries, BatchMatch_out = run_match_batch(BatchMatch, n, number_of_batches)
846
+
847
+ OutputMatch = combine_two_matches(OutputMatch, BatchMatch_out, "All up to and including batch " + str(n+1))
848
+
849
+ n += 1
850
+
851
+ if in_api==True:
852
+ OutputMatch.results_on_orig_df['Matched with reference address'] = OutputMatch.results_on_orig_df['Matched with reference address'].replace({1:True, 0:False})
853
+ OutputMatch.results_on_orig_df['Excluded from search'] = OutputMatch.results_on_orig_df['Excluded from search'].replace('nan', False).fillna(False)
854
+
855
+ # Remove any duplicates from reference df, prioritise successful matches
856
+ OutputMatch.results_on_orig_df = OutputMatch.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
857
+
858
+ overall_toc = time.perf_counter()
859
+ time_out = f"The overall match (all batches) took {overall_toc - overall_tic:0.1f} seconds"
860
+
861
+ print(OutputMatch.output_summary)
862
+
863
+ if OutputMatch.output_summary == "":
864
+ OutputMatch.output_summary = "No matches were found."
865
+
866
+ fuzzy_not_std_output = OutputMatch.match_results_output.copy()
867
+ fuzzy_not_std_output_mask = ~(fuzzy_not_std_output["match_method"].str.contains("Fuzzy match")) | (fuzzy_not_std_output["standardised_address"] == True)
868
+ fuzzy_not_std_output.loc[fuzzy_not_std_output_mask, "full_match"] = False
869
+ fuzzy_not_std_summary = create_match_summary(fuzzy_not_std_output, "Fuzzy not standardised")
870
+
871
+ fuzzy_std_output = OutputMatch.match_results_output.copy()
872
+ fuzzy_std_output_mask = fuzzy_std_output["match_method"].str.contains("Fuzzy match")
873
+ fuzzy_std_output.loc[fuzzy_std_output_mask == False, "full_match"] = False
874
+ fuzzy_std_summary = create_match_summary(fuzzy_std_output, "Fuzzy standardised")
875
+
876
+ nnet_std_output = OutputMatch.match_results_output.copy()
877
+ nnet_std_summary = create_match_summary(nnet_std_output, "Neural net standardised")
878
+
879
+ final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
880
+
881
+ return final_summary, [OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name]
882
+
883
  # Run a match run for a single batch
884
+ def create_simple_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int):
885
+ #print("Search df batch size: ", batch_size)
886
+ #print("ref_df df batch size: ", ref_batch_size)
887
+
888
+ total_rows = df.shape[0]
889
+ ref_total_rows = ref_df.shape[0]
890
+
891
+ # Creating bottom and top limits for search data
892
+ search_ranges = []
893
+ for start in range(0, total_rows, batch_size):
894
+ end = min(start + batch_size - 1, total_rows - 1) # Adjusted to get the top limit
895
+ search_ranges.append((start, end))
896
+
897
+ # Creating bottom and top limits for reference data
898
+ ref_ranges = []
899
+ for start in range(0, ref_total_rows, ref_batch_size):
900
+ end = min(start + ref_batch_size - 1, ref_total_rows - 1) # Adjusted to get the top limit
901
+ ref_ranges.append((start, end))
902
+
903
+ # Create DataFrame with combinations of search_range and ref_range
904
+ result_data = []
905
+ for search_range in search_ranges:
906
+ for ref_range in ref_ranges:
907
+ result_data.append((search_range, ref_range))
908
+
909
+ range_df = pd.DataFrame(result_data, columns=['search_range', 'ref_range'])
910
+
911
+ return range_df
912
+
913
+ def create_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int, search_postcode_col:str, ref_postcode_col:str):
914
+ '''
915
+ Create batches of address indexes for search and reference dataframes based on shortened postcodes.
916
+ '''
917
+
918
+ # If df sizes are smaller than the batch size limits, no need to run through everything
919
+ if len(df) < batch_size and len(ref_df) < ref_batch_size:
920
+ print("Dataframe sizes are smaller than maximum batch sizes, no need to split data.")
921
+ lengths_df = pd.DataFrame(data={'search_range':[df.index.tolist()], 'ref_range':[ref_df.index.tolist()], 'batch_length':len(df), 'ref_length':len(ref_df)})
922
+ return lengths_df
923
+
924
+ #df.index = df[search_postcode_col]
925
+
926
+ df['index'] = df.index
927
+ ref_df['index'] = ref_df.index
928
+
929
+ # Remove the last character of postcode
930
+ df['postcode_minus_last_character'] = df[search_postcode_col].str.lower().str.strip().str.replace("\s+", "", regex=True).str[:-1]
931
+ ref_df['postcode_minus_last_character'] = ref_df[ref_postcode_col].str.lower().str.strip().str.replace("\s+", "", regex=True).str[:-1]
932
+
933
+ unique_postcodes = df['postcode_minus_last_character'][df['postcode_minus_last_character'].str.len()>=4].unique().tolist()
934
+
935
+ df = df.set_index('postcode_minus_last_character')
936
+ ref_df = ref_df.set_index('postcode_minus_last_character')
937
+
938
+ df = df.sort_index()
939
+ ref_df = ref_df.sort_index()
940
+
941
+ #df.to_csv("batch_search_df.csv")
942
+
943
+ # Overall batch variables
944
+ batch_indexes = []
945
+ ref_indexes = []
946
+ batch_lengths = []
947
+ ref_lengths = []
948
+
949
+ # Current batch variables for loop
950
+ current_batch = []
951
+ current_ref_batch = []
952
+ current_batch_length = []
953
+ current_ref_length = []
954
+
955
+ unique_postcodes_iterator = unique_postcodes.copy()
956
+
957
+ while unique_postcodes_iterator:
958
+
959
+ unique_postcodes_loop = unique_postcodes_iterator.copy()
960
+
961
+ #print("Current loop postcodes: ", unique_postcodes_loop)
962
+
963
+ for current_postcode in unique_postcodes_loop:
964
+
965
+
966
+
967
+ if len(current_batch) >= batch_size or len(current_ref_batch) >= ref_batch_size:
968
+ print("Batch length reached - breaking")
969
+ break
970
+
971
+ try:
972
+ current_postcode_search_data_add = df.loc[[current_postcode]]#[df['postcode_minus_last_character'].isin(current_postcode)]
973
+ current_postcode_ref_data_add = ref_df.loc[[current_postcode]]#[ref_df['postcode_minus_last_character'].isin(current_postcode)]
974
+
975
+ #print(current_postcode_search_data_add)
976
+
977
+ if not current_postcode_search_data_add.empty:
978
+ current_batch.extend(current_postcode_search_data_add['index'])
979
+
980
+ if not current_postcode_ref_data_add.empty:
981
+ current_ref_batch.extend(current_postcode_ref_data_add['index'])
982
+
983
+ except:
984
+ #print("postcode not found: ", current_postcode)
985
+ pass
986
+
987
+ unique_postcodes_iterator.remove(current_postcode)
988
+
989
+ # Append the batch data to the master lists and reset lists
990
+ batch_indexes.append(current_batch)
991
+ ref_indexes.append(current_ref_batch)
992
+
993
+ current_batch_length = len(current_batch)
994
+ current_ref_length = len(current_ref_batch)
995
+
996
+ batch_lengths.append(current_batch_length)
997
+ ref_lengths.append(current_ref_length)
998
+
999
+ current_batch = []
1000
+ current_ref_batch = []
1001
+ current_batch_length = []
1002
+ current_ref_length = []
1003
+
1004
+ # Create df to store lengths
1005
+ lengths_df = pd.DataFrame(data={'search_range':batch_indexes, 'ref_range':ref_indexes, 'batch_length':batch_lengths, 'ref_length':ref_lengths})
1006
+
1007
+ return lengths_df
1008
+
1009
  def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress()):
1010
  '''
1011
  Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
 
1029
  print(message)
1030
  return message, InitialMatch
1031
 
 
 
1032
  FuzzyNotStdMatch = combine_two_matches(InitialMatch, FuzzyNotStdMatch, df_name)
 
 
 
1033
 
1034
  if (len(FuzzyNotStdMatch.search_df_not_matched) == 0) | (sum(FuzzyNotStdMatch.match_results_output[FuzzyNotStdMatch.match_results_output['full_match']==False]['fuzzy_score'])==0):
1035
  overall_toc = time.perf_counter()