seanpedrickcase commited on
Commit
8c90944
1 Parent(s): 86f6252

Allowed for custom output folder. Upgraded Gradio version

Browse files
AddressMatcher_0.1_f.spec ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- mode: python ; coding: utf-8 -*-
2
+ from PyInstaller.utils.hooks import collect_data_files
3
+
4
+ datas = []
5
+ datas += collect_data_files('gradio_client')
6
+ datas += collect_data_files('gradio')
7
+
8
+
9
+ a = Analysis(
10
+ ['app.py'],
11
+ pathex=[],
12
+ binaries=[],
13
+ datas=datas,
14
+ hiddenimports=['pyarrow.vendored.version'],
15
+ hookspath=['build_deps\\'],
16
+ hooksconfig={},
17
+ runtime_hooks=[],
18
+ excludes=[],
19
+ noarchive=False,
20
+ optimize=0,
21
+ module_collection_mode={
22
+ 'gradio': 'py', # Collect gradio package as source .py files
23
+ }
24
+ )
25
+ pyz = PYZ(a.pure)
26
+
27
+ exe = EXE(
28
+ pyz,
29
+ a.scripts,
30
+ [],
31
+ exclude_binaries=True,
32
+ name='AddressMatcher_0.1_f',
33
+ debug=False,
34
+ bootloader_ignore_signals=False,
35
+ strip=False,
36
+ upx=True,
37
+ console=True,
38
+ disable_windowed_traceback=False,
39
+ argv_emulation=False,
40
+ target_arch=None,
41
+ codesign_identity=None,
42
+ entitlements_file=None,
43
+ )
44
+ coll = COLLECT(
45
+ exe,
46
+ a.binaries,
47
+ a.datas,
48
+ strip=False,
49
+ upx=True,
50
+ upx_exclude=[],
51
+ name='AddressMatcher_0.1_f',
52
+ )
Dockerfile CHANGED
@@ -6,7 +6,7 @@ COPY requirements.txt .
6
 
7
  RUN pip install --no-cache-dir -r requirements.txt
8
 
9
- RUN pip install --no-cache-dir gradio==4.31.0
10
 
11
  # Set up a new user named "user" with user ID 1000
12
  RUN useradd -m -u 1000 user
 
6
 
7
  RUN pip install --no-cache-dir -r requirements.txt
8
 
9
+ RUN pip install --no-cache-dir gradio==4.32.2
10
 
11
  # Set up a new user named "user" with user ID 1000
12
  RUN useradd -m -u 1000 user
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🌍
4
  colorFrom: purple
5
  colorTo: gray
6
  sdk: gradio
7
- sdk_version: 4.31.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: purple
5
  colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 4.32.2
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -7,6 +7,7 @@ import pandas as pd
7
  from tools.matcher_funcs import run_matcher
8
  from tools.gradio import initial_data_load, ensure_output_folder_exists
9
  from tools.aws_functions import load_data_from_aws
 
10
 
11
  import warnings
12
  # Remove warnings from print statements
@@ -20,7 +21,9 @@ today_rev = datetime.now().strftime("%Y%m%d")
20
 
21
  # Base folder is where the code file is stored
22
  base_folder = Path(os.getcwd())
23
- output_folder = "output/"
 
 
24
 
25
  ensure_output_folder_exists(output_folder)
26
 
 
7
  from tools.matcher_funcs import run_matcher
8
  from tools.gradio import initial_data_load, ensure_output_folder_exists
9
  from tools.aws_functions import load_data_from_aws
10
+ from tools.constants import output_folder
11
 
12
  import warnings
13
  # Remove warnings from print statements
 
21
 
22
  # Base folder is where the code file is stored
23
  base_folder = Path(os.getcwd())
24
+ # output_folder = "output/" # This is now defined in constants
25
+
26
+
27
 
28
  ensure_output_folder_exists(output_folder)
29
 
how_to_create_exe_dist.txt CHANGED
@@ -16,6 +16,8 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
16
 
17
  a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name AddressMatcher_0.1 app.py
18
 
 
 
19
  b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
20
 
21
  a = Analysis(
@@ -27,6 +29,8 @@ a = Analysis(
27
 
28
  c) Back in command line, run this: pyinstaller --clean --noconfirm AddressMatcher_0.1.spec
29
 
 
 
30
 
31
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
32
 
 
16
 
17
  a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name AddressMatcher_0.1 app.py
18
 
19
+ pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --name AddressMatcher_0.1_f app.py
20
+
21
  b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
22
 
23
  a = Analysis(
 
29
 
30
  c) Back in command line, run this: pyinstaller --clean --noconfirm AddressMatcher_0.1.spec
31
 
32
+ pyinstaller --clean --noconfirm AddressMatcher_0.1_f.spec
33
+
34
 
35
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
36
 
tools/addressbase_api_funcs.py CHANGED
@@ -156,9 +156,6 @@ def places_api_query(query, api_key, query_type):
156
  print("No API key provided.")
157
  return pd.DataFrame() # Return blank dataframe
158
 
159
- #print('RESPONSE:', concat_results)
160
-
161
-
162
  # Convert 'results' to DataFrame
163
 
164
  # Check if 'LPI' sub-branch exists in the JSON response
@@ -177,18 +174,7 @@ def places_api_query(query, api_key, query_type):
177
  if isinstance(df, pd.Series):
178
  print("This is a series!")
179
  df = df.to_frame().T # Convert the Series to a DataFrame with a single row
180
- # if isinstance(df, pd.DataFrame):
181
- # print("This is a dataframe!")
182
- # else:
183
- # print("This is not a dataframe!")
184
- # return pd.DataFrame() # Return blank dataframe
185
-
186
 
187
- print(df)
188
- #print(df.columns)
189
- #df.to_csv(query + ".csv")
190
-
191
-
192
 
193
  overall_toc = time.perf_counter()
194
  time_out = f"The API call took {overall_toc - overall_tic:0.1f} seconds"
 
156
  print("No API key provided.")
157
  return pd.DataFrame() # Return blank dataframe
158
 
 
 
 
159
  # Convert 'results' to DataFrame
160
 
161
  # Check if 'LPI' sub-branch exists in the JSON response
 
174
  if isinstance(df, pd.Series):
175
  print("This is a series!")
176
  df = df.to_frame().T # Convert the Series to a DataFrame with a single row
 
 
 
 
 
 
177
 
 
 
 
 
 
178
 
179
  overall_toc = time.perf_counter()
180
  time_out = f"The API call took {overall_toc - overall_tic:0.1f} seconds"
tools/aws_functions.py CHANGED
@@ -13,16 +13,6 @@ except Exception as e:
13
  bucket_name = ''
14
  print(e)
15
 
16
- # sts = session.client("sts")
17
- # Create a Session with the IAM role ARN
18
- # aws_role = os.environ['AWS_ROLE_DATA_TEXT_SEARCH']
19
- # response = sts.assume_role(
20
- # RoleArn=aws_role,
21
- # RoleSessionName="ecs-test-session"
22
- # )
23
- # print(response)
24
-
25
-
26
  def get_assumed_role_info():
27
  sts = boto3.client('sts', region_name='eu-west-2', endpoint_url='https://sts.eu-west-2.amazonaws.com')
28
  response = sts.get_caller_identity()
 
13
  bucket_name = ''
14
  print(e)
15
 
 
 
 
 
 
 
 
 
 
 
16
  def get_assumed_role_info():
17
  sts = boto3.client('sts', region_name='eu-west-2', endpoint_url='https://sts.eu-west-2.amazonaws.com')
18
  response = sts.get_caller_identity()
tools/constants.py CHANGED
@@ -11,6 +11,24 @@ from .pytorch_models import *
11
  PandasDataFrame = Type[pd.DataFrame]
12
  PandasSeries = Type[pd.Series]
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # +
15
  ''' Fuzzywuzzy/Rapidfuzz scorer to use. Options are: ratio, partial_ratio, token_sort_ratio, partial_token_sort_ratio,
16
  token_set_ratio, partial_token_set_ratio, QRatio, UQRatio, WRatio (default), UWRatio
@@ -18,17 +36,11 @@ details here: https://stackoverflow.com/questions/31806695/when-to-use-which-fuz
18
 
19
  fuzzy_scorer_used = "token_set_ratio"
20
 
21
- # +
22
  fuzzy_match_limit = 85
23
-
24
  fuzzy_search_addr_limit = 20
25
-
26
  filter_to_lambeth_pcodes= True
27
- # -
28
-
29
  standardise = False
30
 
31
- # +
32
  if standardise == True:
33
  std = "_std"
34
  if standardise == False:
@@ -40,8 +52,7 @@ suffix_used = dataset_name + "_" + fuzzy_scorer_used
40
 
41
  # https://stackoverflow.com/questions/59221557/tensorflow-v2-replacement-for-tf-contrib-predictor-from-saved-model
42
 
43
- ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), '..'))
44
- print(ROOT_DIR)
45
 
46
  # Uncomment these lines for the tensorflow model
47
  #model_type = "tf"
@@ -66,30 +77,32 @@ device = "cpu"
66
  global labels_list
67
  labels_list = []
68
 
 
 
 
 
 
 
 
 
 
 
69
  model_dir_name = os.path.join(ROOT_DIR, "nnet_model" , model_stub , model_version)
70
- print(model_dir_name)
71
 
72
  model_path = os.path.join(model_dir_name, "saved_model.zip")
73
- print("model path: ")
74
- print(model_path)
75
 
76
  if os.path.exists(model_path):
77
 
78
  os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Better to go without GPU to avoid 'out of memory' issues
79
  device = "cpu"
80
-
81
-
82
-
83
  ## The labels_list object defines the structure of the prediction outputs. It must be the same as what the model was originally trained on
84
-
85
-
86
-
87
  ''' Load pre-trained model '''
88
 
89
-
90
-
91
  with zipfile.ZipFile(model_path,"r") as zip_ref:
92
- zip_ref.extractall(model_dir_name)
93
 
94
  # if model_stub == "addr_model_out_lon":
95
 
@@ -143,16 +156,15 @@ if os.path.exists(model_path):
143
  'Postcode', # 14
144
  'IGNORE'
145
  ]
146
-
147
- #labels_list.to_csv("labels_list.csv", index = None)
148
 
149
  if (model_type == "transformer") | (model_type == "gru") | (model_type == "lstm") :
150
  # Load vocab and word_to_index
151
- with open(model_dir_name + "vocab.txt", "r") as f:
152
  vocab = eval(f.read())
153
- with open(model_dir_name + "/word_to_index.txt", "r") as f:
154
  word_to_index = eval(f.read())
155
- with open(model_dir_name + "/cat_to_idx.txt", "r") as f:
156
  cat_to_idx = eval(f.read())
157
 
158
  VOCAB_SIZE = len(word_to_index)
@@ -180,8 +192,12 @@ if os.path.exists(model_path):
180
  exported_model = LSTMTextClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT, PAD_TOKEN)
181
 
182
 
183
- exported_model.load_state_dict(torch.load(model_dir_name + "output_model_" + str(data_sample_size) +\
184
- "_" + str(N_EPOCHS) + "_" + model_type + ".pth", map_location=torch.device('cpu')))
 
 
 
 
185
  exported_model.eval()
186
 
187
  device='cpu'
@@ -196,13 +212,7 @@ if os.path.exists(model_path):
196
 
197
  else: exported_model = []
198
 
199
- #if exported_model:
200
- # exported_model = exported_model
201
- #else: exported_model = []
202
-
203
-
204
-
205
- # +
206
  # Address matcher will try to match <batch_size> records in one go to avoid exceeding memory limits.
207
  batch_size = 10000
208
  ref_batch_size = 150000
@@ -215,7 +225,6 @@ ref_batch_size = 150000
215
 
216
  Comparison of some of the Jellyfish string comparison methods: https://manpages.debian.org/testing/python-jellyfish-doc/jellyfish.3.en.html '''
217
 
218
-
219
  fuzzy_method = "jarowinkler"
220
 
221
  # Required overall match score for all columns to count as a match
 
11
  PandasDataFrame = Type[pd.DataFrame]
12
  PandasSeries = Type[pd.Series]
13
 
14
+ def get_or_create_env_var(var_name, default_value):
15
+ # Get the environment variable if it exists
16
+ value = os.environ.get(var_name)
17
+
18
+ # If it doesn't exist, set it to the default value
19
+ if value is None:
20
+ os.environ[var_name] = default_value
21
+ value = default_value
22
+
23
+ return value
24
+
25
+ # Retrieving or setting output folder
26
+ env_var_name = 'GRADIO_OUTPUT_FOLDER'
27
+ default_value = 'output/'
28
+
29
+ output_folder = get_or_create_env_var(env_var_name, default_value)
30
+ print(f'The value of {env_var_name} is {output_folder}')
31
+
32
  # +
33
  ''' Fuzzywuzzy/Rapidfuzz scorer to use. Options are: ratio, partial_ratio, token_sort_ratio, partial_token_sort_ratio,
34
  token_set_ratio, partial_token_set_ratio, QRatio, UQRatio, WRatio (default), UWRatio
 
36
 
37
  fuzzy_scorer_used = "token_set_ratio"
38
 
 
39
  fuzzy_match_limit = 85
 
40
  fuzzy_search_addr_limit = 20
 
41
  filter_to_lambeth_pcodes= True
 
 
42
  standardise = False
43
 
 
44
  if standardise == True:
45
  std = "_std"
46
  if standardise == False:
 
52
 
53
  # https://stackoverflow.com/questions/59221557/tensorflow-v2-replacement-for-tf-contrib-predictor-from-saved-model
54
 
55
+
 
56
 
57
  # Uncomment these lines for the tensorflow model
58
  #model_type = "tf"
 
77
  global labels_list
78
  labels_list = []
79
 
80
+ ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), '..'))
81
+
82
+ # If in a non-standard location (e.g. on AWS Lambda Function URL, then save model to tmp drive)
83
+ if output_folder == "output/":
84
+ out_model_dir = ROOT_DIR
85
+ print(out_model_dir)
86
+ else:
87
+ out_model_dir = output_folder[:-1]
88
+ print(out_model_dir)
89
+
90
  model_dir_name = os.path.join(ROOT_DIR, "nnet_model" , model_stub , model_version)
 
91
 
92
  model_path = os.path.join(model_dir_name, "saved_model.zip")
93
+ print("Model zip path: ", model_path)
 
94
 
95
  if os.path.exists(model_path):
96
 
97
  os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Better to go without GPU to avoid 'out of memory' issues
98
  device = "cpu"
99
+
 
 
100
  ## The labels_list object defines the structure of the prediction outputs. It must be the same as what the model was originally trained on
101
+
 
 
102
  ''' Load pre-trained model '''
103
 
 
 
104
  with zipfile.ZipFile(model_path,"r") as zip_ref:
105
+ zip_ref.extractall(out_model_dir)
106
 
107
  # if model_stub == "addr_model_out_lon":
108
 
 
156
  'Postcode', # 14
157
  'IGNORE'
158
  ]
159
+
 
160
 
161
  if (model_type == "transformer") | (model_type == "gru") | (model_type == "lstm") :
162
  # Load vocab and word_to_index
163
+ with open(out_model_dir + "/vocab.txt", "r") as f:
164
  vocab = eval(f.read())
165
+ with open(out_model_dir + "/word_to_index.txt", "r") as f:
166
  word_to_index = eval(f.read())
167
+ with open(out_model_dir + "/cat_to_idx.txt", "r") as f:
168
  cat_to_idx = eval(f.read())
169
 
170
  VOCAB_SIZE = len(word_to_index)
 
192
  exported_model = LSTMTextClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT, PAD_TOKEN)
193
 
194
 
195
+ out_model_file_name = "output_model_" + str(data_sample_size) +\
196
+ "_" + str(N_EPOCHS) + "_" + model_type + ".pth"
197
+
198
+ out_model_path = os.path.join(out_model_dir, out_model_file_name)
199
+ print("Model location: ", out_model_path)
200
+ exported_model.load_state_dict(torch.load(out_model_path, map_location=torch.device('cpu')))
201
  exported_model.eval()
202
 
203
  device='cpu'
 
212
 
213
  else: exported_model = []
214
 
215
+ ### ADDRESS MATCHING FUNCTIONS
 
 
 
 
 
 
216
  # Address matcher will try to match <batch_size> records in one go to avoid exceeding memory limits.
217
  batch_size = 10000
218
  ref_batch_size = 150000
 
225
 
226
  Comparison of some of the Jellyfish string comparison methods: https://manpages.debian.org/testing/python-jellyfish-doc/jellyfish.3.en.html '''
227
 
 
228
  fuzzy_method = "jarowinkler"
229
 
230
  # Required overall match score for all columns to count as a match
tools/gradio.py CHANGED
@@ -60,9 +60,9 @@ def ensure_output_folder_exists(output_folder):
60
  if not os.path.exists(folder_name):
61
  # Create the folder if it doesn't exist
62
  os.makedirs(folder_name)
63
- print(f"Created the output folder.")
64
  else:
65
- print(f"The output folder already exists.")
66
 
67
  def dummy_function(in_colnames):
68
  """
@@ -72,4 +72,5 @@ def dummy_function(in_colnames):
72
 
73
 
74
  def clear_inputs(in_file, in_ref, in_text):
75
- return gr.File.update(value=[]), gr.File.update(value=[]), gr.Textbox.update(value='')
 
 
60
  if not os.path.exists(folder_name):
61
  # Create the folder if it doesn't exist
62
  os.makedirs(folder_name)
63
+ print(f"Created the output folder:", folder_name)
64
  else:
65
+ print(f"The output folder already exists:", folder_name)
66
 
67
  def dummy_function(in_colnames):
68
  """
 
72
 
73
 
74
  def clear_inputs(in_file, in_ref, in_text):
75
+ return gr.File(value=[]), gr.File(value=[]), gr.Textbox(value='')
76
+
tools/matcher_funcs.py CHANGED
@@ -169,7 +169,7 @@ def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, prog
169
  if (i + 1) % 500 == 0:
170
  print("Saving api call checkpoint for query:", str(i + 1))
171
 
172
- pd.concat(loop_list).to_parquet(api_ref_save_loc + ".parquet", index=False)
173
 
174
  return loop_list
175
 
@@ -351,8 +351,8 @@ def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, prog
351
 
352
  if save_file:
353
  print("Saving reference file to: " + api_ref_save_loc[:-5] + ".parquet")
354
- Matcher.ref_df.to_parquet(api_ref_save_loc + ".parquet", index=False) # Save checkpoint as well
355
- Matcher.ref_df.to_parquet(api_ref_save_loc[:-5] + ".parquet", index=False)
356
 
357
  if Matcher.ref_df.empty:
358
  print ("No reference data found with API")
@@ -676,8 +676,8 @@ def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state,
676
  print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
677
  print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
678
 
679
- Matcher.match_outputs_name = "output/diagnostics_initial_" + today_rev + ".csv"
680
- Matcher.results_orig_df_name = "output/results_initial_" + today_rev + ".csv"
681
 
682
  Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
683
  Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
@@ -724,10 +724,6 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
724
  InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
725
 
726
 
727
- # Sort dataframes by postcode - will allow for more efficient matching process if using multiple batches
728
- #InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.sort_values(by="postcode")
729
- #InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.sort_values(by="Postcode")
730
-
731
  # Polars implementation - not finalised
732
  #InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
733
  #InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
@@ -777,31 +773,10 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
777
 
778
  search_range = range_df.iloc[row]['search_range']
779
  ref_range = range_df.iloc[row]['ref_range']
780
-
781
- #print("search_range: ", search_range)
782
- #pd.DataFrame(search_range).to_csv("search_range.csv")
783
- #print("ref_range: ", ref_range)
784
 
785
  BatchMatch = copy.copy(InitMatch)
786
 
787
  # Subset the search and reference dfs based on current batch ranges
788
- # BatchMatch.search_df = BatchMatch.search_df.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
789
- # BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
790
- # BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
791
- # BatchMatch.ref_df = BatchMatch.ref_df.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
792
- # BatchMatch.ref_df_cleaned = BatchMatch.ref_df_cleaned.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
793
-
794
-
795
- # BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.iloc[search_range[0]:search_range[1] + 1]
796
- # BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.iloc[ref_range[0]:ref_range[1] + 1]
797
- # BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.iloc[search_range[0]:search_range[1] + 1]
798
- # BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.iloc[ref_range[0]:ref_range[1] + 1]
799
-
800
- # BatchMatch.search_df_after_stand = BatchMatch.search_df_after_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
801
- # BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
802
- # BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
803
- # BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
804
-
805
  BatchMatch.search_df = BatchMatch.search_df[BatchMatch.search_df.index.isin(search_range)].reset_index(drop=True)
806
  BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
807
  BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned[BatchMatch.search_df_cleaned.index.isin(search_range)].reset_index(drop=True)
@@ -814,25 +789,9 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
814
  BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand[BatchMatch.search_df_after_full_stand.index.isin(search_range)].reset_index(drop=True)
815
 
816
  ### Create lookup lists for fuzzy matches
817
- # BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand.copy().set_index('postcode_search')['search_address_stand']
818
- # BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_full_stand.copy().set_index('postcode_search')['search_address_stand']
819
- # BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.sort_index()
820
- # BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.sort_index()
821
-
822
- #BatchMatch.search_df_after_stand.reset_index(inplace=True, drop = True)
823
- #BatchMatch.search_df_after_full_stand.reset_index(inplace=True, drop = True)
824
-
825
  BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand[BatchMatch.ref_df_after_stand.index.isin(ref_range)].reset_index(drop=True)
826
  BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand[BatchMatch.ref_df_after_full_stand.index.isin(ref_range)].reset_index(drop=True)
827
 
828
- # BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand.copy().set_index('postcode_search')['ref_address_stand']
829
- # BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_full_stand.copy().set_index('postcode_search')['ref_address_stand']
830
- # BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.sort_index()
831
- # BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.sort_index()
832
-
833
- # BatchMatch.ref_df_after_stand.reset_index(inplace=True, drop=True)
834
- # BatchMatch.ref_df_after_full_stand.reset_index(inplace=True, drop=True)
835
-
836
  # Match the data, unless the search or reference dataframes are empty
837
  if BatchMatch.search_df.empty or BatchMatch.ref_df.empty:
838
  out_message = "Nothing to match for batch: " + str(n)
@@ -938,8 +897,6 @@ def create_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:i
938
  df = df.sort_index()
939
  ref_df = ref_df.sort_index()
940
 
941
- #df.to_csv("batch_search_df.csv")
942
-
943
  # Overall batch variables
944
  batch_indexes = []
945
  ref_indexes = []
@@ -1184,8 +1141,8 @@ def orchestrate_match_run(Matcher, standardise = False, nnet = False, file_stub=
1184
 
1185
  Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
1186
 
1187
- Matcher.match_outputs_name = "output/diagnostics_" + file_stub + today_rev + ".csv"
1188
- Matcher.results_orig_df_name = "output/results_" + file_stub + today_rev + ".csv"
1189
 
1190
  Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
1191
  Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
@@ -1233,14 +1190,9 @@ def full_fuzzy_match(search_df:PandasDataFrame,
1233
  # Remove rows from ref search series where postcode is not found in the search_df
1234
  search_df_after_stand_series = search_df_after_stand.copy().set_index('postcode_search')['search_address_stand'].sort_index()
1235
  ref_df_after_stand_series = ref_df_after_stand.copy().set_index('postcode_search')['ref_address_stand'].sort_index()
1236
-
1237
- #print(search_df_after_stand_series.index.tolist())
1238
- #print(ref_df_after_stand_series.index.tolist())
1239
-
1240
  ref_df_after_stand_series_checked = ref_df_after_stand_series.copy()[ref_df_after_stand_series.index.isin(search_df_after_stand_series.index.tolist())]
1241
 
1242
- # pd.DataFrame(ref_df_after_stand_series_checked.to_csv("ref_df_after_stand_series_checked.csv"))
1243
-
1244
  if len(ref_df_after_stand_series_checked) == 0:
1245
  print("Nothing relevant in reference data to match!")
1246
  return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),pd.DataFrame(),"Nothing relevant in reference data to match!",search_address_cols
@@ -1603,8 +1555,8 @@ def combine_two_matches(OrigMatchClass:MatcherClass, NewMatchClass:MatcherClass,
1603
  ### Rejoin the excluded matches onto the output file
1604
  #NewMatchClass.results_on_orig_df = pd.concat([NewMatchClass.results_on_orig_df, NewMatchClass.excluded_df])
1605
 
1606
- NewMatchClass.match_outputs_name = "output/diagnostics_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
1607
- NewMatchClass.results_orig_df_name = "output/results_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
1608
 
1609
  # Only keep essential columns
1610
  essential_results_cols = [NewMatchClass.search_df_key_field, "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
 
169
  if (i + 1) % 500 == 0:
170
  print("Saving api call checkpoint for query:", str(i + 1))
171
 
172
+ pd.concat(loop_list).to_parquet(output_folder + api_ref_save_loc + ".parquet", index=False)
173
 
174
  return loop_list
175
 
 
351
 
352
  if save_file:
353
  print("Saving reference file to: " + api_ref_save_loc[:-5] + ".parquet")
354
+ Matcher.ref_df.to_parquet(output_folder + api_ref_save_loc + ".parquet", index=False) # Save checkpoint as well
355
+ Matcher.ref_df.to_parquet(output_folder + api_ref_save_loc[:-5] + ".parquet", index=False)
356
 
357
  if Matcher.ref_df.empty:
358
  print ("No reference data found with API")
 
676
  print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
677
  print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
678
 
679
+ Matcher.match_outputs_name = output_folder + "diagnostics_initial_" + today_rev + ".csv"
680
+ Matcher.results_orig_df_name = output_folder + "results_initial_" + today_rev + ".csv"
681
 
682
  Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
683
  Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
 
724
  InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
725
 
726
 
 
 
 
 
727
  # Polars implementation - not finalised
728
  #InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
729
  #InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
 
773
 
774
  search_range = range_df.iloc[row]['search_range']
775
  ref_range = range_df.iloc[row]['ref_range']
 
 
 
 
776
 
777
  BatchMatch = copy.copy(InitMatch)
778
 
779
  # Subset the search and reference dfs based on current batch ranges
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
780
  BatchMatch.search_df = BatchMatch.search_df[BatchMatch.search_df.index.isin(search_range)].reset_index(drop=True)
781
  BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
782
  BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned[BatchMatch.search_df_cleaned.index.isin(search_range)].reset_index(drop=True)
 
789
  BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand[BatchMatch.search_df_after_full_stand.index.isin(search_range)].reset_index(drop=True)
790
 
791
  ### Create lookup lists for fuzzy matches
 
 
 
 
 
 
 
 
792
  BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand[BatchMatch.ref_df_after_stand.index.isin(ref_range)].reset_index(drop=True)
793
  BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand[BatchMatch.ref_df_after_full_stand.index.isin(ref_range)].reset_index(drop=True)
794
 
 
 
 
 
 
 
 
 
795
  # Match the data, unless the search or reference dataframes are empty
796
  if BatchMatch.search_df.empty or BatchMatch.ref_df.empty:
797
  out_message = "Nothing to match for batch: " + str(n)
 
897
  df = df.sort_index()
898
  ref_df = ref_df.sort_index()
899
 
 
 
900
  # Overall batch variables
901
  batch_indexes = []
902
  ref_indexes = []
 
1141
 
1142
  Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
1143
 
1144
+ Matcher.match_outputs_name = output_folder + "diagnostics_" + file_stub + today_rev + ".csv"
1145
+ Matcher.results_orig_df_name = output_folder + "results_" + file_stub + today_rev + ".csv"
1146
 
1147
  Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
1148
  Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
 
1190
  # Remove rows from ref search series where postcode is not found in the search_df
1191
  search_df_after_stand_series = search_df_after_stand.copy().set_index('postcode_search')['search_address_stand'].sort_index()
1192
  ref_df_after_stand_series = ref_df_after_stand.copy().set_index('postcode_search')['ref_address_stand'].sort_index()
1193
+
 
 
 
1194
  ref_df_after_stand_series_checked = ref_df_after_stand_series.copy()[ref_df_after_stand_series.index.isin(search_df_after_stand_series.index.tolist())]
1195
 
 
 
1196
  if len(ref_df_after_stand_series_checked) == 0:
1197
  print("Nothing relevant in reference data to match!")
1198
  return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),pd.DataFrame(),"Nothing relevant in reference data to match!",search_address_cols
 
1555
  ### Rejoin the excluded matches onto the output file
1556
  #NewMatchClass.results_on_orig_df = pd.concat([NewMatchClass.results_on_orig_df, NewMatchClass.excluded_df])
1557
 
1558
+ NewMatchClass.match_outputs_name = output_folder + "diagnostics_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
1559
+ NewMatchClass.results_orig_df_name = output_folder + "results_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
1560
 
1561
  # Only keep essential columns
1562
  essential_results_cols = [NewMatchClass.search_df_key_field, "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
tools/model_predict.py CHANGED
@@ -15,10 +15,6 @@ today_rev = datetime.now().strftime("%Y%m%d")
15
 
16
  # # Neural net functions
17
 
18
-
19
-
20
-
21
-
22
  def vocab_lookup(characters: str, vocab) -> (int, np.ndarray):
23
  """
24
  Taken from the function from the addressnet package by Jason Rigby
@@ -298,21 +294,10 @@ def post_predict_clean(predict_df, orig_search_df, ref_address_cols, search_df_k
298
 
299
  predict_df = predict_df.rename(columns={"Postcode":"Postcode_predict"})
300
 
301
- #orig_search_df.to_csv("orig_search_df_pre_predict.csv")
302
-
303
  orig_search_df_pc = orig_search_df[[search_df_key_field, "postcode"]].rename(columns={"postcode":"Postcode"}).reset_index(drop=True)
304
  predict_df = predict_df.merge(orig_search_df_pc, left_index=True, right_index=True, how = "left")
305
 
306
- #predict_df = pd.concat([predict_df, orig_search_df_pc], axis = 1)
307
-
308
- #predict_df[search_df_key_field] = orig_search_df[search_df_key_field]
309
-
310
- #predict_df = predict_df.drop("index", axis=1)
311
-
312
- #predict_df['index'] = predict_df.index
313
  predict_df[search_df_key_field] = predict_df[search_df_key_field].astype(str)
314
-
315
- #predict_df.to_csv("predict_end_of_clean.csv")
316
 
317
  return predict_df
318
 
 
15
 
16
  # # Neural net functions
17
 
 
 
 
 
18
  def vocab_lookup(characters: str, vocab) -> (int, np.ndarray):
19
  """
20
  Taken from the function from the addressnet package by Jason Rigby
 
294
 
295
  predict_df = predict_df.rename(columns={"Postcode":"Postcode_predict"})
296
 
 
 
297
  orig_search_df_pc = orig_search_df[[search_df_key_field, "postcode"]].rename(columns={"postcode":"Postcode"}).reset_index(drop=True)
298
  predict_df = predict_df.merge(orig_search_df_pc, left_index=True, right_index=True, how = "left")
299
 
 
 
 
 
 
 
 
300
  predict_df[search_df_key_field] = predict_df[search_df_key_field].astype(str)
 
 
301
 
302
  return predict_df
303
 
tools/recordlinkage_funcs.py CHANGED
@@ -93,7 +93,6 @@ def calc_final_nnet_scores(scoresSBM, weights, matching_variables):
93
  scoresSBM_r = scoresSBM_r.sort_values(by=["level_0","score_perc"], ascending = False)
94
 
95
  # Within each search address, remove anything below the max
96
- #scoresSBM_r.to_csv("scoresSBM_r.csv")
97
  scoresSBM_g = scoresSBM_r.reset_index()
98
 
99
  # Get maximum score to join on
@@ -114,8 +113,6 @@ def join_on_pred_ref_details(scoresSBM_search_m, ref_search, predict_df_search):
114
 
115
  scoresSBM_search_m_j = scoresSBM_search_m_j.reindex(sorted(scoresSBM_search_m_j.columns), axis=1)
116
 
117
- #scoresSBM_search_m_j.to_csv("scoresSBM_search_m_j.csv")
118
-
119
  return scoresSBM_search_m_j
120
 
121
  def rearrange_columns(scoresSBM_search_m_j, new_join_col, search_df_key_field, blocker_column, standardise):
@@ -175,14 +172,10 @@ def rearrange_columns(scoresSBM_search_m_j, new_join_col, search_df_key_field, b
175
 
176
  scoresSBM_out = scoresSBM_search_m_j[final_cols]
177
 
178
- #scoresSBM_out.to_csv("scoresSBM_out" + "_" + blocker_column[0] + "_" + str(standardise) + ".csv")
179
-
180
  return scoresSBM_out, start_columns
181
 
182
  def create_matched_results_nnet(scoresSBM_best, search_df_key_field, orig_search_df, new_join_col, standardise, ref_search, blocker_column, score_cut_off):
183
 
184
- #scoresSBM_best.to_csv("scores_sbm_best_" + str(standardise) + ".csv")
185
-
186
  ### Make the final 'matched output' file
187
  scoresSBM_best_pred_cols = scoresSBM_best.filter(regex='_pred$').iloc[:,1:-1]
188
  scoresSBM_best["search_orig_address"] = (scoresSBM_best_pred_cols.agg(' '.join, axis=1)).str.strip().str.replace("\s{2,}", " ", regex=True)
@@ -199,22 +192,16 @@ def create_matched_results_nnet(scoresSBM_best, search_df_key_field, orig_search
199
  'full_match_score_based', 'Reference file']], on = search_df_key_field, how = "left").\
200
  rename(columns={"full_address":"search_orig_address"})
201
 
202
- #ref_search.to_csv("ref_search.csv")
203
-
204
  if 'index' not in ref_search.columns:
205
  ref_search['ref_index'] = ref_search.index
206
 
207
  matched_output_SBM = matched_output_SBM.merge(ref_search.drop_duplicates("fulladdress")[["ref_index", "fulladdress", "Postcode", "property_number", "prop_number", "flat_number", "apart_number", "block_number", 'unit_number', "room_number", "house_court_name", "ref_address_stand"]], left_on = "address_ref", right_on = "fulladdress", how = "left", suffixes=('_search', '_reference')).rename(columns={"fulladdress":"reference_orig_address", "ref_address_stand":"reference_list_address"})
208
 
209
- #matched_output_SBM.to_csv("matched_output_SBM_earlier_" + str(standardise) + ".csv")
210
-
211
  # To replace with number check
212
 
213
-
214
  matched_output_SBM = matched_output_SBM.rename(columns={"full_match_score_based":"full_match"})
215
 
216
  matched_output_SBM['property_number_match'] = matched_output_SBM['full_match']
217
- #
218
 
219
  scores_SBM_best_cols = [search_df_key_field, 'full_match_score_based', 'perc_weighted_columns_matched', 'address_pred']#, "reference_mod_address"]
220
  scores_SBM_best_cols.extend(new_join_col)
@@ -223,20 +210,13 @@ def create_matched_results_nnet(scoresSBM_best, search_df_key_field, orig_search
223
 
224
  matched_output_SBM = matched_output_SBM.merge(matched_output_SBM_b.drop_duplicates(search_df_key_field), on = search_df_key_field, how = "left")
225
 
226
- #matched_output_SBM.to_csv("matched_output_SBM_later_" + str(standardise) + ".csv")
227
-
228
  from tools.fuzzy_match import create_diag_shortlist
229
  matched_output_SBM = create_diag_shortlist(matched_output_SBM, "search_orig_address", score_cut_off, blocker_column, fuzzy_col='perc_weighted_columns_matched', search_mod_address="address_pred", resolve_tie_breaks=False)
230
 
231
- #matched_output_SBM.to_csv("matched_output_after.csv")
232
-
233
- #matched_output_SBM["UPRN"] = scoresSBM_best['UPRN']
234
 
235
  matched_output_SBM['standardised_address'] = standardise
236
 
237
- matched_output_SBM = matched_output_SBM.rename(columns={"address_pred":"search_mod_address",
238
- #"address_ref":"reference_orig_address",
239
- #"full_match_score_based":"fuzzy_score_match",
240
  'perc_weighted_columns_matched':"fuzzy_score"})
241
 
242
  matched_output_SBM_cols = [search_df_key_field, 'search_orig_address','reference_orig_address',
@@ -257,10 +237,6 @@ def create_matched_results_nnet(scoresSBM_best, search_df_key_field, orig_search
257
  "unit_number_search","unit_number_reference",
258
  'house_court_name_search', 'house_court_name_reference',
259
  "search_mod_address", 'reference_mod_address','Postcode', 'postcode', 'ref_index', 'Reference file']
260
-
261
- #matched_output_SBM_cols = [search_df_key_field, 'search_orig_address', 'reference_orig_address',
262
- #'full_match', 'fuzzy_score_match', 'property_number_match', 'full_number_match',
263
- #'fuzzy_score', 'search_mod_address', 'reference_mod_address', 'Reference file']
264
 
265
  matched_output_SBM_cols.extend(new_join_col)
266
  matched_output_SBM_cols.extend(['standardised_address'])
@@ -268,8 +244,6 @@ def create_matched_results_nnet(scoresSBM_best, search_df_key_field, orig_search
268
 
269
  matched_output_SBM = matched_output_SBM.sort_values(search_df_key_field, ascending=True)
270
 
271
- #matched_output_SBM.to_csv("matched_output_SBM_out.csv")
272
-
273
  return matched_output_SBM
274
 
275
  def score_based_match(predict_df_search, ref_search, orig_search_df, matching_variables, text_columns, blocker_column, weights, fuzzy_method, score_cut_off, search_df_key_field, standardise, new_join_col, score_cut_off_nnet_street=score_cut_off_nnet_street):
@@ -287,8 +261,6 @@ def score_based_match(predict_df_search, ref_search, orig_search_df, matching_va
287
 
288
  scoresSBM_search_m_j = join_on_pred_ref_details(scoresSBM_search_m, ref_search, predict_df_search)
289
 
290
- #scoresSBM_search_m_j.to_csv("scoresSBM_search_m_j.csv")
291
-
292
  # When blocking by street, may to have an increased threshold as this is more prone to making mistakes
293
  if blocker_column[0] == "Street": scoresSBM_search_m_j['full_match_score_based'] = (scoresSBM_search_m_j['score_perc'] >= score_cut_off_nnet_street)
294
 
@@ -297,15 +269,10 @@ def score_based_match(predict_df_search, ref_search, orig_search_df, matching_va
297
  ### Reorder some columns
298
  scoresSBM_out, start_columns = rearrange_columns(scoresSBM_search_m_j, new_join_col, search_df_key_field, blocker_column, standardise)
299
 
300
- #scoresSBM_out.to_csv("scoresSBM_out.csv")
301
-
302
  matched_output_SBM = create_matched_results_nnet(scoresSBM_out, search_df_key_field, orig_search_df, new_join_col, standardise, ref_search, blocker_column, score_cut_off)
303
 
304
  matched_output_SBM_best = matched_output_SBM.sort_values([search_df_key_field, "full_match"], ascending = [True, False]).drop_duplicates(search_df_key_field)
305
 
306
- #matched_output_SBM.to_csv("matched_output_SBM.csv")
307
- #matched_output_SBM_best.to_csv("matched_output_SBM_best.csv")
308
-
309
  scoresSBM_best = scoresSBM_out[scoresSBM_out[search_df_key_field].isin(matched_output_SBM_best[search_df_key_field])]
310
 
311
  return scoresSBM_best, matched_output_SBM_best
 
93
  scoresSBM_r = scoresSBM_r.sort_values(by=["level_0","score_perc"], ascending = False)
94
 
95
  # Within each search address, remove anything below the max
 
96
  scoresSBM_g = scoresSBM_r.reset_index()
97
 
98
  # Get maximum score to join on
 
113
 
114
  scoresSBM_search_m_j = scoresSBM_search_m_j.reindex(sorted(scoresSBM_search_m_j.columns), axis=1)
115
 
 
 
116
  return scoresSBM_search_m_j
117
 
118
  def rearrange_columns(scoresSBM_search_m_j, new_join_col, search_df_key_field, blocker_column, standardise):
 
172
 
173
  scoresSBM_out = scoresSBM_search_m_j[final_cols]
174
 
 
 
175
  return scoresSBM_out, start_columns
176
 
177
  def create_matched_results_nnet(scoresSBM_best, search_df_key_field, orig_search_df, new_join_col, standardise, ref_search, blocker_column, score_cut_off):
178
 
 
 
179
  ### Make the final 'matched output' file
180
  scoresSBM_best_pred_cols = scoresSBM_best.filter(regex='_pred$').iloc[:,1:-1]
181
  scoresSBM_best["search_orig_address"] = (scoresSBM_best_pred_cols.agg(' '.join, axis=1)).str.strip().str.replace("\s{2,}", " ", regex=True)
 
192
  'full_match_score_based', 'Reference file']], on = search_df_key_field, how = "left").\
193
  rename(columns={"full_address":"search_orig_address"})
194
 
 
 
195
  if 'index' not in ref_search.columns:
196
  ref_search['ref_index'] = ref_search.index
197
 
198
  matched_output_SBM = matched_output_SBM.merge(ref_search.drop_duplicates("fulladdress")[["ref_index", "fulladdress", "Postcode", "property_number", "prop_number", "flat_number", "apart_number", "block_number", 'unit_number', "room_number", "house_court_name", "ref_address_stand"]], left_on = "address_ref", right_on = "fulladdress", how = "left", suffixes=('_search', '_reference')).rename(columns={"fulladdress":"reference_orig_address", "ref_address_stand":"reference_list_address"})
199
 
 
 
200
  # To replace with number check
201
 
 
202
  matched_output_SBM = matched_output_SBM.rename(columns={"full_match_score_based":"full_match"})
203
 
204
  matched_output_SBM['property_number_match'] = matched_output_SBM['full_match']
 
205
 
206
  scores_SBM_best_cols = [search_df_key_field, 'full_match_score_based', 'perc_weighted_columns_matched', 'address_pred']#, "reference_mod_address"]
207
  scores_SBM_best_cols.extend(new_join_col)
 
210
 
211
  matched_output_SBM = matched_output_SBM.merge(matched_output_SBM_b.drop_duplicates(search_df_key_field), on = search_df_key_field, how = "left")
212
 
 
 
213
  from tools.fuzzy_match import create_diag_shortlist
214
  matched_output_SBM = create_diag_shortlist(matched_output_SBM, "search_orig_address", score_cut_off, blocker_column, fuzzy_col='perc_weighted_columns_matched', search_mod_address="address_pred", resolve_tie_breaks=False)
215
 
 
 
 
216
 
217
  matched_output_SBM['standardised_address'] = standardise
218
 
219
+ matched_output_SBM = matched_output_SBM.rename(columns={"address_pred":"search_mod_address",
 
 
220
  'perc_weighted_columns_matched':"fuzzy_score"})
221
 
222
  matched_output_SBM_cols = [search_df_key_field, 'search_orig_address','reference_orig_address',
 
237
  "unit_number_search","unit_number_reference",
238
  'house_court_name_search', 'house_court_name_reference',
239
  "search_mod_address", 'reference_mod_address','Postcode', 'postcode', 'ref_index', 'Reference file']
 
 
 
 
240
 
241
  matched_output_SBM_cols.extend(new_join_col)
242
  matched_output_SBM_cols.extend(['standardised_address'])
 
244
 
245
  matched_output_SBM = matched_output_SBM.sort_values(search_df_key_field, ascending=True)
246
 
 
 
247
  return matched_output_SBM
248
 
249
  def score_based_match(predict_df_search, ref_search, orig_search_df, matching_variables, text_columns, blocker_column, weights, fuzzy_method, score_cut_off, search_df_key_field, standardise, new_join_col, score_cut_off_nnet_street=score_cut_off_nnet_street):
 
261
 
262
  scoresSBM_search_m_j = join_on_pred_ref_details(scoresSBM_search_m, ref_search, predict_df_search)
263
 
 
 
264
  # When blocking by street, may to have an increased threshold as this is more prone to making mistakes
265
  if blocker_column[0] == "Street": scoresSBM_search_m_j['full_match_score_based'] = (scoresSBM_search_m_j['score_perc'] >= score_cut_off_nnet_street)
266
 
 
269
  ### Reorder some columns
270
  scoresSBM_out, start_columns = rearrange_columns(scoresSBM_search_m_j, new_join_col, search_df_key_field, blocker_column, standardise)
271
 
 
 
272
  matched_output_SBM = create_matched_results_nnet(scoresSBM_out, search_df_key_field, orig_search_df, new_join_col, standardise, ref_search, blocker_column, score_cut_off)
273
 
274
  matched_output_SBM_best = matched_output_SBM.sort_values([search_df_key_field, "full_match"], ascending = [True, False]).drop_duplicates(search_df_key_field)
275
 
 
 
 
276
  scoresSBM_best = scoresSBM_out[scoresSBM_out[search_df_key_field].isin(matched_output_SBM_best[search_df_key_field])]
277
 
278
  return scoresSBM_best, matched_output_SBM_best