Spaces:

seanpedrickcase
/

address_matcher

Running

App Files Files Community

seanpedrickcase commited on Sep 17, 2024

Commit

115b61f

•

1 Parent(s): a3c7fb0

Added AWS auth, logging, allowed for API call saves

Browse files

Files changed (6) hide show

app.py +68 -23
tools/auth.py +47 -0
tools/aws_functions.py +75 -29
tools/{gradio.py → helper_functions.py} +106 -0
tools/matcher_funcs.py +51 -32
tools/preparation.py +1 -101

app.py CHANGED Viewed

@@ -3,11 +3,13 @@ from datetime import datetime
 from pathlib import Path
 import gradio as gr
 import pandas as pd
 from tools.matcher_funcs import run_matcher
-from tools.gradio import initial_data_load, ensure_output_folder_exists
-from tools.aws_functions import load_data_from_aws
 from tools.constants import output_folder
 import warnings
 # Remove warnings from print statements
@@ -25,6 +27,15 @@ base_folder = Path(os.getcwd())
 ensure_output_folder_exists(output_folder)
 # Create the gradio interface
 block = gr.Blocks(theme = gr.themes.Base())
@@ -35,6 +46,17 @@ with block:
     results_data_state = gr.State(pd.DataFrame())
     ref_results_data_state =gr.State(pd.DataFrame())
     gr.Markdown(
     """
     # Address matcher
@@ -66,7 +88,7 @@ with block:
         with gr.Accordion("Use Addressbase API (instead of reference file)", open = True):
             in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode"])#["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
-            in_api_key = gr.Textbox(label="Addressbase API key", type='password')
         with gr.Accordion("Match against reference file of addresses", open = False):
             in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
@@ -81,6 +103,18 @@ with block:
             output_summary = gr.Textbox(label="Output summary")
             output_file = gr.File(label="Output file")
     with gr.Tab(label="Advanced options"):
         with gr.Accordion(label = "AWS data access", open = False):
                 aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
@@ -90,35 +124,46 @@ with block:
                 aws_log_box = gr.Textbox(label="AWS data load status")
     ### Loading AWS data ###
     load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_ref, aws_log_box])
     # Updates to components
     in_file.change(fn = initial_data_load, inputs=[in_file], outputs=[output_summary, in_colnames, in_existing, data_state, results_data_state])
     in_ref.change(fn = initial_data_load, inputs=[in_ref], outputs=[output_summary, in_refcol, in_joincol, ref_data_state, ref_results_data_state])
     match_btn.click(fn = run_matcher, inputs=[in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, in_api, in_api_key],
-                    outputs=[output_summary, output_file], api_name="address")
-# Run app
-# If GRADIO_OUTPUT_FOLDER exists and is set to /tmp/ it means that the app is running on AWS Lambda and the queue should not be enabled.
-if 'GRADIO_OUTPUT_FOLDER' in os.environ:
-    if os.environ['GRADIO_OUTPUT_FOLDER'] == '/tmp/':
-        block.launch(ssl_verify=False)
     else:
-        block.queue().launch(ssl_verify=False)
-block.queue().launch(ssl_verify=False)
-# Download OpenSSL from here:
-# Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d
-#block.queue().launch(ssl_verify=False, share=False, debug=False, server_name="0.0.0.0",server_port=443,
-#                     ssl_certfile="cert.pem", ssl_keyfile="key.pem") # port 443 for https. Certificates currently not valid
-# Running on local server without https
-#block.queue().launch(server_name="0.0.0.0", server_port=7861, ssl_verify=False)

 from pathlib import Path
 import gradio as gr
 import pandas as pd
+import socket
 from tools.matcher_funcs import run_matcher
+from tools.helper_functions import initial_data_load, ensure_output_folder_exists, get_connection_params, get_or_create_env_var, reveal_feedback_buttons
+from tools.aws_functions import load_data_from_aws, upload_file_to_s3
 from tools.constants import output_folder
+from tools.auth import authenticate_user
 import warnings
 # Remove warnings from print statements
 ensure_output_folder_exists(output_folder)
+host_name = socket.gethostname()
+feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
+access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
+usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
+# Launch the Gradio app
+ADDRESSBASE_API_KEY = get_or_create_env_var('ADDRESSBASE_API_KEY', '')
 # Create the gradio interface
 block = gr.Blocks(theme = gr.themes.Base())
     results_data_state = gr.State(pd.DataFrame())
     ref_results_data_state =gr.State(pd.DataFrame())
+    session_hash_state = gr.State()
+    s3_output_folder_state = gr.State()
+    # Logging state
+    feedback_logs_state = gr.State(feedback_logs_folder + 'log.csv')
+    feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
+    access_logs_state = gr.State(access_logs_folder + 'log.csv')
+    access_s3_logs_loc_state = gr.State(access_logs_folder)
+    usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
+    usage_s3_logs_loc_state = gr.State(usage_logs_folder)
     gr.Markdown(
     """
     # Address matcher
         with gr.Accordion("Use Addressbase API (instead of reference file)", open = True):
             in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode"])#["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
+            in_api_key = gr.Textbox(label="Addressbase API key", type='password', value = ADDRESSBASE_API_KEY)
         with gr.Accordion("Match against reference file of addresses", open = False):
             in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
             output_summary = gr.Textbox(label="Output summary")
             output_file = gr.File(label="Output file")
+        feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
+        feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
+        further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
+        submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
+        with gr.Row():
+            s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
+            # This keeps track of the time taken to match files for logging purposes.
+            estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False)
+            # Invisible text box to hold the session hash/username just for logging purposes
+            session_hash_textbox = gr.Textbox(value="", visible=False)
     with gr.Tab(label="Advanced options"):
         with gr.Accordion(label = "AWS data access", open = False):
                 aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
                 aws_log_box = gr.Textbox(label="AWS data load status")
     ### Loading AWS data ###
     load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_ref, aws_log_box])
     # Updates to components
     in_file.change(fn = initial_data_load, inputs=[in_file], outputs=[output_summary, in_colnames, in_existing, data_state, results_data_state])
     in_ref.change(fn = initial_data_load, inputs=[in_ref], outputs=[output_summary, in_refcol, in_joincol, ref_data_state, ref_results_data_state])
     match_btn.click(fn = run_matcher, inputs=[in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, in_api, in_api_key],
+                    outputs=[output_summary, output_file, estimated_time_taken_number], api_name="address").\
+    then(fn = reveal_feedback_buttons, outputs=[feedback_radio, further_details_text, submit_feedback_btn, feedback_title])
+    # Get connection details on app load
+    block.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
+    # Log usernames and times of access to file (to know who is using the app when running on AWS)
+    access_callback = gr.CSVLogger()
+    access_callback.setup([session_hash_textbox], access_logs_folder)
+    session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+    # User submitted feedback for pdf redactions
+    feedback_callback = gr.CSVLogger()
+    feedback_callback.setup([feedback_radio, further_details_text, in_file], feedback_logs_folder)
+    submit_feedback_btn.click(lambda *args: feedback_callback.flag(list(args)), [feedback_radio, further_details_text, in_file], None, preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[further_details_text])
+    # Log processing time/token usage when making a query
+    usage_callback = gr.CSVLogger()
+    usage_callback.setup([session_hash_textbox, in_file, estimated_time_taken_number], usage_logs_folder)
+    estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, in_file, estimated_time_taken_number], None, preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+# Launch the Gradio app
+COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
+print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
+if __name__ == "__main__":
+    if os.environ['COGNITO_AUTH'] == "1":
+        block.queue().launch(show_error=True, auth=authenticate_user, max_file_size='50mb')
     else:
+        block.queue().launch(show_error=True, inbrowser=True, max_file_size='50mb')

tools/auth.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import boto3
+from tools.helper_functions import get_or_create_env_var
+client_id = get_or_create_env_var('AWS_CLIENT_ID', '') # This client id is borrowed from async gradio app client
+print(f'The value of AWS_CLIENT_ID is {client_id}')
+user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
+print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
+def authenticate_user(username, password, user_pool_id=user_pool_id, client_id=client_id):
+    """Authenticates a user against an AWS Cognito user pool.
+    Args:
+        user_pool_id (str): The ID of the Cognito user pool.
+        client_id (str): The ID of the Cognito user pool client.
+        username (str): The username of the user.
+        password (str): The password of the user.
+    Returns:
+        bool: True if the user is authenticated, False otherwise.
+    """
+    client = boto3.client('cognito-idp')  # Cognito Identity Provider client
+    try:
+        response = client.initiate_auth(
+            AuthFlow='USER_PASSWORD_AUTH',
+            AuthParameters={
+                'USERNAME': username,
+                'PASSWORD': password,
+            },
+            ClientId=client_id
+        )
+        # If successful, you'll receive an AuthenticationResult in the response
+        if response.get('AuthenticationResult'):
+            return True
+        else:
+            return False
+    except client.exceptions.NotAuthorizedException:
+        return False
+    except client.exceptions.UserNotFoundException:
+        return False
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return False

tools/aws_functions.py CHANGED Viewed

@@ -1,37 +1,46 @@
-from typing import Type
 import pandas as pd
 import boto3
 import tempfile
 import os
 PandasDataFrame = Type[pd.DataFrame]
-try:
-    session = boto3.Session()
-    bucket_name = os.environ['ADDRESS_MATCHER_BUCKET']
-except Exception as e:
-    bucket_name = ''
-    print(e)
-def get_assumed_role_info():
-    sts = boto3.client('sts', region_name='eu-west-2', endpoint_url='https://sts.eu-west-2.amazonaws.com')
-    response = sts.get_caller_identity()
-    # Extract ARN of the assumed role
-    assumed_role_arn = response['Arn']
-    # Extract the name of the assumed role from the ARN
-    assumed_role_name = assumed_role_arn.split('/')[-1]
-    return assumed_role_arn, assumed_role_name
-try:
-    assumed_role_arn, assumed_role_name = get_assumed_role_info()
-    print("Assumed Role ARN:", assumed_role_arn)
-    print("Assumed Role Name:", assumed_role_name)
-except Exception as e:
-    print(e)
 # Download direct from S3 - requires login credentials
 def download_file_from_s3(bucket_name, key, local_file_path):
@@ -101,8 +110,6 @@ def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
         except Exception as e:
             print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
 def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
     temp_dir = tempfile.mkdtemp()
@@ -154,3 +161,42 @@ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_
     return files, out_message

+from typing import Type, List
 import pandas as pd
 import boto3
 import tempfile
 import os
+from tools.helper_functions import get_or_create_env_var
 PandasDataFrame = Type[pd.DataFrame]
+# Get AWS credentials if required
+bucket_name=""
+aws_var = "RUN_AWS_FUNCTIONS"
+aws_var_default = "0"
+aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
+print(f'The value of {aws_var} is {aws_var_val}')
+if aws_var_val == "1":
+    try:
+        session = boto3.Session()
+        bucket_name = os.environ['ADDRESS_MATCHER_BUCKET']
+    except Exception as e:
+        bucket_name = ''
+        print(e)
+    def get_assumed_role_info():
+        sts = boto3.client('sts', region_name='eu-west-2', endpoint_url='https://sts.eu-west-2.amazonaws.com')
+        response = sts.get_caller_identity()
+        # Extract ARN of the assumed role
+        assumed_role_arn = response['Arn']
+        # Extract the name of the assumed role from the ARN
+        assumed_role_name = assumed_role_arn.split('/')[-1]
+        return assumed_role_arn, assumed_role_name
+    try:
+        assumed_role_arn, assumed_role_name = get_assumed_role_info()
+        print("Assumed Role ARN:", assumed_role_arn)
+        print("Assumed Role Name:", assumed_role_name)
+    except Exception as e:
+        print(e)
 # Download direct from S3 - requires login credentials
 def download_file_from_s3(bucket_name, key, local_file_path):
         except Exception as e:
             print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
 def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
     temp_dir = tempfile.mkdtemp()
     return files, out_message
+def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
+    """
+    Uploads a file from local machine to Amazon S3.
+    Args:
+    - local_file_path: Local file path(s) of the file(s) to upload.
+    - s3_key: Key (path) to the file in the S3 bucket.
+    - s3_bucket: Name of the S3 bucket.
+    Returns:
+    - Message as variable/printed to console
+    """
+    final_out_message = []
+    s3_client = boto3.client('s3')
+    if isinstance(local_file_paths, str):
+        local_file_paths = [local_file_paths]
+    for file in local_file_paths:
+        try:
+            # Get file name off file path
+            file_name = os.path.basename(file)
+            s3_key_full = s3_key + file_name
+            print("S3 key: ", s3_key_full)
+            s3_client.upload_file(file, s3_bucket, s3_key_full)
+            out_message = "File " + file_name + " uploaded successfully!"
+            print(out_message)
+        except Exception as e:
+            out_message = f"Error uploading file(s): {e}"
+            print(out_message)
+        final_out_message.append(out_message)
+        final_out_message_str = '\n'.join(final_out_message)
+    return final_out_message_str

tools/{gradio.py → helper_functions.py} RENAMED Viewed

@@ -1,6 +1,25 @@
 import gradio as gr
 import pandas as pd
 import os
 def detect_file_type(filename):
     """Detect the file type based on its extension."""
@@ -70,7 +89,94 @@ def dummy_function(in_colnames):
     """
     return None
 def clear_inputs(in_file, in_ref, in_text):
     return gr.File(value=[]), gr.File(value=[]), gr.Textbox(value='')

 import gradio as gr
 import pandas as pd
 import os
+import re
+def get_or_create_env_var(var_name, default_value):
+    # Get the environment variable if it exists
+    value = os.environ.get(var_name)
+    # If it doesn't exist, set it to the default value
+    if value is None:
+        os.environ[var_name] = default_value
+        value = default_value
+    return value
+# Retrieving or setting output folder
+env_var_name = 'GRADIO_OUTPUT_FOLDER'
+default_value = 'output/'
+output_folder = get_or_create_env_var(env_var_name, default_value)
+print(f'The value of {env_var_name} is {output_folder}')
 def detect_file_type(filename):
     """Detect the file type based on its extension."""
     """
     return None
+# Upon running a process, the feedback buttons are revealed
+def reveal_feedback_buttons():
+    return gr.Radio(visible=True), gr.Textbox(visible=True), gr.Button(visible=True), gr.Markdown(visible=True)
 def clear_inputs(in_file, in_ref, in_text):
     return gr.File(value=[]), gr.File(value=[]), gr.Textbox(value='')
+## Get final processing time for logs:
+def sum_numbers_before_seconds(string):
+        """Extracts numbers that precede the word 'seconds' from a string and adds them up.
+        Args:
+            string: The input string.
+        Returns:
+            The sum of all numbers before 'seconds' in the string.
+        """
+        # Extract numbers before 'seconds' using regular expression
+        numbers = re.findall(r'\d+(\.\d+)?\s*seconds', string)
+        # Extract the numbers from the matches
+        numbers = [float(num.split()[0]) for num in numbers]
+        # Sum up the extracted numbers
+        sum_of_numbers = sum(numbers)
+        return sum_of_numbers
+async def get_connection_params(request: gr.Request):
+    base_folder = ""
+    if request:
+        #print("request user:", request.username)
+        #request_data = await request.json()  # Parse JSON body
+        #print("All request data:", request_data)
+        #context_value = request_data.get('context')
+        #if 'context' in request_data:
+        #     print("Request context dictionary:", request_data['context'])
+        # print("Request headers dictionary:", request.headers)
+        # print("All host elements", request.client)
+        # print("IP address:", request.client.host)
+        # print("Query parameters:", dict(request.query_params))
+        # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
+        #print("Request dictionary to object:", request.request.body())
+        print("Session hash:", request.session_hash)
+        # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
+        CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
+        #print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
+        # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
+        CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
+        #print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
+        if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
+            if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
+                supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
+                if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
+                    print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
+                else:
+                    raise(ValueError, "Custom Cloudfront header value does not match expected value.")
+        # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
+        if request.username:
+            out_session_hash = request.username
+            base_folder = "user-files/"
+            print("Request username found:", out_session_hash)
+        elif 'x-cognito-id' in request.headers:
+            out_session_hash = request.headers['x-cognito-id']
+            base_folder = "user-files/"
+            print("Cognito ID found:", out_session_hash)
+        else:
+            out_session_hash = request.session_hash
+            base_folder = "temp-files/"
+            # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
+        output_folder = base_folder + out_session_hash + "/"
+        #if bucket_name:
+        #    print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
+        return out_session_hash, output_folder, out_session_hash
+    else:
+        print("No session parameters found.")
+        return "",""

tools/matcher_funcs.py CHANGED Viewed

@@ -33,7 +33,7 @@ from tools.standardise import standardise_wrapper_func
 ### Predict function for imported model
 from tools.model_predict import full_predict_func, full_predict_torch, post_predict_clean
 from tools.recordlinkage_funcs import score_based_match
-from tools.gradio import initial_data_load
 # API functions
 from tools.addressbase_api_funcs import places_api_query
@@ -108,10 +108,13 @@ def filter_not_matched(
     return search_df.iloc[np.where(~matched)[0]]
-def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress()):
     if in_api_key == "":
         print ("No API key provided, please provide one to continue")
-        return Matcher
     else:
         # Call the API
         #Matcher.ref_df = pd.DataFrame()
@@ -119,7 +122,7 @@ def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, prog
         # Check if the ref_df file already exists
         def check_and_create_api_folder():
             # Check if the environmental variable is available
-            file_path = os.environ.get('ADDRESSBASE_API_OUT')  # Replace 'YOUR_ENV_VARIABLE_NAME' with the name of your environmental variable
             if file_path is None:
                 # Environmental variable is not set
@@ -145,11 +148,13 @@ def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, prog
         api_ref_save_loc = api_output_folder + search_file_name_without_extension + "_api_" + today_month_rev + "_" + query_type + "_ckpt"
         print("API reference save location: ", api_ref_save_loc)
         # Allow for csv, parquet and gzipped csv files
         if os.path.isfile(api_ref_save_loc + ".csv"):
             print("API reference CSV file found")
             Matcher.ref_df = pd.read_csv(api_ref_save_loc + ".csv")
-        elif os.path.isfile(api_ref_save_loc + ".parquet"):
             print("API reference Parquet file found")
             Matcher.ref_df = pd.read_parquet(api_ref_save_loc + ".parquet")
         elif os.path.isfile(api_ref_save_loc + ".csv.gz"):
@@ -350,21 +355,23 @@ def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, prog
                 # Matcher.ref_df = Matcher.ref_df.loc[Matcher.ref_df["LOCAL_CUSTODIAN_CODE"] != 7655,:]
         if save_file:
             print("Saving reference file to: " + api_ref_save_loc[:-5] + ".parquet")
             Matcher.ref_df.to_parquet(output_folder + api_ref_save_loc + ".parquet", index=False) # Save checkpoint as well
-            Matcher.ref_df.to_parquet(output_folder + api_ref_save_loc[:-5] + ".parquet", index=False)
         if Matcher.ref_df.empty:
             print ("No reference data found with API")
             return Matcher
-    return Matcher
-def check_ref_data_exists(Matcher:MatcherClass, ref_data_state:PandasDataFrame, in_ref:List[str], in_refcol:List[str], in_api:List[str], in_api_key:str, query_type:str, progress=gr.Progress()):
         '''
         Check for reference address data, do some preprocessing, and load in from the Addressbase API if required.
         '''
         # Check if reference data loaded, bring in if already there
         if not ref_data_state.empty:
             Matcher.ref_df = ref_data_state
@@ -382,10 +389,10 @@ def check_ref_data_exists(Matcher:MatcherClass, ref_data_state:PandasDataFrame,
             if not in_ref:
                 if in_api==False:
                     print ("No reference file provided, please provide one to continue")
-                    return Matcher
                 # Check if api call required and api key is provided
                 else:
-                    Matcher = run_all_api_calls(in_api_key, Matcher, query_type)
             else:
                 Matcher.ref_name = get_file_name(in_ref[0].name)
@@ -402,9 +409,7 @@ def check_ref_data_exists(Matcher:MatcherClass, ref_data_state:PandasDataFrame,
                     Matcher.ref_df = pd.concat([Matcher.ref_df, temp_ref_file])
-        # For the neural net model to work, the llpg columns have to be in the LPI format (e.g. with columns SaoText, SaoStartNumber etc. Here we check if we have that format.
         if 'Address_LPI' in Matcher.ref_df.columns:
             Matcher.ref_df = Matcher.ref_df.rename(columns={
@@ -475,9 +480,9 @@ def check_ref_data_exists(Matcher:MatcherClass, ref_data_state:PandasDataFrame,
         Matcher.ref_df = Matcher.ref_df.reset_index() #.drop(["index","level_0"], axis = 1, errors="ignore").reset_index().drop(["index","level_0"], axis = 1, errors="ignore")
         Matcher.ref_df.index.name = 'index'
-        return Matcher
-def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, results_data_state:PandasDataFrame, in_file:List[str], in_text:str, in_colnames:List[str], in_joincol:List[str], in_existing:List[str], in_api:List[str]):
     '''
     Check if data to be matched exists. Filter it according to which records are relevant in the reference dataset
     '''
@@ -654,6 +659,8 @@ def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state,
         '''
         Load in user inputs from the Gradio interface. Convert all input types (single address, or csv input) into standardised data format that can be used downstream for the fuzzy matching.
         '''
         today_rev = datetime.now().strftime("%Y%m%d")
         # Abort flag for if it's not even possible to attempt the first stage of the match for some reason
@@ -662,16 +669,15 @@ def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state,
         ### ref_df FILES ###
         # If not an API call, run this first
         if not in_api:
-            Matcher = check_ref_data_exists(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
         ### MATCH/SEARCH FILES ###
         # If doing API calls, we need to know the search data before querying for specific addresses/postcodes
-        Matcher = check_match_data_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api)
         # If an API call, ref_df data is loaded after
         if in_api:
-            Matcher = check_ref_data_exists(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
         print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
         print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
@@ -682,23 +688,31 @@ def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state,
         Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
         Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
-        return Matcher
 # Run whole matcher process
 def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
     '''
     Split search and reference data into batches. Loop and run through the match script for each batch of data.
     '''
     overall_tic = time.perf_counter()
     # Load in initial data. This will filter to relevant addresses in the search and reference datasets that can potentially be matched, and will pull in API data if asked for.
-    InitMatch = load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, InitMatch, in_api, in_api_key)
     if InitMatch.search_df.empty or InitMatch.ref_df.empty:
         out_message = "Nothing to match!"
         print(out_message)
-        return out_message, [InitMatch.results_orig_df_name, InitMatch.match_outputs_name]
     # Run initial address preparation and standardisation processes
     # Prepare address format
@@ -801,7 +815,7 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
                                                                    "Excluded from search":False,
                                                                     "Matched with reference address":False})
         else:
-            summary_of_summaries, BatchMatch_out = run_match_batch(BatchMatch, n, number_of_batches)
         OutputMatch = combine_two_matches(OutputMatch, BatchMatch_out, "All up to and including batch " + str(n+1))
@@ -837,7 +851,13 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
     final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
-    return final_summary, [OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name]
 # Run a match run for a single batch
 def create_simple_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int):
@@ -963,7 +983,7 @@ def create_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:i
     return lengths_df
-def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress()):
     '''
     Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
     '''
@@ -979,7 +999,7 @@ def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, p
         ''' Run fuzzy match on non-standardised dataset '''
-        FuzzyNotStdMatch = orchestrate_match_run(Matcher = copy.copy(InitialMatch), standardise = False, nnet = False, file_stub= "not_std_", df_name = df_name)
         if FuzzyNotStdMatch.abort_flag == True:
             message = "Nothing to match! Aborting address check."
@@ -999,7 +1019,7 @@ def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, p
         progress(.25, desc="Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Fuzzy match - standardised dataset")
         df_name = "Fuzzy standardised"
-        FuzzyStdMatch = orchestrate_match_run(Matcher = copy.copy(FuzzyNotStdMatch), standardise = True, nnet = False, file_stub= "std_", df_name = df_name)
         FuzzyStdMatch = combine_two_matches(FuzzyNotStdMatch, FuzzyStdMatch, df_name)
         ''' Continue if reference file in correct format, and neural net model exists. Also if data not too long '''
@@ -1022,7 +1042,7 @@ def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, p
         progress(.50, desc="Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Neural net - non-standardised dataset")
         df_name = "Neural net not standardised"
-        FuzzyNNetNotStdMatch = orchestrate_match_run(Matcher = copy.copy(FuzzyStdMatch), standardise = False, nnet = True, file_stub= "nnet_not_std_", df_name = df_name)
         FuzzyNNetNotStdMatch = combine_two_matches(FuzzyStdMatch, FuzzyNNetNotStdMatch, df_name)
         if (len(FuzzyNNetNotStdMatch.search_df_not_matched) == 0):
@@ -1035,7 +1055,7 @@ def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, p
         progress(.75, desc="Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Neural net - standardised dataset")
         df_name = "Neural net standardised"
-        FuzzyNNetStdMatch = orchestrate_match_run(Matcher = copy.copy(FuzzyNNetNotStdMatch), standardise = True, nnet = True, file_stub= "nnet_std_", df_name = df_name)
         FuzzyNNetStdMatch = combine_two_matches(FuzzyNNetNotStdMatch, FuzzyNNetStdMatch, df_name)
         if run_fuzzy_match == False:
@@ -1052,7 +1072,7 @@ def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, p
     return summary_of_summaries, FuzzyNNetStdMatch
 # Overarching functions
-def orchestrate_match_run(Matcher, standardise = False, nnet = False, file_stub= "not_std_", df_name = "Fuzzy not standardised"):
         today_rev = datetime.now().strftime("%Y%m%d")
@@ -1463,7 +1483,6 @@ def full_nn_match(ref_address_cols:List[str],
     return match_results_output_final_three, results_on_orig_df, summary_three, predict_df
 # Combiner/summary functions
 def combine_dfs_and_remove_dups(orig_df:PandasDataFrame, new_df:PandasDataFrame, index_col:str = "search_orig_address", match_address_series:str = "full_match", keep_only_duplicated:bool = False) -> PandasDataFrame:

 ### Predict function for imported model
 from tools.model_predict import full_predict_func, full_predict_torch, post_predict_clean
 from tools.recordlinkage_funcs import score_based_match
+from tools.helper_functions import initial_data_load, sum_numbers_before_seconds
 # API functions
 from tools.addressbase_api_funcs import places_api_query
     return search_df.iloc[np.where(~matched)[0]]
+def query_addressbase_api(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress()):
+    final_api_output_file_name = ""
     if in_api_key == "":
         print ("No API key provided, please provide one to continue")
+        return Matcher, final_api_output_file_name
     else:
         # Call the API
         #Matcher.ref_df = pd.DataFrame()
         # Check if the ref_df file already exists
         def check_and_create_api_folder():
             # Check if the environmental variable is available
+            file_path = os.environ.get('ADDRESSBASE_API_OUT')
             if file_path is None:
                 # Environmental variable is not set
         api_ref_save_loc = api_output_folder + search_file_name_without_extension + "_api_" + today_month_rev + "_" + query_type + "_ckpt"
         print("API reference save location: ", api_ref_save_loc)
+        final_api_output_file_name = api_ref_save_loc + ".parquet"
         # Allow for csv, parquet and gzipped csv files
         if os.path.isfile(api_ref_save_loc + ".csv"):
             print("API reference CSV file found")
             Matcher.ref_df = pd.read_csv(api_ref_save_loc + ".csv")
+        elif os.path.isfile(final_api_output_file_name):
             print("API reference Parquet file found")
             Matcher.ref_df = pd.read_parquet(api_ref_save_loc + ".parquet")
         elif os.path.isfile(api_ref_save_loc + ".csv.gz"):
                 # Matcher.ref_df = Matcher.ref_df.loc[Matcher.ref_df["LOCAL_CUSTODIAN_CODE"] != 7655,:]
         if save_file:
+            final_api_output_file_name = output_folder + api_ref_save_loc[:-5] + ".parquet"
             print("Saving reference file to: " + api_ref_save_loc[:-5] + ".parquet")
             Matcher.ref_df.to_parquet(output_folder + api_ref_save_loc + ".parquet", index=False) # Save checkpoint as well
+            Matcher.ref_df.to_parquet(final_api_output_file_name, index=False)
         if Matcher.ref_df.empty:
             print ("No reference data found with API")
             return Matcher
+    return Matcher, final_api_output_file_name
+def load_ref_data(Matcher:MatcherClass, ref_data_state:PandasDataFrame, in_ref:List[str], in_refcol:List[str], in_api:List[str], in_api_key:str, query_type:str, progress=gr.Progress()):
         '''
         Check for reference address data, do some preprocessing, and load in from the Addressbase API if required.
         '''
+        final_api_output_file_name = ""
         # Check if reference data loaded, bring in if already there
         if not ref_data_state.empty:
             Matcher.ref_df = ref_data_state
             if not in_ref:
                 if in_api==False:
                     print ("No reference file provided, please provide one to continue")
+                    return Matcher, final_api_output_file_name
                 # Check if api call required and api key is provided
                 else:
+                    Matcher, final_api_output_file_name = query_addressbase_api(in_api_key, Matcher, query_type)
             else:
                 Matcher.ref_name = get_file_name(in_ref[0].name)
                     Matcher.ref_df = pd.concat([Matcher.ref_df, temp_ref_file])
+        # For the neural net model to work, the llpg columns have to be in the LPI format (e.g. with columns SaoText, SaoStartNumber etc. Here we check if we have that format.
         if 'Address_LPI' in Matcher.ref_df.columns:
             Matcher.ref_df = Matcher.ref_df.rename(columns={
         Matcher.ref_df = Matcher.ref_df.reset_index() #.drop(["index","level_0"], axis = 1, errors="ignore").reset_index().drop(["index","level_0"], axis = 1, errors="ignore")
         Matcher.ref_df.index.name = 'index'
+        return Matcher, final_api_output_file_name
+def load_match_data_and_filter(Matcher:MatcherClass, data_state:PandasDataFrame, results_data_state:PandasDataFrame, in_file:List[str], in_text:str, in_colnames:List[str], in_joincol:List[str], in_existing:List[str], in_api:List[str]):
     '''
     Check if data to be matched exists. Filter it according to which records are relevant in the reference dataset
     '''
         '''
         Load in user inputs from the Gradio interface. Convert all input types (single address, or csv input) into standardised data format that can be used downstream for the fuzzy matching.
         '''
+        final_api_output_file_name = ""
         today_rev = datetime.now().strftime("%Y%m%d")
         # Abort flag for if it's not even possible to attempt the first stage of the match for some reason
         ### ref_df FILES ###
         # If not an API call, run this first
         if not in_api:
+            Matcher, final_api_output_file_name = load_ref_data(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
         ### MATCH/SEARCH FILES ###
         # If doing API calls, we need to know the search data before querying for specific addresses/postcodes
+        Matcher = load_match_data_and_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api)
         # If an API call, ref_df data is loaded after
         if in_api:
+            Matcher, final_api_output_file_name = load_ref_data(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
         print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
         print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
         Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
         Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
+        return Matcher, final_api_output_file_name
 # Run whole matcher process
 def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
     '''
     Split search and reference data into batches. Loop and run through the match script for each batch of data.
     '''
+    output_files = []
+    estimate_total_processing_time = 0.0
     overall_tic = time.perf_counter()
     # Load in initial data. This will filter to relevant addresses in the search and reference datasets that can potentially be matched, and will pull in API data if asked for.
+    InitMatch, final_api_output_file_name = load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, InitMatch, in_api, in_api_key)
+    if final_api_output_file_name:
+        output_files.append(final_api_output_file_name)
     if InitMatch.search_df.empty or InitMatch.ref_df.empty:
         out_message = "Nothing to match!"
         print(out_message)
+        output_files.extend([InitMatch.results_orig_df_name, InitMatch.match_outputs_name])
+        return out_message, output_files, estimate_total_processing_time
     # Run initial address preparation and standardisation processes
     # Prepare address format
                                                                    "Excluded from search":False,
                                                                     "Matched with reference address":False})
         else:
+            summary_of_summaries, BatchMatch_out = run_single_match_batch(BatchMatch, n, number_of_batches)
         OutputMatch = combine_two_matches(OutputMatch, BatchMatch_out, "All up to and including batch " + str(n+1))
     final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
+    estimate_total_processing_time = sum_numbers_before_seconds(time_out)
+    print("Estimated total processing time:", str(estimate_total_processing_time))
+    output_files.extend([OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name])
+    return final_summary, output_files, estimate_total_processing_time
 # Run a match run for a single batch
 def create_simple_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int):
     return lengths_df
+def run_single_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress()):
     '''
     Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
     '''
         ''' Run fuzzy match on non-standardised dataset '''
+        FuzzyNotStdMatch = orchestrate_single_match_batch(Matcher = copy.copy(InitialMatch), standardise = False, nnet = False, file_stub= "not_std_", df_name = df_name)
         if FuzzyNotStdMatch.abort_flag == True:
             message = "Nothing to match! Aborting address check."
         progress(.25, desc="Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Fuzzy match - standardised dataset")
         df_name = "Fuzzy standardised"
+        FuzzyStdMatch = orchestrate_single_match_batch(Matcher = copy.copy(FuzzyNotStdMatch), standardise = True, nnet = False, file_stub= "std_", df_name = df_name)
         FuzzyStdMatch = combine_two_matches(FuzzyNotStdMatch, FuzzyStdMatch, df_name)
         ''' Continue if reference file in correct format, and neural net model exists. Also if data not too long '''
         progress(.50, desc="Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Neural net - non-standardised dataset")
         df_name = "Neural net not standardised"
+        FuzzyNNetNotStdMatch = orchestrate_single_match_batch(Matcher = copy.copy(FuzzyStdMatch), standardise = False, nnet = True, file_stub= "nnet_not_std_", df_name = df_name)
         FuzzyNNetNotStdMatch = combine_two_matches(FuzzyStdMatch, FuzzyNNetNotStdMatch, df_name)
         if (len(FuzzyNNetNotStdMatch.search_df_not_matched) == 0):
         progress(.75, desc="Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Neural net - standardised dataset")
         df_name = "Neural net standardised"
+        FuzzyNNetStdMatch = orchestrate_single_match_batch(Matcher = copy.copy(FuzzyNNetNotStdMatch), standardise = True, nnet = True, file_stub= "nnet_std_", df_name = df_name)
         FuzzyNNetStdMatch = combine_two_matches(FuzzyNNetNotStdMatch, FuzzyNNetStdMatch, df_name)
         if run_fuzzy_match == False:
     return summary_of_summaries, FuzzyNNetStdMatch
 # Overarching functions
+def orchestrate_single_match_batch(Matcher, standardise = False, nnet = False, file_stub= "not_std_", df_name = "Fuzzy not standardised"):
         today_rev = datetime.now().strftime("%Y%m%d")
     return match_results_output_final_three, results_on_orig_df, summary_three, predict_df
 # Combiner/summary functions
 def combine_dfs_and_remove_dups(orig_df:PandasDataFrame, new_df:PandasDataFrame, index_col:str = "search_orig_address", match_address_series:str = "full_match", keep_only_duplicated:bool = False) -> PandasDataFrame:

tools/preparation.py CHANGED Viewed

@@ -49,49 +49,6 @@ def prepare_search_address_string(
     return search_df_out, key_field, address_cols, postcode_col
-# def prepare_search_address(
-#     search_df: pd.DataFrame,
-#     address_cols: list,
-#     postcode_col: list,
-#     key_col: str
-# ) -> Tuple[pd.DataFrame, str]:
-#     # Validate inputs
-#     if not isinstance(search_df, pd.DataFrame):
-#         raise TypeError("search_df must be a Pandas DataFrame")
-#     if not isinstance(address_cols, list):
-#         raise TypeError("address_cols must be a list")
-#     if not isinstance(postcode_col, list):
-#         raise TypeError("postcode_col must be a list")
-#     if not isinstance(key_col, str):
-#         raise TypeError("key_col must be a string")
-#     # Clean address columns
-#     clean_addresses = _clean_columns(search_df, address_cols)
-#     # Join address columns into one
-#     full_addresses = _join_address(clean_addresses, address_cols)
-#     # Add postcode column
-#     full_df = _add_postcode_column(full_addresses, postcode_col)
-#     # Remove postcode from main address if there was only one column in the input
-#     if postcode_col == "full_address_postcode":
-#         # Remove postcode from address
-#         address_series = remove_postcode(search_df, "full_address")
-#         search_df["full_address"] == address_series
-#     # Ensure index column
-#     final_df = _ensure_index(full_df, key_col)
-#     #print(final_df)
-#     return final_df, key_col
 def prepare_search_address(
     search_df: pd.DataFrame,
     address_cols: list,
@@ -145,25 +102,7 @@ def _clean_columns(df, cols):
    df[cols] = df[cols].apply(clean_col)
    return df
-# def _clean_columns(df, cols):
-#     # Cleaning logic
-#     #print(df)
-#     #if isinstance(df, pl.DataFrame):
-#     #    print("It's a Polars DataFrame")
-#     def clean_col(col):
-#         col = col.str.replace("nan", "")
-#         col = col.apply(lambda x: re.sub(r'\s{2,}', ' ', str(x)), skip_nulls=False, return_dtype=str)  # replace any spaces greater than one with one
-#         return col.str.replace(",", " ").str.strip()  # replace commas with a space
-#     for col in cols:
-#         df = df.with_columns(clean_col(df[col]).alias(col))
-#     return df
 def _join_address(df, cols):
    # Joining logic
    full_address = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
@@ -289,43 +228,6 @@ def prepare_ref_address(ref_df, ref_address_cols, new_join_col = ['UPRN'], stand
     return ref_df_cleaned
-# def prepare_ref_address(ref_df:pl.DataFrame, ref_address_cols, new_join_col = ['UPRN'], standard_cols = True):
-#     if ('SaoText' in ref_df.columns) | ("Secondary_Name_LPI" in ref_df.columns):
-#         standard_cols = True
-#     else:
-#         standard_cols = False
-#     ref_address_cols_uprn = list(ref_address_cols) + new_join_col
-#     ref_df_cleaned = ref_df[ref_address_cols_uprn].fill_null("")
-#     # In on-prem LPI db street has been excluded, so put this back in
-#     if ('Street' not in ref_df_cleaned.columns) & ('Address_LPI' in ref_df_cleaned.columns):
-#         ref_df_cleaned = ref_df_cleaned.with_column(pl.col('Address_LPI').apply(lambda x: extract_street_name(x)).alias('Street'))
-#     if ('Organisation' not in ref_df_cleaned.columns) & ('SaoText' in ref_df_cleaned.columns):
-#         ref_df_cleaned = ref_df_cleaned.with_column(pl.lit("").alias('Organisation'))
-#     #ref_df_cleaned['fulladdress'] =
-#     if standard_cols:
-#         pass
-#         # I can not write the full address code here as it depends on your extract_street_name and create_full_address function implementations.
-#         # However, you might need to convert string types to object type for full address creation which may require more than just a few lines of codes.
-#     else:
-#         pass
-#         # I can not write the full address code here as it depends on your extract_street_name and create_full_address function implementations.
-#     if 'Street' not in ref_df_cleaned.columns:
-#         ref_df_cleaned = ref_df_cleaned.with_column(pl.col('fulladdress').apply(extract_street_name).alias("Street"))
-#     # Add index column
-#     ref_df_cleaned = ref_df_cleaned.with_column(pl.lit('').alias('ref_index'))
-#     return ref_df_cleaned
 def extract_postcode(df, col:str) -> PandasSeries:
     '''
     Extract a postcode from a string column in a dataframe
@@ -335,7 +237,6 @@ def extract_postcode(df, col:str) -> PandasSeries:
     return postcode_series
 # Remove addresses with no numbers in at all - too high a risk of badly assigning an address
 def check_no_number_addresses(df, in_address_series) -> PandasSeries:
     '''
@@ -353,7 +254,6 @@ def check_no_number_addresses(df, in_address_series) -> PandasSeries:
     return df
 def remove_postcode(df, col:str) -> PandasSeries:
     '''
     Remove a postcode from a string column in a dataframe

     return search_df_out, key_field, address_cols, postcode_col
 def prepare_search_address(
     search_df: pd.DataFrame,
     address_cols: list,
    df[cols] = df[cols].apply(clean_col)
    return df
 def _join_address(df, cols):
    # Joining logic
    full_address = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
     return ref_df_cleaned
 def extract_postcode(df, col:str) -> PandasSeries:
     '''
     Extract a postcode from a string column in a dataframe
     return postcode_series
 # Remove addresses with no numbers in at all - too high a risk of badly assigning an address
 def check_no_number_addresses(df, in_address_series) -> PandasSeries:
     '''
     return df
 def remove_postcode(df, col:str) -> PandasSeries:
     '''
     Remove a postcode from a string column in a dataframe