File size: 4,341 Bytes
588f1c0
 
5a8534e
 
 
 
588f1c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a8534e
 
 
588f1c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a8534e
588f1c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a8534e
588f1c0
 
5a8534e
 
 
588f1c0
 
5a8534e
 
 
588f1c0
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import warnings

import pandas as pd
from geopy.distance import geodesic

# Function to calculate distances while preserving all original columns
# def calculate_distances(
#     df1: pd.DataFrame,
#     df2: pd.DataFrame,
#     code_col1,
#     lat_col1,
#     long_col1,
#     code_col2,
#     lat_col2,
#     long_col2,
#     min_distance: int = 1,
# ):
#     distances = []

#     for _, row1 in df1.iterrows():
#         for _, row2 in df2.iterrows():
#             coord1 = (row1[lat_col1], row1[long_col1])
#             coord2 = (row2[lat_col2], row2[long_col2])
#             distance_km = geodesic(coord1, coord2).kilometers  # Compute distance

#             # Combine all original columns + distance
#             combined_row = {
#                 **row1.to_dict(),  # Keep all columns from Dataset1
#                 **{
#                     f"{col}_Dataset2": row2[col] for col in df2.columns
#                 },  # Keep all columns from Dataset2
#                 "Distance_km": distance_km,
#             }
#             distances.append(combined_row)

#     df_distances = pd.DataFrame(distances)

#     # Find the closest point for each Point1
#     df_closest: pd.DataFrame = df_distances.loc[
#         df_distances.groupby(code_col1)["Distance_km"].idxmin()
#     ]

#     # Find the distnce below min_distance
#     df_closest_min_distance = df_distances[df_distances["Distance_km"] < min_distance]

#     return df_distances, df_closest, df_closest_min_distance


def calculate_distances(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    code_col1: str,
    lat_col1: str,
    long_col1: str,
    code_col2: str,
    lat_col2: str,
    long_col2: str,
    min_distance: float = 1.0,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Calculate distances between points in two datasets and find closest matches.

    Args:
        df1: First DataFrame containing reference points
        df2: Second DataFrame containing points to compare
        code_col1: Column name in df1 containing point identifiers
        lat_col1: Column name in df1 containing latitude
        long_col1: Column name in df1 containing longitude
        code_col2: Column name in df2 containing point identifiers
        lat_col2: Column name in df2 containing latitude
        long_col2: Column name in df2 containing longitude
        min_distance: Minimum distance threshold in kilometers

    Returns:
        tuple: (all_distances, closest_matches, matches_below_threshold)
    """
    # Validate input columns
    required_cols_1 = {code_col1, lat_col1, long_col1}
    required_cols_2 = {code_col2, lat_col2, long_col2}

    if not required_cols_1.issubset(df1.columns):
        raise ValueError(
            f"df1 is missing required columns: {required_cols_1 - set(df1.columns)}"
        )
    if not required_cols_2.issubset(df2.columns):
        raise ValueError(
            f"df2 is missing required columns: {required_cols_2 - set(df2.columns)}"
        )

    # Convert to list of tuples for vectorized operations
    coords1 = df1[[lat_col1, long_col1]].apply(tuple, axis=1).tolist()
    coords2 = df2[[lat_col2, long_col2]].apply(tuple, axis=1).tolist()

    # Calculate all pairwise distances
    distances = []
    for i, coord1 in enumerate(coords1):
        for j, coord2 in enumerate(coords2):
            try:
                distance_km = geodesic(coord1, coord2).kilometers
                distances.append(
                    {
                        **df1.iloc[i].to_dict(),
                        **{f"{col}_Dataset2": df2.iloc[j][col] for col in df2.columns},
                        "Distance_km": distance_km,
                    }
                )
            except ValueError as e:
                warnings.warn(
                    f"Skipping invalid coordinates: {coord1} or {coord2}: {e}"
                )
                continue

    if not distances:
        raise ValueError("No valid coordinate pairs were processed")

    df_distances = pd.DataFrame(distances)

    # Find closest matches
    df_closest = df_distances.loc[
        df_distances.groupby(code_col1)["Distance_km"].idxmin()
    ]

    # Filter by minimum distance
    df_closest_min_distance = df_distances[df_distances["Distance_km"] < min_distance]

    return df_distances, df_closest, df_closest_min_distance