File size: 1,842 Bytes
c5301d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# src: https://gist.github.com/iamaziz/ff570a6826b6d56c32b9d497a73e688c
# src: https://gist.github.com/iamaziz/0786e3de174c79839e42a5926f25acb2
def distance(u, v):
    """
    Calculates Euclidean distance between two point
    distance = square_root( sum(u_i - v_i)^2 )

    u: [float, float], point1
    v: [float, float], point2
    """
    sum_ = sum((u[i] - v[i]) ** 2 for i in range(len(u)))
    return sum_ ** (1 / 2)


def get_closer(target, *args):
    """
    Return the closest point (from points in `args`) to target

    target:  [float], target point
    *args: [[float]], list of points
    """
    min_distance = float("inf")
    closer = target
    for point in args:
        d = distance(point, target)
        if d < min_distance:
            min_distance = d
            closer = point
    return closer


def get_center(cluster):
    """
    Calculates the centroid point for `cluster`

    cluster: [[float]], list of the points in cluster
    """
    center = []
    n = len(cluster)
    for i in range(len(cluster[0])):
        c = sum(p[i] for p in cluster) / n
        center.append(round(c, 1))
    return center


def k_means(data, k=2, *centers):
    """
    Recursive k_means algorithm

    data: [[float]],    data points to consider for clustering
    k: int,             number of clusters
    centers: [[float]], optiona - initial centroids
    """
    centers = list(centers) if centers else [data[i] for i in range(k)]
    clusters = [[] for _ in range(k)]
    for point in data:
        nearest = get_closer(point, *centers)
        nearest_cluster_index = centers.index(nearest)
        clusters[nearest_cluster_index].append(point)

    new_centers = [get_center(cluster) for cluster in clusters]
    if centers == new_centers:
        return clusters, centers
    return k_means(data, k, *new_centers)