File size: 3,317 Bytes
c9ec478
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re
import numpy as np
import pandas as pd
from numpy import dot 
from numpy.linalg import norm
from body_shape_lookup import body_shape_lookup

BODY_SHAPE_MEASURES = "body_shape_measures_normalised_updated.csv"
VOLUNTEERS_MEASURES = "volunteers_measures_normalised_updated.csv"

# selecting specific features
RATIOS_TO_USE = ['shoulder_to_hip_distance',
                 'hip_to_ankle_distance',
                'thigh_to_torso_ratio_normalised',
                'upper_to_lower_torso_normalised_ratio',
                'shoulder_to_hip_ratio',
                'thigh_to_body_ratio',
                'upper_torso_to_body_ratio']

def extract_digits(input_string):
    # find digits in the format '1A' or '12B'
    match = re.search(r'\d+', input_string)
    if match:
        return int(match.group())
    else:
        return -1 # not found

def is_match(row):
    # check whether there was a match for this record
    # extract the user class from id
    ground_truth = extract_digits(row['Volunteer_ID'])
    return ground_truth == row['Rank_1_Body_Shape'] or ground_truth == row['Rank_2_Body_Shape'] or ground_truth == row['Rank_3_Body_Shape']

def select_body_shape(normalised_body_shape_measures):
    # load the body shape measures
    body_shape_df = pd.read_csv(BODY_SHAPE_MEASURES)
    # body_shape_df = normalised_body_shape_measures

    # load the volunteers measures
    # volunteers_df = pd.read_csv(VOLUNTEERS_MEASURES)
    volunteers_df = normalised_body_shape_measures

    # select only the columns corresponding to the ratios
    body_shape_ratios = body_shape_df[RATIOS_TO_USE]

    # Create a DataFrame to store the results
    results_df = pd.DataFrame(columns=["Volunteer_ID", "Rank_1_Body_Shape", "Score_1",
                                       "Rank_2_Body_Shape", "Score_2",
                                       "Rank_3_Body_Shape", "Score_3"])

    # calculate euclidean distance for each volunteer
    for index, volunteer_row in volunteers_df.iterrows():
        print(f"\nProcessing volunteer {volunteer_row['id']}")
        volunteer_ratios = volunteer_row[RATIOS_TO_USE]

        top_scores = [(-1000, 'n/a')] * 3

        for body_index, body_shape_row in body_shape_ratios.iterrows():
            # euclidean distance
            # similarity = np.linalg.norm(volunteer_ratios - body_shape_row)
            # calculate cosine similarity
            similarity = dot(volunteer_ratios, body_shape_row) / (norm(volunteer_ratios)*norm(body_shape_row))

            # Check if the current score is among the top 3
            for i, (score, _) in enumerate(top_scores):
                if similarity > score:
                    top_scores.insert(i, (similarity, body_index + 1))
                    top_scores = top_scores[:3]  
                    break

            print(f"Volunteer {volunteer_row['id']} (body shape {body_index + 1}) Similarity:\t{similarity:.3f}")

        # Print the top 3 best body shapes and scores for the current volunteer
        print(f"Volunteer {volunteer_row['id']} top 3 body shapes and scores are:")
        for i, (score, body_shape) in enumerate(top_scores):
            print(f"Rank {i + 1}: Body Shape {body_shape} with score {score:.3f}")

        body_shape_index = top_scores[0][1]

        return body_shape_lookup(body_shape_index)