File size: 1,889 Bytes
edd4815
 
d654474
 
 
 
 
 
 
8d09ff7
 
 
 
 
 
d654474
 
 
 
0e97d35
d654474
 
 
 
 
0e97d35
d654474
 
 
 
8d09ff7
 
 
 
 
 
0e97d35
 
d654474
0e97d35
 
8d09ff7
0e97d35
 
 
 
f2c67c4
0e97d35
 
f2c67c4
0e97d35
 
 
d654474
8d09ff7
0e97d35
d654474
 
 
 
 
edd4815
d654474
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from typing import List

import pandas as pd
from sentence_transformers.util import cos_sim

from utils.models import SBert


def p0_originality(df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    """
    row-wise
    :param df:
    :param model_name:
    :return:
    """
    assert 'prompt' in df.columns
    assert 'response' in df.columns
    model = SBert(model_name)

    def get_cos_sim(prompt: str, response: str) -> float:
        prompt_vec = model(prompt)
        response_vec = model(response)
        score = cos_sim(prompt_vec, response_vec).item()
        return score

    df['originality'] = df.apply(lambda x: 1 - get_cos_sim(x['prompt'], x['response']), axis=1)
    return df


def p1_flexibility(df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    """
    group-wise
    :param df:
    :param model_name:
    :return:
    """
    assert 'prompt' in df.columns
    assert 'response' in df.columns
    assert 'id' in df.columns
    model = SBert(model_name)

    def get_flexibility(responses: List[str]) -> float:
        responses_vec = [model(_) for _ in responses]
        count = 0
        score = 0
        for i in range(len(responses_vec)):
            for j in range(len(responses_vec)):
                if i == j:
                    continue
                score += 1 - cos_sim(responses_vec[i], responses_vec[j]).item()
                count += 1
        return score / count

    df_out = df.groupby(by=['id', 'prompt']) \
        .agg({'id': 'first', 'prompt': 'first', 'response': get_flexibility}) \
        .rename(columns={'response': 'flexibility'}) \
        .reset_index(drop=True)
    return df_out


if __name__ == '__main__':
    _df_input = pd.read_csv('data/tmp/example_3.csv')
    _df_0 = p0_originality(_df_input, 'paraphrase-multilingual-MiniLM-L12-v2')
    _df_1 = p1_flexibility(_df_input, 'paraphrase-multilingual-MiniLM-L12-v2')