File size: 1,942 Bytes
ac901c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#
#  Copyright (c) 2013-present, Anoop Kunchukuttan
#  All rights reserved.
#
#  This source code is licensed under the MIT license found in the
#  LICENSE file in the root directory of this source tree.
#

from indicnlp import langinfo
from indicnlp.script.indic_scripts import *
import numpy as np


def equal(v1, v2):
    return 0.0 if np.sum(xor_vectors(v1, v2)) > 0 else 1.0


def dice(v1, v2):
    dotprod = 2 * float(np.dot(v1, v2.T))
    return dotprod / float(len(v1) + len(v2))


def jaccard(v1, v2):
    dotprod = float(np.dot(v1, v2.T))
    return dotprod / float(len(v1) + len(v2) - dotprod)


def cosine(v1, v2):
    dotprod = float(np.dot(v1, v2.T))
    norm1 = float(np.dot(v1, v1.T))
    norm2 = float(np.dot(v2, v2.T))
    return (dotprod) / (np.sqrt(norm1 * norm2) + 0.00001)


def dotprod(v1, v2):
    return float(np.dot(v1, v2.T))


def sim1(v1, v2, base=5.0):
    return np.power(base, dotprod(v1, v2))


def softmax(v1, v2):
    return sim1(v1, v2, np.e)


def create_similarity_matrix(sim_func, slang, tlang, normalize=True):
    dim = (
        langinfo.COORDINATED_RANGE_END_INCLUSIVE
        - langinfo.COORDINATED_RANGE_START_INCLUSIVE
        + 1
    )
    sim_mat = np.zeros((dim, dim))

    for offset1 in range(
        langinfo.COORDINATED_RANGE_START_INCLUSIVE,
        langinfo.COORDINATED_RANGE_END_INCLUSIVE + 1,
    ):
        v1 = get_phonetic_feature_vector(offset_to_char(offset1, slang), slang)
        for offset2 in range(
            langinfo.COORDINATED_RANGE_START_INCLUSIVE,
            langinfo.COORDINATED_RANGE_END_INCLUSIVE + 1,
        ):
            v2 = get_phonetic_feature_vector(offset_to_char(offset2, tlang), tlang)
            sim_mat[offset1, offset2] = sim_func(v1, v2)

    if normalize:
        sums = np.sum(sim_mat, axis=1)
        sim_mat = (sim_mat.transpose() / sums).transpose()

    return sim_mat