Rajendransp133's picture
Upload 86 files
ac901c7 verified
#
# Copyright (c) 2013-present, Anoop Kunchukuttan
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
from indicnlp import langinfo
from indicnlp.script.indic_scripts import *
import numpy as np
def equal(v1, v2):
return 0.0 if np.sum(xor_vectors(v1, v2)) > 0 else 1.0
def dice(v1, v2):
dotprod = 2 * float(np.dot(v1, v2.T))
return dotprod / float(len(v1) + len(v2))
def jaccard(v1, v2):
dotprod = float(np.dot(v1, v2.T))
return dotprod / float(len(v1) + len(v2) - dotprod)
def cosine(v1, v2):
dotprod = float(np.dot(v1, v2.T))
norm1 = float(np.dot(v1, v1.T))
norm2 = float(np.dot(v2, v2.T))
return (dotprod) / (np.sqrt(norm1 * norm2) + 0.00001)
def dotprod(v1, v2):
return float(np.dot(v1, v2.T))
def sim1(v1, v2, base=5.0):
return np.power(base, dotprod(v1, v2))
def softmax(v1, v2):
return sim1(v1, v2, np.e)
def create_similarity_matrix(sim_func, slang, tlang, normalize=True):
dim = (
langinfo.COORDINATED_RANGE_END_INCLUSIVE
- langinfo.COORDINATED_RANGE_START_INCLUSIVE
+ 1
)
sim_mat = np.zeros((dim, dim))
for offset1 in range(
langinfo.COORDINATED_RANGE_START_INCLUSIVE,
langinfo.COORDINATED_RANGE_END_INCLUSIVE + 1,
):
v1 = get_phonetic_feature_vector(offset_to_char(offset1, slang), slang)
for offset2 in range(
langinfo.COORDINATED_RANGE_START_INCLUSIVE,
langinfo.COORDINATED_RANGE_END_INCLUSIVE + 1,
):
v2 = get_phonetic_feature_vector(offset_to_char(offset2, tlang), tlang)
sim_mat[offset1, offset2] = sim_func(v1, v2)
if normalize:
sums = np.sum(sim_mat, axis=1)
sim_mat = (sim_mat.transpose() / sums).transpose()
return sim_mat