Spaces:
Sleeping
Sleeping
# | |
# Copyright (c) 2013-present, Anoop Kunchukuttan | |
# All rights reserved. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
# | |
from indicnlp import langinfo | |
from indicnlp.script.indic_scripts import * | |
import numpy as np | |
def equal(v1, v2): | |
return 0.0 if np.sum(xor_vectors(v1, v2)) > 0 else 1.0 | |
def dice(v1, v2): | |
dotprod = 2 * float(np.dot(v1, v2.T)) | |
return dotprod / float(len(v1) + len(v2)) | |
def jaccard(v1, v2): | |
dotprod = float(np.dot(v1, v2.T)) | |
return dotprod / float(len(v1) + len(v2) - dotprod) | |
def cosine(v1, v2): | |
dotprod = float(np.dot(v1, v2.T)) | |
norm1 = float(np.dot(v1, v1.T)) | |
norm2 = float(np.dot(v2, v2.T)) | |
return (dotprod) / (np.sqrt(norm1 * norm2) + 0.00001) | |
def dotprod(v1, v2): | |
return float(np.dot(v1, v2.T)) | |
def sim1(v1, v2, base=5.0): | |
return np.power(base, dotprod(v1, v2)) | |
def softmax(v1, v2): | |
return sim1(v1, v2, np.e) | |
def create_similarity_matrix(sim_func, slang, tlang, normalize=True): | |
dim = ( | |
langinfo.COORDINATED_RANGE_END_INCLUSIVE | |
- langinfo.COORDINATED_RANGE_START_INCLUSIVE | |
+ 1 | |
) | |
sim_mat = np.zeros((dim, dim)) | |
for offset1 in range( | |
langinfo.COORDINATED_RANGE_START_INCLUSIVE, | |
langinfo.COORDINATED_RANGE_END_INCLUSIVE + 1, | |
): | |
v1 = get_phonetic_feature_vector(offset_to_char(offset1, slang), slang) | |
for offset2 in range( | |
langinfo.COORDINATED_RANGE_START_INCLUSIVE, | |
langinfo.COORDINATED_RANGE_END_INCLUSIVE + 1, | |
): | |
v2 = get_phonetic_feature_vector(offset_to_char(offset2, tlang), tlang) | |
sim_mat[offset1, offset2] = sim_func(v1, v2) | |
if normalize: | |
sums = np.sum(sim_mat, axis=1) | |
sim_mat = (sim_mat.transpose() / sums).transpose() | |
return sim_mat | |