import json | |
import numpy | |
import os | |
# Opening JSON file | |
f = open('thirukural_git.json') | |
# returns JSON object as | |
# a dictionary | |
data = json.load(f) | |
en_translations=[] | |
kurals=[] | |
# Iterating through the json | |
# list | |
for kural in data['kurals']: | |
en_translations.append((kural['meaning']['en'].lower())) | |
kurals.append(kural['kural']) | |
# Closing file | |
f.close() | |
from sentence_transformers import SentenceTransformer | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
# model.tokenizer.add_special_tokens({'pad_token':'[thiyaga]'}) | |
#Encoding: | |
sen_embeddings = model.encode(en_translations) | |
# sen_embeddings= numpy.memmap('trainedmodel',mode="r",dtype=numpy.float32,shape=(1330,768)) | |
# sen_embeddings.tofile('trainedmodel') | |
def find_similarities(input:str): | |
input_embeddings = model.encode([input.lower()]) | |
from sklearn.metrics.pairwise import cosine_similarity | |
#let's calculate cosine similarity for sentence 0: | |
similarity_matrix=cosine_similarity( | |
[input_embeddings[0]], | |
sen_embeddings[1:] | |
) | |
indices=[numpy.argpartition(similarity_matrix[0],-3)[-3:]] | |
response='' | |
for index in indices[0]: | |
print(similarity_matrix[0][index]) | |
response+=en_translations[index+1] | |
print(en_translations[index+1]) | |
response += "\n"+"\n".join(kurals[index+1])+"\n" | |
print("\n".join(kurals[index+1])) | |
return response | |
# while True: | |
# text=input('Ask valluvar: ') | |
# if( text == 'exit'): | |
# break | |
# find_similarities(text) |