Thamizh / semanticsearch.py
thiyagab
Changed to gradio
b10d5a0
raw
history blame
1.88 kB
#Write some lines to encode (sentences 0 and 2 are both ideltical):
sen = [
"Three years later, the coffin was still full of Jello.",
"The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
"The person box was packed with jelly many dozens of months later.",
"He found a leprechaun in his walnut shell."
]
import json
import numpy
import os
# Opening JSON file
f = open('thirukural_git.json')
# returns JSON object as
# a dictionary
data = json.load(f)
en_translations=[]
kurals=[]
# Iterating through the json
# list
for kural in data['kurals']:
en_translations.append((kural['meaning']['en'].lower()))
kurals.append(kural['kural'])
# Closing file
f.close()
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
# model.tokenizer.add_special_tokens({'pad_token':'[thiyaga]'})
#Encoding:
sen_embeddings = model.encode(en_translations)
# sen_embeddings= numpy.memmap('trainedmodel',mode="r",dtype=numpy.float32,shape=(1330,768))
# sen_embeddings.tofile('trainedmodel')
def find_similarities(input:str):
input_embeddings = model.encode([input.lower()])
from sklearn.metrics.pairwise import cosine_similarity
#let's calculate cosine similarity for sentence 0:
similarity_matrix=cosine_similarity(
[input_embeddings[0]],
sen_embeddings[1:]
)
indices=[numpy.argpartition(similarity_matrix[0],-3)[-3:]]
response=''
for index in indices[0]:
print(similarity_matrix[0][index])
print(en_translations[index+1])
response += "\n".join(kurals[index+1])+"\n"
response += en_translations[index + 1]+"\n\n"
print("\n".join(kurals[index+1]))
return response
# while True:
# text=input('Ask valluvar: ')
# if( text == 'exit'):
# break
# find_similarities(text)