TTSNepali / cosine_similarity.py
aryamanstha's picture
Upload cosine_similarity.py
4b00955 verified
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
def calculate_cosine_similarity(dataset, input_text):
transcriptions = dataset["transcription"]
texts = transcriptions+ [input_text]
# Vectorize the texts
vectorizer = TfidfVectorizer().fit_transform(texts)
vectors = vectorizer.toarray()
# Compute cosine similarity
input_vector = vectors[-1]
dataset_vectors = vectors[:-1]
similarities = cosine_similarity([input_vector], dataset_vectors).flatten()
highest_similarity_index = int(np.argmax(similarities))
highest_similarity_score = similarities[highest_similarity_index]
print('Highest Similarity Score Index:',highest_similarity_index)
print('Highest Similarity Score:',highest_similarity_score)
hf_dataset=load_dataset('awajai/phase2dataset-tts',split='train')
most_similar_row = hf_dataset[highest_similarity_index]
return most_similar_row