Spaces:
Running
Running
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from datasets import load_dataset | |
def calculate_cosine_similarity(dataset, input_text): | |
transcriptions = dataset["transcription"] | |
texts = transcriptions+ [input_text] | |
# Vectorize the texts | |
vectorizer = TfidfVectorizer().fit_transform(texts) | |
vectors = vectorizer.toarray() | |
# Compute cosine similarity | |
input_vector = vectors[-1] | |
dataset_vectors = vectors[:-1] | |
similarities = cosine_similarity([input_vector], dataset_vectors).flatten() | |
highest_similarity_index = int(np.argmax(similarities)) | |
highest_similarity_score = similarities[highest_similarity_index] | |
print('Highest Similarity Score Index:',highest_similarity_index) | |
print('Highest Similarity Score:',highest_similarity_score) | |
hf_dataset=load_dataset('awajai/phase2dataset-tts',split='train') | |
most_similar_row = hf_dataset[highest_similarity_index] | |
return most_similar_row | |