|
import spacy |
|
import numpy as np |
|
|
|
nlp = spacy.load("en_core_web_md") |
|
|
|
|
|
def cosine_similarity(a, b): |
|
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) |
|
|
|
|
|
def find_representative_sub_topic(sub_topics): |
|
sub_topic_vectors = [np.mean([nlp(word.lower()).vector for word in topic.split()], axis=0) for topic in sub_topics] |
|
|
|
avg_similarities = {} |
|
for i, vec1 in enumerate(sub_topic_vectors): |
|
total_sim = 0 |
|
for j, vec2 in enumerate(sub_topic_vectors): |
|
if i != j: |
|
total_sim += cosine_similarity(vec1, vec2) |
|
avg_similarities[sub_topics[i]] = total_sim / (len(sub_topic_vectors) - 1) |
|
|
|
most_representative_sub_topic = max(avg_similarities, key=avg_similarities.get) |
|
|
|
return most_representative_sub_topic |
|
|
|
|
|
sub_topics1 = ['Machine Learning', 'Deep Learning', 'Supervised Learning'] |
|
sub_topics2 = ['Web Development', 'Web Design', 'Website Building'] |
|
sub_topics3 = ['Healthy Eating', 'Nutrition', 'Balanced Diet', 'Wellness'] |
|
sub_topics4 = ['Hours Support', 'Working Hours', 'Morning Brief Calls'] |
|
|
|
print(f"Representative sub-topic for {sub_topics1}: {find_representative_sub_topic(sub_topics1)}") |
|
print(f"Representative sub-topic for {sub_topics2}: {find_representative_sub_topic(sub_topics2)}") |
|
print(f"Representative sub-topic for {sub_topics3}: {find_representative_sub_topic(sub_topics3)}") |
|
print(f"Representative sub-topic for {sub_topics4}: {find_representative_sub_topic(sub_topics4)}") |
|
|