File size: 9,000 Bytes
7c3be27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import silhouette_score
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


def generate_embeddings(df, content_column):
    """
    Generate embeddings for the content using SentenceTransformer.
    """
    print("πŸ”’ Generating embeddings for clustering...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(df[content_column].tolist(), show_progress_bar=True)
    return embeddings


def determine_optimum_clusters(embeddings, min_clusters=2, max_clusters=10):
    """
    Determine the optimum number of clusters using silhouette analysis.
    """
    print("πŸ” Determining the optimum number of clusters using silhouette analysis...")
    n_samples = len(embeddings)
    if n_samples < 2:
        raise ValueError("Not enough samples to perform clustering. At least 2 samples are required.")

    # Adjust max_clusters to ensure it does not exceed n_samples - 1
    max_clusters = min(max_clusters, n_samples - 1)

    best_num_clusters = min_clusters
    best_score = -1

    for n_clusters in range(min_clusters, max_clusters + 1):
        try:
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            cluster_labels = kmeans.fit_predict(embeddings)
            score = silhouette_score(embeddings, cluster_labels)
            print(f"Number of clusters: {n_clusters}, Silhouette Score: {score:.4f}")

            if score > best_score:
                best_score = score
                best_num_clusters = n_clusters
        except ValueError as e:
            print(f"Skipping {n_clusters} clusters due to error: {e}")

    print(f"βœ… Optimum number of clusters determined: {best_num_clusters}")
    return best_num_clusters


def cluster_embeddings(embeddings, num_clusters):
    """
    Perform KMeans clustering on the embeddings.
    """
    print(f"πŸ“Š Clustering articles into {num_clusters} clusters using KMeans...")
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(embeddings)
    return kmeans.labels_, kmeans


def extract_tfidf_labels(df, content_column, cluster_labels):
    """
    Extract top TF-IDF keywords for each cluster.
    """
    print("πŸ”  Extracting TF-IDF-based keywords for cluster labels...")
    grouped = defaultdict(list)
    for idx, label in enumerate(cluster_labels):
        grouped[label].append(df.iloc[idx][content_column])

    tfidf_labels = {}
    for cluster_id, texts in grouped.items():
        vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=50)
        tfidf_matrix = vectorizer.fit_transform(texts)
        avg_tfidf = tfidf_matrix.mean(axis=0).A1
        top_indices = np.argsort(avg_tfidf)[::-1][:3]
        top_terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
        tfidf_labels[cluster_id] = ", ".join(top_terms)

    return tfidf_labels

def apply_topic_modeling(df, content_column, cluster_labels, num_topics=2):
    """
    Apply topic modeling (LDA) within each cluster to refine and describe topics.
    """
    print("πŸ” Applying topic modeling within each cluster...")
    grouped = defaultdict(list)
    for idx, label in enumerate(cluster_labels):
        grouped[label].append(df.iloc[idx][content_column])

    topic_labels = {}
    for cluster_id, texts in grouped.items():
        vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
        tfidf_matrix = vectorizer.fit_transform(texts)

        lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
        lda.fit(tfidf_matrix)

        # Extract top words for each topic
        feature_names = vectorizer.get_feature_names_out()
        topics = []
        for topic_idx, topic in enumerate(lda.components_):
            top_indices = topic.argsort()[:-4:-1]
            topics.append(", ".join([feature_names[i] for i in top_indices]))
        topic_labels[cluster_id] = " | ".join(topics)

    return topic_labels


def filter_similar_topics(topic_keywords_list, threshold=0.75):
    """
    Filter out similar topics based on cosine similarity of their embeddings.
    """
    print("πŸ”„ Filtering similar topics...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    topic_sentences = [", ".join(kw) for kw in topic_keywords_list]
    embeddings = model.encode(topic_sentences)
    unique_indices = []
    for i, emb in enumerate(embeddings):
        if all(cosine_similarity([emb], [embeddings[j]])[0][0] < threshold for j in unique_indices):
            unique_indices.append(i)
    return [topic_keywords_list[i] for i in unique_indices]


def get_representative_summaries(df, summary_column, embeddings, cluster_labels, kmeans):
    """
    Get the most representative summary for each cluster based on proximity to the cluster centroid.
    """
    print("πŸ”„ Refining cluster labels using representative summaries...")
    representatives = {}
    for i in range(kmeans.n_clusters):
        indices = [j for j, label in enumerate(cluster_labels) if label == i]
        if not indices:
            continue
        cluster_embeddings = embeddings[indices]
        centroid = kmeans.cluster_centers_[i]
        distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
        closest_idx = indices[np.argmin(distances)]
        representatives[i] = df.iloc[closest_idx][summary_column]

    return representatives


def cluster_and_label_articles(df, content_column="content", summary_column="summary", min_clusters=2, max_clusters=10, max_topics=3):
    """
    Cluster articles using SentenceTransformer embeddings and label clusters using TF-IDF and Topic Modeling.
    Display detected topics for each cluster with Primary focus and Related topics.
    """
    if df.empty:
        print("No articles to cluster.")
        return None

    # Step 1: Generate embeddings
    embeddings = generate_embeddings(df, content_column)

    # Step 2: Determine the optimum number of clusters
    num_clusters = determine_optimum_clusters(embeddings, min_clusters, max_clusters)

    # Step 3: Perform clustering
    cluster_labels, kmeans = cluster_embeddings(embeddings, num_clusters)
    df['cluster_label'] = cluster_labels

    # Step 4: Extract TF-IDF matrix
    print("πŸ”  Extracting TF-IDF matrix for clusters...")
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(df[content_column].tolist())
    feature_names = vectorizer.get_feature_names_out()

    # Step 5: Process each cluster
    print("πŸ” Processing clusters for TF-IDF and topic modeling...")
    grouped = defaultdict(list)
    for idx, label in enumerate(cluster_labels):
        grouped[label].append(idx)

    refined_labels = [""] * num_clusters  # Initialize refined_labels with empty strings
    detected_topics = {}
    for cluster_id, indices in grouped.items():
        cluster_texts = tfidf_matrix[indices]

        # Extract TF-IDF keywords
        avg_tfidf = cluster_texts.mean(axis=0).A1
        top_indices = np.argsort(avg_tfidf)[::-1][:3]
        tfidf_keywords = [feature_names[i] for i in top_indices]

        # Generate a cluster label using the top TF-IDF keywords
        cluster_label_tfidf = ", ".join(tfidf_keywords)

        # Apply topic modeling
        lda = LatentDirichletAllocation(n_components=min(max_topics, len(indices)), random_state=42)
        lda.fit(cluster_texts)
        topics = []
        topic_weights = []
        for topic_idx, topic in enumerate(lda.components_):
            top_topic_indices = topic.argsort()[:-4:-1]
            topics.append(", ".join([feature_names[i] for i in top_topic_indices]))
            topic_weights.append(topic.sum())  # Sum of weights for ranking

        # Rank topics by importance
        ranked_topics = [x for _, x in sorted(zip(topic_weights, topics), reverse=True)]

        # Generate Primary focus and Related topics
        primary_focus = ranked_topics[0] if ranked_topics else "N/A"
        related_topics = ranked_topics[1:] if len(ranked_topics) > 1 else []

        # Store detected topics for user display
        detected_topics[cluster_label_tfidf] = {
            "primary_focus": primary_focus,
            "related_topics": related_topics,
        }

        # Assign the TF-IDF keywords as the cluster label
        refined_labels[cluster_id] = cluster_label_tfidf

    # Assign refined labels to clusters
    df['cluster_label'] = [refined_labels[label] for label in cluster_labels]

    print("βœ… Clustering and labeling complete!")
    return {
        "dataframe": df,
        "detected_topics": detected_topics,
        "number_of_clusters": num_clusters,
    }