File size: 5,482 Bytes
1ef3d70
ec7b130
c219fea
ec7b130
 
1ef3d70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c219fea
1ef3d70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c219fea
1ef3d70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ea0857
 
1ef3d70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import streamlit as st
import openai
import os
openai.api_key = os.getenv("OPENAI_API_KEY")

from streamlit import session_state
import numpy as np
import json
from io import StringIO
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.cluster import AgglomerativeClustering,k_means
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import numpy as np
nltk.download("stopwords")
import nltk
nltk.download('punkt')
#text preprocessing function
def clean_text_1(text):
    stop_words = set(stopwords.words("english"))
    def remove_stopwords(text):
      return " ".join([word for word in str(text).split() if word not in stop_words])
    text = remove_stopwords(text)
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", " ", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", " ", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    # text = re.sub(stop_words, " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation
    return text
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here

# Load sentence transformer model
def get_embedding(text):
    # Assuming you have a function clean_text_1 to clean the text
    #text = clean_text_1(text)
    return model.encode(text)

# Streamlit UI configuration
st.set_page_config(
    page_title="text_clustering.py",
    page_icon="👋",
)

# Upload file
uploaded_file = st.file_uploader("Choose a file")
if uploaded_file:
    # Read data from file
    df = pd.read_csv(uploaded_file)
    
    # Clean data
    df = df[df['text'].notna()].reset_index(drop=True)
    
    # Get embeddings
    df['embedding'] = df['text'].apply(get_embedding)
    matrix = np.vstack(df['embedding'].values)

    # Distance threshold slider
    distance_threshold = st.slider("Select Distance Threshold", min_value=0.1, max_value=2.0, value=1.1, step=0.1)

    # Perform clustering
    agg_clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, linkage='ward')
    cluster_labels = agg_clustering.fit_predict(matrix)
    df['Cluster'] = cluster_labels
    
    # Visualize clusters with t-SNE
    tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
    vis_dims2 = tsne.fit_transform(matrix)

    x = [x for x, y in vis_dims2]
    y = [y for x, y in vis_dims2]

    unique_clusters, cluster_counts = np.unique(cluster_labels, return_counts=True)

    # Create a colormap based on cluster sizes
    colormap = plt.cm.get_cmap("viridis", len(unique_clusters))

    # Set up Streamlit app

    fig, ax = plt.subplots()
    for category, (color, size) in enumerate(zip(colormap.colors, cluster_counts)):
        xs = np.array(x)[cluster_labels == category]
        ys = np.array(y)[cluster_labels == category]

        ax.scatter(xs, ys, color=color, alpha=0.3, label=f'Cluster {category} (Size: {size})')

        avg_x = xs.mean()
        avg_y = ys.mean()

        ax.scatter(avg_x, avg_y, marker="x", color=color, s=100)

    ax.set_title("Clusters identified visualized in language 2D using t-SNE")
    ax.legend()

    # Display the plot in Streamlit
    st.pyplot(fig)
    st.text_area("Number of Cluster Labels", value=len(np.unique(cluster_labels.tolist())))


    # Reading a review which belong to each group.
    rev_per_cluster = 1
    n_clusters = len(np.unique(cluster_labels.tolist()))

    for i in range(n_clusters):
        print(f"Cluster {i} Theme:", end=" ")

        reviews = "\n".join(
            df[df.Cluster == i]
            .text.str.replace("Title: ", "")
            .str.replace("\n\nContent: ", ":  ")
            .sample(rev_per_cluster, random_state=42)
            .values
        )

        messages = [
            {"role": "user", "content": f'What do the following  have in common?\n\nValues:\n"""\n{reviews}\n"""\n\nTheme:'}
        ]

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0,
            max_tokens=64,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0)
        print(response.choices[0].message.content.replace("\n", ""))
        st.text_area(f"Cluster {i} Theme", value=response.choices[0].message.content.replace("\n", ""))

#         sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42)
#         for j in range(rev_per_cluster):
#             print(sample_cluster_rows.Score.values[j], end=", ")
#             print(sample_cluster_rows.Summary.values[j], end=":   ")
#             print(sample_cluster_rows.Text.str[:70].values[j])

#         print("-" * 100)
#