themeetjani's picture
Upload 3 files
1ef3d70 verified
raw
history blame
5.46 kB
import streamlit as st
from streamlit import session_state
import numpy as np
import json
from io import StringIO
import openai
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.cluster import AgglomerativeClustering,k_means
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import numpy as np
nltk.download("stopwords")
import nltk
nltk.download('punkt')
#text preprocessing function
def clean_text_1(text):
stop_words = set(stopwords.words("english"))
def remove_stopwords(text):
return " ".join([word for word in str(text).split() if word not in stop_words])
text = remove_stopwords(text)
text = str(text).lower() # Lowercase words
text = re.sub(r"\[(.*?)\]", " ", text) # Remove [+XYZ chars] in content
text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content
text = re.sub(r"\w+…|…", " ", text) # Remove ellipsis (and last word)
text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words
# text = re.sub(stop_words, " ", text) # Replace dash between words
text = re.sub(
f"[{re.escape(string.punctuation)}]", "", text
) # Remove punctuation
return text
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here
from openai import OpenAI
client = OpenAI()
# Load sentence transformer model
def get_embedding(text):
# Assuming you have a function clean_text_1 to clean the text
#text = clean_text_1(text)
return model.encode(text)
# Streamlit UI configuration
st.set_page_config(
page_title="text_clustering.py",
page_icon="👋",
)
# Upload file
uploaded_file = st.file_uploader("Choose a file")
if uploaded_file:
# Read data from file
df = pd.read_csv(uploaded_file)
# Clean data
df = df[df['text'].notna()].reset_index(drop=True)
# Get embeddings
df['embedding'] = df['text'].apply(get_embedding)
matrix = np.vstack(df['embedding'].values)
# Distance threshold slider
distance_threshold = st.slider("Select Distance Threshold", min_value=0.1, max_value=2.0, value=1.1, step=0.1)
# Perform clustering
agg_clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, linkage='ward')
cluster_labels = agg_clustering.fit_predict(matrix)
df['Cluster'] = cluster_labels
# Visualize clusters with t-SNE
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
vis_dims2 = tsne.fit_transform(matrix)
x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]
unique_clusters, cluster_counts = np.unique(cluster_labels, return_counts=True)
# Create a colormap based on cluster sizes
colormap = plt.cm.get_cmap("viridis", len(unique_clusters))
# Set up Streamlit app
fig, ax = plt.subplots()
for category, (color, size) in enumerate(zip(colormap.colors, cluster_counts)):
xs = np.array(x)[cluster_labels == category]
ys = np.array(y)[cluster_labels == category]
ax.scatter(xs, ys, color=color, alpha=0.3, label=f'Cluster {category} (Size: {size})')
avg_x = xs.mean()
avg_y = ys.mean()
ax.scatter(avg_x, avg_y, marker="x", color=color, s=100)
ax.set_title("Clusters identified visualized in language 2D using t-SNE")
ax.legend()
# Display the plot in Streamlit
st.pyplot(fig)
st.text_area("Number of Cluster Labels", value=len(np.unique(cluster_labels.tolist())))
# Reading a review which belong to each group.
rev_per_cluster = 3
n_clusters = len(np.unique(cluster_labels.tolist()))
for i in range(n_clusters):
print(f"Cluster {i} Theme:", end=" ")
reviews = "\n".join(
df[df.Cluster == i]
.text.str.replace("Title: ", "")
.str.replace("\n\nContent: ", ": ")
.sample(rev_per_cluster, random_state=42)
.values
)
messages = [
{"role": "user", "content": f'What do the following have in common?\n\nValues:\n"""\n{reviews}\n"""\n\nTheme:'}
]
response = client.chat.completions.create(
model="gpt-4",
messages=messages,
temperature=0,
max_tokens=64,
top_p=1,
frequency_penalty=0,
presence_penalty=0)
print(response.choices[0].message.content.replace("\n", ""))
st.text_area(f"Cluster {i} Theme", value=response.choices[0].message.content.replace("\n", ""))
# sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42)
# for j in range(rev_per_cluster):
# print(sample_cluster_rows.Score.values[j], end=", ")
# print(sample_cluster_rows.Summary.values[j], end=": ")
# print(sample_cluster_rows.Text.str[:70].values[j])
# print("-" * 100)
#