Spaces:
Sleeping
Sleeping
import streamlit as st | |
import openai | |
import os | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
from streamlit import session_state | |
import numpy as np | |
import json | |
from io import StringIO | |
import json | |
import os | |
import pandas as pd | |
from sentence_transformers import SentenceTransformer | |
import nltk | |
from nltk import word_tokenize | |
from nltk.corpus import stopwords | |
from sklearn.cluster import MiniBatchKMeans | |
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score | |
from sklearn.cluster import AgglomerativeClustering,k_means | |
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster | |
import numpy as np | |
nltk.download("stopwords") | |
import nltk | |
nltk.download('punkt') | |
#text preprocessing function | |
def clean_text_1(text): | |
stop_words = set(stopwords.words("english")) | |
def remove_stopwords(text): | |
return " ".join([word for word in str(text).split() if word not in stop_words]) | |
text = remove_stopwords(text) | |
text = str(text).lower() # Lowercase words | |
text = re.sub(r"\[(.*?)\]", " ", text) # Remove [+XYZ chars] in content | |
text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content | |
text = re.sub(r"\w+…|…", " ", text) # Remove ellipsis (and last word) | |
text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words | |
# text = re.sub(stop_words, " ", text) # Replace dash between words | |
text = re.sub( | |
f"[{re.escape(string.punctuation)}]", "", text | |
) # Remove punctuation | |
return text | |
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from sklearn.cluster import AgglomerativeClustering | |
from sklearn.manifold import TSNE | |
import matplotlib.pyplot as plt | |
import matplotlib.colors as mcolors | |
from sentence_transformers import SentenceTransformer | |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here | |
# Load sentence transformer model | |
def get_embedding(text): | |
# Assuming you have a function clean_text_1 to clean the text | |
#text = clean_text_1(text) | |
return model.encode(text) | |
# Streamlit UI configuration | |
st.set_page_config( | |
page_title="text_clustering.py", | |
page_icon="👋", | |
) | |
# Upload file | |
uploaded_file = st.file_uploader("Choose a file") | |
if uploaded_file: | |
# Read data from file | |
df = pd.read_csv(uploaded_file) | |
# Clean data | |
df = df[df['text'].notna()].reset_index(drop=True) | |
# Get embeddings | |
df['embedding'] = df['text'].apply(get_embedding) | |
matrix = np.vstack(df['embedding'].values) | |
# Distance threshold slider | |
distance_threshold = st.slider("Select Distance Threshold", min_value=0.1, max_value=2.0, value=1.1, step=0.1) | |
# Perform clustering | |
agg_clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, linkage='ward') | |
cluster_labels = agg_clustering.fit_predict(matrix) | |
df['Cluster'] = cluster_labels | |
# Visualize clusters with t-SNE | |
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200) | |
vis_dims2 = tsne.fit_transform(matrix) | |
x = [x for x, y in vis_dims2] | |
y = [y for x, y in vis_dims2] | |
unique_clusters, cluster_counts = np.unique(cluster_labels, return_counts=True) | |
# Create a colormap based on cluster sizes | |
colormap = plt.cm.get_cmap("viridis", len(unique_clusters)) | |
# Set up Streamlit app | |
fig, ax = plt.subplots() | |
for category, (color, size) in enumerate(zip(colormap.colors, cluster_counts)): | |
xs = np.array(x)[cluster_labels == category] | |
ys = np.array(y)[cluster_labels == category] | |
ax.scatter(xs, ys, color=color, alpha=0.3, label=f'Cluster {category} (Size: {size})') | |
avg_x = xs.mean() | |
avg_y = ys.mean() | |
ax.scatter(avg_x, avg_y, marker="x", color=color, s=100) | |
ax.set_title("Clusters identified visualized in language 2D using t-SNE") | |
ax.legend() | |
# Display the plot in Streamlit | |
st.pyplot(fig) | |
st.text_area("Number of Cluster Labels", value=len(np.unique(cluster_labels.tolist()))) | |
# Reading a review which belong to each group. | |
rev_per_cluster = 1 | |
n_clusters = len(np.unique(cluster_labels.tolist())) | |
for i in range(n_clusters): | |
print(f"Cluster {i} Theme:", end=" ") | |
reviews = "\n".join( | |
df[df.Cluster == i] | |
.text.str.replace("Title: ", "") | |
.str.replace("\n\nContent: ", ": ") | |
.sample(rev_per_cluster, random_state=42) | |
.values | |
) | |
messages = [ | |
{"role": "user", "content": f'What do the following have in common?\n\nValues:\n"""\n{reviews}\n"""\n\nTheme:'} | |
] | |
response = openai.completions.create( | |
model="gpt-3.5-turbo-instruct", | |
messages=messages, | |
temperature=0, | |
max_tokens=64, | |
top_p=1, | |
frequency_penalty=0, | |
presence_penalty=0) | |
print(response.choices[0].message.content.replace("\n", "")) | |
st.text_area(f"Cluster {i} Theme", value=response.choices[0].message.content.replace("\n", "")) | |
# sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42) | |
# for j in range(rev_per_cluster): | |
# print(sample_cluster_rows.Score.values[j], end=", ") | |
# print(sample_cluster_rows.Summary.values[j], end=": ") | |
# print(sample_cluster_rows.Text.str[:70].values[j]) | |
# print("-" * 100) | |
# | |