Spaces:

rjadr
/

ditaduranuncamais_explorer

Runtime error

App Files Files Community

rjadr commited on Jul 21, 2023

Commit

3ffc79c

•

1 Parent(s): 72bddeb

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -4

app.py CHANGED Viewed

@@ -18,8 +18,15 @@ import networkx as nx
 import plotly.graph_objects as go
 import colorcet as cc
 from matplotlib.colors import rgb2hex
-st.set_page_config(layout="wide")
 model_dir = "./models/sbert.net_models_sentence-transformers_clip-ViT-B-32-multilingual-v1"
@@ -461,6 +468,49 @@ def plot_graph(_G: nx.Graph, layout: str = "fdp"):
     fig=go.Figure(data=data, layout=layout)
     return fig
 st.title("#ditaduranuncamais Data Explorer")
 def check_password():
@@ -503,7 +553,8 @@ df = load_dataframe(dataset)
 image_model = load_img_model()
 text_model = load_txt_model()
-menu_options = ["Data exploration", "Semantic search", "Hashtags", "Stats"]
 st.sidebar.markdown('# Menu')
 selected_menu_option = st.sidebar.radio("Select a page", menu_options)
@@ -634,7 +685,6 @@ elif selected_menu_option == "Hashtags":
     if col2.button("Reset"):
         st.session_state.dfx = df.copy()  # Reset dfx to the original DataFrame
-   # df2['Hashtags'] = df2['Hashtags'].apply(lambda x: [item for item in x if not item == 'ditaduranuncamais'])
     # Count the number of unique hashtags
     hashtags = [item for sublist in st.session_state.dfx['Hashtags'].tolist() for item in sublist]
     # Count the number of posts per hashtag
@@ -689,7 +739,6 @@ elif selected_menu_option == "Hashtags":
         for node in community:
             G_backbone.nodes[node]['community'] = i
     # Sort community hashtags based on their weighted degree in the network
     sorted_community_hashtags = [
         [
@@ -716,6 +765,105 @@ elif selected_menu_option == "Hashtags":
     st.markdown("### Hashtag Network Graph")
     st.plotly_chart(plot_graph(G_backbone, layout="fdp")) # fdp is relatively slow, use 'sfdp' or 'neato' for faster but denser layouts
 elif selected_menu_option == "Stats":
     st.markdown("### Time Series Analysis")

 import plotly.graph_objects as go
 import colorcet as cc
 from matplotlib.colors import rgb2hex
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+import hdbscan
+import umap
+import numpy as np
+from bokeh.plotting import figure
+from bokeh.models import ColumnDataSource
+#st.set_page_config(layout="wide")
 model_dir = "./models/sbert.net_models_sentence-transformers_clip-ViT-B-32-multilingual-v1"
     fig=go.Figure(data=data, layout=layout)
     return fig
+@st.cache_data(show_spinner=True)
+def cluster_embeddings(embeddings, clustering_algo='KMeans', dim_reduction='PCA', n_clusters=5, min_cluster_size=5, n_components=2, n_neighbors=15, min_dist=0.0, random_state=42, min_samples=5):
+    """
+    A function to cluster embeddings.
+    Args:
+    embeddings (pd.Series): A series of numpy vectors.
+    clustering_algo (str): The clustering algorithm to use. Either 'KMeans' or 'HDBSCAN'.
+    dim_reduction (str): The dimensionality reduction method to use. Either 'PCA' or 'UMAP'.
+    n_clusters (int): The number of clusters for KMeans.
+    min_cluster_size (int): The minimum cluster size for HDBSCAN.
+    n_components (int): The number of components for the dimensionality reduction method.
+    n_neighbors (int): The number of neighbors for UMAP.
+    min_dist (float): The minimum distance for UMAP.
+    random_state (int): The seed used by the random number generator.
+    min_samples (int): The minimum number of samples for HDBSCAN.
+    Returns:
+    pd.Series: A series of cluster labels.
+    """
+    # Dimensionality reduction
+    if dim_reduction == 'PCA':
+        reducer = PCA(n_components=n_components, random_state=random_state)
+    elif dim_reduction == 'UMAP':
+        reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=random_state)
+    else:
+        raise ValueError('Invalid dimensionality reduction method')
+    reduced_embeddings = reducer.fit_transform(np.stack(embeddings))
+    # Clustering
+    if clustering_algo == 'KMeans':
+        clusterer = KMeans(n_clusters=n_clusters, random_state=random_state)
+    elif clustering_algo == 'HDBSCAN':
+        clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)
+    else:
+        raise ValueError('Invalid clustering algorithm')
+    labels = clusterer.fit_predict(reduced_embeddings)
+    return labels, reduced_embeddings
 st.title("#ditaduranuncamais Data Explorer")
 def check_password():
 image_model = load_img_model()
 text_model = load_txt_model()
+menu_options = ["Data exploration", "Semantic search", "Hashtags", "Clustering", "Stats"]
 st.sidebar.markdown('# Menu')
 selected_menu_option = st.sidebar.radio("Select a page", menu_options)
     if col2.button("Reset"):
         st.session_state.dfx = df.copy()  # Reset dfx to the original DataFrame
     # Count the number of unique hashtags
     hashtags = [item for sublist in st.session_state.dfx['Hashtags'].tolist() for item in sublist]
     # Count the number of posts per hashtag
         for node in community:
             G_backbone.nodes[node]['community'] = i
     # Sort community hashtags based on their weighted degree in the network
     sorted_community_hashtags = [
         [
     st.markdown("### Hashtag Network Graph")
     st.plotly_chart(plot_graph(G_backbone, layout="fdp")) # fdp is relatively slow, use 'sfdp' or 'neato' for faster but denser layouts
+elif selected_menu_option == "Clustering":
+    st.markdown("## Clustering")
+    st.markdown("Select the type of embeddings to cluster and the clustering algorithm and dimensionality reduction method to use in the sidebar. Then click run clustering. Clustering may take some time.")
+    st.sidebar.markdown("# Clustering Options")
+    type_embeddings = st.sidebar.selectbox("Type of embeddings to cluster", ["Text", "Image"])
+    clustering_algo = st.sidebar.selectbox("Clustering algorithm", ["HDBSCAN", "KMeans"])
+    dim_reduction = st.sidebar.selectbox("Dimensionality reduction method", ["UMAP", "PCA"])
+    if clustering_algo == "KMeans":
+        st.sidebar.markdown("### KMeans Options")
+        n_clusters = st.sidebar.slider("Number of clusters", 2, 20, 5)
+        min_cluster_size = None
+        min_samples = None
+    elif clustering_algo == "HDBSCAN":
+        st.sidebar.markdown("### HDBSCAN Options")
+        min_cluster_size = st.sidebar.slider("[Minimum cluster size](https://github.com/scikit-learn-contrib/hdbscan/blob/master/docs/parameter_selection.rst)", 2, 200, 5)
+        min_samples = st.sidebar.slider("Minimum samples", 2, 50, 5)
+        n_clusters = None
+    if dim_reduction == "UMAP":
+        st.sidebar.markdown("### UMAP Options")
+        n_components = st.sidebar.slider("Number of dimensions", 2, 80, 50)
+        n_neighbors = st.sidebar.slider("Number of neighbors", 2, 20, 15)
+        min_dist = st.sidebar.slider("Minimum distance", 0.0, 1.0, 0.0)
+    else:
+        st.sidebar.markdown("### PCA Options")
+        n_components = st.sidebar.slider("Number of dimensions", 2, 80, 2)
+        n_neighbors = None
+        min_dist = None
+    if st.sidebar.button('Run clustering'):
+        st.markdown("### Clustering Results")
+        if type_embeddings == "Text":
+            embeddings = dataset['txt_embs']
+        elif type_embeddings == "Image":
+            embeddings = dataset['img_embs']
+        # Cluster embeddings
+        labels, reduced_embeddings = cluster_embeddings(embeddings, clustering_algo=clustering_algo, dim_reduction=dim_reduction, n_clusters=n_clusters, min_cluster_size=min_cluster_size, n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist)
+        st.markdown(f"Clustering {type_embeddings} embeddings using {clustering_algo} with {dim_reduction} dimensionality reduction method resulting in **{len(set(labels))}** clusters.")
+        df_clustered = df.copy()
+        df_clustered['cluster'] = labels
+        df_clustered = df_clustered.set_index('cluster').reset_index()
+        st.dataframe(
+            data=filter_dataframe(df_clustered),
+        # use_container_width=True,
+            column_config={
+                "image": st.column_config.ImageColumn(
+                    "Image", help="Instagram image"
+                ),
+                "URL": st.column_config.LinkColumn(
+                    "Link", help="Instagram link", width="small"
+                )
+            },
+            hide_index=True,
+        )
+        st.markdown("### Cluster Plot")
+        # Plot the scatter plot in plotly with the cluster labels as colors reduce further to 2 dimensions if n_components > 2
+        if n_components > 2:
+            reducer = umap.UMAP(n_components=2, random_state=42)
+            reduced_embeddings = reducer.fit_transform(reduced_embeddings)
+            # set the labels to be the cluster labels dynamically
+        # visualise with bokeh showing df_clustered['Description'] and df_clustered['image'] on hover
+        descriptions = df_clustered['Description'].tolist()
+        images = df_clustered['image'].tolist()
+        glasbey_colors = cc.glasbey_hv
+        color_dict = {n: rgb2hex(glasbey_colors[i % len(glasbey_colors)]) for i, n in enumerate(set(labels))}
+        colors = [color_dict[label] for label in labels]
+        source = ColumnDataSource(data=dict(
+            x=reduced_embeddings[:, 0],
+            y=reduced_embeddings[:, 1],
+            desc=descriptions,
+            imgs=images,
+            colors=colors
+        ))
+        TOOLTIPS = """
+            <div>
+                <div>
+                    <img
+                        src="@imgs" height="100" alt="@imgs" width="100"
+                        style="float: left; margin: 0px 15px 15px 0px;"
+                        border="2"
+                    ></img>
+                </div>
+                <div>
+                    <span style="font-size: 12px; font-weight: bold;">@desc</span>
+                </div>
+            </div>
+        """
+        p = figure(width=800, height=800, tooltips=TOOLTIPS,
+                title="Mouse over the dots")
+        p.circle('x', 'y', size=10, source=source, color='colors', line_color=None)
+        st.bokeh_chart(p)
 elif selected_menu_option == "Stats":
     st.markdown("### Time Series Analysis")