Spaces:

rjadr
/

ditaduranuncamais_explorer

Runtime error

App Files Files Community

rjadr commited on Jul 21, 2023

Commit

d8cce8d

•

1 Parent(s): 92d6900

Update app.py

Browse files

Files changed (1) hide show

app.py +340 -3

app.py CHANGED Viewed

@@ -13,6 +13,11 @@ from pandas.api.types import (
 )
 import subprocess
 from tempfile import NamedTemporaryFile
 st.set_page_config(layout="wide")
@@ -38,6 +43,11 @@ def load_dataset():
 @st.cache_data(show_spinner=False)
 def load_dataframe(_dataset):
     dataframe = _dataset.remove_columns(['txt_embs', 'img_embs']).to_pandas()
    # dataframe['Post Created'] = dataframe['Post Created'].dt.tz_convert('UTC')
     dataframe = dataframe[['Post Created', 'image', 'Description', 'Image Text', 'Account', 'User Name'] + [col for col in dataframe.columns if col not in ['Post Created', 'image', 'Description', 'Image Text', 'Account', 'User Name']]]
     return dataframe
@@ -226,6 +236,231 @@ def image_to_image(image, k=5):
     scores, samples = dataset.get_nearest_examples('img_embs', img_emb, k=k)
     return postprocess_results(scores, samples)
 st.title("#ditaduranuncamais Data Explorer")
 def check_password():
@@ -268,7 +503,7 @@ df = load_dataframe(dataset)
 image_model = load_img_model()
 text_model = load_txt_model()
-menu_options = ["Data exploration", "Semantic search", "Stats"]
 st.sidebar.markdown('# Menu')
 selected_menu_option = st.sidebar.radio("Select a page", menu_options)
@@ -379,7 +614,109 @@ elif selected_menu_option == "Semantic search":
                     },
                     hide_index=True,
                 )
 elif selected_menu_option == "Stats":
     st.markdown("### Time Series Analysis")
     # Dropdown to select variables
@@ -439,4 +776,4 @@ elif selected_menu_option == "Stats":
     elif corr > -0.7:
         st.write(f"The correlation coefficient is {corr}, indicating a moderate negative relationship between {scatter_variable_1} and {scatter_variable_2}.")
     else:
-        st.write(f"The correlation coefficient is {corr}, indicating a strong negative relationship between {scatter_variable_1} and {scatter_variable_2}.")

 )
 import subprocess
 from tempfile import NamedTemporaryFile
+from itertools import combinations
+import networkx as nx
+import plotly.graph_objects as go
+import colorcet as cc
+from matplotlib.colors import rgb2hex
 st.set_page_config(layout="wide")
 @st.cache_data(show_spinner=False)
 def load_dataframe(_dataset):
     dataframe = _dataset.remove_columns(['txt_embs', 'img_embs']).to_pandas()
+    # Extract hashtags ith regex and convert to set
+    dataframe['Hashtags'] = dataframe.apply(lambda row: f"{row['Description']} {row['Image Text']}", axis=1)
+    dataframe['Hashtags'] = dataframe['Hashtags'].str.lower().str.findall(r'#(\w+)').apply(set)
+     # remove all hashtags that starts with 'throwback', 'thursday' or 'tbt' from the lists of hashtags per post
+   # dataframe['Hashtags'] = dataframe['Hashtags'].apply(lambda x: [item for item in x if not item.startswith('ditaduranuncamais')])
    # dataframe['Post Created'] = dataframe['Post Created'].dt.tz_convert('UTC')
     dataframe = dataframe[['Post Created', 'image', 'Description', 'Image Text', 'Account', 'User Name'] + [col for col in dataframe.columns if col not in ['Post Created', 'image', 'Description', 'Image Text', 'Account', 'User Name']]]
     return dataframe
     scores, samples = dataset.get_nearest_examples('img_embs', img_emb, k=k)
     return postprocess_results(scores, samples)
+def disparity_filter(g: nx.Graph, weight: str = 'weight', alpha: float = 0.05) -> nx.Graph:
+    """
+    Computes the backbone of the input graph using the disparity filter algorithm.
+    The algorithm is proposed in:
+    M. A. Serrano, M. Boguna, and A. Vespignani,
+    "Extracting the Multiscale Backbone of Complex Weighted Networks",
+    PNAS, 106(16), pp 6483--6488 (2009).
+    DOI: 10.1073/pnas.0808904106
+    Implementation taken from https://groups.google.com/g/networkx-discuss/c/bCuHZ3qQ2po/m/QvUUJqOYDbIJ
+    Parameters
+    ----------
+    g : NetworkX graph
+        The input graph.
+    weight : str, optional (default='weight')
+        The name of the edge attribute to use as weight.
+    alpha : float, optional (default=0.05)
+        The statistical significance level for the disparity filter (p-value).
+    Returns
+    -------
+    backbone_graph : NetworkX graph
+        The backbone graph.
+    """
+    # Create an empty graph for the backbone
+    backbone_graph = nx.Graph()
+    # Iterate over all nodes in the input graph
+    for node in g:
+        # Get the degree of the node (number of edges connected to the node)
+        k_n = len(g[node])
+        # Only proceed if the node has more than one connection
+        if k_n > 1:
+            # Calculate the sum of weights of edges connected to the node
+            sum_w = sum(g[node][neighbor][weight] for neighbor in g[node])
+            # Iterate over all neighbors of the node
+            for neighbor in g[node]:
+                # Get the weight of the edge between the node and its neighbor
+                edge_weight = g[node][neighbor][weight]
+                # Calculate the proportion of the total weight that this edge represents
+                pij = float(edge_weight) / sum_w
+                # Perform the disparity filter test. If it passes, the edge is considered significant and is added to the backbone
+                if (1 - pij) ** (k_n - 1) < alpha:
+                    backbone_graph.add_edge(node, neighbor, weight=edge_weight)
+    # Return the backbone graph
+    return backbone_graph
+st.cache_data(show_spinner=True)
+def assign_community_colors(G: nx.Graph, attr: str = 'community') -> dict:
+    """
+    Assigns a unique color to each community in the input graph.
+    Parameters
+    ----------
+    G : nx.Graph
+        The input graph.
+    attr : str, optional
+        The node attribute of the community names or indexes (default is 'community').
+    Returns
+    -------
+    dict
+        A dictionary mapping each community to a unique color.
+    """
+    glasbey_colors = cc.glasbey_hv
+    communities_ = set(nx.get_node_attributes(G, attr).values())
+    return {community: rgb2hex(glasbey_colors[i % len(glasbey_colors)]) for i, community in enumerate(communities_)}
+st.cache_data(show_spinner=True)
+def generate_hover_text(G: nx.Graph, attr: str = 'community') -> list:
+    """
+    Generates hover text for each node in the input graph.
+    Parameters
+    ----------
+    G : nx.Graph
+        The input graph.
+    attr : str, optional
+        The node attribute of the community names or indexes (default is 'community').
+    Returns
+    -------
+    list
+        A list of strings containing the hover text for each node.
+    """
+    return [f"Node: {str(node)}<br>Community: {G.nodes[node][attr] + 1}<br># of connections: {len(adjacencies)}" for node, adjacencies in G.adjacency()]
+st.cache_data(show_spinner=True)
+def calculate_node_sizes(G: nx.Graph) -> list:
+    """
+    Calculates the size of each node in the input graph based on its degree.
+    Parameters
+    ----------
+    G : nx.Graph
+        The input graph.
+    Returns
+    -------
+    list
+        A list of node sizes.
+    """
+    degrees = dict(G.degree())
+    max_degree = max(deg for node, deg in degrees.items())
+    return [10 + 20 * (degrees[node] / max_degree) for node in G.nodes()]
+@st.cache_data(show_spinner=True)
+def plot_graph(_G: nx.Graph, layout: str = "fdp"):
+    """
+    Plots a network graph with communities.
+    Parameters
+    ----------
+    G : nx.Graph
+        The input graph.
+    layout : str, optional
+        The layout algorithm to use (default is "fdp").
+    """
+    pos = nx.spring_layout(G_backbone, dim=3, seed=779)
+    community_colors = assign_community_colors(_G)
+    node_colors = [community_colors[_G.nodes[n]['community']] for n in _G.nodes]
+    edge_trace = go.Scatter(x=[item for sublist in [[pos[edge[0]][0], pos[edge[1]][0], None] for edge in _G.edges()] for item in sublist],
+                            y=[item for sublist in [[pos[edge[0]][1], pos[edge[1]][1], None] for edge in _G.edges()] for item in sublist],
+                            line=dict(width=0.5, color='#888'),
+                            hoverinfo='none',
+                            mode='lines')
+    node_trace = go.Scatter(x=[pos[n][0] for n in _G.nodes()],
+                            y=[pos[n][1] for n in _G.nodes()],
+                            mode='markers',
+                            hoverinfo='text',
+                            marker=dict(color=node_colors, size=10, line_width=2))
+    node_trace.text = generate_hover_text(_G)
+    node_trace.marker.size = calculate_node_sizes(_G)
+    fig = go.Figure(data=[edge_trace, node_trace],
+                    layout=go.Layout(title='Network graph with communities',
+                                     titlefont=dict(size=16),
+                                     showlegend=False,
+                                     hovermode='closest',
+                                     margin=dict(b=20,l=5,r=5,t=40),
+                                     xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+                                     yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+                                     height=800))
+    # Extract node positions
+    Xn=[pos[k][0] for k in G_backbone.nodes()] # x-coordinates of nodes
+    Yn=[pos[k][1] for k in G_backbone.nodes()] # y-coordinates
+    Zn=[pos[k][2] for k in G_backbone.nodes()] # z-coordinates
+    # Extract edge positions
+    Xe=[]
+    Ye=[]
+    Ze=[]
+    for e in G_backbone.edges():
+        Xe+=[pos[e[0]][0],pos[e[1]][0], None] # x-coordinates of edge ends
+        Ye+=[pos[e[0]][1],pos[e[1]][1], None]
+        Ze+=[pos[e[0]][2],pos[e[1]][2], None]
+    # Define traces for plotly
+    trace1=go.Scatter3d(x=Xe,
+                y=Ye,
+                z=Ze,
+                mode='lines',
+                line=dict(color='rgb(125,125,125)', width=1),
+                hoverinfo='none'
+                )
+    # Map community numbers to names
+    community_names = {i: f"Community {i+1}" for i in range(len(communities))}
+    # Create hover text
+    hover_text = [f"{node} ({community_names[G_backbone.nodes[node]['community']]})" for node in G_backbone.nodes()]
+    trace2=go.Scatter3d(x=Xn,
+                y=Yn,
+                z=Zn,
+                mode='markers',
+                name='actors',
+                marker=dict(symbol='circle',
+                            size=7,
+                            color=node_colors, # pass hex colors
+                            line=dict(color='rgb(50,50,50)', width=0.2)
+                            ),
+                text=hover_text,  # Use community names as hover text
+                hoverinfo='text'
+                )
+    axis=dict(showbackground=False,
+            showline=False,
+            zeroline=False,
+            showgrid=False,
+            showticklabels=False,
+            title=''
+            )
+    layout = go.Layout(
+            title="3D Network Graph",
+            width=1000,
+            height=1000,
+            showlegend=False,
+            scene=dict(
+                xaxis=dict(axis),
+                yaxis=dict(axis),
+                zaxis=dict(axis),
+            ),
+        margin=dict(
+            t=100
+        ),
+        hovermode='closest',
+        )
+    data=[trace1, trace2]
+    fig=go.Figure(data=data, layout=layout)
+    return fig
 st.title("#ditaduranuncamais Data Explorer")
 def check_password():
 image_model = load_img_model()
 text_model = load_txt_model()
+menu_options = ["Data exploration", "Semantic search", "Hashtags", "Stats"]
 st.sidebar.markdown('# Menu')
 selected_menu_option = st.sidebar.radio("Select a page", menu_options)
                     },
                     hide_index=True,
                 )
+elif selected_menu_option == "Hashtags":
+    if 'dfx' not in st.session_state:
+        st.session_state.dfx = df.copy()  # Make a copy of dfx
+    # Get a list of all unique hashtags in the DataFrame
+    all_hashtags = list(set([item for sublist in st.session_state.dfx['Hashtags'].tolist() for item in sublist]))
+    st.sidebar.markdown('# Hashtag co-occurrence analysis options')
+    # Let users select hashtags to remove
+    hashtags_to_remove = st.sidebar.multiselect("Hashtags to remove", all_hashtags)
+    col1, col2 = st.sidebar.columns(2)
+    # Add a button to trigger the removal operation
+    if col1.button("Remove hashtags"):
+        # If dfx does not exist in session state, create it
+        st.session_state.dfx['Hashtags'] = st.session_state.dfx['Hashtags'].apply(lambda x: [item for item in x if item not in hashtags_to_remove])
+    # Add a reset button
+    if col2.button("Reset"):
+        st.session_state.dfx = df.copy()  # Reset dfx to the original DataFrame
+   # df2['Hashtags'] = df2['Hashtags'].apply(lambda x: [item for item in x if not item == 'ditaduranuncamais'])
+    # Count the number of unique hashtags
+    hashtags = [item for sublist in st.session_state.dfx['Hashtags'].tolist() for item in sublist]
+    # Count the number of posts per hashtag
+    hashtag_freq = st.session_state.dfx.explode('Hashtags').groupby('Hashtags').size().reset_index(name='counts')
+    # Sort the hashtags by frequency
+    hashtag_freq = hashtag_freq.sort_values(by='counts', ascending=False)
+    # Make the scatter plot
+    hashtags_fig = px.scatter(hashtag_freq, x='Hashtags', y='counts', log_y=True, # Set log_y to True to make the plot more readable on a log scale
+                    labels={'Hashtags': 'Hashtags', 'counts': 'Frequency'},
+                    title='Frequency of hashtags in #throwbackthursday posts on Instagram',
+                    height=600)  # Set the height to 600 pixels
+    st.markdown("### Hashtag Frequency Distribution")
+    st.markdown('Here we apply hashtag co-occurence analysis for mnemonic community detection. This detects communities through creating a network of hashtag pairs (which hashtags are used together in which posts) and then applying community detection algorithms on this network.')
+    st.plotly_chart(hashtags_fig)
+    weight_option = st.sidebar.radio(
+        'Select weight definition',
+        ('Number of users that use the hashtag pairs', 'Total number of occurrences')
+    )
+    hashtag_user_pairs = [(tuple(sorted(combination)), userid) for hashtags, userid in zip(st.session_state.dfx['Hashtags'], st.session_state.dfx['User Name']) for combination in combinations(hashtags, r=2)]
+    # Create a DataFrame with columns 'hashtag_pair' and 'userid'
+    hashtag_user_df = pd.DataFrame(hashtag_user_pairs, columns=['hashtag_pair', 'User Name'])
+    if weight_option == 'Number of users that use the hashtag pairs':
+        # Group by 'hashtag_pair' and count the number of unique 'userid's
+        hashtag_user_df = hashtag_user_df.groupby('hashtag_pair').agg({'User Name': 'nunique'}).reset_index()
+    elif weight_option == 'Total number of occurrences':
+        # Group by 'hashtag_pair' and count the total number of occurrences
+        hashtag_user_df = hashtag_user_df.groupby('hashtag_pair').size().reset_index(name='User Name')
+    # Make edge_list from hashtag_user_df with columns 'hashtag1', 'hashtag2', and 'weight'
+    edge_list = hashtag_user_df.rename(columns={'hashtag_pair': 'hashtag1', 'User Name': 'weight'})
+    edge_list[['hashtag1', 'hashtag2']] = pd.DataFrame(edge_list['hashtag1'].tolist(), index=edge_list.index)
+    edge_list = edge_list[['hashtag1', 'hashtag2', 'weight']]
+    st.markdown("### Edge List of Hashtag Pairs")
+    # Create the graph using the unique users as adge attributes
+    G = nx.from_pandas_edgelist(edge_list, 'hashtag1', 'hashtag2', 'weight')
+    G_backbone = disparity_filter(G, weight='weight', alpha=0.05)
+    st.markdown(f'Number of nodes {len(G_backbone.nodes)}')
+    st.markdown(f'Number of edges {len(G_backbone.edges)}')
+    st.dataframe(edge_list.sort_values(by='weight', ascending=False).head(10).style.set_caption("Edge list of hashtag pairs with the highest weight"))
+    # Create louvain communities
+    communities = nx.community.louvain_communities(G_backbone, weight='weight', seed=1234)
+    communities = list(communities)
+    # Sort communities by size
+    communities.sort(key=len, reverse=True)
+    for i, community in enumerate(communities):
+        for node in community:
+            G_backbone.nodes[node]['community'] = i
+    # Sort community hashtags based on their weighted degree in the network
+    sorted_community_hashtags = [
+        [
+            hashtag
+            for hashtag, degree in sorted(
+                ((h, G.degree(h, weight='weight')) for h in community),
+                key=lambda x: x[1],
+                reverse=True
+            )
+        ]
+        for community in communities
+    ]
+    # Convert the sorted_community_hashtags list into a DataFrame and transpose it
+    sorted_community_hashtags = pd.DataFrame(sorted_community_hashtags).T
+    # Rename the columns of sorted_community_hashtags DataFrame
+    sorted_community_hashtags.columns = [f'Community {i+1}' for i in range(len(sorted_community_hashtags.columns))]
+    st.markdown("### Hashtag Communities")
+    st.markdown(f'There are {len(communities)} communities in the graph.')
+    st.data_editor(sorted_community_hashtags)
+    st.markdown("### Hashtag Network Graph")
+    st.plotly_chart(plot_graph(G_backbone, layout="fdp")) # fdp is relatively slow, use 'sfdp' or 'neato' for faster but denser layouts
 elif selected_menu_option == "Stats":
     st.markdown("### Time Series Analysis")
     # Dropdown to select variables
     elif corr > -0.7:
         st.write(f"The correlation coefficient is {corr}, indicating a moderate negative relationship between {scatter_variable_1} and {scatter_variable_2}.")
     else:
+        st.write(f"The correlation coefficient is {corr}, indicating a strong negative relationship between {scatter_variable_1} and {scatter_variable_2}.")