File size: 4,716 Bytes
3503538
 
101b319
1154940
8f37445
3503538
 
1db9e9a
 
 
 
8f37445
c491d37
0037ea6
4b87d72
2df8c8c
1db9e9a
9884e92
1bce5cd
090671b
 
 
 
 
 
6517c43
da97d65
1fdb11f
b6c7b40
 
090671b
6517c43
b4b1fd9
090671b
 
6517c43
090671b
f0ca479
755eb55
f0ca479
090671b
6517c43
 
e0b660a
665a998
 
090671b
f0ca479
1bce5cd
d1cba40
1db9e9a
 
090671b
 
 
2df8c8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0ca479
2df8c8c
 
 
 
 
 
 
 
 
 
 
02a8d5f
2df8c8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0f7403
 
1db9e9a
9cba540
be053e4
60e7380
 
88cd79e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import streamlit as st
from datasets import load_dataset
import networkx as nx
import numpy as np
import pandas as pd
dataset = load_dataset("roneneldan/TinyStories")

st.markdown('# Short Stories, networks and connections')
st.markdown('In this example we consider the semantic similarity between short stories generatited by GenAI.')
st.markdown('We study the relationshis between the stories using a network. The laplacian connectivity provides inights about the closeness of the graph')

st.markdown('## Short Stories')
st.markdown('We are using a sample fo the [TinyStories](roneneldan/TinyStories) dataset from roneneldan work')
text_text = dataset['train'][10]['text']
st.markdown("<span style='color:red'>" + text_text.replace('\n',' ') + "</span>",unsafe_allow_html=True)

st.markdown('The threshold changes the level of connectivity in the network. The reange is from 0 (less similar) to 1 (completely similar)')
threshhold = st.slider('Threshhold',0.0,1.0,step=0.1)

#-------------------------------------------------------------
#-------------------------------------------------------------

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

# Sentences from the data set
#sentences = [item['text'] for item in dataset['train'][:10]]

#sentences = [dataset['train'][0],dataset['train'][1],dataset['train'][2]]
sentences = [dataset['train'][ii] for ii in range(10)]

#Compute embedding 
embeddings = model.encode(sentences, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings, embeddings)

# creating adjacency matrix
A = np.zeros((len(sentences),len(sentences)))

#Output the pairs with their score
for i in range(len(sentences)):
    for j in range(i):
        #st.write("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], cosine_scores[i][j]))
        A[i][j] = cosine_scores[i][j]
        A[j][i] = cosine_scores[i][j]

#G = nx.from_numpy_array(A)
G = nx.from_numpy_array(cosine_scores.numpy()>threshhold)

st.markdown('We can visualize the similarity between the shorts stories as a network. It the similarity is greater than the threshold, the two nodes are conencted')


#-------------------------------------------------------------
#-------------------------------------------------------------
# ego_graph.py
# An example of how to plot a node's ego network 
# (egonet). This indirectly showcases slightly more involved 
# interoperability between streamlit-agraph and networkx.

# An egonet can be # created from (almost) any network (graph),
# and exemplifies the # concept of a subnetwork (subgraph):
# A node's egonet is the (sub)network comprised of the focal node 
# and all the nodes to whom it is adjacent. The edges included
# in the egonet are those nodes are both included in the aforementioned
# nodes. 

# Use the following command to launch the app
# streamlit run <path-to-script>.py

# standard library dependencies
from operator import itemgetter

# external dependencies
import networkx as nx
from streamlit_agraph import agraph, Node, Edge, Config

# First create a graph using the Barabasi-Albert model
n = 2000
m = 2
#G = nx.generators.barabasi_albert_graph(n, m, seed=2023)

# Then find the node with the largest degree; 
# This node's egonet will be the focus of this example.
node_and_degree = G.degree()
most_connected_node = sorted(G.degree, key=lambda x: x[1], reverse=True)[0]
degree = G.degree(most_connected_node)

# Create egonet for the focal node
hub_ego = nx.ego_graph(G, most_connected_node[0])

# Now create the equivalent Node and Edge lists
nodes = [Node(title=str(sentences[i]['text']), id=i, label='node_'+str(i), size=20) for i in hub_ego.nodes]
edges = [Edge(source=i, target=j, type="CURVE_SMOOTH") for (i,j) in G.edges
        if i in hub_ego.nodes and j in hub_ego.nodes]


config = Config(width=500, 
                height=500, 
                directed=True,
                nodeHighlightBehavior=False, 
                highlightColor="#F7A7A6", # or "blue"
                collapsible=False,
                node={'labelProperty':'label'},
                # **kwargs e.g. node_size=1000 or node_color="blue"
                ) 

return_value = agraph(nodes=nodes, 
                      edges=edges, 
                      config=config)

st.markdown('The Laplacian centrality is a measure of closeness')
st.write(str(nx.laplacian_centrality(G)))
d_lc = nx.laplacian_centrality(G)
#st.write(d_lc[0])
#df_lc = pd.DataFrame.from_dict(nx.laplacian_centrality(G))
df_lc = pd.DataFrame(np.transpose([list(d_lc.keys()),list(d_lc.values())]),columns=['node','laplacian_centrality'])
st.bar_chart(df_lc,x='node',y='laplacian_centrality')