MLSG_01 / app.py
amirhnikzad's picture
Update app.py
70ed2cc
#Import Libraries
import spacy
nlp = spacy.load("en_core_web_sm")
from transformer_srl import dataset_readers, models, predictors
predictor = predictors.SrlTransformersPredictor.from_path("srl_bert_base_conll2012.tar.gz","transformer_srl")
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import numpy as np
import pandas as pd
import networkx as nx
from IPython.core.display import display, HTML
from pyvis.network import Network
from somajo import SoMaJo
import requests
import json
import gradio as gr
import matplotlib
import matplotlib.pyplot as plt
from statistics import mean
from statistics import stdev
import math
#Semantic Graph Function
def semantic_graph(text_input, srl_model, flow, predication, action, setting, cause, manner, negation,
orientation):
#Define dictionary to convert pronouns to their root
pronouns_dic = {'i': 'i', 'I': 'i', 'he': 'he', 'she': 'she', 'you': 'you', 'we': 'we', 'they': 'they', 'me': 'i',
'my': 'i', 'mine': 'i', 'your': 'you', 'yours': 'you', 'him': 'he', 'his': 'he', 'her': 'she',
'hers': 'she', 'us': 'we', 'ours': 'we', 'our': 'we', 'their': 'they', 'theirs': 'they',
'them': 'they', 'its': 'it', 'it': 'it', "'em": 'they', 'myself': 'i', 'that': 'that',
'this': 'this', 'those': 'those', 'these': 'these'}
nlp = spacy.load("en_core_web_sm")
text = [text_input]
#Parse text into sentences:
sentence_df = pd.DataFrame()
tokenizer = SoMaJo("en_PTB", split_camel_case=True)
somajo_sentences = tokenizer.tokenize_text(text)
n_sent = 1
for sentence in somajo_sentences:
each_sentence = []
for token in sentence:
each_sentence.append(token.text)
content_sent = ' '.join(each_sentence)
instance = pd.DataFrame({'sentence_id': [n_sent], 'content': [content_sent]})
sentence_df = sentence_df.append(instance, ignore_index=True)
n_sent += 1
#Define semantic Roles:
arguments = ['ARG0', 'ARG1', 'ARG2', 'ARG3', 'ARG4' ,'ARG5']
spatiotemporal = ['ARGM-LOC', 'ARGM-DIR', 'ARGM-GOL', 'ARGM-TMP']
causal = ['ARGM-PRP', 'ARGM-CAU']
how = ['ARGM-MNR', 'ARGM-COM', 'ARGM-EXT', 'ARGM-ADV', 'ARGM-ADJ', 'ARGM-PRD']
meta = ['ARGM-DIS', 'ARGM-MOD', 'ARGM-REC']
neg = ['ARGM-NEG']
columns = ['sentence_id', 'pred_id', 'frame', 'lemma'] + arguments + spatiotemporal + causal + how + meta + neg
srl_df = pd.DataFrame(columns=columns)
pred_id = 1
#srl_model = 'VerbAtlas'
#Branch for SRL models:
if srl_model == 'Transformer_SRL':
for index, row in sentence_df.iterrows():
srl = predictor.predict((row['content']))
for each_verb in srl['verbs']:
srl_elements = re.findall(r'\[.*?\]', each_verb['description'])
instance = {}
instance['sentence_id'] = row['sentence_id']
instance['frame'] = each_verb['frame']
instance['lemma'] = each_verb['lemma']
instance['pred_id'] = pred_id
any_element = False
for element in srl_elements:
if element.split(':')[0][1:] != each_verb['frame']:
any_element = True
srl_content = element[1:-1].split(':')
# print(srl_content)
instance[srl_content[0].strip()] = srl_content[1].strip()
if any_element == True:
srl_df = srl_df.append(instance, ignore_index=True)
pred_id += 1
srl_df['Pred'] = srl_df['lemma']
elif srl_model == 'VerbAtlas':
URL = "https://verbatlas.org/api/model"
for index, row in sentence_df.iterrows():
try:
r = requests.post(url=URL, json=[{"text": row['content'], "lang": "EN"}])
all_srl = json.loads(r.text)
for item in all_srl:
token_register = {}
for token in item['tokens']:
token_register[token['index']] = token['rawText']
for head in item['annotations']:
any_element = False
head_index = head['tokenIndex']
english_prop = head['englishPropbank']
prop_head = english_prop['frameName']
prop_roles = english_prop['roles']
instance = {}
instance['sentence_id'] = row['sentence_id']
instance['frame'] = prop_head
instance['pred_id'] = pred_id
for role in prop_roles:
cat = role['role']
span_0 = role['span'][0]
span_1 = role['span'][1]
token_list = []
for i in range(span_0, span_1):
token_list.append(token_register[i])
instance[cat] = ' '.join(token_list)
any_element = True
if any_element == True:
srl_df = srl_df.append(instance, ignore_index=True)
pred_id += 1
except:
pass
srl_df['Pred'] = srl_df['frame']
semantic_df = srl_df.copy()
#Extract roles:
roles = semantic_df.drop(['sentence_id', 'pred_id', 'frame', 'lemma', 'Pred'], axis=1).columns.to_list()
#Iterate on roles and clean:
for index, row in semantic_df.iterrows():
for role in roles:
try:
arg_content = row[role].split(' ')
filtered_content = []
for token in arg_content:
if nlp(token)[0].pos_ == 'PRON':
try:
filtered_content.append(pronouns_dic[token.lower()]) # ,
except:
pass
elif nlp(token)[0].pos_ not in ['ADP', 'PART', 'PUNCT', 'DET', 'SPACE']:
filtered_content.append(token)
semantic_df.at[index, role] = ' '.join(filtered_content)
except:
pass
#Define Graph:
G_sem = nx.MultiDiGraph()
#Iterate on semantic df and add nodes and edges to the semantic graph:
for index, row in semantic_df.iterrows():
if index < len(semantic_df) - 1 and len(semantic_df) > 1:
a = semantic_df.at[index, 'Pred'].lower()
b = semantic_df.at[index + 1, 'Pred'].lower()
if a not in G_sem:
G_sem.add_node(a, category='Predication', color='salmon', label=a)
if b not in G_sem:
G_sem.add_node(b, category='Predication', color='salmon', label=b)
G_sem.add_edge(a, b, turn=index + 0.5, color='brown', category='Flow')
elif len(semantic_df) == 1:
a = semantic_df.at[index, 'Pred'].lower()
G_sem.add_node(a, category='Predication', color='salmon', label=a)
a = semantic_df.at[index, 'Pred'].lower()
for arg in arguments:
if type(row[arg]) != float and row[arg] != '':
c = row[arg].lower()
G_sem.add_node(c, category='Argument', color='lightblue', label=c)
G_sem.add_edge(a, c, turn=index, category='Predication', color='salmon')
if type(row['ARG0']) != float:
c = row['ARG0'].lower()
if type(row['ARG1']) != float and row['ARG1'] != '':
d = row['ARG1'].lower()
G_sem.add_edge(c, d, turn=index, category='Argument', color='lightblue')
if type(row['ARG2']) != float and row['ARG2'] != '':
d = row['ARG2'].lower()
G_sem.add_edge(c, d, turn=index, category='Argument', color='lightblue')
#if type(row['ARG3']) != float:
#d = row['ARG3'].lower()
#G_sem.add_edge(c, d, turn=index, category='Argument', color='lightblue')
#if type(row['ARG4']) != float:
#d = row['ARG4'].lower()
#G_sem.add_edge(c, d, turn=index, category='Argument', color='lightblue')
for argm in spatiotemporal:
if type(row[argm]) != float and row[arg] != '':
d = row[argm].lower()
G_sem.add_node(d, category='Setting', color='lightgreen', label=d)
G_sem.add_edge(d, a, turn=index, category='Setting', color='lightgreen')
for argm in causal:
if type(row[argm]) != float and row[arg] != '':
d = row[argm].lower()
G_sem.add_node(d, category='Cause', color='gold', label=d)
G_sem.add_edge(d, a, turn=index, category='Cause', color='gold')
for argm in how:
if type(row[argm]) != float and row[arg] != '':
d = row[argm].lower()
G_sem.add_node(d, category='Manner', color='grey', label=d)
G_sem.add_edge(d, a, turn=index, category='Manner', color='grey')
for argm in meta:
if type(row[argm]) != float and row[arg] != '':
d = row[argm].lower()
G_sem.add_node(d, category='Orientation', color='aqua', label=d)
G_sem.add_edge(d, a, turn=index, category='Orientation', color='aqua')
for argm in neg:
if type(row[argm]) != float and row[arg] != '':
d = row[argm].lower()
G_sem.add_node(d, category='Negation', color='black', label=d)
G_sem.add_edge(d, a, turn=index, category='Negation', color='black')
cat_list = ['Flow', 'Predication', 'Argument', 'Setting', 'Cause', 'Manner', 'Negation', 'Orientation']
categories = [flow, predication, action, setting, cause, manner, negation, orientation]
network_oi = []
for i in range(0, len(categories)):
if categories[i] == True:
network_oi.append(cat_list[i])
# G_filtered_cat = G_sem.subgraph([n for n,v in G_sem.nodes(data=True) if v['cat'] in network_oi])
G_filtered_cat = nx.MultiDiGraph(
[(u, v, {'turn': e['turn'], 'category': e['category'], 'color': e['color']}) for u, v, e in G_sem.edges(data=True) if
e['category'] in network_oi])
for v in G_filtered_cat.nodes:
G_filtered_cat.nodes[v]['category'] = G_sem.nodes[v]['category']
G_filtered_cat.nodes[v]['title'] = G_sem.nodes[v]['category']
G_filtered_cat.nodes[v]['color'] = G_sem.nodes[v]['color']
#Visualize static graph:
color_map = []
for each_node in G_filtered_cat.nodes(data=True):
color_map.append(each_node[1]['color'])
color_map2 = []
for each_edge in G_filtered_cat.edges(data=True):
color_map2.append(each_edge[2]['color'])
legend_colors = pd.DataFrame(columns=['x','y','category','color'])
for each_node in G_filtered_cat.nodes(data=True):
instance = {}
instance['color']=(each_node[1]['color'])
instance['category']=(each_node[1]['category'])
legend_colors = legend_colors.append(instance, ignore_index=True)
legend_colors=legend_colors.drop_duplicates()
legend_colors2 = pd.DataFrame(columns=['x','y','category','color'])
for each_edge in G_filtered_cat.edges(data=True):
instance = {}
instance['color']=(each_edge[2]['color'])
instance['category']=(each_edge[2]['category'])
legend_colors2 = legend_colors2.append(instance, ignore_index=True)
legend_colors2=legend_colors2.drop_duplicates()
fig, ax = plt.subplots(figsize=(8,8))
for index, row in legend_colors.iterrows():
plt.scatter(row['x'],row['y'], c=row['color'], label=row['category'])
legend = plt.legend(bbox_to_anchor=(1.05, 1.0),loc=1, title='Nodes')
#nx.draw_kamada_kawai(G_filtered_cat,node_color=color_map,edge_color=color_map2, with_labels=True)
pos = nx.spring_layout(G_filtered_cat, k=5/math.sqrt(G_filtered_cat.order()))
nx.draw(G_filtered_cat, pos=pos,node_color=color_map,edge_color=color_map2, with_labels=True)
plt.tight_layout()
plt.show()
#Visualize interactive graph:
net = Network("600px", "600px", notebook=True, directed=True, cdn_resources="remote", filter_menu=True, select_menu=True)
net.from_nx(G_filtered_cat, show_edge_weights=False)
net.show_buttons(filter_=['physics'])
net.show('network.html')
#display(HTML('network.html'))
#Compute Graph metrics:
G_stat = nx.DiGraph(G_filtered_cat)
nn = G_stat.number_of_nodes()
ne = G_stat.number_of_edges()
try:
diameter = nx.diameter(nx.to_undirected(G_stat))
except:
diameter = np.nan
try:
aspl = nx.average_shortest_path_length(nx.to_undirected(G_stat))
except:
aspl=np.nan
try:
ad = sum([d for (n, d) in nx.degree(G_stat)]) / float(G_stat.number_of_nodes())
except:
ad = 0
try:
awd = sum([d for (n, d) in nx.degree(G_stat,weight='weight')]) / float(G_stat.number_of_nodes())
except:
awd = 0
gd = nx.density(G_stat)
try:
cc = nx.average_clustering(nx.Graph(G_stat))
except:
cc = np.nan
ncc = len(sorted(nx.connected_components(nx.Graph(G_stat)), key=len, reverse=True))
try:
lcc = len(max(nx.connected_components(nx.Graph(G_stat)), key=len))
except:
lcc = 0
try:
lscc = len(max(nx.strongly_connected_components(G_stat), key=len))
except:
lscc = 0
ncc_list = []
lcc_list = []
lscc_list =[]
for seed in range(0,1000):
G_rand = nx.gnm_random_graph(nn, ne, seed=seed,directed=True)
try:
ncc_list.append(len(sorted(nx.connected_components(nx.Graph(G_rand)), key=len, reverse=True)))
except:
pass
try:
lcc_list.append(len(max(nx.connected_components(nx.Graph(G_rand)), key=len)))
except:
pass
try:
lscc_list.append(len(max(nx.strongly_connected_components(G_rand), key=len)))
except:
pass
try:
nccz = (ncc-mean(ncc_list))/stdev(ncc_list)
except:
nccz = np.nan
try:
lccz = (lcc-mean(lcc_list))/stdev(lcc_list)
except:
lccz = np.nan
try:
lsccz = (lscc-mean(lscc_list))/stdev(lscc_list)
except:
lsccz = np.nan
cols = ['#Nodes', '#Edges','Diameter','ASPL','AD','AWD','Density','CC','NCC', 'LCC', 'LSCC', 'NCCZ', 'LCCZ', 'LSCCZ']
df_stats = pd.DataFrame(data=[[nn,ne,diameter,aspl,ad,awd,gd,cc,ncc,lcc,lscc,nccz,lccz,lsccz]], columns=cols)
return fig, df_stats, 'network.html' #, net.generate_html('network.html')
#Build gradio interface
demo = gr.Interface(
fn=semantic_graph, title = 'Multi-Layered Semantic Speech Graph - Tang Lab',
inputs=[gr.Textbox(label='Insert speech sample:', placeholder= "Type or paste..."), gr.Radio(["Transformer_SRL", "VerbAtlas"], label='Select Semantic Role Labeling model and semantic layers:'),
gr.Checkbox(label='Flow of Predicates'), gr.Checkbox(label='Predication'), gr.Checkbox(label='Action'),
gr.Checkbox(label='Setting'), gr.Checkbox(label='Cause'), gr.Checkbox(label='Manner'),
gr.Checkbox(label='Negation'), gr.Checkbox(label='Orientation')],
examples = [['The dog is chasing the cat.', 'VerbAtlas', False, True, True, False, False, False, False, False]],
outputs=[ gr.Plot(label='Graph Representation', scroll_to_output=True), gr.Dataframe(label='Graph Metrics'), gr.File(label='Interactive Graph')] #['A large hall, numerous guests, whom we were receiving. Among them was Irma. I at once took her to one side, as though to answer her letter and to reproach her for not having accepted my solution yet.','Transformer_SRL', False, True, True, True, False, False, False, False]],
)
demo.launch() #share=True