Spaces:

amirhnikzad
/

MLSG_01

Running

App Files Files Community

MLSG_01 / app.py

amirhnikzad

Update app.py

70ed2cc 7 months ago

raw history blame contribute delete

No virus

16.1 kB


	#Import Libraries
	import spacy
	nlp = spacy.load("en_core_web_sm")
	from transformer_srl import dataset_readers, models, predictors
	predictor = predictors.SrlTransformersPredictor.from_path("srl_bert_base_conll2012.tar.gz","transformer_srl")
	import re
	import nltk
	nltk.download('stopwords')
	from nltk.corpus import stopwords
	stop_words = set(stopwords.words('english'))
	import numpy as np
	import pandas as pd
	import networkx as nx
	from IPython.core.display import display, HTML
	from pyvis.network import Network
	from somajo import SoMaJo
	import requests
	import json
	import gradio as gr
	import matplotlib
	import matplotlib.pyplot as plt
	from statistics import mean
	from statistics import stdev

	import math


	#Semantic Graph Function
	def semantic_graph(text_input, srl_model, flow, predication, action, setting, cause, manner, negation,
	orientation):
	#Define dictionary to convert pronouns to their root
	pronouns_dic = {'i': 'i', 'I': 'i', 'he': 'he', 'she': 'she', 'you': 'you', 'we': 'we', 'they': 'they', 'me': 'i',
	'my': 'i', 'mine': 'i', 'your': 'you', 'yours': 'you', 'him': 'he', 'his': 'he', 'her': 'she',
	'hers': 'she', 'us': 'we', 'ours': 'we', 'our': 'we', 'their': 'they', 'theirs': 'they',
	'them': 'they', 'its': 'it', 'it': 'it', "'em": 'they', 'myself': 'i', 'that': 'that',
	'this': 'this', 'those': 'those', 'these': 'these'}
	nlp = spacy.load("en_core_web_sm")
	text = [text_input]
	#Parse text into sentences:
	sentence_df = pd.DataFrame()
	tokenizer = SoMaJo("en_PTB", split_camel_case=True)
	somajo_sentences = tokenizer.tokenize_text(text)
	n_sent = 1
	for sentence in somajo_sentences:
	each_sentence = []
	for token in sentence:
	each_sentence.append(token.text)
	content_sent = ' '.join(each_sentence)
	instance = pd.DataFrame({'sentence_id': [n_sent], 'content': [content_sent]})
	sentence_df = sentence_df.append(instance, ignore_index=True)
	n_sent += 1
	#Define semantic Roles:
	arguments = ['ARG0', 'ARG1', 'ARG2', 'ARG3', 'ARG4' ,'ARG5']
	spatiotemporal = ['ARGM-LOC', 'ARGM-DIR', 'ARGM-GOL', 'ARGM-TMP']
	causal = ['ARGM-PRP', 'ARGM-CAU']
	how = ['ARGM-MNR', 'ARGM-COM', 'ARGM-EXT', 'ARGM-ADV', 'ARGM-ADJ', 'ARGM-PRD']
	meta = ['ARGM-DIS', 'ARGM-MOD', 'ARGM-REC']
	neg = ['ARGM-NEG']
	columns = ['sentence_id', 'pred_id', 'frame', 'lemma'] + arguments + spatiotemporal + causal + how + meta + neg
	srl_df = pd.DataFrame(columns=columns)
	pred_id = 1

	#srl_model = 'VerbAtlas'
	#Branch for SRL models:
	if srl_model == 'Transformer_SRL':
	for index, row in sentence_df.iterrows():
	srl = predictor.predict((row['content']))
	for each_verb in srl['verbs']:
	srl_elements = re.findall(r'\[.*?\]', each_verb['description'])
	instance = {}
	instance['sentence_id'] = row['sentence_id']
	instance['frame'] = each_verb['frame']
	instance['lemma'] = each_verb['lemma']
	instance['pred_id'] = pred_id
	any_element = False
	for element in srl_elements:
	if element.split(':')[0][1:] != each_verb['frame']:
	any_element = True
	srl_content = element[1:-1].split(':')
	# print(srl_content)
	instance[srl_content[0].strip()] = srl_content[1].strip()
	if any_element == True:
	srl_df = srl_df.append(instance, ignore_index=True)
	pred_id += 1
	srl_df['Pred'] = srl_df['lemma']
	elif srl_model == 'VerbAtlas':
	URL = "https://verbatlas.org/api/model"
	for index, row in sentence_df.iterrows():
	try:
	r = requests.post(url=URL, json=[{"text": row['content'], "lang": "EN"}])
	all_srl = json.loads(r.text)
	for item in all_srl:
	token_register = {}
	for token in item['tokens']:
	token_register[token['index']] = token['rawText']
	for head in item['annotations']:
	any_element = False
	head_index = head['tokenIndex']
	english_prop = head['englishPropbank']
	prop_head = english_prop['frameName']
	prop_roles = english_prop['roles']
	instance = {}
	instance['sentence_id'] = row['sentence_id']
	instance['frame'] = prop_head
	instance['pred_id'] = pred_id
	for role in prop_roles:
	cat = role['role']
	span_0 = role['span'][0]
	span_1 = role['span'][1]
	token_list = []
	for i in range(span_0, span_1):
	token_list.append(token_register[i])
	instance[cat] = ' '.join(token_list)
	any_element = True
	if any_element == True:
	srl_df = srl_df.append(instance, ignore_index=True)
	pred_id += 1
	except:
	pass
	srl_df['Pred'] = srl_df['frame']

	semantic_df = srl_df.copy()
	#Extract roles:
	roles = semantic_df.drop(['sentence_id', 'pred_id', 'frame', 'lemma', 'Pred'], axis=1).columns.to_list()
	#Iterate on roles and clean:
	for index, row in semantic_df.iterrows():
	for role in roles:
	try:
	arg_content = row[role].split(' ')
	filtered_content = []
	for token in arg_content:
	if nlp(token)[0].pos_ == 'PRON':
	try:
	filtered_content.append(pronouns_dic[token.lower()]) # ,
	except:
	pass
	elif nlp(token)[0].pos_ not in ['ADP', 'PART', 'PUNCT', 'DET', 'SPACE']:
	filtered_content.append(token)
	semantic_df.at[index, role] = ' '.join(filtered_content)
	except:
	pass
	#Define Graph:
	G_sem = nx.MultiDiGraph()
	#Iterate on semantic df and add nodes and edges to the semantic graph:
	for index, row in semantic_df.iterrows():
	if index < len(semantic_df) - 1 and len(semantic_df) > 1:
	a = semantic_df.at[index, 'Pred'].lower()
	b = semantic_df.at[index + 1, 'Pred'].lower()
	if a not in G_sem:
	G_sem.add_node(a, category='Predication', color='salmon', label=a)
	if b not in G_sem:
	G_sem.add_node(b, category='Predication', color='salmon', label=b)
	G_sem.add_edge(a, b, turn=index + 0.5, color='brown', category='Flow')
	elif len(semantic_df) == 1:
	a = semantic_df.at[index, 'Pred'].lower()
	G_sem.add_node(a, category='Predication', color='salmon', label=a)
	a = semantic_df.at[index, 'Pred'].lower()
	for arg in arguments:
	if type(row[arg]) != float and row[arg] != '':
	c = row[arg].lower()
	G_sem.add_node(c, category='Argument', color='lightblue', label=c)
	G_sem.add_edge(a, c, turn=index, category='Predication', color='salmon')
	if type(row['ARG0']) != float:
	c = row['ARG0'].lower()
	if type(row['ARG1']) != float and row['ARG1'] != '':
	d = row['ARG1'].lower()
	G_sem.add_edge(c, d, turn=index, category='Argument', color='lightblue')
	if type(row['ARG2']) != float and row['ARG2'] != '':
	d = row['ARG2'].lower()
	G_sem.add_edge(c, d, turn=index, category='Argument', color='lightblue')
	#if type(row['ARG3']) != float:
	#d = row['ARG3'].lower()
	#G_sem.add_edge(c, d, turn=index, category='Argument', color='lightblue')
	#if type(row['ARG4']) != float:
	#d = row['ARG4'].lower()
	#G_sem.add_edge(c, d, turn=index, category='Argument', color='lightblue')
	for argm in spatiotemporal:
	if type(row[argm]) != float and row[arg] != '':
	d = row[argm].lower()
	G_sem.add_node(d, category='Setting', color='lightgreen', label=d)
	G_sem.add_edge(d, a, turn=index, category='Setting', color='lightgreen')
	for argm in causal:
	if type(row[argm]) != float and row[arg] != '':
	d = row[argm].lower()
	G_sem.add_node(d, category='Cause', color='gold', label=d)
	G_sem.add_edge(d, a, turn=index, category='Cause', color='gold')
	for argm in how:
	if type(row[argm]) != float and row[arg] != '':
	d = row[argm].lower()
	G_sem.add_node(d, category='Manner', color='grey', label=d)
	G_sem.add_edge(d, a, turn=index, category='Manner', color='grey')
	for argm in meta:
	if type(row[argm]) != float and row[arg] != '':
	d = row[argm].lower()
	G_sem.add_node(d, category='Orientation', color='aqua', label=d)
	G_sem.add_edge(d, a, turn=index, category='Orientation', color='aqua')
	for argm in neg:
	if type(row[argm]) != float and row[arg] != '':
	d = row[argm].lower()
	G_sem.add_node(d, category='Negation', color='black', label=d)
	G_sem.add_edge(d, a, turn=index, category='Negation', color='black')

	cat_list = ['Flow', 'Predication', 'Argument', 'Setting', 'Cause', 'Manner', 'Negation', 'Orientation']
	categories = [flow, predication, action, setting, cause, manner, negation, orientation]
	network_oi = []
	for i in range(0, len(categories)):
	if categories[i] == True:
	network_oi.append(cat_list[i])

	# G_filtered_cat = G_sem.subgraph([n for n,v in G_sem.nodes(data=True) if v['cat'] in network_oi])
	G_filtered_cat = nx.MultiDiGraph(
	[(u, v, {'turn': e['turn'], 'category': e['category'], 'color': e['color']}) for u, v, e in G_sem.edges(data=True) if
	e['category'] in network_oi])
	for v in G_filtered_cat.nodes:
	G_filtered_cat.nodes[v]['category'] = G_sem.nodes[v]['category']
	G_filtered_cat.nodes[v]['title'] = G_sem.nodes[v]['category']
	G_filtered_cat.nodes[v]['color'] = G_sem.nodes[v]['color']
	#Visualize static graph:
	color_map = []
	for each_node in G_filtered_cat.nodes(data=True):
	color_map.append(each_node[1]['color'])
	color_map2 = []
	for each_edge in G_filtered_cat.edges(data=True):
	color_map2.append(each_edge[2]['color'])
	legend_colors = pd.DataFrame(columns=['x','y','category','color'])
	for each_node in G_filtered_cat.nodes(data=True):
	instance = {}
	instance['color']=(each_node[1]['color'])
	instance['category']=(each_node[1]['category'])
	legend_colors = legend_colors.append(instance, ignore_index=True)
	legend_colors=legend_colors.drop_duplicates()
	legend_colors2 = pd.DataFrame(columns=['x','y','category','color'])
	for each_edge in G_filtered_cat.edges(data=True):
	instance = {}
	instance['color']=(each_edge[2]['color'])
	instance['category']=(each_edge[2]['category'])
	legend_colors2 = legend_colors2.append(instance, ignore_index=True)
	legend_colors2=legend_colors2.drop_duplicates()

	fig, ax = plt.subplots(figsize=(8,8))
	for index, row in legend_colors.iterrows():
	plt.scatter(row['x'],row['y'], c=row['color'], label=row['category'])
	legend = plt.legend(bbox_to_anchor=(1.05, 1.0),loc=1, title='Nodes')
	#nx.draw_kamada_kawai(G_filtered_cat,node_color=color_map,edge_color=color_map2, with_labels=True)
	pos = nx.spring_layout(G_filtered_cat, k=5/math.sqrt(G_filtered_cat.order()))
	nx.draw(G_filtered_cat, pos=pos,node_color=color_map,edge_color=color_map2, with_labels=True)
	plt.tight_layout()
	plt.show()

	#Visualize interactive graph:
	net = Network("600px", "600px", notebook=True, directed=True, cdn_resources="remote", filter_menu=True, select_menu=True)
	net.from_nx(G_filtered_cat, show_edge_weights=False)
	net.show_buttons(filter_=['physics'])
	net.show('network.html')
	#display(HTML('network.html'))
	#Compute Graph metrics:
	G_stat = nx.DiGraph(G_filtered_cat)
	nn = G_stat.number_of_nodes()
	ne = G_stat.number_of_edges()
	try:
	diameter = nx.diameter(nx.to_undirected(G_stat))
	except:
	diameter = np.nan
	try:
	aspl = nx.average_shortest_path_length(nx.to_undirected(G_stat))
	except:
	aspl=np.nan
	try:
	ad = sum([d for (n, d) in nx.degree(G_stat)]) / float(G_stat.number_of_nodes())
	except:
	ad = 0
	try:
	awd = sum([d for (n, d) in nx.degree(G_stat,weight='weight')]) / float(G_stat.number_of_nodes())
	except:
	awd = 0
	gd = nx.density(G_stat)
	try:
	cc = nx.average_clustering(nx.Graph(G_stat))
	except:
	cc = np.nan
	ncc = len(sorted(nx.connected_components(nx.Graph(G_stat)), key=len, reverse=True))
	try:
	lcc = len(max(nx.connected_components(nx.Graph(G_stat)), key=len))
	except:
	lcc = 0
	try:
	lscc = len(max(nx.strongly_connected_components(G_stat), key=len))
	except:
	lscc = 0
	ncc_list = []
	lcc_list = []
	lscc_list =[]
	for seed in range(0,1000):
	G_rand = nx.gnm_random_graph(nn, ne, seed=seed,directed=True)
	try:
	ncc_list.append(len(sorted(nx.connected_components(nx.Graph(G_rand)), key=len, reverse=True)))
	except:
	pass
	try:
	lcc_list.append(len(max(nx.connected_components(nx.Graph(G_rand)), key=len)))
	except:
	pass
	try:
	lscc_list.append(len(max(nx.strongly_connected_components(G_rand), key=len)))
	except:
	pass
	try:
	nccz = (ncc-mean(ncc_list))/stdev(ncc_list)
	except:
	nccz = np.nan
	try:
	lccz = (lcc-mean(lcc_list))/stdev(lcc_list)
	except:
	lccz = np.nan
	try:
	lsccz = (lscc-mean(lscc_list))/stdev(lscc_list)
	except:
	lsccz = np.nan
	cols = ['#Nodes', '#Edges','Diameter','ASPL','AD','AWD','Density','CC','NCC', 'LCC', 'LSCC', 'NCCZ', 'LCCZ', 'LSCCZ']
	df_stats = pd.DataFrame(data=[[nn,ne,diameter,aspl,ad,awd,gd,cc,ncc,lcc,lscc,nccz,lccz,lsccz]], columns=cols)

	return fig, df_stats, 'network.html' #, net.generate_html('network.html')

	#Build gradio interface
	demo = gr.Interface(
	fn=semantic_graph, title = 'Multi-Layered Semantic Speech Graph - Tang Lab',
	inputs=[gr.Textbox(label='Insert speech sample:', placeholder= "Type or paste..."), gr.Radio(["Transformer_SRL", "VerbAtlas"], label='Select Semantic Role Labeling model and semantic layers:'),
	gr.Checkbox(label='Flow of Predicates'), gr.Checkbox(label='Predication'), gr.Checkbox(label='Action'),
	gr.Checkbox(label='Setting'), gr.Checkbox(label='Cause'), gr.Checkbox(label='Manner'),
	gr.Checkbox(label='Negation'), gr.Checkbox(label='Orientation')],
	examples = [['The dog is chasing the cat.', 'VerbAtlas', False, True, True, False, False, False, False, False]],
	outputs=[ gr.Plot(label='Graph Representation', scroll_to_output=True), gr.Dataframe(label='Graph Metrics'), gr.File(label='Interactive Graph')] #['A large hall, numerous guests, whom we were receiving. Among them was Irma. I at once took her to one side, as though to answer her letter and to reproach her for not having accepted my solution yet.','Transformer_SRL', False, True, True, True, False, False, False, False]],
	)
	demo.launch() #share=True