Spaces:

egumasa
/

engagement-analyzer-demo5

Sleeping

App Files Files Community

engagement-analyzer-demo5 / utils /visualize.py

egumasa

push

0146ef9 2 months ago

raw

history blame

6.3 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	#
	# This code is adapted from spacy-streamlit package by explosion
	# https://github.com/explosion/spacy-streamlit/blob/master/spacy_streamlit/__init__.py
	#

	from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
	import streamlit as st
	import spacy
	from spacy.language import Language
	from spacy import displacy
	import pandas as pd

	import streamlit as st
	from spacy_streamlit import visualize_spans
	from spacy_streamlit.util import load_model, process_text, get_svg, get_html, LOGO

	from pipeline.post_processors import (
	simple_table,
	const_table,
	ngrammar,
	diversity_values,
	)
	from skbio import diversity as dv

	SPACY_VERSION = tuple(map(int, spacy.__version__.split(".")))

	# fmt: off
	# SPAN_ATTRS = ["text", "label_", "start", "end", "start_char", "end_char"]
	SPAN_ATTRS = [
	"text",
	"label_",
	"start",
	"end",
	]

	CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]

	def visualize_spans(
	doc: Union[spacy.tokens.Doc, Dict[str, str]],
	*,
	spans_key: str = "sc",
	attrs: List[str] = SPAN_ATTRS,
	show_table: bool = True,
	title: Optional[str] = "Spans",
	manual: bool = False,
	displacy_options: Optional[Dict] = None,
	simple: bool = True,
	show_confidence: bool = False,
	show_diversity: bool = False,
	show_ngrams: bool = False,
	):
	"""
	Visualizer for spans.
	doc (Doc, Dict): The document to visualize.
	spans_key (str): Which spans key to render spans from. Default is "sc".
	attrs (list): The attributes on the entity Span to be labeled. Attributes are displayed only when the show_table
	argument is True.
	show_table (bool): Flag signifying whether to show a table with accompanying span attributes.
	title (str): The title displayed at the top of the Spans visualization.
	manual (bool): Flag signifying whether the doc argument is a Doc object or a List of Dicts containing span information.
	displacy_options (Dict): Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered.
	See https://spacy.io/api/top-level#displacy_options-span
	"""
	if SPACY_VERSION < (3, 3, 0):
	raise ValueError(
	f"'visualize_spans' requires spacy>=3.3.0. You have spacy=={spacy.__version__}"
	)
	if not displacy_options:
	displacy_options = dict()
	displacy_options["spans_key"] = spans_key

	if title:
	st.header(title)

	if manual:
	if show_table:
	st.warning(
	"When the parameter 'manual' is set to True, the parameter 'show_table' must be set to False."
	)
	if not isinstance(doc, dict):
	st.warning(
	"When the parameter 'manual' is set to True, the parameter 'doc' must be of type 'Dict', not 'spacy.tokens.Doc'."
	)
	html = displacy.render(
	doc,
	style="span",
	options=displacy_options,
	manual=manual,
	)
	st.write(f"{get_html(html)}", unsafe_allow_html=True)

	if show_table:
	# data = [
	# [str(getattr(span, attr)) for attr in attrs] + [str(score)]
	# for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
	# ]
	if simple:
	data, cols = simple_table(doc, spans_key='sc', attrs=attrs)
	else:
	data, cols = const_table(doc, spans_key='sc', attrs=attrs)

	# seq = [s for s in doc.spans[spans_key]]

	if data:
	df = pd.DataFrame(data, columns=cols)
	df = df.astype({"start": int, "end": int})
	df = df.sort_values(by= ['start'])
	st.subheader("Engagement span information")

	st.dataframe(
	df.style.highlight_between(subset='Conf. score', right=.7))

	counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)

	if show_confidence:
	st.subheader("Label counts & Diagnostic confidence score summary")

	print(counts)
	print(list(counts))
	label_counts = df.groupby('label_').agg({
	"label_":
	'count',
	"Conf. score": ['median', 'min', 'max']
	}).round(4).reindex(CATEGORIES, fill_value=0)

	st.dataframe(label_counts)
	# print(list(label_counts))

	if show_ngrams:
	sequences = list(df['label_'])

	# Engagement ngrams
	span_bigrams = ngrammar(seq=sequences, n=2, concat=True)
	span_trigrams = ngrammar(seq=sequences, n=3, concat=True)

	st.dataframe(pd.DataFrame(span_bigrams))
	st.code(span_trigrams)


	st.subheader("Engagement label by grammatical function")
	label_dep = pd.crosstab(df['grammatical realization'], df['label_'])
	st.dataframe(label_dep)

	if show_diversity:
	st.subheader('Diversity of rhetorical features')
	# st.markdown(
	# f"Shannon's index: {dv.alpha.shannon(list(counts), base=2): .3f}")
	# st.markdown(
	# f"Simpson's e index: {1 - dv.alpha.simpson_e(list(counts)): .3f}")

	st.markdown("##### Entropy based diversity measures")

	filename = "NA"

	div = diversity_values(list(counts))
	div_data = pd.DataFrame.from_dict(div, orient='index')
	# st.dataframe(div_data)

	doc_data = pd.concat([div_data, counts, ], axis = 0).T
	filename = "NA"
	doc_data.insert(0, "filename", filename, True)
	doc_data.insert(1, "nwords", len(doc), True)
	st.dataframe(doc_data)

	# st.markdown(str(dv.alpha_diversity(metric = "shannon", counts=counts, ids = ['ENTERTAIN', 'ATTRIBUTE', 'CITATION', 'COUNTER', 'DENY', 'ENDORSE', 'PRONOUNCE', 'CONCUR', 'MONOGLOSS', 'SOURCES', 'JUSTIFYING'])))
	# print(dv.get_alpha_diversity_metrics())