#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # This code is adapted from spacy-streamlit package by explosion # https://github.com/explosion/spacy-streamlit/blob/master/spacy_streamlit/__init__.py # from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable import streamlit as st import spacy from spacy.language import Language from spacy import displacy import pandas as pd import streamlit as st from spacy_streamlit import visualize_spans from spacy_streamlit.util import load_model, process_text, get_svg, get_html, LOGO from pipeline.post_processors import ( simple_table, const_table, ngrammar, diversity_values, ) from skbio import diversity as dv SPACY_VERSION = tuple(map(int, spacy.__version__.split("."))) # fmt: off # SPAN_ATTRS = ["text", "label_", "start", "end", "start_char", "end_char"] SPAN_ATTRS = [ "text", "label_", "start", "end", ] CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"] def visualize_spans( doc: Union[spacy.tokens.Doc, Dict[str, str]], *, spans_key: str = "sc", attrs: List[str] = SPAN_ATTRS, show_table: bool = True, title: Optional[str] = "Spans", manual: bool = False, displacy_options: Optional[Dict] = None, simple: bool = True, show_confidence: bool = False, show_diversity: bool = False, show_ngrams: bool = False, ): """ Visualizer for spans. doc (Doc, Dict): The document to visualize. spans_key (str): Which spans key to render spans from. Default is "sc". attrs (list): The attributes on the entity Span to be labeled. Attributes are displayed only when the show_table argument is True. show_table (bool): Flag signifying whether to show a table with accompanying span attributes. title (str): The title displayed at the top of the Spans visualization. manual (bool): Flag signifying whether the doc argument is a Doc object or a List of Dicts containing span information. displacy_options (Dict): Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered. See https://spacy.io/api/top-level#displacy_options-span """ if SPACY_VERSION < (3, 3, 0): raise ValueError( f"'visualize_spans' requires spacy>=3.3.0. You have spacy=={spacy.__version__}" ) if not displacy_options: displacy_options = dict() displacy_options["spans_key"] = spans_key if title: st.header(title) if manual: if show_table: st.warning( "When the parameter 'manual' is set to True, the parameter 'show_table' must be set to False." ) if not isinstance(doc, dict): st.warning( "When the parameter 'manual' is set to True, the parameter 'doc' must be of type 'Dict', not 'spacy.tokens.Doc'." ) html = displacy.render( doc, style="span", options=displacy_options, manual=manual, ) st.write(f"{get_html(html)}", unsafe_allow_html=True) if show_table: # data = [ # [str(getattr(span, attr)) for attr in attrs] + [str(score)] # for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores']) # ] if simple: data, cols = simple_table(doc, spans_key='sc', attrs=attrs) else: data, cols = const_table(doc, spans_key='sc', attrs=attrs) # seq = [s for s in doc.spans[spans_key]] if data: df = pd.DataFrame(data, columns=cols) df = df.astype({"start": int, "end": int}) df = df.sort_values(by= ['start']) st.subheader("Engagement span information") st.dataframe( df.style.highlight_between(subset='Conf. score', right=.7)) counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0) if show_confidence: st.subheader("Label counts & Diagnostic confidence score summary") print(counts) print(list(counts)) label_counts = df.groupby('label_').agg({ "label_": 'count', "Conf. score": ['median', 'min', 'max'] }).round(4).reindex(CATEGORIES, fill_value=0) st.dataframe(label_counts) # print(list(label_counts)) if show_ngrams: sequences = list(df['label_']) # Engagement ngrams span_bigrams = ngrammar(seq=sequences, n=2, concat=True) span_trigrams = ngrammar(seq=sequences, n=3, concat=True) st.dataframe(pd.DataFrame(span_bigrams)) st.code(span_trigrams) st.subheader("Engagement label by grammatical function") label_dep = pd.crosstab(df['grammatical realization'], df['label_']) st.dataframe(label_dep) if show_diversity: st.subheader('Diversity of rhetorical features') # st.markdown( # f"Shannon's index: {dv.alpha.shannon(list(counts), base=2): .3f}") # st.markdown( # f"Simpson's e index: {1 - dv.alpha.simpson_e(list(counts)): .3f}") st.markdown("##### Entropy based diversity measures") filename = "NA" div = diversity_values(list(counts)) div_data = pd.DataFrame.from_dict(div, orient='index') # st.dataframe(div_data) doc_data = pd.concat([div_data, counts, ], axis = 0).T filename = "NA" doc_data.insert(0, "filename", filename, True) doc_data.insert(1, "nwords", len(doc), True) st.dataframe(doc_data) # st.markdown(str(dv.alpha_diversity(metric = "shannon", counts=counts, ids = ['ENTERTAIN', 'ATTRIBUTE', 'CITATION', 'COUNTER', 'DENY', 'ENDORSE', 'PRONOUNCE', 'CONCUR', 'MONOGLOSS', 'SOURCES', 'JUSTIFYING']))) # print(dv.get_alpha_diversity_metrics())