import collections import functools import operator import numpy as np import pandas as pd import plotly.express as px import plotly.graph_objects as go from plotly.offline import iplot from spacy import displacy def no_of_tags(data): tag_in_sent = {} for tag in data["entity"]: if tag != "": if tag not in tag_in_sent: tag_in_sent[tag] = 1 else: tag_in_sent[tag] += 1 return tag_in_sent def list_ents(data_frame): index = 0 ents_list = [] for word, ent in zip(data_frame['words'], data_frame['entity']): if ent != "": ent_dict = {"start": index, "end": index + len(word) + 1, "label": ent.upper()} ents_list.append(ent_dict) index = index + len(word) + 1 return ents_list def color_creator(color_data): color_dict = {} for ner in range(0, len(color_data), 2): color_dict[color_data[ner]] = color_data[ner + 1] return color_dict def options(data_frame): option = {"ents": color_creator(data_frame).keys(), "colors": color_creator(data_frame)} return option def tag_display(sent, tag_colors): # for sent in df_list: ex = {"text": ' '.join(sent['words']), "ents": list_ents(sent), "title": None} displacy.render(ex, style="ent", manual=True, options=options(tag_colors["NER"])) def bubble_sentence(ls, tag_colors): # for ls in data_frame: tags_data = no_of_tags(ls) tags_data = pd.DataFrame(tags_data.items(), columns=["Entities", "Counts"]) data = [ go.Scatter(x=tags_data["Entities"], y=tags_data["Counts"], mode='markers', marker=dict(color=list(color_creator(tag_colors["NER"]).values()), size=tags_data["Counts"] * 40, ) ) ] layout = go.Layout(title=f'Words :{ls.count(axis=0)["clean_words"]} Tags :{ls.count(axis=0)["clean_entity"]}', xaxis=dict(title='Tags'), yaxis=dict(title='Count'), hovermode="closest") figure = go.Figure(data=data, layout=layout) iplot(figure) def bubble_document(data_frame, tag_colors): doc_list = [] for ls in data_frame: doc_list.append(no_of_tags(ls)) doc_tags = dict(functools.reduce(operator.add, map(collections.Counter, doc_list))) doc_data = pd.DataFrame(doc_tags.items(), columns=["Entities", "Counts"]) data = [ go.Scatter(x=doc_data["Entities"], y=doc_data["Counts"], mode='markers', marker=dict(color=list(color_creator(tag_colors["NER"]).values()), size=doc_data["Counts"] * 15) ) ] layout = go.Layout(title="Distribution of Tags in the Document", xaxis=dict(title='Tags'), yaxis=dict(title='Count'), hovermode="closest") figure = go.Figure(data=data, layout=layout) iplot(figure) def line_document(data_frame): line_list = pd.DataFrame(columns=["sentence", "words", "tags", "words_to_tag"]) total_words, total_tags = 0, 0 for ls in data_frame: line_list.loc[len(line_list) + 1] = [len(line_list) + 1, ls.count(axis=0)["clean_words"], ls.count(axis=0)["clean_entity"], f'{ls.count(axis=0)["clean_words"]} : {ls.count(axis=0)["clean_entity"]}'] total_tags += ls.count(axis=0)["clean_entity"] total_words += ls.count(axis=0)["clean_words"] line_list["words_tag"] = line_list["words"] / line_list["tags"] fig = px.line(line_list, x="sentence", y="words_tag", markers=True, text="words_to_tag", template="plotly_dark", title=f'Total Words : {total_words} ' f'Total Entities : {total_tags}', line_shape='vh') # fig.show() iplot(fig) def scatter_document(data_frame, tags_data): fig_list = [] for sent, color in zip(data_frame, list(color_creator(tags_data["NER"]).values())): tags_df = pd.DataFrame(tags_data["_ntags_"], columns=["entity"]) sent_tags_data = pd.DataFrame(no_of_tags(sent).items(), columns=["entity", "count"]) tags_df = pd.merge(left=tags_df, right=sent_tags_data, how="left", left_on=["entity"], right_on=["entity"]) tags_df["count"] = tags_df['count'].replace(np.NAN, '0', regex=True).astype(int) data = [ go.Scatter(x=tags_df["entity"], y=tags_df["count"], mode='markers', marker=dict(color=color, size=tags_df["count"] * 25, ) ) ] layout = go.Layout(title=' '.join(sent["words"]), xaxis=dict(title='Tags'), yaxis=dict(title='Count'), hovermode="closest", template="plotly_dark") figure = go.Figure(data=data, layout=layout) fig_list.append(figure) layout = go.Layout(title="Document Tag Distribution", xaxis=dict(title='Tags'), yaxis=dict(title='Count'), hovermode="closest", template="plotly_dark") fig = go.Figure(data=sum((fig.data for fig in fig_list), ()), layout=layout) # iplot(fig) return fig