NER_Tagger / process_tags.py
kandysh's picture
Update process_tags.py
b6974ae
raw
history blame
No virus
5.44 kB
import collections
import functools
import operator
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import iplot
from spacy import displacy
def no_of_tags(data):
tag_in_sent = {}
for tag in data["entity"]:
if tag != "":
if tag not in tag_in_sent:
tag_in_sent[tag] = 1
else:
tag_in_sent[tag] += 1
return tag_in_sent
def list_ents(data_frame):
index = 0
ents_list = []
for word, ent in zip(data_frame['words'], data_frame['entity']):
if ent != "":
ent_dict = {"start": index, "end": index + len(word) + 1, "label": ent.upper()}
ents_list.append(ent_dict)
index = index + len(word) + 1
return ents_list
def color_creator(color_data):
color_dict = {}
for ner in range(0, len(color_data), 2):
color_dict[color_data[ner]] = color_data[ner + 1]
return color_dict
def options(data_frame):
option = {"ents": color_creator(data_frame).keys(), "colors": color_creator(data_frame)}
return option
def tag_display(sent, tag_colors):
# for sent in df_list:
ex = {"text": ' '.join(sent['words']), "ents": list_ents(sent), "title": None}
displacy.render(ex, style="ent", manual=True, options=options(tag_colors["NER"]))
def bubble_sentence(ls, tag_colors):
# for ls in data_frame:
tags_data = no_of_tags(ls)
tags_data = pd.DataFrame(tags_data.items(), columns=["Entities", "Counts"])
data = [
go.Scatter(x=tags_data["Entities"], y=tags_data["Counts"],
mode='markers', marker=dict(color=list(color_creator(tag_colors["NER"]).values()),
size=tags_data["Counts"] * 40,
)
)
]
layout = go.Layout(title=f'Words :{ls.count(axis=0)["clean_words"]} Tags :{ls.count(axis=0)["clean_entity"]}',
xaxis=dict(title='Tags'),
yaxis=dict(title='Count'),
hovermode="closest")
figure = go.Figure(data=data, layout=layout)
iplot(figure)
def bubble_document(data_frame, tag_colors):
doc_list = []
for ls in data_frame:
doc_list.append(no_of_tags(ls))
doc_tags = dict(functools.reduce(operator.add, map(collections.Counter, doc_list)))
doc_data = pd.DataFrame(doc_tags.items(), columns=["Entities", "Counts"])
data = [
go.Scatter(x=doc_data["Entities"], y=doc_data["Counts"],
mode='markers', marker=dict(color=list(color_creator(tag_colors["NER"]).values()),
size=doc_data["Counts"] * 15)
)
]
layout = go.Layout(title="Distribution of Tags in the Document", xaxis=dict(title='Tags'),
yaxis=dict(title='Count'), hovermode="closest")
figure = go.Figure(data=data, layout=layout)
iplot(figure)
def line_document(data_frame):
line_list = pd.DataFrame(columns=["sentence", "words", "tags", "words_to_tag"])
total_words, total_tags = 0, 0
for ls in data_frame:
line_list.loc[len(line_list) + 1] = [len(line_list) + 1, ls.count(axis=0)["clean_words"],
ls.count(axis=0)["clean_entity"],
f'{ls.count(axis=0)["clean_words"]} : {ls.count(axis=0)["clean_entity"]}']
total_tags += ls.count(axis=0)["clean_entity"]
total_words += ls.count(axis=0)["clean_words"]
line_list["words_tag"] = line_list["words"] / line_list["tags"]
fig = px.line(line_list, x="sentence", y="words_tag", markers=True, text="words_to_tag", template="plotly_dark",
title=f'Total Words : {total_words} '
f'Total Entities : {total_tags}', line_shape='vh')
# fig.show()
iplot(fig)
def scatter_document(data_frame, tags_data):
fig_list = []
for sent, color in zip(data_frame, list(color_creator(tags_data["NER"]).values())):
tags_df = pd.DataFrame(tags_data["_ntags_"], columns=["entity"])
sent_tags_data = pd.DataFrame(no_of_tags(sent).items(), columns=["entity", "count"])
tags_df = pd.merge(left=tags_df, right=sent_tags_data, how="left", left_on=["entity"], right_on=["entity"])
tags_df["count"] = tags_df['count'].replace(np.NAN, '0', regex=True).astype(int)
data = [
go.Scatter(x=tags_df["entity"], y=tags_df["count"],
mode='markers', marker=dict(color=color,
size=tags_df["count"] * 25,
)
)
]
layout = go.Layout(title=' '.join(sent["words"]), xaxis=dict(title='Tags'),
yaxis=dict(title='Count'),
hovermode="closest",
template="plotly_dark")
figure = go.Figure(data=data, layout=layout)
fig_list.append(figure)
layout = go.Layout(title="Document Tag Distribution", xaxis=dict(title='Tags'),
yaxis=dict(title='Count'),
hovermode="closest",
template="plotly_dark")
fig = go.Figure(data=sum((fig.data for fig in fig_list), ()), layout=layout)
# iplot(fig)
return fig