text_dissection / app.py
Nikhil Mane
all analysis
8bf5454
raw
history blame
10.9 kB
import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams["figure.figsize"] = (30,20)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpus')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
import spacy
# import en_core_web_sm
nlp = spacy.load("en_core_web_sm")
from spacy import displacy
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import streamlit as st
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from textstat import flesch_reading_ease
# import SessionState
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def create_wordcloud(text):
st.header("Here is wordcloud..")
wc = WordCloud(width=width*100 , height=height*100 , background_color='white', colormap='prism', collocations = False).generate_from_text(text)
fig, ax = plt.subplots()
# fig, ax = plt.subplots(figsize=(width , height))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
st.pyplot(fig)
# @st.cache(suppress_st_warning=True, allow_output_mutation=True)
def get_input():
text_dream= 'ex_dream.txt'
text_tryst= 'ex_tryst.txt'
with open(text_dream) as f:
dream = f.readlines()
with open(text_tryst) as f:
tryst = f.readlines()
if 'x' not in st.session_state:
st.session_state['x'] = ' '
if 'k' not in st.session_state:
st.session_state['k'] = 0
if st.button('Example: I have a dream - M. King'):
st.session_state['x'] = ' '.join(dream)
if st.button('Example: Tryst with destiny - J. Nehru'):
st.session_state['x'] = ' '.join(tryst)
em = st.empty()
if st.button('Clear'):
st.session_state['k']+=1
st.session_state['x'] = ' '
text = em.text_area("Paste your text or Click Example", value = st.session_state['x'] , key = st.session_state['k'], height=200, placeholder="Add here..")
return text
def create_ngram(text):
st.header("N-Gram Anaysis is >>")
def plot_top_ngrams_barchart(text, n=2):
stop=set(stopwords.words('english'))
new= text.str.split()
new=new.values.tolist()
corpus=[word for i in new for word in i]
def _get_top_ngram(corpus, n=None):
vec = CountVectorizer(ngram_range=(n, n), stop_words=stop).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx])
for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:10]
top_n_bigrams=_get_top_ngram(text,n)[:10]
x,y=map(list,zip(*top_n_bigrams))
fig = px.bar(x=y,y=x, color=y)
fig.update_layout( yaxis=dict(autorange='reversed'))
fig.update_layout(autosize=False,width=width*100,height=height*100)
st.plotly_chart(fig)
st.subheader(f"Unigram:")
plot_top_ngrams_barchart(pd.Series([text]), 1)
st.subheader(f"Bigram:")
plot_top_ngrams_barchart(pd.Series([text]), 2)
st.subheader(f"Trigram:")
plot_top_ngrams_barchart(pd.Series([text]), 3)
# # Overall Sentiment
def create_sentiment(text, tokenized_sent):
sentiment_dict = sid.polarity_scores(text)
st.header(f"Sentiment Analysis >>")
st.subheader(f"Overall Sentiment score is = {sentiment_dict['compound']}")
# decide sentiment as positive, negative and neutral
if sentiment_dict['compound'] >= 0.05 :
st.subheader("Sentence Overall Rated As Positive")
elif sentiment_dict['compound'] <= - 0.05 :
st.subheader("Sentence Overall Rated As Negative")
else :
st.subheader("Sentence Overall Rated As Neutral")
# Temporal sentiment
st.subheader(f"Temporal Sentiment")
temporal_sentiment = pd.DataFrame(columns =['sentence', 'sentiment', 'len_sent'])
for sent in tokenized_sent:
sentiment_dict = sid.polarity_scores(sent)
temporal_sentiment = temporal_sentiment.append({'sentence' : sent,
'sentiment' :sentiment_dict['compound'],'len_sent' : len(sent.split())}, ignore_index=True)
temporal_sentiment['sentiment_stretch'] = (temporal_sentiment['sentiment'] * temporal_sentiment['len_sent']).astype(float)
fig = px.bar(temporal_sentiment, x=temporal_sentiment.index , y='sentiment',
hover_data=['sentence','sentiment','sentiment_stretch'], color= (temporal_sentiment['sentiment'] > 0),
color_discrete_map={True: 'green',False: 'red'})
fig.update_layout(autosize=False,width=width*100,height=height*100)
st.plotly_chart(fig)
st.subheader(f"Temporal Sentiment Stretch")
fig = px.bar(temporal_sentiment, x=temporal_sentiment.index , y='sentiment_stretch',
hover_data=['sentence','sentiment','sentiment_stretch'], color= (temporal_sentiment['sentiment'] > 0),
color_discrete_map={True: 'green',False: 'red'})
fig.update_layout(autosize=False,width=width*100,height=height*100)
st.plotly_chart(fig)
# # ner
def nested_state(state):
# st.session_state['state_ner'] = state
st.session_state['nested_session'] = state
def create_ner(text):
# st.session_state['state_ner'] = True
st.header(f"Named Entity Recognition >>")
st.subheader(f"Top Entities .. ")
doc=nlp(text)
ent = [X.label_ for X in doc.ents]
counter=Counter(ent)
count=counter.most_common()
x,y=map(list,zip(*count))
fig = px.bar(x=y,y=x, color=y)
fig.update_layout( yaxis=dict(autorange='reversed'))
fig.update_layout(autosize=False,width=width*100,height=height*100)
st.plotly_chart(fig)
st.subheader(f"What Are Those Entities .. ")
ent_type= st.selectbox("Select Named Entity :", x, on_change=nested_state(True))
ent_single = [X.text for X in doc.ents if X.label_ == ent_type]
ent_single=[x for x in ent_single]
counter=Counter(ent_single)
count=counter.most_common()
x,y=map(list,zip(*count))
fig = px.bar(x=y,y=x, color=y)
fig.update_layout( yaxis=dict(autorange='reversed'))
fig.update_layout(autosize=False,width=width*100,height=height*100)
st.plotly_chart(fig)
if st.button("Render NER"):
st.markdown(displacy.render(doc, style='ent'), unsafe_allow_html=True)
# # pos tags
def create_pos(text):
st.session_state['state_ner'] = True
st.header(f"Part of Speech >>")
st.subheader(f"Top POS ..")
# st.markdown(displacy.render(doc, style='dep'), unsafe_allow_html=True)
pos = nltk.pos_tag(tokenized_word)
pos=list(map(list,zip(*pos)))[1]
pos = [x for x in pos]
counter=Counter(pos)
count=counter.most_common()
x,y=map(list,zip(*count))
fig = px.bar(x=y,y=x, color=y)
fig.update_layout( yaxis=dict(autorange='reversed'))
fig.update_layout(autosize=False,width=width*100,height=height*100)
st.plotly_chart(fig)
st.subheader(f"What Are those POS .. ")
pos_type= st.selectbox("Select POS :", x)
pos_single = []
pos = nltk.pos_tag(tokenized_word)
for word,tag in pos:
if tag==pos_type:
pos_single.append(word)
pos_single=[x for x in pos_single]
counter=Counter(pos_single)
count=counter.most_common()
x,y=map(list,zip(*count))
fig = px.bar(x=y,y=x, color=y)
fig.update_layout( yaxis=dict(autorange='reversed'))
fig.update_layout(autosize=False,width=width*100,height=height*100)
st.plotly_chart(fig)
# # Text Complexity
def create_complexity(text, tokenized_sent):
st.header(f"Text Complexity >>")
st.caption(f"Higher scores indicate material that is easier to read,lower numbers mark harder-to-read passages:\
– 0-30 College\
– 50-60 High school\
– 60+ Fourth grade")
st.subheader(f"Flesch Reading Ease score is = {flesch_reading_ease(text)}")
# Temporal sentiment
st.subheader(f"Temporal Complexity")
temporal_complexity= pd.DataFrame(columns =['sentence', 'complexity', 'len_sent'])
for sent in tokenized_sent:
complexity = flesch_reading_ease(sent)
temporal_complexity = temporal_complexity.append({'sentence' : sent,
'complexity' :complexity,'len_sent' : len(sent.split())}, ignore_index=True)
temporal_complexity['complexity_stretch'] = (temporal_complexity['complexity'] * temporal_complexity['len_sent']).astype(float)
fig = px.bar(temporal_complexity, x=temporal_complexity.index , y='complexity',
hover_data=['sentence','complexity','complexity_stretch'], color= (temporal_complexity['complexity'] > 30),
color_discrete_map={True: 'green',False: 'red'})
fig.update_layout(autosize=False,width=width*100,height=height*100)
st.plotly_chart(fig)
if __name__ == '__main__':
m = st.markdown("""<style>div.stButton > button:first-child
{background-color: #dbe6c4;}
</style>""", unsafe_allow_html=True)
st.title("Text Disection : Analyze your text")
st.sidebar.header("Adjust Plot Dimensions")
width = st.sidebar.slider("Plot Width", 1, 25, 10)
height = st.sidebar.slider("Plot Height", 1, 25, 7)
# Input
st.header(f"Your Text please..")
text = get_input()
tokenized_sent=sent_tokenize(text)
tokenized_word=word_tokenize(text)
st.markdown(f"###### Total Sentences in the text = {len(tokenized_sent)}")
st.markdown(f"###### Total words in the text = {len(tokenized_word)}")
st.sidebar.title("Analysis Type")
analysis = st.sidebar.radio("Select Analysis",
options = ['Wordcloud', 'N-Gram Analysis', 'Sentiment Analysis', 'Named Entity Recognition Analysis',
'Part Of Speech Analysis', 'Text Complexity Analysis','Keep Calm!'], index=6)
if st.button("Complete Analysis"):
create_wordcloud(text)
create_ngram(text)
create_sentiment(text, tokenized_sent)
create_ner(text)
create_pos(text)
create_complexity(text, tokenized_sent)
analysis = 'Keep Calm!'
if analysis == 'Wordcloud':
create_wordcloud(text)
if analysis == 'N-Gram Analysis':
create_ngram(text)
if analysis == 'Sentiment Analysis':
create_sentiment(text, tokenized_sent)
if analysis == 'Named Entity Recognition Analysis':
create_ner(text)
if analysis == 'Part Of Speech Analysis':
create_pos(text)
if analysis == 'Text Complexity Analysis':
create_complexity(text, tokenized_sent)
if analysis == 'Keep Calm!':
st.image('nlp_meme.jpg')