UjjwalVIT's picture
github
8d2c511
raw
history blame
7.17 kB
import streamlit as st
import pandas as pd
import streamlit.components.v1 as stc
import nltk
# NLP Package-used for text analysis
import nltk
nltk.download('all')
from sumy.parsers.plaintext import PlaintextParser
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sumy.nlp.tokenizers import Tokenizer
from rouge import Rouge
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer
# from nltk import ne_chunk
from nltk.tag import StanfordNERTagger
from collections import Counter
from textblob import TextBlob
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import base64
import time
stanford_ner_jar = 'https://github.com/UjjwalBansal19/stanford_model/raw/main/stanford-ner.jar'
# Path to the pre-trained NER model file
stanford_ner_model ='https://huggingface.co/spaces/UjjwalVIT/Text_analysis_and_metadata_app/raw/main/english.all.3class.distsim.crf.ser.gz'
timestr = time.strftime("%Y%m%d-%H%M%S")
# from spacy import displacy
#Text cleaning packages
# removing stopwords, removing special characters, removing URLs, normalizing text, removing HTML tags, correcting common spelling mistakes,
import neattext as nt
import neattext.functions as nfx
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid red; border-radius: 0.25rem; padding: 1rem";>{}
</div>
"""
def evaluate_summary(summary,reference):
r=Rouge()
eval_score=r.get_scores(summary,reference)
eval_score_df=pd.DataFrame(eval_score[0])
return eval_score_df
def bart_summary(docx):
model=BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
inputs = tokenizer.batch_encode_plus([docx], truncation=True, padding='longest', max_length=1024, return_tensors='pt')
summary_ids = model.generate(inputs['input_ids'], num_beams=6, max_length=100, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def T5_summary(docx):
model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')
input_text = "summarize: " + docx
input_ids = tokenizer.encode(input_text, return_tensors='pt')
summary_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def sumy_summarizer(docx,num=5):
parser=PlaintextParser.from_string(docx,Tokenizer("english"))
lex_summ=LexRankSummarizer()
summary=lex_summ(parser.document,sentences_count= num)
summary_list=[str(sentence) for sentence in summary]
result=' '.join(summary_list)
return result
def sumy_text_summarizer(docx, num=5):
parser = PlaintextParser.from_string(docx, Tokenizer("english"))
text_rank_summarizer = TextRankSummarizer()
summary = text_rank_summarizer(parser.document, sentences_count=num)
summary_list = [str(sentence) for sentence in summary]
result = ' '.join(summary_list)
return result
def nlp_analysis(text):
token_data = []
tokens=word_tokenize(text)
tagged_tokens = pos_tag(tokens) #categorize into nouns, verbs, adjectives, adverbs, pronouns etc
stop_words = set(stopwords.words('english')) #check for words like a", "an", "the", "is", "in"
lemmatizer = WordNetLemmatizer() #preprocessing
for token in tagged_tokens:
token_text=token[0]
token_shape = None
token_pos = token[1] # "," - Comma CC - Coordinating conjunction DT - Determiner NN - Noun VBD - Past tense verb PRP - Personal pronoun VBD - Past tense verb
token_lemma = lemmatizer.lemmatize(token_text)
token_is_alpha = token_text.isalpha()
token_is_stop = token_text.lower() in stop_words
token_data.append([token_text,token_shape,token_pos,token_lemma,token_is_alpha,token_is_stop])
df=pd.DataFrame(token_data,columns=['Token','Shape','Position','lemma','Contains_Alphabets','Contains_Stop_words'])
return df
def find_entities(text):
stan = StanfordNERTagger(stanford_ner_model, stanford_ner_jar)
text=text.replace("\n\n","\n")
tokens = nltk.word_tokenize(text)
tagged_tokens = stan.tag(tokens)
entities = [(token, tag) for token, tag in tagged_tokens if tag != 'O']
entities=HTML_WRAPPER.format(entities)
return entities
def file_download(data):
csv_file= data.to_csv()
b64=base64.b64encode(csv_file.encode()).decode()
new_filename="result_{}.csv".format(timestr)
st.markdown('### 🗃️ Download csv file ')
href=f'<a href="data:file/csv;base64,{b64}" download="{new_filename}"> Click Here! </a>'
st.markdown(href, unsafe_allow_html=True)
def get_most_common_tokens(text):
word_tokens=Counter(text.split())
most_common=dict(word_tokens.most_common(len(text)))
return most_common
def get_semantics(text):
blob=TextBlob(text)
sentiment=blob.sentiment
return sentiment
def plot_wordcloud(text):
text_workcloud= WordCloud().generate(text) #size indicates its frequency
fig=plt.figure()
plt.imshow(text_workcloud,interpolation='bilinear')
plt.axis('off')
st.pyplot(fig)
def pos_tags(text):
blob=TextBlob(text)
tagged_text=blob.tags
tagged_df=pd.DataFrame(tagged_text,columns=['tokens','tags'])
return tagged_df
TAGS = {
'NN' : 'green',
'NNS' : 'green',
'NNP' : 'green',
'NNPS' : 'green',
'VB' : 'blue',
'VBD' : 'blue',
'VBG' : 'blue',
'VBN' : 'blue',
'VBP' : 'blue',
'VBZ' : 'blue',
'JJ' : 'red',
'JJR' : 'red',
'JJS' : 'red',
'RB' : 'cyan',
'RBR' : 'cyan',
'RBS' : 'cyan',
'IN' : 'darkwhite',
'POS' : 'darkyellow',
'PRP$' : 'magenta',
'PRP$' : 'magenta',
'DET' : 'black',
'CC' : 'black',
'CD' : 'black',
'WDT' : 'black',
'WP' : 'black',
'WP$' : 'black',
'WRB' : 'black',
'EX' : 'yellow',
'FW' : 'yellow',
'LS' : 'yellow',
'MD' : 'yellow',
'PDT' : 'yellow',
'RP' : 'yellow',
'SYM' : 'yellow',
'TO' : 'yellow',
'None' : 'off'
}
def tag_visualize(tagged_df):
colored_text=[]
for i in tagged_df:
if i[1] in TAGS.keys():
token=i[0]
color_of_text=TAGS.get(i[1])
changed_text='<span style=color:{}>{}</span>'.format(color_of_text,token)
colored_text.append(changed_text)
result=''.join(colored_text)
return result