File size: 7,257 Bytes
575adcc 22f6792 575adcc 4628306 575adcc 81c0292 a597086 5c5bce4 a597086 575adcc 4628306 575adcc 4628306 575adcc 4628306 575adcc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
import streamlit as st
import pandas as pd
import streamlit.components.v1 as stc
import nltk
# NLP Package-used for text analysis
import nltk
nltk.download('all')
from sumy.parsers.plaintext import PlaintextParser
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sumy.nlp.tokenizers import Tokenizer
from rouge import Rouge
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoTokenizer, AutoModelForTokenClassification
# from nltk import ne_chunk
from nltk.tag import StanfordNERTagger
from collections import Counter
from textblob import TextBlob
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import base64
import time
stanford_ner_jar_path = 'stanford_model/stanford-ner.jar'
# Path to the pre-trained NER model file
stanford_ner_model_path ='stanford_model/english.all.3class.distsim.crf.ser.gz'
timestr = time.strftime("%Y%m%d-%H%M%S")
# from spacy import displacy
#Text cleaning packages
# removing stopwords, removing special characters, removing URLs, normalizing text, removing HTML tags, correcting common spelling mistakes,
import neattext as nt
import neattext.functions as nfx
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid red; border-radius: 0.25rem; padding: 1rem";>{}
</div>
"""
def evaluate_summary(summary,reference):
r=Rouge()
eval_score=r.get_scores(summary,reference)
eval_score_df=pd.DataFrame(eval_score[0])
return eval_score_df
def bart_summary(docx):
model=BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
inputs = tokenizer.batch_encode_plus([docx], truncation=True, padding='longest', max_length=1024, return_tensors='pt')
summary_ids = model.generate(inputs['input_ids'], num_beams=6, max_length=100, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def T5_summary(docx):
model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')
input_text = "summarize: " + docx
input_ids = tokenizer.encode(input_text, return_tensors='pt')
summary_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def sumy_summarizer(docx,num=5):
parser=PlaintextParser.from_string(docx,Tokenizer("english"))
lex_summ=LexRankSummarizer()
summary=lex_summ(parser.document,sentences_count= num)
summary_list=[str(sentence) for sentence in summary]
result=' '.join(summary_list)
return result
def sumy_text_summarizer(docx, num=5):
parser = PlaintextParser.from_string(docx, Tokenizer("english"))
text_rank_summarizer = TextRankSummarizer()
summary = text_rank_summarizer(parser.document, sentences_count=num)
summary_list = [str(sentence) for sentence in summary]
result = ' '.join(summary_list)
return result
def nlp_analysis(text):
token_data = []
tokens=word_tokenize(text)
tagged_tokens = pos_tag(tokens) #categorize into nouns, verbs, adjectives, adverbs, pronouns etc
stop_words = set(stopwords.words('english')) #check for words like a", "an", "the", "is", "in"
lemmatizer = WordNetLemmatizer() #preprocessing
for token in tagged_tokens:
token_text=token[0]
token_shape = None
token_pos = token[1] # "," - Comma CC - Coordinating conjunction DT - Determiner NN - Noun VBD - Past tense verb PRP - Personal pronoun VBD - Past tense verb
token_lemma = lemmatizer.lemmatize(token_text)
token_is_alpha = token_text.isalpha()
token_is_stop = token_text.lower() in stop_words
token_data.append([token_text,token_shape,token_pos,token_lemma,token_is_alpha,token_is_stop])
df=pd.DataFrame(token_data,columns=['Token','Shape','Position','lemma','Contains_Alphabets','Contains_Stop_words'])
return df
def find_entities(text):
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
e=nlp(text)
entities=[(entity["word"], entity["entity"]) for entity in e]
entities=HTML_WRAPPER.format(entities)
return entities
def file_download(data):
csv_file= data.to_csv()
b64=base64.b64encode(csv_file.encode()).decode()
new_filename="result_{}.csv".format(timestr)
st.markdown('### 🗃️ Download csv file ')
href=f'<a href="data:file/csv;base64,{b64}" download="{new_filename}"> Click Here! </a>'
st.markdown(href, unsafe_allow_html=True)
def get_most_common_tokens(text):
word_tokens=Counter(text.split())
most_common=dict(word_tokens.most_common(len(text)))
return most_common
def get_semantics(text):
blob=TextBlob(text)
sentiment=blob.sentiment
return sentiment
def plot_wordcloud(text):
text_workcloud= WordCloud().generate(text) #size indicates its frequency
fig=plt.figure()
plt.imshow(text_workcloud,interpolation='bilinear')
plt.axis('off')
st.pyplot(fig)
def pos_tags(text):
blob=TextBlob(text)
tagged_text=blob.tags
tagged_df=pd.DataFrame(tagged_text,columns=['tokens','tags'])
return tagged_df
TAGS = {
'NN' : 'green',
'NNS' : 'green',
'NNP' : 'green',
'NNPS' : 'green',
'VB' : 'blue',
'VBD' : 'blue',
'VBG' : 'blue',
'VBN' : 'blue',
'VBP' : 'blue',
'VBZ' : 'blue',
'JJ' : 'red',
'JJR' : 'red',
'JJS' : 'red',
'RB' : 'cyan',
'RBR' : 'cyan',
'RBS' : 'cyan',
'IN' : 'darkwhite',
'POS' : 'darkyellow',
'PRP$' : 'magenta',
'PRP$' : 'magenta',
'DET' : 'black',
'CC' : 'black',
'CD' : 'black',
'WDT' : 'black',
'WP' : 'black',
'WP$' : 'black',
'WRB' : 'black',
'EX' : 'yellow',
'FW' : 'yellow',
'LS' : 'yellow',
'MD' : 'yellow',
'PDT' : 'yellow',
'RP' : 'yellow',
'SYM' : 'yellow',
'TO' : 'yellow',
'None' : 'off'
}
def tag_visualize(tagged_df):
colored_text=[]
for i in tagged_df:
if i[1] in TAGS.keys():
token=i[0]
color_of_text=TAGS.get(i[1])
changed_text='<span style=color:{}>{}</span>'.format(color_of_text,token)
colored_text.append(changed_text)
result=''.join(colored_text)
return result |