|
import streamlit as st |
|
import pandas as pd |
|
import streamlit.components.v1 as stc |
|
import nltk |
|
|
|
|
|
import nltk |
|
nltk.download('all') |
|
from sumy.parsers.plaintext import PlaintextParser |
|
from nltk.tokenize import word_tokenize |
|
from nltk.tag import pos_tag |
|
from nltk.stem import WordNetLemmatizer |
|
from sumy.summarizers.lex_rank import LexRankSummarizer |
|
from sumy.summarizers.text_rank import TextRankSummarizer |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import sent_tokenize |
|
from sumy.nlp.tokenizers import Tokenizer |
|
from rouge import Rouge |
|
from transformers import BartForConditionalGeneration, BartTokenizer |
|
from transformers import T5ForConditionalGeneration, T5Tokenizer |
|
|
|
|
|
from nltk.tag import StanfordNERTagger |
|
|
|
from collections import Counter |
|
|
|
from textblob import TextBlob |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
|
|
from wordcloud import WordCloud |
|
|
|
import base64 |
|
import time |
|
|
|
stanford_ner_jar = 'https://github.com/UjjwalBansal19/stanford_model/raw/main/stanford-ner.jar' |
|
|
|
stanford_ner_model ='https://huggingface.co/spaces/UjjwalVIT/Text_analysis_and_metadata_app/raw/main/english.all.3class.distsim.crf.ser.gz' |
|
|
|
timestr = time.strftime("%Y%m%d-%H%M%S") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import neattext as nt |
|
import neattext.functions as nfx |
|
|
|
|
|
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid red; border-radius: 0.25rem; padding: 1rem";>{} |
|
</div> |
|
""" |
|
|
|
def evaluate_summary(summary,reference): |
|
r=Rouge() |
|
eval_score=r.get_scores(summary,reference) |
|
eval_score_df=pd.DataFrame(eval_score[0]) |
|
return eval_score_df |
|
|
|
|
|
def bart_summary(docx): |
|
model=BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') |
|
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') |
|
inputs = tokenizer.batch_encode_plus([docx], truncation=True, padding='longest', max_length=1024, return_tensors='pt') |
|
summary_ids = model.generate(inputs['input_ids'], num_beams=6, max_length=100, early_stopping=True) |
|
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) |
|
return summary |
|
|
|
def T5_summary(docx): |
|
model = T5ForConditionalGeneration.from_pretrained('t5-base') |
|
tokenizer = T5Tokenizer.from_pretrained('t5-base') |
|
input_text = "summarize: " + docx |
|
input_ids = tokenizer.encode(input_text, return_tensors='pt') |
|
summary_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True) |
|
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) |
|
return summary |
|
|
|
def sumy_summarizer(docx,num=5): |
|
parser=PlaintextParser.from_string(docx,Tokenizer("english")) |
|
lex_summ=LexRankSummarizer() |
|
summary=lex_summ(parser.document,sentences_count= num) |
|
summary_list=[str(sentence) for sentence in summary] |
|
result=' '.join(summary_list) |
|
return result |
|
|
|
def sumy_text_summarizer(docx, num=5): |
|
parser = PlaintextParser.from_string(docx, Tokenizer("english")) |
|
text_rank_summarizer = TextRankSummarizer() |
|
summary = text_rank_summarizer(parser.document, sentences_count=num) |
|
summary_list = [str(sentence) for sentence in summary] |
|
result = ' '.join(summary_list) |
|
return result |
|
|
|
|
|
def nlp_analysis(text): |
|
token_data = [] |
|
tokens=word_tokenize(text) |
|
tagged_tokens = pos_tag(tokens) |
|
stop_words = set(stopwords.words('english')) |
|
lemmatizer = WordNetLemmatizer() |
|
for token in tagged_tokens: |
|
token_text=token[0] |
|
token_shape = None |
|
token_pos = token[1] |
|
token_lemma = lemmatizer.lemmatize(token_text) |
|
token_is_alpha = token_text.isalpha() |
|
token_is_stop = token_text.lower() in stop_words |
|
token_data.append([token_text,token_shape,token_pos,token_lemma,token_is_alpha,token_is_stop]) |
|
df=pd.DataFrame(token_data,columns=['Token','Shape','Position','lemma','Contains_Alphabets','Contains_Stop_words']) |
|
return df |
|
|
|
|
|
def find_entities(text): |
|
stan = StanfordNERTagger(stanford_ner_model, stanford_ner_jar) |
|
text=text.replace("\n\n","\n") |
|
tokens = nltk.word_tokenize(text) |
|
tagged_tokens = stan.tag(tokens) |
|
entities = [(token, tag) for token, tag in tagged_tokens if tag != 'O'] |
|
entities=HTML_WRAPPER.format(entities) |
|
return entities |
|
|
|
|
|
def file_download(data): |
|
csv_file= data.to_csv() |
|
b64=base64.b64encode(csv_file.encode()).decode() |
|
new_filename="result_{}.csv".format(timestr) |
|
st.markdown('### 🗃️ Download csv file ') |
|
href=f'<a href="data:file/csv;base64,{b64}" download="{new_filename}"> Click Here! </a>' |
|
st.markdown(href, unsafe_allow_html=True) |
|
|
|
def get_most_common_tokens(text): |
|
word_tokens=Counter(text.split()) |
|
most_common=dict(word_tokens.most_common(len(text))) |
|
return most_common |
|
|
|
|
|
def get_semantics(text): |
|
blob=TextBlob(text) |
|
sentiment=blob.sentiment |
|
return sentiment |
|
|
|
def plot_wordcloud(text): |
|
text_workcloud= WordCloud().generate(text) |
|
fig=plt.figure() |
|
plt.imshow(text_workcloud,interpolation='bilinear') |
|
plt.axis('off') |
|
st.pyplot(fig) |
|
|
|
def pos_tags(text): |
|
blob=TextBlob(text) |
|
tagged_text=blob.tags |
|
tagged_df=pd.DataFrame(tagged_text,columns=['tokens','tags']) |
|
return tagged_df |
|
|
|
TAGS = { |
|
'NN' : 'green', |
|
'NNS' : 'green', |
|
'NNP' : 'green', |
|
'NNPS' : 'green', |
|
'VB' : 'blue', |
|
'VBD' : 'blue', |
|
'VBG' : 'blue', |
|
'VBN' : 'blue', |
|
'VBP' : 'blue', |
|
'VBZ' : 'blue', |
|
'JJ' : 'red', |
|
'JJR' : 'red', |
|
'JJS' : 'red', |
|
'RB' : 'cyan', |
|
'RBR' : 'cyan', |
|
'RBS' : 'cyan', |
|
'IN' : 'darkwhite', |
|
'POS' : 'darkyellow', |
|
'PRP$' : 'magenta', |
|
'PRP$' : 'magenta', |
|
'DET' : 'black', |
|
'CC' : 'black', |
|
'CD' : 'black', |
|
'WDT' : 'black', |
|
'WP' : 'black', |
|
'WP$' : 'black', |
|
'WRB' : 'black', |
|
'EX' : 'yellow', |
|
'FW' : 'yellow', |
|
'LS' : 'yellow', |
|
'MD' : 'yellow', |
|
'PDT' : 'yellow', |
|
'RP' : 'yellow', |
|
'SYM' : 'yellow', |
|
'TO' : 'yellow', |
|
'None' : 'off' |
|
} |
|
|
|
def tag_visualize(tagged_df): |
|
colored_text=[] |
|
for i in tagged_df: |
|
if i[1] in TAGS.keys(): |
|
token=i[0] |
|
color_of_text=TAGS.get(i[1]) |
|
changed_text='<span style=color:{}>{}</span>'.format(color_of_text,token) |
|
colored_text.append(changed_text) |
|
result=''.join(colored_text) |
|
return result |