import streamlit as st import pandas as pd import streamlit.components.v1 as stc import nltk # NLP Package-used for text analysis import nltk nltk.download('all') from sumy.parsers.plaintext import PlaintextParser from nltk.tokenize import word_tokenize from nltk.tag import pos_tag from nltk.stem import WordNetLemmatizer from sumy.summarizers.lex_rank import LexRankSummarizer from sumy.summarizers.text_rank import TextRankSummarizer from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize from sumy.nlp.tokenizers import Tokenizer from rouge import Rouge from transformers import BartForConditionalGeneration, BartTokenizer from transformers import T5ForConditionalGeneration, T5Tokenizer # from nltk import ne_chunk from nltk.tag import StanfordNERTagger from collections import Counter from textblob import TextBlob import seaborn as sns import matplotlib.pyplot as plt from wordcloud import WordCloud import base64 import time python stanford_ner_jar = 'https://huggingface.co/spaces/UjjwalVIT/Text_analysis_and_metadata_app/raw/main/stanford-ner.jar' # Path to the pre-trained NER model file stanford_ner_model ='https://huggingface.co/spaces/UjjwalVIT/Text_analysis_and_metadata_app/raw/main/english.all.3class.distsim.crf.ser.gz' timestr = time.strftime("%Y%m%d-%H%M%S") # from spacy import displacy #Text cleaning packages # removing stopwords, removing special characters, removing URLs, normalizing text, removing HTML tags, correcting common spelling mistakes, import neattext as nt import neattext.functions as nfx HTML_WRAPPER = """
{}
""" def evaluate_summary(summary,reference): r=Rouge() eval_score=r.get_scores(summary,reference) eval_score_df=pd.DataFrame(eval_score[0]) return eval_score_df def bart_summary(docx): model=BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') inputs = tokenizer.batch_encode_plus([docx], truncation=True, padding='longest', max_length=1024, return_tensors='pt') summary_ids = model.generate(inputs['input_ids'], num_beams=6, max_length=100, early_stopping=True) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary def T5_summary(docx): model = T5ForConditionalGeneration.from_pretrained('t5-base') tokenizer = T5Tokenizer.from_pretrained('t5-base') input_text = "summarize: " + docx input_ids = tokenizer.encode(input_text, return_tensors='pt') summary_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary def sumy_summarizer(docx,num=5): parser=PlaintextParser.from_string(docx,Tokenizer("english")) lex_summ=LexRankSummarizer() summary=lex_summ(parser.document,sentences_count= num) summary_list=[str(sentence) for sentence in summary] result=' '.join(summary_list) return result def sumy_text_summarizer(docx, num=5): parser = PlaintextParser.from_string(docx, Tokenizer("english")) text_rank_summarizer = TextRankSummarizer() summary = text_rank_summarizer(parser.document, sentences_count=num) summary_list = [str(sentence) for sentence in summary] result = ' '.join(summary_list) return result def nlp_analysis(text): token_data = [] tokens=word_tokenize(text) tagged_tokens = pos_tag(tokens) #categorize into nouns, verbs, adjectives, adverbs, pronouns etc stop_words = set(stopwords.words('english')) #check for words like a", "an", "the", "is", "in" lemmatizer = WordNetLemmatizer() #preprocessing for token in tagged_tokens: token_text=token[0] token_shape = None token_pos = token[1] # "," - Comma CC - Coordinating conjunction DT - Determiner NN - Noun VBD - Past tense verb PRP - Personal pronoun VBD - Past tense verb token_lemma = lemmatizer.lemmatize(token_text) token_is_alpha = token_text.isalpha() token_is_stop = token_text.lower() in stop_words token_data.append([token_text,token_shape,token_pos,token_lemma,token_is_alpha,token_is_stop]) df=pd.DataFrame(token_data,columns=['Token','Shape','Position','lemma','Contains_Alphabets','Contains_Stop_words']) return df def find_entities(text): stan = StanfordNERTagger(stanford_ner_model, stanford_ner_jar) text=text.replace("\n\n","\n") tokens = nltk.word_tokenize(text) tagged_tokens = stan.tag(tokens) entities = [(token, tag) for token, tag in tagged_tokens if tag != 'O'] entities=HTML_WRAPPER.format(entities) return entities def file_download(data): csv_file= data.to_csv() b64=base64.b64encode(csv_file.encode()).decode() new_filename="result_{}.csv".format(timestr) st.markdown('### 🗃️ Download csv file ') href=f' Click Here! ' st.markdown(href, unsafe_allow_html=True) def get_most_common_tokens(text): word_tokens=Counter(text.split()) most_common=dict(word_tokens.most_common(len(text))) return most_common def get_semantics(text): blob=TextBlob(text) sentiment=blob.sentiment return sentiment def plot_wordcloud(text): text_workcloud= WordCloud().generate(text) #size indicates its frequency fig=plt.figure() plt.imshow(text_workcloud,interpolation='bilinear') plt.axis('off') st.pyplot(fig) def pos_tags(text): blob=TextBlob(text) tagged_text=blob.tags tagged_df=pd.DataFrame(tagged_text,columns=['tokens','tags']) return tagged_df TAGS = { 'NN' : 'green', 'NNS' : 'green', 'NNP' : 'green', 'NNPS' : 'green', 'VB' : 'blue', 'VBD' : 'blue', 'VBG' : 'blue', 'VBN' : 'blue', 'VBP' : 'blue', 'VBZ' : 'blue', 'JJ' : 'red', 'JJR' : 'red', 'JJS' : 'red', 'RB' : 'cyan', 'RBR' : 'cyan', 'RBS' : 'cyan', 'IN' : 'darkwhite', 'POS' : 'darkyellow', 'PRP$' : 'magenta', 'PRP$' : 'magenta', 'DET' : 'black', 'CC' : 'black', 'CD' : 'black', 'WDT' : 'black', 'WP' : 'black', 'WP$' : 'black', 'WRB' : 'black', 'EX' : 'yellow', 'FW' : 'yellow', 'LS' : 'yellow', 'MD' : 'yellow', 'PDT' : 'yellow', 'RP' : 'yellow', 'SYM' : 'yellow', 'TO' : 'yellow', 'None' : 'off' } def tag_visualize(tagged_df): colored_text=[] for i in tagged_df: if i[1] in TAGS.keys(): token=i[0] color_of_text=TAGS.get(i[1]) changed_text='{}'.format(color_of_text,token) colored_text.append(changed_text) result=''.join(colored_text) return result