File size: 7,194 Bytes
575adcc
 
 
 
 
 
22f6792
 
575adcc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81c0292
d07c79b
575adcc
d07c79b
575adcc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import streamlit as st
import pandas as pd
import streamlit.components.v1 as stc
import nltk

# NLP Package-used for text analysis
import nltk
nltk.download('all')
from sumy.parsers.plaintext import PlaintextParser
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sumy.nlp.tokenizers import Tokenizer
from rouge import Rouge
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer

# from nltk import ne_chunk
from nltk.tag import StanfordNERTagger

from collections import Counter

from textblob import TextBlob
import seaborn as sns
import matplotlib.pyplot as plt

from wordcloud import WordCloud

import base64
import time

stanford_ner_jar = 'https://huggingface.co/spaces/UjjwalVIT/Text_analysis_and_metadata_app/raw/main/stanford-ner.jar'
# Path to the pre-trained NER model file
stanford_ner_model ='https://huggingface.co/spaces/UjjwalVIT/Text_analysis_and_metadata_app/raw/main/english.all.3class.distsim.crf.ser.gz'

timestr = time.strftime("%Y%m%d-%H%M%S")


# from spacy import displacy


#Text cleaning packages 
# removing stopwords, removing special characters, removing URLs, normalizing text, removing HTML tags, correcting common spelling mistakes,
import neattext as nt
import neattext.functions as nfx


HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid red; border-radius: 0.25rem; padding: 1rem";>{}
</div> 
"""

def evaluate_summary(summary,reference):
    r=Rouge()
    eval_score=r.get_scores(summary,reference)
    eval_score_df=pd.DataFrame(eval_score[0])
    return eval_score_df


def bart_summary(docx):
    model=BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    inputs = tokenizer.batch_encode_plus([docx], truncation=True, padding='longest', max_length=1024, return_tensors='pt')
    summary_ids = model.generate(inputs['input_ids'], num_beams=6, max_length=100, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return  summary

def T5_summary(docx):
    model = T5ForConditionalGeneration.from_pretrained('t5-base')
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    input_text = "summarize: " + docx
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    summary_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary
    
def sumy_summarizer(docx,num=5):
    parser=PlaintextParser.from_string(docx,Tokenizer("english"))
    lex_summ=LexRankSummarizer()
    summary=lex_summ(parser.document,sentences_count= num)
    summary_list=[str(sentence) for sentence in summary]
    result=' '.join(summary_list)
    return result

def sumy_text_summarizer(docx, num=5):
    parser = PlaintextParser.from_string(docx, Tokenizer("english"))
    text_rank_summarizer = TextRankSummarizer()
    summary = text_rank_summarizer(parser.document, sentences_count=num)
    summary_list = [str(sentence) for sentence in summary]
    result = ' '.join(summary_list)
    return result


def nlp_analysis(text):
    token_data = []
    tokens=word_tokenize(text)
    tagged_tokens = pos_tag(tokens) #categorize into nouns, verbs, adjectives, adverbs, pronouns etc
    stop_words = set(stopwords.words('english')) #check for words like a", "an", "the", "is", "in"
    lemmatizer = WordNetLemmatizer() #preprocessing
    for token in tagged_tokens:
        token_text=token[0]
        token_shape = None
        token_pos = token[1] # "," - Comma CC - Coordinating conjunction DT - Determiner NN - Noun VBD - Past tense verb PRP - Personal pronoun VBD - Past tense verb
        token_lemma = lemmatizer.lemmatize(token_text)
        token_is_alpha = token_text.isalpha()
        token_is_stop = token_text.lower() in stop_words
        token_data.append([token_text,token_shape,token_pos,token_lemma,token_is_alpha,token_is_stop])
    df=pd.DataFrame(token_data,columns=['Token','Shape','Position','lemma','Contains_Alphabets','Contains_Stop_words'])
    return df


def find_entities(text):
    stan = StanfordNERTagger(stanford_ner_model, stanford_ner_jar)
    text=text.replace("\n\n","\n")
    tokens = nltk.word_tokenize(text)
    tagged_tokens = stan.tag(tokens)
    entities = [(token, tag) for token, tag in tagged_tokens if tag != 'O']
    entities=HTML_WRAPPER.format(entities)
    return entities


def file_download(data):
    csv_file= data.to_csv()
    b64=base64.b64encode(csv_file.encode()).decode()
    new_filename="result_{}.csv".format(timestr)
    st.markdown('### 🗃️ Download csv file ')
    href=f'<a href="data:file/csv;base64,{b64}" download="{new_filename}"> Click Here! </a>'
    st.markdown(href, unsafe_allow_html=True)

def get_most_common_tokens(text):
    word_tokens=Counter(text.split())
    most_common=dict(word_tokens.most_common(len(text)))
    return most_common


def get_semantics(text):
    blob=TextBlob(text)
    sentiment=blob.sentiment
    return sentiment

def plot_wordcloud(text):
    text_workcloud= WordCloud().generate(text) #size indicates its frequency
    fig=plt.figure()
    plt.imshow(text_workcloud,interpolation='bilinear')
    plt.axis('off')
    st.pyplot(fig)

def pos_tags(text):
    blob=TextBlob(text)
    tagged_text=blob.tags
    tagged_df=pd.DataFrame(tagged_text,columns=['tokens','tags'])
    return tagged_df

TAGS = {
            'NN'   : 'green',
            'NNS'  : 'green',
            'NNP'  : 'green',
            'NNPS' : 'green',
            'VB'   : 'blue',
            'VBD'  : 'blue',
            'VBG'  : 'blue',
            'VBN'  : 'blue',
            'VBP'  : 'blue',
            'VBZ'  : 'blue',
            'JJ'   : 'red',
            'JJR'  : 'red',
            'JJS'  : 'red',
            'RB'   : 'cyan',
            'RBR'  : 'cyan',
            'RBS'  : 'cyan',
            'IN'   : 'darkwhite',
            'POS'  : 'darkyellow',
            'PRP$' : 'magenta',
            'PRP$' : 'magenta',
            'DET'   : 'black',
            'CC'   : 'black',
            'CD'   : 'black',
            'WDT'  : 'black',
            'WP'   : 'black',
            'WP$'  : 'black',
            'WRB'  : 'black',
            'EX'   : 'yellow',
            'FW'   : 'yellow',
            'LS'   : 'yellow',
            'MD'   : 'yellow',
            'PDT'  : 'yellow',
            'RP'   : 'yellow',
            'SYM'  : 'yellow',
            'TO'   : 'yellow',
            'None' : 'off'
        }

def tag_visualize(tagged_df):
    colored_text=[]
    for i in tagged_df:
        if i[1] in TAGS.keys():
            token=i[0]
            color_of_text=TAGS.get(i[1])
            changed_text='<span style=color:{}>{}</span>'.format(color_of_text,token)
            colored_text.append(changed_text)
    result=''.join(colored_text)
    return result