UjjwalVIT commited on
Commit
575adcc
1 Parent(s): be562eb

Upload 4 files

Browse files
Files changed (4) hide show
  1. Metadata.py +317 -0
  2. Text_analysis.py +179 -0
  3. app.py +125 -0
  4. app_utils.py +202 -0
Metadata.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import streamlit.components.v1 as stc
3
+ import pandas as pd
4
+ import numpy as np
5
+ import seaborn as sns
6
+ import matplotlib.pyplot as plt
7
+
8
+ from PIL import Image
9
+ import exifread # Extracts Meta data of images
10
+ import os
11
+ from datetime import datetime
12
+
13
+ import mutagen # Extracts Meta data of Audio
14
+ from PIL.ExifTags import TAGS, GPSTAGS
15
+ import base64
16
+ import time
17
+ from PyPDF2 import PdfReader
18
+ timestr = time.strftime("%Y%m%d-%H%M%S")
19
+
20
+ import sqlite3
21
+
22
+ details = """
23
+ Metadata is defined as the data providing information about one or more aspects of the data; it is used to summarize basic information about data which can make tracking and working with specific data easier
24
+ """
25
+
26
+ HTML_BANNER = """
27
+ <div style="background-color:violet;padding:10px;border-radius:10px">
28
+ <h1 style="color:white;text-align:center;">MetaData Extractor App </h1>
29
+ </div>
30
+ """
31
+
32
+ def file_download(data):
33
+ csv_file= data.to_csv()
34
+ b64=base64.b64encode(csv_file.encode()).decode()
35
+ new_filename="result_{}.csv".format(timestr)
36
+ st.markdown('### 🗃️ Download csv file ')
37
+ href=f'<a href="data:file/csv;base64,{b64}" download="{new_filename}"> Click Here! </a>'
38
+ st.markdown(href, unsafe_allow_html=True)
39
+
40
+ conn=sqlite3.connect('data.db')
41
+ c=conn.cursor()
42
+
43
+ def create_filestable():
44
+ c.execute('CREATE TABLE IF NOT EXISTS filestable(filename TEXT,filetype TEXT,filesize TEXT,uploadDate TIMESTAMP)')
45
+
46
+ def add_file_details():
47
+ c.execute('INSERT INTO filestable(filename, filetype, filesize, uploadDate) VALUES (?, ?, ?, ?)', (filename, filetype, filesize, uploadDate))
48
+ conn.commit()
49
+
50
+ def view_all_data():
51
+ c.execute('SELECT * FROM filestable')
52
+ data = c.fetchall()
53
+ return data
54
+
55
+
56
+ def load_image(file):
57
+ img = Image.open(file)
58
+ return img
59
+
60
+ def get_readable_time(time):
61
+ return datetime.fromtimestamp(time).strftime('%Y-%m-%d-%H:%M')
62
+
63
+
64
+ def get_exif(filename):
65
+ exif = Image.open(filename).getexif()
66
+
67
+ if exif is not None and isinstance(exif, dict):
68
+ for key, value in exif.items():
69
+ name = TAGS.get(key, value)
70
+ exif[name] = exif.pop(key)
71
+
72
+ if 'GPSInfo' in exif:
73
+ for key in exif['GPSInfo'].keys():
74
+ name = GPSTAGS.get(key,key)
75
+ exif['GPSInfo'][name] = exif['GPSInfo'].pop(key)
76
+ return exif
77
+
78
+
79
+ def metadata():
80
+ # st.title('Meta-Data Extractor App')
81
+ stc.html(HTML_BANNER)
82
+ menu=['Home','Image','Audio','Document_Files','Analytics']
83
+ choice=st.sidebar.selectbox('Menu',menu)
84
+ create_filestable()
85
+ if choice=='Home':
86
+ st.image(load_image('extraction_process.png'))
87
+ st.write(details)
88
+ col1, col2, col3 = st.columns(3)
89
+ with col1:
90
+ with st.expander("Get Image Metadata 📷"):
91
+ st.info("Image Metadata")
92
+ st.markdown("📷")
93
+ st.text("Upload JPEG,JPG,PNG Images")
94
+
95
+ with col2:
96
+ with st.expander("Get Audio Metadata 🔉"):
97
+ st.info("Audio Metadata")
98
+ st.markdown("🔉")
99
+ st.text("Upload Mp3,Ogg")
100
+
101
+ with col3:
102
+ with st.expander("Get Document Metadata 📄📁"):
103
+ st.info("Document Files Metadata")
104
+ st.markdown("📄📁")
105
+ st.text("Upload PDF,Docx")
106
+
107
+ elif choice=='Image':
108
+ st.subheader('Image MetaData Extractor')
109
+ image_file = st.file_uploader("Upload Image", type=["png", "jpg", "jpeg"])
110
+ if image_file is not None:
111
+ with st.expander('File Stats'):
112
+ file_details={'Filename':image_file.name,
113
+ 'Filesize':image_file.size,
114
+ 'Filetype':image_file.type}
115
+
116
+ statinfo=os.stat(image_file.readable())
117
+ statdetails={
118
+ 'Accessed Time': get_readable_time(statinfo.st_atime),
119
+ 'Creation Time':get_readable_time(statinfo.st_ctime),
120
+ 'Modified Time':get_readable_time(statinfo.st_mtime)}
121
+ full_details={
122
+ 'Filename':image_file.name,
123
+ 'Filesize':image_file.size,
124
+ 'Filetype':image_file.type,
125
+ 'Accessed Time': get_readable_time(statinfo.st_atime),
126
+ 'Creation Time':get_readable_time(statinfo.st_ctime),
127
+ 'Modified Time':get_readable_time(statinfo.st_mtime)
128
+ }
129
+ # st.write(full_details)
130
+ file_details_df = pd.DataFrame(
131
+ list(full_details.items()), columns=["Meta Tags", "Value"]
132
+ )
133
+ st.dataframe(file_details_df)
134
+ c1, c2 = st.columns(2)
135
+ with c1:
136
+ with st.expander("View Image"):
137
+ img = load_image(image_file)
138
+ st.image(img,width=250)
139
+ with c2:
140
+ with st.expander("Default(JPEG)"):
141
+ st.info("Using PILLOW")
142
+ img = load_image(image_file)
143
+ img_details = {
144
+ "format": img.format,
145
+ "format_desc": img.format_description,
146
+ "filename": img.filename,
147
+ "size": img.size,
148
+ "height": img.height,
149
+ "width": img.width,
150
+ "info": img.info,
151
+ }
152
+ df_img_details = pd.DataFrame(
153
+ list(img_details.items()), columns=["Meta Tags", "Value"]
154
+ )
155
+ st.dataframe(df_img_details)
156
+
157
+ c3,c4=st.columns(2)
158
+ with c3:
159
+ with st.expander('Using ExifRead Tool'):
160
+ meta_data=exifread.process_file(image_file)
161
+ # st.write(meta_data)
162
+ meta_data_df=pd.DataFrame(
163
+ list(meta_data.items()),columns=['Meta Data','Values'])
164
+ st.dataframe(meta_data_df)
165
+ with c4:
166
+ with st.expander('Image geo Coordinates'):
167
+ img_gps_details=get_exif(image_file)
168
+ latitude = img_gps_details.get('GPSLatitude')
169
+ longitude = img_gps_details.get('GPSLongitude')
170
+ try:
171
+ gps_info = img_gps_details
172
+ lat=latitude
173
+ long=longitude
174
+ except:
175
+ gps_info = "None Found"
176
+ st.write(gps_info)
177
+ st.write(lat)
178
+ st.write(long)
179
+
180
+ add_file_details(img.filename,img.format,img.size,datetime.now())
181
+ with st.expander('Download Results'):
182
+ final_df=pd.concat([file_details_df,df_img_details,meta_data_df])
183
+ st.dataframe(final_df)
184
+ file_download(final_df)
185
+
186
+
187
+ elif choice=='Audio':
188
+ st.subheader('Audio MetaData Extractor')
189
+ audio_file = st.file_uploader("Upload Audio", type=["mp3", "ogg"])
190
+
191
+ if audio_file is not None:
192
+
193
+ col1, col2 = st.columns(2)
194
+
195
+ with col1:
196
+ st.audio(audio_file.read())
197
+
198
+ with col2:
199
+ with st.expander("File Stats"):
200
+ file_details = {
201
+ "FileName": audio_file.name,
202
+ "FileSize": audio_file.size,
203
+ "FileType": audio_file.type,
204
+ }
205
+ add_file_details(audio_file.name,audio_file.type,audio_file.size,datetime.now())
206
+
207
+ st.write(file_details)
208
+
209
+ statinfo = os.stat(audio_file.readable())
210
+ stats_details = {
211
+ "Accessed_Time": get_readable_time(statinfo.st_atime),
212
+ "Creation_Time": get_readable_time(statinfo.st_ctime),
213
+ "Modified_Time": get_readable_time(statinfo.st_mtime),
214
+ }
215
+ st.write(stats_details)
216
+
217
+ file_details_combined = {
218
+ "FileName": audio_file.name,
219
+ "FileSize": audio_file.size,
220
+ "FileType": audio_file.type,
221
+ "Accessed_Time": get_readable_time(statinfo.st_atime),
222
+ "Creation_Time": get_readable_time(statinfo.st_ctime),
223
+ "Modified_Time": get_readable_time(statinfo.st_mtime),
224
+ }
225
+
226
+ df_file_details = pd.DataFrame(
227
+ list(file_details_combined.items()),
228
+ columns=["Meta Tags", "Value"],
229
+ )
230
+ st.dataframe(df_file_details)
231
+
232
+ with st.expander('Metadata using Mutagen'):
233
+ meta_data=mutagen.File(audio_file)
234
+ meta_data_dict={str(key):str(value) for key,value in meta_data.items()}
235
+ meta_data_audio_df=pd.DataFrame(
236
+ list(meta_data_dict.items()),columns=['Tag','Values'])
237
+ st.dataframe(meta_data_audio_df)
238
+ with st.expander("Download Results"):
239
+ combined_df = pd.concat([df_file_details, meta_data_audio_df])
240
+ st.dataframe(combined_df)
241
+ file_download(combined_df)
242
+
243
+
244
+ elif choice=='Document_Files':
245
+ st.subheader('Document MetaData Extractor')
246
+ text_file = st.file_uploader("Upload File", type=["PDF"])
247
+ if text_file is not None:
248
+ col1, col2 = st.columns([1, 2])
249
+
250
+ with col1:
251
+ with st.expander("File Stats"):
252
+ file_details = {
253
+ "FileName": text_file.name,
254
+ "FileSize": text_file.size,
255
+ "FileType": text_file.type,
256
+ }
257
+ add_file_details(text_file.name,text_file.type,text_file.size,datetime.now())
258
+
259
+ st.write(file_details)
260
+
261
+ statinfo = os.stat(text_file.readable())
262
+
263
+ stats_details = {
264
+ "Accessed_Time": get_readable_time(statinfo.st_atime),
265
+ "Creation_Time": get_readable_time(statinfo.st_ctime),
266
+ "Modified_Time": get_readable_time(statinfo.st_mtime),
267
+ }
268
+ st.write(stats_details)
269
+
270
+ # Combine All Details
271
+ file_details_combined = {
272
+ "FileName": text_file.name,
273
+ "FileSize": text_file.size,
274
+ "FileType": text_file.type,
275
+ "Accessed_Time": get_readable_time(statinfo.st_atime),
276
+ "Creation_Time": get_readable_time(statinfo.st_ctime),
277
+ "Modified_Time": get_readable_time(statinfo.st_mtime),
278
+ }
279
+
280
+ # Convert to DataFrame
281
+ df_file_details = pd.DataFrame(
282
+ list(file_details_combined.items()),
283
+ columns=["Meta Tags", "Value"],
284
+ )
285
+ with col2:
286
+ with st.expander("Metadata"):
287
+ pdf_file = PdfReader(text_file)
288
+ pdf_info = pdf_file.metadata
289
+ df_file_details_with_pdf = pd.DataFrame(
290
+ list(pdf_info.items()), columns=["Meta Tags", "Value"]
291
+ )
292
+
293
+ st.dataframe(df_file_details_with_pdf)
294
+
295
+ with st.expander("Download Results"):
296
+ pdf_combined_df = pd.concat([df_file_details, df_file_details_with_pdf])
297
+ st.dataframe(pdf_combined_df)
298
+ file_download(pdf_combined_df)
299
+
300
+ elif choice=='Analytics':
301
+ st.subheader('Analytics')
302
+ uploaded_files= view_all_data()
303
+ df=pd.DataFrame(uploaded_files,columns=['Filename','Filetype','Filesize','UploadDate'])
304
+ with st.expander('Monitor'):
305
+ st.success('View all uploaded files')
306
+ st.dataframe(df)
307
+
308
+ #Monitor uploads
309
+
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+
Text_analysis.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import streamlit.components.v1 as stc
4
+ import docx2txt
5
+
6
+ # NLP Package-used for text analysis
7
+ import nltk
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.tag import pos_tag
10
+ from nltk.stem import WordNetLemmatizer
11
+ from nltk.corpus import stopwords
12
+ # from nltk import ne_chunk
13
+ from nltk.tag import StanfordNERTagger
14
+
15
+ from collections import Counter
16
+
17
+ from textblob import TextBlob
18
+ import seaborn as sns
19
+ import matplotlib.pyplot as plt
20
+
21
+ from wordcloud import WordCloud
22
+
23
+ import base64
24
+ import time
25
+ from app_utils import *
26
+
27
+ HTML_BANNER = """
28
+ <div style="background-color:green;padding:10px;border-radius:10px">
29
+ <h1 style="color:white;text-align:center;">Text Analysis App </h1>
30
+ </div>
31
+ """
32
+
33
+ def text_analysis():
34
+ stc.html(HTML_BANNER)
35
+ menu=['Text-analysis','Upload_Files']
36
+
37
+ choice=st.sidebar.selectbox('Menu',menu)
38
+ if choice=='Text-analysis':
39
+ st.subheader('Analyse Text')
40
+ text=st.text_area("Enter the text to anlayze")
41
+ if (st.button("Analyze")):
42
+ st.success("Success")
43
+ with st.expander('Original Text'):
44
+ st.write(text)
45
+ with st.expander('Text Analysis'):
46
+ token_analysis=nlp_analysis(text)
47
+ st.dataframe(token_analysis)
48
+ with st.expander('Entitites'):
49
+ entity_result=find_entities(text)
50
+ stc.html(entity_result, height=100, scrolling=True)
51
+
52
+ col1,col2=st.columns(2)
53
+
54
+ with col1:
55
+
56
+ with st.expander("Word Stats"):
57
+ st.info("Word Statistics")
58
+ docx = nt.TextFrame(text)
59
+ st.write(docx.word_stats())
60
+
61
+ with st.expander("Top keywords"):
62
+ keywords=get_most_common_tokens(text)
63
+ st.write(keywords)
64
+
65
+ with st.expander('Tagged Keywords'):
66
+ data= pos_tag(text)
67
+ st.dataframe(data)
68
+ visualize_tags=tag_visualize(data)
69
+ stc.html(visualize_tags,scrolling=True)
70
+
71
+
72
+ with st.expander("Sentiment"):
73
+ sent_result=get_semantics(text)
74
+ st.write(sent_result)
75
+
76
+ with col2:
77
+
78
+ with st.expander("Plot word freq"):
79
+ try:
80
+ fig, ax = plt.subplots()
81
+ most_common_tokens = dict(token_analysis["Token"].value_counts())
82
+ sns.countplot(data=token_analysis[token_analysis["Token"].isin(most_common_tokens)], x="Token", ax=ax)
83
+ ax.set_xlabel('PoS')
84
+ ax.set_ylabel('Frequency')
85
+ ax.tick_params(axis='x' , rotation=45)
86
+ st.pyplot(fig)
87
+ except:
88
+ st.warning('Insufficient data')
89
+
90
+ with st.expander("Plot part of speech"):
91
+ try:
92
+ fig, ax = plt.subplots()
93
+ most_common_tokens = dict(token_analysis["Position"].value_counts())
94
+ sns.countplot(data=token_analysis[token_analysis["Position"].isin(most_common_tokens)], x="Position", ax=ax)
95
+ ax.set_xlabel('PoS')
96
+ ax.set_ylabel('Frequency')
97
+ ax.tick_params(axis='x' , rotation=45)
98
+ st.pyplot(fig)
99
+ except:
100
+ st.warning('Insufficient data')
101
+
102
+ with st.expander("Plot word cloud"):
103
+ try:
104
+ plot_wordcloud(text)
105
+ except:
106
+ st.warning('Insufficient data')
107
+
108
+
109
+ with st.expander('Download Results'):
110
+ file_download(token_analysis)
111
+
112
+
113
+
114
+
115
+
116
+
117
+ elif choice == 'Upload_Files':
118
+ text_file = st.file_uploader('Upload Files', type=['docx'])
119
+ if text_file is not None:
120
+ if text_file.type == 'text/plain':
121
+ text = str(text_file.read(), "utf-8")
122
+ else:
123
+ text = docx2txt.process(text_file)
124
+
125
+ if (st.button("Analyze")):
126
+ with st.expander('Original Text'):
127
+ st.write(text)
128
+ with st.expander('Text Analysis'):
129
+ token_analysis = nlp_analysis(text)
130
+ st.dataframe(token_analysis)
131
+ with st.expander('Entities'):
132
+ entity_result = find_entities(text)
133
+ stc.html(entity_result, height=100, scrolling=True)
134
+
135
+ col1, col2 = st.columns(2)
136
+
137
+ with col1:
138
+ with st.expander("Word Stats"):
139
+ st.info("Word Statistics")
140
+ docx = nt.TextFrame(text)
141
+ st.write(docx.word_stats())
142
+
143
+ with st.expander("Top keywords"):
144
+ keywords = get_most_common_tokens(text)
145
+ st.write(keywords)
146
+
147
+ with st.expander("Sentiment"):
148
+ sent_result = get_semantics(text)
149
+ st.write(sent_result)
150
+
151
+ with col2:
152
+ with st.expander("Plot word freq"):
153
+ fig, ax = plt.subplots()
154
+ num_tokens = 10 # Adjust the number of tokens to display as desired
155
+ most_common_tokens = dict(token_analysis["Token"].value_counts().head(num_tokens))
156
+ sns.countplot(data=token_analysis[token_analysis["Token"].isin(most_common_tokens)], x="Token", ax=ax)
157
+ ax.set_xlabel('Token')
158
+ ax.set_ylabel('Frequency')
159
+ ax.tick_params(axis='x', rotation=45)
160
+ st.pyplot(fig)
161
+
162
+ with st.expander("Plot part of speech"):
163
+ fig, ax = plt.subplots()
164
+ most_common_tokens = dict(token_analysis["Position"].value_counts())
165
+ sns.countplot(data=token_analysis[token_analysis["Position"].isin(most_common_tokens)], x="Position", ax=ax)
166
+ ax.set_xlabel('PoS')
167
+ ax.set_ylabel('Frequency')
168
+ ax.tick_params(axis='x', rotation=45)
169
+ st.pyplot(fig)
170
+
171
+ with st.expander("Plot word cloud"):
172
+ plot_wordcloud(text)
173
+
174
+ with st.expander('Download Results'):
175
+ file_download(token_analysis)
176
+
177
+
178
+
179
+
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sumy
3
+
4
+ # using sumy library for summarization
5
+ from sumy.parsers.plaintext import PlaintextParser
6
+ from sumy.nlp.tokenizers import Tokenizer
7
+ from sumy.summarizers.lex_rank import LexRankSummarizer
8
+ from sumy.summarizers.text_rank import TextRankSummarizer
9
+ from sumy.nlp.tokenizers import Tokenizer
10
+ import pandas as pd
11
+ import matplotlib.pyplot as plt
12
+ # import seaborn
13
+ from transformers import BartForConditionalGeneration, BartTokenizer
14
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
15
+ from rouge import Rouge
16
+ import altair as at
17
+ import torch
18
+ from Text_analysis import *
19
+ from Metadata import *
20
+ from app_utils import *
21
+
22
+
23
+ HTML_BANNER = """
24
+ <div style="background-color:lightgreen;padding:10px;border-radius:10px">
25
+ <h1 style="color:white;text-align:center;">Summary app </h1>
26
+ </div>
27
+ """
28
+
29
+
30
+ def main():
31
+ menu=['Summarization','Text-Analysis','Meta-Data']
32
+ choice=st.sidebar.selectbox("Menu",menu)
33
+
34
+
35
+ if choice=='Summarization':
36
+ stc.html(HTML_BANNER)
37
+ st.subheader('summarization')
38
+ raw_text=st.text_area("Enter the text you want to summarize")
39
+ if st.button("Summarize"):
40
+ with st.expander("Original Text"):
41
+ st.write(raw_text)
42
+ c1, c2 = st.columns(2)
43
+
44
+ with c1:
45
+ with st.expander("LexRank Summary"):
46
+ summary = sumy_summarizer(raw_text)
47
+ document_len={"Original":len(raw_text),
48
+ "Summary":len(summary)
49
+ }
50
+ st.write(document_len)
51
+ st.write(summary)
52
+ st.info("Rouge Score")
53
+ score=evaluate_summary(summary,raw_text)
54
+ st.write(score.T)
55
+ st.subheader(" ")
56
+ score['metrics']=score.index
57
+ c=at.Chart(score).mark_bar().encode(
58
+ x='metrics',y='rouge-1'
59
+ )
60
+ st.altair_chart(c)
61
+
62
+ with c2:
63
+ with st.expander("TextRank Summary"):
64
+ text_summary=sumy_text_summarizer(raw_text)
65
+ document_len={"Original":len(raw_text),
66
+ "Summary":len(summary)
67
+ }
68
+ st.write(document_len)
69
+ st.write(text_summary)
70
+
71
+ st.info("Rouge Score")
72
+ score=evaluate_summary(text_summary,raw_text)
73
+ st.write(score.T)
74
+ st.subheader(" ")
75
+ score['metrics']=score.index
76
+ c=at.Chart(score).mark_bar().encode(
77
+ x='metrics',y='rouge-1'
78
+ )
79
+ st.altair_chart(c)
80
+
81
+ st.subheader("Bart Sumary")
82
+ with st.expander("Bart Summary"):
83
+ bart_summ = bart_summary(raw_text)
84
+ document_len={"Original":len(raw_text),
85
+ "Summary":len(summary)
86
+ }
87
+ st.write(document_len)
88
+ st.write(bart_summ)
89
+ st.info("Rouge Score")
90
+ score=evaluate_summary(bart_summ,raw_text)
91
+ st.write(score.T)
92
+ st.subheader(" ")
93
+ score['metrics']=score.index
94
+ c=at.Chart(score).mark_bar().encode(
95
+ x='metrics',y='rouge-1'
96
+ )
97
+ st.altair_chart(c)
98
+
99
+ st.subheader("T5 Sumarization")
100
+ with st.expander("T5 Summary"):
101
+ T5_sum = T5_summary(raw_text)
102
+ document_len={"Original":len(raw_text),
103
+ "Summary":len(summary)
104
+ }
105
+ st.write(document_len)
106
+ st.write(T5_sum)
107
+ st.info("Rouge Score")
108
+ score=evaluate_summary(T5_sum,raw_text)
109
+ st.write(score.T)
110
+ st.subheader(" ")
111
+ score['metrics']=score.index
112
+ c=at.Chart(score).mark_bar().encode(
113
+ x='metrics',y='rouge-1'
114
+ )
115
+ st.altair_chart(c)
116
+
117
+
118
+ elif choice=='Text-Analysis':
119
+ text_analysis()
120
+ else:
121
+ metadata()
122
+
123
+
124
+ if __name__=='__main__':
125
+ main()
app_utils.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import streamlit.components.v1 as stc
4
+ import nltk
5
+
6
+ # NLP Package-used for text analysis
7
+ from sumy.parsers.plaintext import PlaintextParser
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.tag import pos_tag
10
+ from nltk.stem import WordNetLemmatizer
11
+ from sumy.summarizers.lex_rank import LexRankSummarizer
12
+ from sumy.summarizers.text_rank import TextRankSummarizer
13
+ from nltk.corpus import stopwords
14
+ from nltk.tokenize import sent_tokenize
15
+ from sumy.nlp.tokenizers import Tokenizer
16
+ from rouge import Rouge
17
+ from transformers import BartForConditionalGeneration, BartTokenizer
18
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
19
+
20
+ # from nltk import ne_chunk
21
+ from nltk.tag import StanfordNERTagger
22
+
23
+ from collections import Counter
24
+
25
+ from textblob import TextBlob
26
+ import seaborn as sns
27
+ import matplotlib.pyplot as plt
28
+
29
+ from wordcloud import WordCloud
30
+
31
+ import base64
32
+ import time
33
+ stanford_ner_jar = '/Users/ujjwalbansal/Desktop/Summary-app/stanford-ner-2020-11-17/stanford-ner.jar'
34
+ # Path to the pre-trained NER model file
35
+ stanford_ner_model = '/Users/ujjwalbansal/Desktop/Summary-app/stanford-ner-2020-11-17/classifiers/english.all.3class.distsim.crf.ser.gz'
36
+
37
+ timestr = time.strftime("%Y%m%d-%H%M%S")
38
+
39
+
40
+ # from spacy import displacy
41
+
42
+
43
+ #Text cleaning packages
44
+ # removing stopwords, removing special characters, removing URLs, normalizing text, removing HTML tags, correcting common spelling mistakes,
45
+ import neattext as nt
46
+ import neattext.functions as nfx
47
+
48
+
49
+ HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid red; border-radius: 0.25rem; padding: 1rem";>{}
50
+ </div>
51
+ """
52
+
53
+ def evaluate_summary(summary,reference):
54
+ r=Rouge()
55
+ eval_score=r.get_scores(summary,reference)
56
+ eval_score_df=pd.DataFrame(eval_score[0])
57
+ return eval_score_df
58
+
59
+
60
+ def bart_summary(docx):
61
+ model=BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
62
+ tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
63
+ inputs = tokenizer.batch_encode_plus([docx], truncation=True, padding='longest', max_length=1024, return_tensors='pt')
64
+ summary_ids = model.generate(inputs['input_ids'], num_beams=6, max_length=100, early_stopping=True)
65
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
66
+ return summary
67
+
68
+ def T5_summary(docx):
69
+ model = T5ForConditionalGeneration.from_pretrained('t5-base')
70
+ tokenizer = T5Tokenizer.from_pretrained('t5-base')
71
+ input_text = "summarize: " + docx
72
+ input_ids = tokenizer.encode(input_text, return_tensors='pt')
73
+ summary_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)
74
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
75
+ return summary
76
+
77
+ def sumy_summarizer(docx,num=5):
78
+ parser=PlaintextParser.from_string(docx,Tokenizer("english"))
79
+ lex_summ=LexRankSummarizer()
80
+ summary=lex_summ(parser.document,sentences_count= num)
81
+ summary_list=[str(sentence) for sentence in summary]
82
+ result=' '.join(summary_list)
83
+ return result
84
+
85
+ def sumy_text_summarizer(docx, num=5):
86
+ parser = PlaintextParser.from_string(docx, Tokenizer("english"))
87
+ text_rank_summarizer = TextRankSummarizer()
88
+ summary = text_rank_summarizer(parser.document, sentences_count=num)
89
+ summary_list = [str(sentence) for sentence in summary]
90
+ result = ' '.join(summary_list)
91
+ return result
92
+
93
+
94
+ def nlp_analysis(text):
95
+ token_data = []
96
+ tokens=word_tokenize(text)
97
+ tagged_tokens = pos_tag(tokens) #categorize into nouns, verbs, adjectives, adverbs, pronouns etc
98
+ stop_words = set(stopwords.words('english')) #check for words like a", "an", "the", "is", "in"
99
+ lemmatizer = WordNetLemmatizer() #preprocessing
100
+ for token in tagged_tokens:
101
+ token_text=token[0]
102
+ token_shape = None
103
+ token_pos = token[1] # "," - Comma CC - Coordinating conjunction DT - Determiner NN - Noun VBD - Past tense verb PRP - Personal pronoun VBD - Past tense verb
104
+ token_lemma = lemmatizer.lemmatize(token_text)
105
+ token_is_alpha = token_text.isalpha()
106
+ token_is_stop = token_text.lower() in stop_words
107
+ token_data.append([token_text,token_shape,token_pos,token_lemma,token_is_alpha,token_is_stop])
108
+ df=pd.DataFrame(token_data,columns=['Token','Shape','Position','lemma','Contains_Alphabets','Contains_Stop_words'])
109
+ return df
110
+
111
+
112
+ def find_entities(text):
113
+ stan = StanfordNERTagger(stanford_ner_model, stanford_ner_jar)
114
+ text=text.replace("\n\n","\n")
115
+ tokens = nltk.word_tokenize(text)
116
+ tagged_tokens = stan.tag(tokens)
117
+ entities = [(token, tag) for token, tag in tagged_tokens if tag != 'O']
118
+ entities=HTML_WRAPPER.format(entities)
119
+ return entities
120
+
121
+
122
+ def file_download(data):
123
+ csv_file= data.to_csv()
124
+ b64=base64.b64encode(csv_file.encode()).decode()
125
+ new_filename="result_{}.csv".format(timestr)
126
+ st.markdown('### 🗃️ Download csv file ')
127
+ href=f'<a href="data:file/csv;base64,{b64}" download="{new_filename}"> Click Here! </a>'
128
+ st.markdown(href, unsafe_allow_html=True)
129
+
130
+ def get_most_common_tokens(text):
131
+ word_tokens=Counter(text.split())
132
+ most_common=dict(word_tokens.most_common(len(text)))
133
+ return most_common
134
+
135
+
136
+ def get_semantics(text):
137
+ blob=TextBlob(text)
138
+ sentiment=blob.sentiment
139
+ return sentiment
140
+
141
+ def plot_wordcloud(text):
142
+ text_workcloud= WordCloud().generate(text) #size indicates its frequency
143
+ fig=plt.figure()
144
+ plt.imshow(text_workcloud,interpolation='bilinear')
145
+ plt.axis('off')
146
+ st.pyplot(fig)
147
+
148
+ def pos_tags(text):
149
+ blob=TextBlob(text)
150
+ tagged_text=blob.tags
151
+ tagged_df=pd.DataFrame(tagged_text,columns=['tokens','tags'])
152
+ return tagged_df
153
+
154
+ TAGS = {
155
+ 'NN' : 'green',
156
+ 'NNS' : 'green',
157
+ 'NNP' : 'green',
158
+ 'NNPS' : 'green',
159
+ 'VB' : 'blue',
160
+ 'VBD' : 'blue',
161
+ 'VBG' : 'blue',
162
+ 'VBN' : 'blue',
163
+ 'VBP' : 'blue',
164
+ 'VBZ' : 'blue',
165
+ 'JJ' : 'red',
166
+ 'JJR' : 'red',
167
+ 'JJS' : 'red',
168
+ 'RB' : 'cyan',
169
+ 'RBR' : 'cyan',
170
+ 'RBS' : 'cyan',
171
+ 'IN' : 'darkwhite',
172
+ 'POS' : 'darkyellow',
173
+ 'PRP$' : 'magenta',
174
+ 'PRP$' : 'magenta',
175
+ 'DET' : 'black',
176
+ 'CC' : 'black',
177
+ 'CD' : 'black',
178
+ 'WDT' : 'black',
179
+ 'WP' : 'black',
180
+ 'WP$' : 'black',
181
+ 'WRB' : 'black',
182
+ 'EX' : 'yellow',
183
+ 'FW' : 'yellow',
184
+ 'LS' : 'yellow',
185
+ 'MD' : 'yellow',
186
+ 'PDT' : 'yellow',
187
+ 'RP' : 'yellow',
188
+ 'SYM' : 'yellow',
189
+ 'TO' : 'yellow',
190
+ 'None' : 'off'
191
+ }
192
+
193
+ def tag_visualize(tagged_df):
194
+ colored_text=[]
195
+ for i in tagged_df:
196
+ if i[1] in TAGS.keys():
197
+ token=i[0]
198
+ color_of_text=TAGS.get(i[1])
199
+ changed_text='<span style=color:{}>{}</span>'.format(color_of_text,token)
200
+ colored_text.append(changed_text)
201
+ result=''.join(colored_text)
202
+ return result