Upload 4 files
Browse files- Metadata.py +317 -0
- Text_analysis.py +179 -0
- app.py +125 -0
- app_utils.py +202 -0
Metadata.py
ADDED
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import streamlit.components.v1 as stc
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import seaborn as sns
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
|
8 |
+
from PIL import Image
|
9 |
+
import exifread # Extracts Meta data of images
|
10 |
+
import os
|
11 |
+
from datetime import datetime
|
12 |
+
|
13 |
+
import mutagen # Extracts Meta data of Audio
|
14 |
+
from PIL.ExifTags import TAGS, GPSTAGS
|
15 |
+
import base64
|
16 |
+
import time
|
17 |
+
from PyPDF2 import PdfReader
|
18 |
+
timestr = time.strftime("%Y%m%d-%H%M%S")
|
19 |
+
|
20 |
+
import sqlite3
|
21 |
+
|
22 |
+
details = """
|
23 |
+
Metadata is defined as the data providing information about one or more aspects of the data; it is used to summarize basic information about data which can make tracking and working with specific data easier
|
24 |
+
"""
|
25 |
+
|
26 |
+
HTML_BANNER = """
|
27 |
+
<div style="background-color:violet;padding:10px;border-radius:10px">
|
28 |
+
<h1 style="color:white;text-align:center;">MetaData Extractor App </h1>
|
29 |
+
</div>
|
30 |
+
"""
|
31 |
+
|
32 |
+
def file_download(data):
|
33 |
+
csv_file= data.to_csv()
|
34 |
+
b64=base64.b64encode(csv_file.encode()).decode()
|
35 |
+
new_filename="result_{}.csv".format(timestr)
|
36 |
+
st.markdown('### 🗃️ Download csv file ')
|
37 |
+
href=f'<a href="data:file/csv;base64,{b64}" download="{new_filename}"> Click Here! </a>'
|
38 |
+
st.markdown(href, unsafe_allow_html=True)
|
39 |
+
|
40 |
+
conn=sqlite3.connect('data.db')
|
41 |
+
c=conn.cursor()
|
42 |
+
|
43 |
+
def create_filestable():
|
44 |
+
c.execute('CREATE TABLE IF NOT EXISTS filestable(filename TEXT,filetype TEXT,filesize TEXT,uploadDate TIMESTAMP)')
|
45 |
+
|
46 |
+
def add_file_details():
|
47 |
+
c.execute('INSERT INTO filestable(filename, filetype, filesize, uploadDate) VALUES (?, ?, ?, ?)', (filename, filetype, filesize, uploadDate))
|
48 |
+
conn.commit()
|
49 |
+
|
50 |
+
def view_all_data():
|
51 |
+
c.execute('SELECT * FROM filestable')
|
52 |
+
data = c.fetchall()
|
53 |
+
return data
|
54 |
+
|
55 |
+
|
56 |
+
def load_image(file):
|
57 |
+
img = Image.open(file)
|
58 |
+
return img
|
59 |
+
|
60 |
+
def get_readable_time(time):
|
61 |
+
return datetime.fromtimestamp(time).strftime('%Y-%m-%d-%H:%M')
|
62 |
+
|
63 |
+
|
64 |
+
def get_exif(filename):
|
65 |
+
exif = Image.open(filename).getexif()
|
66 |
+
|
67 |
+
if exif is not None and isinstance(exif, dict):
|
68 |
+
for key, value in exif.items():
|
69 |
+
name = TAGS.get(key, value)
|
70 |
+
exif[name] = exif.pop(key)
|
71 |
+
|
72 |
+
if 'GPSInfo' in exif:
|
73 |
+
for key in exif['GPSInfo'].keys():
|
74 |
+
name = GPSTAGS.get(key,key)
|
75 |
+
exif['GPSInfo'][name] = exif['GPSInfo'].pop(key)
|
76 |
+
return exif
|
77 |
+
|
78 |
+
|
79 |
+
def metadata():
|
80 |
+
# st.title('Meta-Data Extractor App')
|
81 |
+
stc.html(HTML_BANNER)
|
82 |
+
menu=['Home','Image','Audio','Document_Files','Analytics']
|
83 |
+
choice=st.sidebar.selectbox('Menu',menu)
|
84 |
+
create_filestable()
|
85 |
+
if choice=='Home':
|
86 |
+
st.image(load_image('extraction_process.png'))
|
87 |
+
st.write(details)
|
88 |
+
col1, col2, col3 = st.columns(3)
|
89 |
+
with col1:
|
90 |
+
with st.expander("Get Image Metadata 📷"):
|
91 |
+
st.info("Image Metadata")
|
92 |
+
st.markdown("📷")
|
93 |
+
st.text("Upload JPEG,JPG,PNG Images")
|
94 |
+
|
95 |
+
with col2:
|
96 |
+
with st.expander("Get Audio Metadata 🔉"):
|
97 |
+
st.info("Audio Metadata")
|
98 |
+
st.markdown("🔉")
|
99 |
+
st.text("Upload Mp3,Ogg")
|
100 |
+
|
101 |
+
with col3:
|
102 |
+
with st.expander("Get Document Metadata 📄📁"):
|
103 |
+
st.info("Document Files Metadata")
|
104 |
+
st.markdown("📄📁")
|
105 |
+
st.text("Upload PDF,Docx")
|
106 |
+
|
107 |
+
elif choice=='Image':
|
108 |
+
st.subheader('Image MetaData Extractor')
|
109 |
+
image_file = st.file_uploader("Upload Image", type=["png", "jpg", "jpeg"])
|
110 |
+
if image_file is not None:
|
111 |
+
with st.expander('File Stats'):
|
112 |
+
file_details={'Filename':image_file.name,
|
113 |
+
'Filesize':image_file.size,
|
114 |
+
'Filetype':image_file.type}
|
115 |
+
|
116 |
+
statinfo=os.stat(image_file.readable())
|
117 |
+
statdetails={
|
118 |
+
'Accessed Time': get_readable_time(statinfo.st_atime),
|
119 |
+
'Creation Time':get_readable_time(statinfo.st_ctime),
|
120 |
+
'Modified Time':get_readable_time(statinfo.st_mtime)}
|
121 |
+
full_details={
|
122 |
+
'Filename':image_file.name,
|
123 |
+
'Filesize':image_file.size,
|
124 |
+
'Filetype':image_file.type,
|
125 |
+
'Accessed Time': get_readable_time(statinfo.st_atime),
|
126 |
+
'Creation Time':get_readable_time(statinfo.st_ctime),
|
127 |
+
'Modified Time':get_readable_time(statinfo.st_mtime)
|
128 |
+
}
|
129 |
+
# st.write(full_details)
|
130 |
+
file_details_df = pd.DataFrame(
|
131 |
+
list(full_details.items()), columns=["Meta Tags", "Value"]
|
132 |
+
)
|
133 |
+
st.dataframe(file_details_df)
|
134 |
+
c1, c2 = st.columns(2)
|
135 |
+
with c1:
|
136 |
+
with st.expander("View Image"):
|
137 |
+
img = load_image(image_file)
|
138 |
+
st.image(img,width=250)
|
139 |
+
with c2:
|
140 |
+
with st.expander("Default(JPEG)"):
|
141 |
+
st.info("Using PILLOW")
|
142 |
+
img = load_image(image_file)
|
143 |
+
img_details = {
|
144 |
+
"format": img.format,
|
145 |
+
"format_desc": img.format_description,
|
146 |
+
"filename": img.filename,
|
147 |
+
"size": img.size,
|
148 |
+
"height": img.height,
|
149 |
+
"width": img.width,
|
150 |
+
"info": img.info,
|
151 |
+
}
|
152 |
+
df_img_details = pd.DataFrame(
|
153 |
+
list(img_details.items()), columns=["Meta Tags", "Value"]
|
154 |
+
)
|
155 |
+
st.dataframe(df_img_details)
|
156 |
+
|
157 |
+
c3,c4=st.columns(2)
|
158 |
+
with c3:
|
159 |
+
with st.expander('Using ExifRead Tool'):
|
160 |
+
meta_data=exifread.process_file(image_file)
|
161 |
+
# st.write(meta_data)
|
162 |
+
meta_data_df=pd.DataFrame(
|
163 |
+
list(meta_data.items()),columns=['Meta Data','Values'])
|
164 |
+
st.dataframe(meta_data_df)
|
165 |
+
with c4:
|
166 |
+
with st.expander('Image geo Coordinates'):
|
167 |
+
img_gps_details=get_exif(image_file)
|
168 |
+
latitude = img_gps_details.get('GPSLatitude')
|
169 |
+
longitude = img_gps_details.get('GPSLongitude')
|
170 |
+
try:
|
171 |
+
gps_info = img_gps_details
|
172 |
+
lat=latitude
|
173 |
+
long=longitude
|
174 |
+
except:
|
175 |
+
gps_info = "None Found"
|
176 |
+
st.write(gps_info)
|
177 |
+
st.write(lat)
|
178 |
+
st.write(long)
|
179 |
+
|
180 |
+
add_file_details(img.filename,img.format,img.size,datetime.now())
|
181 |
+
with st.expander('Download Results'):
|
182 |
+
final_df=pd.concat([file_details_df,df_img_details,meta_data_df])
|
183 |
+
st.dataframe(final_df)
|
184 |
+
file_download(final_df)
|
185 |
+
|
186 |
+
|
187 |
+
elif choice=='Audio':
|
188 |
+
st.subheader('Audio MetaData Extractor')
|
189 |
+
audio_file = st.file_uploader("Upload Audio", type=["mp3", "ogg"])
|
190 |
+
|
191 |
+
if audio_file is not None:
|
192 |
+
|
193 |
+
col1, col2 = st.columns(2)
|
194 |
+
|
195 |
+
with col1:
|
196 |
+
st.audio(audio_file.read())
|
197 |
+
|
198 |
+
with col2:
|
199 |
+
with st.expander("File Stats"):
|
200 |
+
file_details = {
|
201 |
+
"FileName": audio_file.name,
|
202 |
+
"FileSize": audio_file.size,
|
203 |
+
"FileType": audio_file.type,
|
204 |
+
}
|
205 |
+
add_file_details(audio_file.name,audio_file.type,audio_file.size,datetime.now())
|
206 |
+
|
207 |
+
st.write(file_details)
|
208 |
+
|
209 |
+
statinfo = os.stat(audio_file.readable())
|
210 |
+
stats_details = {
|
211 |
+
"Accessed_Time": get_readable_time(statinfo.st_atime),
|
212 |
+
"Creation_Time": get_readable_time(statinfo.st_ctime),
|
213 |
+
"Modified_Time": get_readable_time(statinfo.st_mtime),
|
214 |
+
}
|
215 |
+
st.write(stats_details)
|
216 |
+
|
217 |
+
file_details_combined = {
|
218 |
+
"FileName": audio_file.name,
|
219 |
+
"FileSize": audio_file.size,
|
220 |
+
"FileType": audio_file.type,
|
221 |
+
"Accessed_Time": get_readable_time(statinfo.st_atime),
|
222 |
+
"Creation_Time": get_readable_time(statinfo.st_ctime),
|
223 |
+
"Modified_Time": get_readable_time(statinfo.st_mtime),
|
224 |
+
}
|
225 |
+
|
226 |
+
df_file_details = pd.DataFrame(
|
227 |
+
list(file_details_combined.items()),
|
228 |
+
columns=["Meta Tags", "Value"],
|
229 |
+
)
|
230 |
+
st.dataframe(df_file_details)
|
231 |
+
|
232 |
+
with st.expander('Metadata using Mutagen'):
|
233 |
+
meta_data=mutagen.File(audio_file)
|
234 |
+
meta_data_dict={str(key):str(value) for key,value in meta_data.items()}
|
235 |
+
meta_data_audio_df=pd.DataFrame(
|
236 |
+
list(meta_data_dict.items()),columns=['Tag','Values'])
|
237 |
+
st.dataframe(meta_data_audio_df)
|
238 |
+
with st.expander("Download Results"):
|
239 |
+
combined_df = pd.concat([df_file_details, meta_data_audio_df])
|
240 |
+
st.dataframe(combined_df)
|
241 |
+
file_download(combined_df)
|
242 |
+
|
243 |
+
|
244 |
+
elif choice=='Document_Files':
|
245 |
+
st.subheader('Document MetaData Extractor')
|
246 |
+
text_file = st.file_uploader("Upload File", type=["PDF"])
|
247 |
+
if text_file is not None:
|
248 |
+
col1, col2 = st.columns([1, 2])
|
249 |
+
|
250 |
+
with col1:
|
251 |
+
with st.expander("File Stats"):
|
252 |
+
file_details = {
|
253 |
+
"FileName": text_file.name,
|
254 |
+
"FileSize": text_file.size,
|
255 |
+
"FileType": text_file.type,
|
256 |
+
}
|
257 |
+
add_file_details(text_file.name,text_file.type,text_file.size,datetime.now())
|
258 |
+
|
259 |
+
st.write(file_details)
|
260 |
+
|
261 |
+
statinfo = os.stat(text_file.readable())
|
262 |
+
|
263 |
+
stats_details = {
|
264 |
+
"Accessed_Time": get_readable_time(statinfo.st_atime),
|
265 |
+
"Creation_Time": get_readable_time(statinfo.st_ctime),
|
266 |
+
"Modified_Time": get_readable_time(statinfo.st_mtime),
|
267 |
+
}
|
268 |
+
st.write(stats_details)
|
269 |
+
|
270 |
+
# Combine All Details
|
271 |
+
file_details_combined = {
|
272 |
+
"FileName": text_file.name,
|
273 |
+
"FileSize": text_file.size,
|
274 |
+
"FileType": text_file.type,
|
275 |
+
"Accessed_Time": get_readable_time(statinfo.st_atime),
|
276 |
+
"Creation_Time": get_readable_time(statinfo.st_ctime),
|
277 |
+
"Modified_Time": get_readable_time(statinfo.st_mtime),
|
278 |
+
}
|
279 |
+
|
280 |
+
# Convert to DataFrame
|
281 |
+
df_file_details = pd.DataFrame(
|
282 |
+
list(file_details_combined.items()),
|
283 |
+
columns=["Meta Tags", "Value"],
|
284 |
+
)
|
285 |
+
with col2:
|
286 |
+
with st.expander("Metadata"):
|
287 |
+
pdf_file = PdfReader(text_file)
|
288 |
+
pdf_info = pdf_file.metadata
|
289 |
+
df_file_details_with_pdf = pd.DataFrame(
|
290 |
+
list(pdf_info.items()), columns=["Meta Tags", "Value"]
|
291 |
+
)
|
292 |
+
|
293 |
+
st.dataframe(df_file_details_with_pdf)
|
294 |
+
|
295 |
+
with st.expander("Download Results"):
|
296 |
+
pdf_combined_df = pd.concat([df_file_details, df_file_details_with_pdf])
|
297 |
+
st.dataframe(pdf_combined_df)
|
298 |
+
file_download(pdf_combined_df)
|
299 |
+
|
300 |
+
elif choice=='Analytics':
|
301 |
+
st.subheader('Analytics')
|
302 |
+
uploaded_files= view_all_data()
|
303 |
+
df=pd.DataFrame(uploaded_files,columns=['Filename','Filetype','Filesize','UploadDate'])
|
304 |
+
with st.expander('Monitor'):
|
305 |
+
st.success('View all uploaded files')
|
306 |
+
st.dataframe(df)
|
307 |
+
|
308 |
+
#Monitor uploads
|
309 |
+
|
310 |
+
|
311 |
+
|
312 |
+
|
313 |
+
|
314 |
+
|
315 |
+
|
316 |
+
|
317 |
+
|
Text_analysis.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import streamlit.components.v1 as stc
|
4 |
+
import docx2txt
|
5 |
+
|
6 |
+
# NLP Package-used for text analysis
|
7 |
+
import nltk
|
8 |
+
from nltk.tokenize import word_tokenize
|
9 |
+
from nltk.tag import pos_tag
|
10 |
+
from nltk.stem import WordNetLemmatizer
|
11 |
+
from nltk.corpus import stopwords
|
12 |
+
# from nltk import ne_chunk
|
13 |
+
from nltk.tag import StanfordNERTagger
|
14 |
+
|
15 |
+
from collections import Counter
|
16 |
+
|
17 |
+
from textblob import TextBlob
|
18 |
+
import seaborn as sns
|
19 |
+
import matplotlib.pyplot as plt
|
20 |
+
|
21 |
+
from wordcloud import WordCloud
|
22 |
+
|
23 |
+
import base64
|
24 |
+
import time
|
25 |
+
from app_utils import *
|
26 |
+
|
27 |
+
HTML_BANNER = """
|
28 |
+
<div style="background-color:green;padding:10px;border-radius:10px">
|
29 |
+
<h1 style="color:white;text-align:center;">Text Analysis App </h1>
|
30 |
+
</div>
|
31 |
+
"""
|
32 |
+
|
33 |
+
def text_analysis():
|
34 |
+
stc.html(HTML_BANNER)
|
35 |
+
menu=['Text-analysis','Upload_Files']
|
36 |
+
|
37 |
+
choice=st.sidebar.selectbox('Menu',menu)
|
38 |
+
if choice=='Text-analysis':
|
39 |
+
st.subheader('Analyse Text')
|
40 |
+
text=st.text_area("Enter the text to anlayze")
|
41 |
+
if (st.button("Analyze")):
|
42 |
+
st.success("Success")
|
43 |
+
with st.expander('Original Text'):
|
44 |
+
st.write(text)
|
45 |
+
with st.expander('Text Analysis'):
|
46 |
+
token_analysis=nlp_analysis(text)
|
47 |
+
st.dataframe(token_analysis)
|
48 |
+
with st.expander('Entitites'):
|
49 |
+
entity_result=find_entities(text)
|
50 |
+
stc.html(entity_result, height=100, scrolling=True)
|
51 |
+
|
52 |
+
col1,col2=st.columns(2)
|
53 |
+
|
54 |
+
with col1:
|
55 |
+
|
56 |
+
with st.expander("Word Stats"):
|
57 |
+
st.info("Word Statistics")
|
58 |
+
docx = nt.TextFrame(text)
|
59 |
+
st.write(docx.word_stats())
|
60 |
+
|
61 |
+
with st.expander("Top keywords"):
|
62 |
+
keywords=get_most_common_tokens(text)
|
63 |
+
st.write(keywords)
|
64 |
+
|
65 |
+
with st.expander('Tagged Keywords'):
|
66 |
+
data= pos_tag(text)
|
67 |
+
st.dataframe(data)
|
68 |
+
visualize_tags=tag_visualize(data)
|
69 |
+
stc.html(visualize_tags,scrolling=True)
|
70 |
+
|
71 |
+
|
72 |
+
with st.expander("Sentiment"):
|
73 |
+
sent_result=get_semantics(text)
|
74 |
+
st.write(sent_result)
|
75 |
+
|
76 |
+
with col2:
|
77 |
+
|
78 |
+
with st.expander("Plot word freq"):
|
79 |
+
try:
|
80 |
+
fig, ax = plt.subplots()
|
81 |
+
most_common_tokens = dict(token_analysis["Token"].value_counts())
|
82 |
+
sns.countplot(data=token_analysis[token_analysis["Token"].isin(most_common_tokens)], x="Token", ax=ax)
|
83 |
+
ax.set_xlabel('PoS')
|
84 |
+
ax.set_ylabel('Frequency')
|
85 |
+
ax.tick_params(axis='x' , rotation=45)
|
86 |
+
st.pyplot(fig)
|
87 |
+
except:
|
88 |
+
st.warning('Insufficient data')
|
89 |
+
|
90 |
+
with st.expander("Plot part of speech"):
|
91 |
+
try:
|
92 |
+
fig, ax = plt.subplots()
|
93 |
+
most_common_tokens = dict(token_analysis["Position"].value_counts())
|
94 |
+
sns.countplot(data=token_analysis[token_analysis["Position"].isin(most_common_tokens)], x="Position", ax=ax)
|
95 |
+
ax.set_xlabel('PoS')
|
96 |
+
ax.set_ylabel('Frequency')
|
97 |
+
ax.tick_params(axis='x' , rotation=45)
|
98 |
+
st.pyplot(fig)
|
99 |
+
except:
|
100 |
+
st.warning('Insufficient data')
|
101 |
+
|
102 |
+
with st.expander("Plot word cloud"):
|
103 |
+
try:
|
104 |
+
plot_wordcloud(text)
|
105 |
+
except:
|
106 |
+
st.warning('Insufficient data')
|
107 |
+
|
108 |
+
|
109 |
+
with st.expander('Download Results'):
|
110 |
+
file_download(token_analysis)
|
111 |
+
|
112 |
+
|
113 |
+
|
114 |
+
|
115 |
+
|
116 |
+
|
117 |
+
elif choice == 'Upload_Files':
|
118 |
+
text_file = st.file_uploader('Upload Files', type=['docx'])
|
119 |
+
if text_file is not None:
|
120 |
+
if text_file.type == 'text/plain':
|
121 |
+
text = str(text_file.read(), "utf-8")
|
122 |
+
else:
|
123 |
+
text = docx2txt.process(text_file)
|
124 |
+
|
125 |
+
if (st.button("Analyze")):
|
126 |
+
with st.expander('Original Text'):
|
127 |
+
st.write(text)
|
128 |
+
with st.expander('Text Analysis'):
|
129 |
+
token_analysis = nlp_analysis(text)
|
130 |
+
st.dataframe(token_analysis)
|
131 |
+
with st.expander('Entities'):
|
132 |
+
entity_result = find_entities(text)
|
133 |
+
stc.html(entity_result, height=100, scrolling=True)
|
134 |
+
|
135 |
+
col1, col2 = st.columns(2)
|
136 |
+
|
137 |
+
with col1:
|
138 |
+
with st.expander("Word Stats"):
|
139 |
+
st.info("Word Statistics")
|
140 |
+
docx = nt.TextFrame(text)
|
141 |
+
st.write(docx.word_stats())
|
142 |
+
|
143 |
+
with st.expander("Top keywords"):
|
144 |
+
keywords = get_most_common_tokens(text)
|
145 |
+
st.write(keywords)
|
146 |
+
|
147 |
+
with st.expander("Sentiment"):
|
148 |
+
sent_result = get_semantics(text)
|
149 |
+
st.write(sent_result)
|
150 |
+
|
151 |
+
with col2:
|
152 |
+
with st.expander("Plot word freq"):
|
153 |
+
fig, ax = plt.subplots()
|
154 |
+
num_tokens = 10 # Adjust the number of tokens to display as desired
|
155 |
+
most_common_tokens = dict(token_analysis["Token"].value_counts().head(num_tokens))
|
156 |
+
sns.countplot(data=token_analysis[token_analysis["Token"].isin(most_common_tokens)], x="Token", ax=ax)
|
157 |
+
ax.set_xlabel('Token')
|
158 |
+
ax.set_ylabel('Frequency')
|
159 |
+
ax.tick_params(axis='x', rotation=45)
|
160 |
+
st.pyplot(fig)
|
161 |
+
|
162 |
+
with st.expander("Plot part of speech"):
|
163 |
+
fig, ax = plt.subplots()
|
164 |
+
most_common_tokens = dict(token_analysis["Position"].value_counts())
|
165 |
+
sns.countplot(data=token_analysis[token_analysis["Position"].isin(most_common_tokens)], x="Position", ax=ax)
|
166 |
+
ax.set_xlabel('PoS')
|
167 |
+
ax.set_ylabel('Frequency')
|
168 |
+
ax.tick_params(axis='x', rotation=45)
|
169 |
+
st.pyplot(fig)
|
170 |
+
|
171 |
+
with st.expander("Plot word cloud"):
|
172 |
+
plot_wordcloud(text)
|
173 |
+
|
174 |
+
with st.expander('Download Results'):
|
175 |
+
file_download(token_analysis)
|
176 |
+
|
177 |
+
|
178 |
+
|
179 |
+
|
app.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import sumy
|
3 |
+
|
4 |
+
# using sumy library for summarization
|
5 |
+
from sumy.parsers.plaintext import PlaintextParser
|
6 |
+
from sumy.nlp.tokenizers import Tokenizer
|
7 |
+
from sumy.summarizers.lex_rank import LexRankSummarizer
|
8 |
+
from sumy.summarizers.text_rank import TextRankSummarizer
|
9 |
+
from sumy.nlp.tokenizers import Tokenizer
|
10 |
+
import pandas as pd
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
# import seaborn
|
13 |
+
from transformers import BartForConditionalGeneration, BartTokenizer
|
14 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
15 |
+
from rouge import Rouge
|
16 |
+
import altair as at
|
17 |
+
import torch
|
18 |
+
from Text_analysis import *
|
19 |
+
from Metadata import *
|
20 |
+
from app_utils import *
|
21 |
+
|
22 |
+
|
23 |
+
HTML_BANNER = """
|
24 |
+
<div style="background-color:lightgreen;padding:10px;border-radius:10px">
|
25 |
+
<h1 style="color:white;text-align:center;">Summary app </h1>
|
26 |
+
</div>
|
27 |
+
"""
|
28 |
+
|
29 |
+
|
30 |
+
def main():
|
31 |
+
menu=['Summarization','Text-Analysis','Meta-Data']
|
32 |
+
choice=st.sidebar.selectbox("Menu",menu)
|
33 |
+
|
34 |
+
|
35 |
+
if choice=='Summarization':
|
36 |
+
stc.html(HTML_BANNER)
|
37 |
+
st.subheader('summarization')
|
38 |
+
raw_text=st.text_area("Enter the text you want to summarize")
|
39 |
+
if st.button("Summarize"):
|
40 |
+
with st.expander("Original Text"):
|
41 |
+
st.write(raw_text)
|
42 |
+
c1, c2 = st.columns(2)
|
43 |
+
|
44 |
+
with c1:
|
45 |
+
with st.expander("LexRank Summary"):
|
46 |
+
summary = sumy_summarizer(raw_text)
|
47 |
+
document_len={"Original":len(raw_text),
|
48 |
+
"Summary":len(summary)
|
49 |
+
}
|
50 |
+
st.write(document_len)
|
51 |
+
st.write(summary)
|
52 |
+
st.info("Rouge Score")
|
53 |
+
score=evaluate_summary(summary,raw_text)
|
54 |
+
st.write(score.T)
|
55 |
+
st.subheader(" ")
|
56 |
+
score['metrics']=score.index
|
57 |
+
c=at.Chart(score).mark_bar().encode(
|
58 |
+
x='metrics',y='rouge-1'
|
59 |
+
)
|
60 |
+
st.altair_chart(c)
|
61 |
+
|
62 |
+
with c2:
|
63 |
+
with st.expander("TextRank Summary"):
|
64 |
+
text_summary=sumy_text_summarizer(raw_text)
|
65 |
+
document_len={"Original":len(raw_text),
|
66 |
+
"Summary":len(summary)
|
67 |
+
}
|
68 |
+
st.write(document_len)
|
69 |
+
st.write(text_summary)
|
70 |
+
|
71 |
+
st.info("Rouge Score")
|
72 |
+
score=evaluate_summary(text_summary,raw_text)
|
73 |
+
st.write(score.T)
|
74 |
+
st.subheader(" ")
|
75 |
+
score['metrics']=score.index
|
76 |
+
c=at.Chart(score).mark_bar().encode(
|
77 |
+
x='metrics',y='rouge-1'
|
78 |
+
)
|
79 |
+
st.altair_chart(c)
|
80 |
+
|
81 |
+
st.subheader("Bart Sumary")
|
82 |
+
with st.expander("Bart Summary"):
|
83 |
+
bart_summ = bart_summary(raw_text)
|
84 |
+
document_len={"Original":len(raw_text),
|
85 |
+
"Summary":len(summary)
|
86 |
+
}
|
87 |
+
st.write(document_len)
|
88 |
+
st.write(bart_summ)
|
89 |
+
st.info("Rouge Score")
|
90 |
+
score=evaluate_summary(bart_summ,raw_text)
|
91 |
+
st.write(score.T)
|
92 |
+
st.subheader(" ")
|
93 |
+
score['metrics']=score.index
|
94 |
+
c=at.Chart(score).mark_bar().encode(
|
95 |
+
x='metrics',y='rouge-1'
|
96 |
+
)
|
97 |
+
st.altair_chart(c)
|
98 |
+
|
99 |
+
st.subheader("T5 Sumarization")
|
100 |
+
with st.expander("T5 Summary"):
|
101 |
+
T5_sum = T5_summary(raw_text)
|
102 |
+
document_len={"Original":len(raw_text),
|
103 |
+
"Summary":len(summary)
|
104 |
+
}
|
105 |
+
st.write(document_len)
|
106 |
+
st.write(T5_sum)
|
107 |
+
st.info("Rouge Score")
|
108 |
+
score=evaluate_summary(T5_sum,raw_text)
|
109 |
+
st.write(score.T)
|
110 |
+
st.subheader(" ")
|
111 |
+
score['metrics']=score.index
|
112 |
+
c=at.Chart(score).mark_bar().encode(
|
113 |
+
x='metrics',y='rouge-1'
|
114 |
+
)
|
115 |
+
st.altair_chart(c)
|
116 |
+
|
117 |
+
|
118 |
+
elif choice=='Text-Analysis':
|
119 |
+
text_analysis()
|
120 |
+
else:
|
121 |
+
metadata()
|
122 |
+
|
123 |
+
|
124 |
+
if __name__=='__main__':
|
125 |
+
main()
|
app_utils.py
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import streamlit.components.v1 as stc
|
4 |
+
import nltk
|
5 |
+
|
6 |
+
# NLP Package-used for text analysis
|
7 |
+
from sumy.parsers.plaintext import PlaintextParser
|
8 |
+
from nltk.tokenize import word_tokenize
|
9 |
+
from nltk.tag import pos_tag
|
10 |
+
from nltk.stem import WordNetLemmatizer
|
11 |
+
from sumy.summarizers.lex_rank import LexRankSummarizer
|
12 |
+
from sumy.summarizers.text_rank import TextRankSummarizer
|
13 |
+
from nltk.corpus import stopwords
|
14 |
+
from nltk.tokenize import sent_tokenize
|
15 |
+
from sumy.nlp.tokenizers import Tokenizer
|
16 |
+
from rouge import Rouge
|
17 |
+
from transformers import BartForConditionalGeneration, BartTokenizer
|
18 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
19 |
+
|
20 |
+
# from nltk import ne_chunk
|
21 |
+
from nltk.tag import StanfordNERTagger
|
22 |
+
|
23 |
+
from collections import Counter
|
24 |
+
|
25 |
+
from textblob import TextBlob
|
26 |
+
import seaborn as sns
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
|
29 |
+
from wordcloud import WordCloud
|
30 |
+
|
31 |
+
import base64
|
32 |
+
import time
|
33 |
+
stanford_ner_jar = '/Users/ujjwalbansal/Desktop/Summary-app/stanford-ner-2020-11-17/stanford-ner.jar'
|
34 |
+
# Path to the pre-trained NER model file
|
35 |
+
stanford_ner_model = '/Users/ujjwalbansal/Desktop/Summary-app/stanford-ner-2020-11-17/classifiers/english.all.3class.distsim.crf.ser.gz'
|
36 |
+
|
37 |
+
timestr = time.strftime("%Y%m%d-%H%M%S")
|
38 |
+
|
39 |
+
|
40 |
+
# from spacy import displacy
|
41 |
+
|
42 |
+
|
43 |
+
#Text cleaning packages
|
44 |
+
# removing stopwords, removing special characters, removing URLs, normalizing text, removing HTML tags, correcting common spelling mistakes,
|
45 |
+
import neattext as nt
|
46 |
+
import neattext.functions as nfx
|
47 |
+
|
48 |
+
|
49 |
+
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid red; border-radius: 0.25rem; padding: 1rem";>{}
|
50 |
+
</div>
|
51 |
+
"""
|
52 |
+
|
53 |
+
def evaluate_summary(summary,reference):
|
54 |
+
r=Rouge()
|
55 |
+
eval_score=r.get_scores(summary,reference)
|
56 |
+
eval_score_df=pd.DataFrame(eval_score[0])
|
57 |
+
return eval_score_df
|
58 |
+
|
59 |
+
|
60 |
+
def bart_summary(docx):
|
61 |
+
model=BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
|
62 |
+
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
|
63 |
+
inputs = tokenizer.batch_encode_plus([docx], truncation=True, padding='longest', max_length=1024, return_tensors='pt')
|
64 |
+
summary_ids = model.generate(inputs['input_ids'], num_beams=6, max_length=100, early_stopping=True)
|
65 |
+
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
66 |
+
return summary
|
67 |
+
|
68 |
+
def T5_summary(docx):
|
69 |
+
model = T5ForConditionalGeneration.from_pretrained('t5-base')
|
70 |
+
tokenizer = T5Tokenizer.from_pretrained('t5-base')
|
71 |
+
input_text = "summarize: " + docx
|
72 |
+
input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
73 |
+
summary_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)
|
74 |
+
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
75 |
+
return summary
|
76 |
+
|
77 |
+
def sumy_summarizer(docx,num=5):
|
78 |
+
parser=PlaintextParser.from_string(docx,Tokenizer("english"))
|
79 |
+
lex_summ=LexRankSummarizer()
|
80 |
+
summary=lex_summ(parser.document,sentences_count= num)
|
81 |
+
summary_list=[str(sentence) for sentence in summary]
|
82 |
+
result=' '.join(summary_list)
|
83 |
+
return result
|
84 |
+
|
85 |
+
def sumy_text_summarizer(docx, num=5):
|
86 |
+
parser = PlaintextParser.from_string(docx, Tokenizer("english"))
|
87 |
+
text_rank_summarizer = TextRankSummarizer()
|
88 |
+
summary = text_rank_summarizer(parser.document, sentences_count=num)
|
89 |
+
summary_list = [str(sentence) for sentence in summary]
|
90 |
+
result = ' '.join(summary_list)
|
91 |
+
return result
|
92 |
+
|
93 |
+
|
94 |
+
def nlp_analysis(text):
|
95 |
+
token_data = []
|
96 |
+
tokens=word_tokenize(text)
|
97 |
+
tagged_tokens = pos_tag(tokens) #categorize into nouns, verbs, adjectives, adverbs, pronouns etc
|
98 |
+
stop_words = set(stopwords.words('english')) #check for words like a", "an", "the", "is", "in"
|
99 |
+
lemmatizer = WordNetLemmatizer() #preprocessing
|
100 |
+
for token in tagged_tokens:
|
101 |
+
token_text=token[0]
|
102 |
+
token_shape = None
|
103 |
+
token_pos = token[1] # "," - Comma CC - Coordinating conjunction DT - Determiner NN - Noun VBD - Past tense verb PRP - Personal pronoun VBD - Past tense verb
|
104 |
+
token_lemma = lemmatizer.lemmatize(token_text)
|
105 |
+
token_is_alpha = token_text.isalpha()
|
106 |
+
token_is_stop = token_text.lower() in stop_words
|
107 |
+
token_data.append([token_text,token_shape,token_pos,token_lemma,token_is_alpha,token_is_stop])
|
108 |
+
df=pd.DataFrame(token_data,columns=['Token','Shape','Position','lemma','Contains_Alphabets','Contains_Stop_words'])
|
109 |
+
return df
|
110 |
+
|
111 |
+
|
112 |
+
def find_entities(text):
|
113 |
+
stan = StanfordNERTagger(stanford_ner_model, stanford_ner_jar)
|
114 |
+
text=text.replace("\n\n","\n")
|
115 |
+
tokens = nltk.word_tokenize(text)
|
116 |
+
tagged_tokens = stan.tag(tokens)
|
117 |
+
entities = [(token, tag) for token, tag in tagged_tokens if tag != 'O']
|
118 |
+
entities=HTML_WRAPPER.format(entities)
|
119 |
+
return entities
|
120 |
+
|
121 |
+
|
122 |
+
def file_download(data):
|
123 |
+
csv_file= data.to_csv()
|
124 |
+
b64=base64.b64encode(csv_file.encode()).decode()
|
125 |
+
new_filename="result_{}.csv".format(timestr)
|
126 |
+
st.markdown('### 🗃️ Download csv file ')
|
127 |
+
href=f'<a href="data:file/csv;base64,{b64}" download="{new_filename}"> Click Here! </a>'
|
128 |
+
st.markdown(href, unsafe_allow_html=True)
|
129 |
+
|
130 |
+
def get_most_common_tokens(text):
|
131 |
+
word_tokens=Counter(text.split())
|
132 |
+
most_common=dict(word_tokens.most_common(len(text)))
|
133 |
+
return most_common
|
134 |
+
|
135 |
+
|
136 |
+
def get_semantics(text):
|
137 |
+
blob=TextBlob(text)
|
138 |
+
sentiment=blob.sentiment
|
139 |
+
return sentiment
|
140 |
+
|
141 |
+
def plot_wordcloud(text):
|
142 |
+
text_workcloud= WordCloud().generate(text) #size indicates its frequency
|
143 |
+
fig=plt.figure()
|
144 |
+
plt.imshow(text_workcloud,interpolation='bilinear')
|
145 |
+
plt.axis('off')
|
146 |
+
st.pyplot(fig)
|
147 |
+
|
148 |
+
def pos_tags(text):
|
149 |
+
blob=TextBlob(text)
|
150 |
+
tagged_text=blob.tags
|
151 |
+
tagged_df=pd.DataFrame(tagged_text,columns=['tokens','tags'])
|
152 |
+
return tagged_df
|
153 |
+
|
154 |
+
TAGS = {
|
155 |
+
'NN' : 'green',
|
156 |
+
'NNS' : 'green',
|
157 |
+
'NNP' : 'green',
|
158 |
+
'NNPS' : 'green',
|
159 |
+
'VB' : 'blue',
|
160 |
+
'VBD' : 'blue',
|
161 |
+
'VBG' : 'blue',
|
162 |
+
'VBN' : 'blue',
|
163 |
+
'VBP' : 'blue',
|
164 |
+
'VBZ' : 'blue',
|
165 |
+
'JJ' : 'red',
|
166 |
+
'JJR' : 'red',
|
167 |
+
'JJS' : 'red',
|
168 |
+
'RB' : 'cyan',
|
169 |
+
'RBR' : 'cyan',
|
170 |
+
'RBS' : 'cyan',
|
171 |
+
'IN' : 'darkwhite',
|
172 |
+
'POS' : 'darkyellow',
|
173 |
+
'PRP$' : 'magenta',
|
174 |
+
'PRP$' : 'magenta',
|
175 |
+
'DET' : 'black',
|
176 |
+
'CC' : 'black',
|
177 |
+
'CD' : 'black',
|
178 |
+
'WDT' : 'black',
|
179 |
+
'WP' : 'black',
|
180 |
+
'WP$' : 'black',
|
181 |
+
'WRB' : 'black',
|
182 |
+
'EX' : 'yellow',
|
183 |
+
'FW' : 'yellow',
|
184 |
+
'LS' : 'yellow',
|
185 |
+
'MD' : 'yellow',
|
186 |
+
'PDT' : 'yellow',
|
187 |
+
'RP' : 'yellow',
|
188 |
+
'SYM' : 'yellow',
|
189 |
+
'TO' : 'yellow',
|
190 |
+
'None' : 'off'
|
191 |
+
}
|
192 |
+
|
193 |
+
def tag_visualize(tagged_df):
|
194 |
+
colored_text=[]
|
195 |
+
for i in tagged_df:
|
196 |
+
if i[1] in TAGS.keys():
|
197 |
+
token=i[0]
|
198 |
+
color_of_text=TAGS.get(i[1])
|
199 |
+
changed_text='<span style=color:{}>{}</span>'.format(color_of_text,token)
|
200 |
+
colored_text.append(changed_text)
|
201 |
+
result=''.join(colored_text)
|
202 |
+
return result
|