darthPanda commited on
Commit
583664a
1 Parent(s): 6a71d54

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +292 -0
app.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os.path
3
+ import pathlib
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ import PyPDF2
8
+ from PyPDF2 import PdfReader
9
+ from os import walk
10
+ import nltk
11
+ import glob
12
+
13
+ import plotly.express as px
14
+ from wordcloud import WordCloud
15
+ import plotly.io as pio
16
+ from plotly.subplots import make_subplots
17
+ import plotly.graph_objs as go
18
+ import pandas as pd
19
+ import plotly.offline as pyo
20
+
21
+ @st.cache_resource()
22
+ def get_nl():
23
+ return nltk.download('punkt')
24
+ get_nl()
25
+
26
+ from nltk.tokenize import sent_tokenize
27
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
28
+ from transformers import pipeline
29
+
30
+ # if os.path.exists("report.html"):
31
+ # os.remove("report.html")
32
+
33
+
34
+ @st.cache_resource()
35
+ def get_model():
36
+ tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
37
+ model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
38
+ return tokenizer,model
39
+
40
+ tokenizer,model = get_model()
41
+
42
+ def extract_text_from_pdf(path):
43
+ text=''
44
+ reader = PdfReader(path)
45
+ number_of_pages = len(reader.pages)
46
+ print(number_of_pages)
47
+ for i in range(number_of_pages):
48
+ page=reader.pages[i]
49
+ text = text + page.extract_text()
50
+ return text
51
+
52
+ # Create a button to download the HTML file
53
+ def download_html():
54
+ with st.spinner('Downloading HTML file...'):
55
+ # Get the HTML content
56
+ with open('report.html', "r") as f:
57
+ html = f.read()
58
+ f.close()
59
+ # Set the file name and content type
60
+ file_name = "report.html"
61
+ mime_type = "text/html"
62
+ # Use st.download_button() to create a download button
63
+ print('download button')
64
+ st.download_button(label="Download Report", data=html, file_name=file_name, mime=mime_type)
65
+ st.stop()
66
+
67
+ st.write("""
68
+ # Sentiment Analysis Tool
69
+ """)
70
+ #uploaded_file = st.file_uploader("Choose a PDF file")
71
+ #uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False, type=['pdf'])
72
+ uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=True, type=['pdf'])
73
+ #if uploaded_file is not None:
74
+ if len(uploaded_file)>0:
75
+ import time
76
+
77
+ # Wait for 5 seconds
78
+ time.sleep(5)
79
+ #print('gone')
80
+ pdf_reader = PyPDF2.PdfReader(uploaded_file[0])
81
+ # Get the number of pages in the PDF file
82
+ num_pages = len(pdf_reader.pages)
83
+
84
+ if num_pages > 20:
85
+ st.error("Pages in PDF file should be less than 20.")
86
+ # Check that only one file was uploaded
87
+ #elif isinstance(uploaded_file, list):
88
+ elif len(uploaded_file) > 1:
89
+ st.error("Please upload only one PDF file at a time.")
90
+ else:
91
+ #uploaded_file = uploaded_file[0]
92
+ # Check that the file is a PDF
93
+ if uploaded_file[0].type != 'application/pdf':
94
+ st.error("Please upload a PDF file.")
95
+ else:
96
+
97
+ ############################ 1. Extract text from PDF ############################
98
+ text=''
99
+ # return text from pdf
100
+ pdf_reader = PyPDF2.PdfReader(uploaded_file[0])
101
+ # Get the number of pages in the PDF file
102
+ num_pages = len(pdf_reader.pages)
103
+ # Display the number of pages in the PDF file
104
+ st.write(f"Number of pages in PDF file: {num_pages}")
105
+ for i in range(num_pages):
106
+ page=pdf_reader.pages[i]
107
+ text = text + page.extract_text()
108
+
109
+
110
+
111
+ ############################ 2. Sentiment Analysis ############################
112
+ text = text.replace("\n", " " )
113
+ sentences = sent_tokenize(text)
114
+ title = sentences[0]
115
+ long_sentence=[]
116
+ small_sentence=[]
117
+ useful_sentence=[]
118
+ for i in sentences:
119
+ if len(i) > 510:
120
+ long_sentence.append(i)
121
+ elif len(i) < 50:
122
+ small_sentence.append(i)
123
+ else:
124
+ useful_sentence.append(i)
125
+
126
+ del sentences
127
+
128
+ with st.spinner('Processing please wait...'):
129
+
130
+ pipe = pipeline(model="ProsusAI/finbert")
131
+
132
+ classifier = pipeline(model="ProsusAI/finbert")
133
+ output = classifier(useful_sentence)
134
+
135
+ df = pd.DataFrame.from_dict(output)
136
+ df['Sentence']= pd.Series(useful_sentence)
137
+
138
+ labels = ['neutral', 'positive', 'negative']
139
+ values = df.label.value_counts().to_list()
140
+
141
+ # removing words
142
+ words_to_remove = ["s", "quarter", "thank", "million", "Thank", "quetion", 'wa', 'rate', 'firt',
143
+ "customer", "business", "last year", "year", 'lat', 'well', 'jut', 'thi', 'cutomer',
144
+ "will", "think", "higher", "question", "going"]
145
+ for word in words_to_remove:
146
+ text = text.replace(word, "")
147
+ wordcloud = WordCloud(background_color='white', width=800, height=400).generate(text)
148
+ image = wordcloud.to_image()
149
+
150
+ pos_df = df[df['label']=='positive']
151
+ pos_df = pos_df[['score', 'Sentence']]
152
+ pos_df = pos_df.sort_values('score', ascending=False)
153
+ pos_df_mean = pos_df.score.mean()
154
+ pos_df['score'] = pos_df['score'].round(4)
155
+ pos_df.rename(columns = {'Sentence':'Positive Sentences'}, inplace = True)
156
+
157
+ neg_df = df[df['label']=='negative']
158
+ neg_df = neg_df[['score', 'Sentence']]
159
+ neg_df = neg_df.sort_values('score', ascending=False)
160
+ neg_df_mean = neg_df.score.mean()
161
+ neg_df['score'] = neg_df['score'].round(4)
162
+ neg_df.rename(columns = {'Sentence':'Negative Sentences'}, inplace = True)
163
+
164
+ neu_df = df[df['label']=='neutral']
165
+ neu_df = neu_df[['score', 'Sentence']]
166
+ neu_df = neu_df.sort_values('score', ascending=False)
167
+ #neu_df_mean = neu_df.score.mean()
168
+ neu_df['score'] = neu_df['score'].round(4)
169
+ neu_df.rename(columns = {'Sentence':'Neutral Sentences'}, inplace = True)
170
+
171
+ df_temp = neg_df
172
+ df_temp = df_temp['score'] * -1
173
+ df_temp = pd.concat([df_temp, pos_df])
174
+
175
+
176
+ fig = make_subplots(
177
+ rows=26, cols=6,
178
+ specs=[ [None, None, None, None, None, None],
179
+ [None, None, None, None, None, None],
180
+ [None, None, None, None, None, None],
181
+ [None, None, None, None, None, None],
182
+ [None, None, None, None, None, None],
183
+ [{"type": "pie", "rowspan": 6, "colspan": 2}, None, {"type": "indicator", "rowspan": 6, "colspan": 2}, None, {"type": "indicator", "rowspan": 6, "colspan": 2}, None],
184
+ [None, None, None, None, None, None],
185
+ [None, None, None, None, None, None],
186
+ [None, None, None, None, None, None],
187
+ [None, None, None, None, None, None],
188
+ [None, None, None, None, None, None],
189
+ [{"type": "image", "rowspan": 15, "colspan": 3}, None, None, {"type": "table", "rowspan": 5, "colspan": 3}, None, None],
190
+ [None, None, None, None, None, None],
191
+ [None, None, None, None, None, None],
192
+ [None, None, None, None, None, None],
193
+ [None, None, None, None, None, None],
194
+ [None, None, None, {"type": "table", "rowspan": 5, "colspan": 3}, None, None],
195
+ [None, None, None, None, None, None],
196
+ [None, None, None, None, None, None],
197
+ [None, None, None, None, None, None],
198
+ [None, None, None, None, None, None],
199
+ [None, None, None, {"type": "table", "rowspan": 5, "colspan": 3}, None, None],
200
+ [None, None, None, None, None, None],
201
+ [None, None, None, None, None, None],
202
+ [None, None, None, None, None, None],
203
+ [None, None, None, None, None, None],
204
+ ],
205
+ )
206
+ colors = px.colors.diverging.Portland#RdBu
207
+ fig.add_trace(go.Pie(labels=labels, values=values, hole = 0.5,
208
+ title = 'Count by label',
209
+ marker=dict(colors=colors,
210
+ line=dict(width=2, color='white'))),
211
+ row=6, col=1)
212
+
213
+ fig.add_trace(go.Indicator(
214
+ mode = "number",
215
+ value = len(df.label.values.tolist()),
216
+ title = {"text": "Count of Sentence"}), row=6, col=3)
217
+
218
+ fig.add_trace(go.Indicator(
219
+ mode = "gauge+number",
220
+ value = df_temp.score.mean(),
221
+ domain = {'x': [0, 1], 'y': [0, 1]},
222
+ title = {'text': "Average of Score", 'font': {'size': 16}},
223
+ gauge = {
224
+ 'axis': {'range': [-1, 1], 'tickwidth': 1, 'tickcolor': "darkblue"},
225
+ 'bar': {'color': "darkblue"},
226
+ 'steps': [
227
+ {'range': [-0.29, 0.29], 'color': 'white'},
228
+ {'range': [0.3, 1], 'color': 'green'},
229
+ {'range': [-1, -0.3], 'color': 'red'}
230
+ ],
231
+ 'threshold': {
232
+ 'line': {'color': "black", 'width': 4},
233
+ 'thickness': 0.75,
234
+ 'value': abs((pos_df_mean - neg_df_mean))
235
+ }
236
+ }
237
+ ), row=6, col=5)
238
+
239
+ if df_temp.score.mean() < -0.29:
240
+ fig.update_traces(title_text="Cummulative Sentiment Negative", selector=dict(type='indicator'), row=6, col=5)
241
+ elif df_temp.score.mean() < 0.29:
242
+ fig.update_traces(title_text="Cummulative Sentiment Neutral", selector=dict(type='indicator'), row=6, col=5)
243
+ else:
244
+ fig.update_traces(title_text="Cummulative Sentiment Positive", selector=dict(type='indicator'), row=6, col=5)
245
+
246
+ fig.add_trace(go.Image(z=image), row=12, col=1)
247
+ fig.update_xaxes(visible=False, row=12, col=1)
248
+ fig.update_yaxes(visible=False, row=12, col=1)
249
+
250
+ table_trace1 = go.Table(
251
+ header=dict(values=list(pos_df.columns), fill_color='lightgray', align='left'),
252
+ cells=dict(values=[pos_df[name] for name in pos_df.columns], fill_color='white', align='left'),
253
+ columnwidth=[1, 4]
254
+ )
255
+ fig.add_trace(table_trace1, row=12, col=4)
256
+
257
+ table_trace2 = go.Table(
258
+ header=dict(values=list(neg_df.columns), fill_color='lightgray', align='left'),
259
+ cells=dict(values=[neg_df[name] for name in neg_df.columns], fill_color='white', align='left'),
260
+ columnwidth=[1, 4]
261
+ )
262
+ fig.add_trace(table_trace2, row=17, col=4)
263
+
264
+ table_trace2 = go.Table(
265
+ header=dict(values=list(neu_df.columns), fill_color='lightgray', align='left'),
266
+ cells=dict(values=[neu_df[name] for name in neu_df.columns], fill_color='white', align='left'),
267
+ columnwidth=[1, 4]
268
+ )
269
+ fig.add_trace(table_trace2, row=22, col=4)
270
+
271
+ import textwrap
272
+ wrapped_title = "\n".join(textwrap.wrap(title, width=50))
273
+
274
+ # Add HTML tags to force line breaks in the title text
275
+ wrapped_title = "<br>".join(wrapped_title.split("\n"))
276
+
277
+ fig.update_layout(height=700, showlegend=False, title={'text': f"<b>{wrapped_title} - Sentiment Analysis Report</b>", 'x': 0.5, 'xanchor': 'center','font': {'size': 32}})
278
+
279
+ pyo.plot(fig, filename='report.html')
280
+
281
+ import base64
282
+
283
+ # Convert the figure to HTML format
284
+ fig_html = pio.to_html(fig, full_html=False)
285
+ b64 = base64.b64encode(fig_html.encode()).decode()
286
+
287
+ # Generate a download link
288
+ filename = "figure.html"
289
+ href = f'<a href="data:file/html;base64,{b64}" download="{filename}">Download Report</a>'
290
+
291
+ # Display the link
292
+ st.markdown(href, unsafe_allow_html=True)