peter2000 commited on
Commit
1786cfb
β€’
1 Parent(s): 72663fa

Create new file

Browse files
Files changed (1) hide show
  1. appStore/sdg_analysis.py +179 -0
appStore/sdg_analysis.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys; sys.path.append('../udfPreprocess')
3
+
4
+ #import helper
5
+ import udfPreprocess.docPreprocessing as pre
6
+ import udfPreprocess.cleaning as clean
7
+
8
+ #import needed libraries
9
+ import seaborn as sns
10
+ from pandas import DataFrame
11
+ from keybert import KeyBERT
12
+ from transformers import pipeline
13
+ import matplotlib.pyplot as plt
14
+ import numpy as np
15
+ import streamlit as st
16
+ import pandas as pd
17
+
18
+ import tempfile
19
+ import sqlite3
20
+
21
+ def app():
22
+
23
+ with st.container():
24
+ st.markdown("<h1 style='text-align: center; color: black;'> Analyse Policy Document</h1>", unsafe_allow_html=True)
25
+ st.write(' ')
26
+ st.write(' ')
27
+
28
+ with st.expander("ℹ️ - About this app", expanded=True):
29
+
30
+ st.write(
31
+ """
32
+ The *Analyse Policy Document* app is an easy-to-use interface built in Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network. \n
33
+ 1. Keyword heatmap \n
34
+ 2. SDG Classification for the paragraphs/texts in the document
35
+ """
36
+ )
37
+
38
+ st.markdown("")
39
+
40
+ st.markdown("")
41
+ st.markdown("## πŸ“Œ Step One: Upload document ")
42
+
43
+ with st.container():
44
+
45
+ file = st.file_uploader('Upload PDF File', type=['pdf', 'docx', 'txt'])
46
+
47
+ if file is not None:
48
+
49
+
50
+ with tempfile.NamedTemporaryFile(mode="wb") as temp:
51
+ bytes_data = file.getvalue()
52
+ temp.write(bytes_data)
53
+
54
+ st.write("Filename: ", file.name)
55
+
56
+ # load document
57
+ docs = pre.load_document(temp.name, file)
58
+
59
+ # preprocess document
60
+ docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
61
+
62
+ # testing
63
+ # st.write(len(all_text))
64
+ # for i in par_list:
65
+ # st.write(i)
66
+
67
+ @st.cache(allow_output_mutation=True)
68
+ def load_keyBert():
69
+ return KeyBERT()
70
+
71
+ kw_model = load_keyBert()
72
+
73
+ keywords = kw_model.extract_keywords(
74
+ all_text,
75
+ keyphrase_ngram_range=(1, 3),
76
+ use_mmr=True,
77
+ stop_words="english",
78
+ top_n=5,
79
+ diversity=0.7,
80
+ )
81
+
82
+ st.markdown("## 🎈 What is my document about?")
83
+
84
+ df = (
85
+ DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
86
+ .sort_values(by="Relevancy", ascending=False)
87
+ .reset_index(drop=True)
88
+ )
89
+
90
+ df.index += 1
91
+
92
+ # Add styling
93
+ cmGreen = sns.light_palette("green", as_cmap=True)
94
+ cmRed = sns.light_palette("red", as_cmap=True)
95
+ df = df.style.background_gradient(
96
+ cmap=cmGreen,
97
+ subset=[
98
+ "Relevancy",
99
+ ],
100
+ )
101
+ c1, c2, c3 = st.columns([1, 3, 1])
102
+
103
+ format_dictionary = {
104
+ "Relevancy": "{:.1%}",
105
+ }
106
+
107
+ df = df.format(format_dictionary)
108
+
109
+ with c2:
110
+ st.table(df)
111
+
112
+ ######## SDG classiciation
113
+ # @st.cache(allow_output_mutation=True)
114
+ # def load_sdgClassifier():
115
+ # classifier = pipeline("text-classification", model= "../models/osdg_sdg/")
116
+
117
+ # return classifier
118
+
119
+ # load from disc (github repo) for performance boost
120
+ @st.cache(allow_output_mutation=True)
121
+ def load_sdgClassifier():
122
+ classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
123
+
124
+ return classifier
125
+
126
+ classifier = load_sdgClassifier()
127
+
128
+ # # not needed, par list comes from pre_processing function already
129
+
130
+ # word_list = all_text.split()
131
+ # len_word_list = len(word_list)
132
+ # par_list = []
133
+ # par_len = 130
134
+ # for i in range(0,len_word_list // par_len):
135
+ # string_part = ' '.join(word_list[i*par_len:(i+1)*par_len])
136
+ # par_list.append(string_part)
137
+
138
+ labels = classifier(par_list)
139
+ labels_= [(l['label'],l['score']) for l in labels]
140
+ df = DataFrame(labels_, columns=["SDG", "Relevancy"])
141
+ df['text'] = par_list
142
+ df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
143
+ df.index += 1
144
+ df =df[df['Relevancy']>.85]
145
+ x = df['SDG'].value_counts()
146
+
147
+ plt.rcParams['font.size'] = 25
148
+ colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
149
+ # plot
150
+ fig, ax = plt.subplots()
151
+ ax.pie(x, colors=colors, radius=2, center=(4, 4),
152
+ wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
153
+
154
+ st.markdown("## 🎈 Anything related to SDGs?")
155
+
156
+ c4, c5, c6 = st.columns([5, 7, 1])
157
+
158
+ # Add styling
159
+ cmGreen = sns.light_palette("green", as_cmap=True)
160
+ cmRed = sns.light_palette("red", as_cmap=True)
161
+ df = df.style.background_gradient(
162
+ cmap=cmGreen,
163
+ subset=[
164
+ "Relevancy",
165
+ ],
166
+ )
167
+
168
+ format_dictionary = {
169
+ "Relevancy": "{:.1%}",
170
+ }
171
+
172
+ df = df.format(format_dictionary)
173
+
174
+ with c4:
175
+ st.pyplot(fig)
176
+ with c5:
177
+ st.table(df)
178
+
179
+