peter2000 commited on
Commit
41460de
β€’
1 Parent(s): 5dc6b7b

Create new file

Browse files
Files changed (1) hide show
  1. app.py +149 -0
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ st.set_page_config(f'SDSN x GIZ Policy Tracing', layout="wide")
3
+
4
+ import seaborn as sns
5
+ import pdfplumber
6
+ from pandas import DataFrame
7
+ from keybert import KeyBERT
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ import streamlit as st
11
+
12
+ def app():
13
+
14
+ with st.container():
15
+ st.markdown("<h1 style='text-align: center; color: black;'> Policy Action Tracking</h1>", unsafe_allow_html=True)
16
+ st.write(' ')
17
+ st.write(' ')
18
+
19
+ with st.expander("ℹ️ - About this app", expanded=True):
20
+
21
+ st.write(
22
+ """
23
+ The *Policy Action Tracker* app is an easy-to-use interface built in Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network.
24
+
25
+ It uses a minimal keyword extraction technique that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) πŸ€— to create keywords/keyphrases that are most similar to a document.
26
+ """
27
+ )
28
+
29
+ st.markdown("")
30
+
31
+ st.markdown("")
32
+ st.markdown("## πŸ“Œ Step One: Upload document ")
33
+
34
+ with st.container():
35
+
36
+ file = st.file_uploader('Upload PDF File', type=['pdf'])
37
+
38
+ if file is not None:
39
+ text = []
40
+ with pdfplumber.open(file) as pdf:
41
+ for page in pdf.pages:
42
+ text.append(page.extract_text())
43
+ text_str = ' '.join([page for page in text])
44
+
45
+ st.write('Number of pages:',len(pdf.pages))
46
+
47
+ @st.cache(allow_output_mutation=True)
48
+ def load_model():
49
+ return KeyBERT()
50
+
51
+ kw_model = load_model()
52
+
53
+ keywords = kw_model.extract_keywords(
54
+ text_str,
55
+ keyphrase_ngram_range=(1, 2),
56
+ use_mmr=True,
57
+ stop_words="english",
58
+ top_n=15,
59
+ diversity=0.7,
60
+ )
61
+
62
+ st.markdown("## 🎈 What is my document about?")
63
+
64
+ df = (
65
+ DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
66
+ .sort_values(by="Relevancy", ascending=False)
67
+ .reset_index(drop=True)
68
+ )
69
+
70
+ df.index += 1
71
+
72
+ # Add styling
73
+ cmGreen = sns.light_palette("green", as_cmap=True)
74
+ cmRed = sns.light_palette("red", as_cmap=True)
75
+ df = df.style.background_gradient(
76
+ cmap=cmGreen,
77
+ subset=[
78
+ "Relevancy",
79
+ ],
80
+ )
81
+ c1, c2, c3 = st.columns([1, 3, 1])
82
+
83
+ format_dictionary = {
84
+ "Relevancy": "{:.1%}",
85
+ }
86
+
87
+ df = df.format(format_dictionary)
88
+
89
+ with c2:
90
+ st.table(df)
91
+
92
+ ######## SDG!
93
+ from transformers import pipeline
94
+
95
+ finetuned_checkpoint = "peter2000/roberta-base-finetuned-osdg"
96
+ classifier = pipeline("text-classification", model=finetuned_checkpoint)
97
+
98
+ word_list = text_str.split()
99
+ len_word_list = len(word_list)
100
+ par_list = []
101
+ par_len = 130
102
+ for i in range(0,len_word_list // par_len):
103
+ string_part = ' '.join(word_list[i*par_len:(i+1)*par_len])
104
+ par_list.append(string_part)
105
+
106
+ labels = classifier(par_list)
107
+ labels_= [(l['label'],l['score']) for l in labels]
108
+ df = DataFrame(labels_, columns=["SDG", "Relevancy"])
109
+ df['text'] = par_list
110
+ df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
111
+ df.index += 1
112
+ #df =df[df['Relevancy']>.95]
113
+ x = df['SDG'].value_counts()
114
+
115
+ plt.rcParams['font.size'] = 25
116
+ colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
117
+ # plot
118
+ fig, ax = plt.subplots()
119
+ ax.pie(x, colors=colors, radius=2, center=(4, 4),
120
+ wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
121
+
122
+ st.markdown("## 🎈 Anything related to SDGs?")
123
+
124
+ c4, c5, c6 = st.columns([5, 7, 1])
125
+
126
+ # Add styling
127
+ cmGreen = sns.light_palette("green", as_cmap=True)
128
+ cmRed = sns.light_palette("red", as_cmap=True)
129
+ df = df.style.background_gradient(
130
+ cmap=cmGreen,
131
+ subset=[
132
+ "Relevancy",
133
+ ],
134
+ )
135
+
136
+ format_dictionary = {
137
+ "Relevancy": "{:.1%}",
138
+ }
139
+
140
+ df = df.format(format_dictionary)
141
+
142
+ with c4:
143
+ st.pyplot(fig)
144
+ with c5:
145
+ st.table(df)
146
+
147
+
148
+
149
+ app.run()