peter2000 commited on
Commit
c5118ce
β€’
1 Parent(s): 65fe02c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -131
app.py CHANGED
@@ -9,141 +9,138 @@ import matplotlib.pyplot as plt
9
  import numpy as np
10
  import streamlit as st
11
 
12
- def app():
13
 
14
- with st.container():
15
- st.markdown("<h1 style='text-align: center; color: black;'> Policy Action Tracking</h1>", unsafe_allow_html=True)
16
- st.write(' ')
17
- st.write(' ')
18
 
19
- with st.expander("ℹ️ - About this app", expanded=True):
20
 
21
- st.write(
22
- """
23
- The *Policy Action Tracker* app is an easy-to-use interface built in Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network.
24
 
25
- It uses a minimal keyword extraction technique that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) πŸ€— to create keywords/keyphrases that are most similar to a document.
26
- """
27
- )
28
-
29
- st.markdown("")
30
 
31
  st.markdown("")
32
- st.markdown("## πŸ“Œ Step One: Upload document ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- with st.container():
35
-
36
- file = st.file_uploader('Upload PDF File', type=['pdf'])
37
-
38
- if file is not None:
39
- text = []
40
- with pdfplumber.open(file) as pdf:
41
- for page in pdf.pages:
42
- text.append(page.extract_text())
43
- text_str = ' '.join([page for page in text])
44
-
45
- st.write('Number of pages:',len(pdf.pages))
46
-
47
- @st.cache(allow_output_mutation=True)
48
- def load_model():
49
- return KeyBERT()
50
-
51
- kw_model = load_model()
52
-
53
- keywords = kw_model.extract_keywords(
54
- text_str,
55
- keyphrase_ngram_range=(1, 2),
56
- use_mmr=True,
57
- stop_words="english",
58
- top_n=15,
59
- diversity=0.7,
60
- )
61
-
62
- st.markdown("## 🎈 What is my document about?")
63
-
64
- df = (
65
- DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
66
- .sort_values(by="Relevancy", ascending=False)
67
- .reset_index(drop=True)
68
- )
69
-
70
- df.index += 1
71
-
72
- # Add styling
73
- cmGreen = sns.light_palette("green", as_cmap=True)
74
- cmRed = sns.light_palette("red", as_cmap=True)
75
- df = df.style.background_gradient(
76
- cmap=cmGreen,
77
- subset=[
78
- "Relevancy",
79
- ],
80
- )
81
- c1, c2, c3 = st.columns([1, 3, 1])
82
-
83
- format_dictionary = {
84
- "Relevancy": "{:.1%}",
85
- }
86
-
87
- df = df.format(format_dictionary)
88
-
89
- with c2:
90
- st.table(df)
91
-
92
- ######## SDG!
93
- from transformers import pipeline
94
-
95
- finetuned_checkpoint = "peter2000/roberta-base-finetuned-osdg"
96
- classifier = pipeline("text-classification", model=finetuned_checkpoint)
97
-
98
- word_list = text_str.split()
99
- len_word_list = len(word_list)
100
- par_list = []
101
- par_len = 130
102
- for i in range(0,len_word_list // par_len):
103
- string_part = ' '.join(word_list[i*par_len:(i+1)*par_len])
104
- par_list.append(string_part)
105
-
106
- labels = classifier(par_list)
107
- labels_= [(l['label'],l['score']) for l in labels]
108
- df = DataFrame(labels_, columns=["SDG", "Relevancy"])
109
- df['text'] = par_list
110
- df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
111
- df.index += 1
112
- #df =df[df['Relevancy']>.95]
113
- x = df['SDG'].value_counts()
114
-
115
- plt.rcParams['font.size'] = 25
116
- colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
117
- # plot
118
- fig, ax = plt.subplots()
119
- ax.pie(x, colors=colors, radius=2, center=(4, 4),
120
- wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
121
-
122
- st.markdown("## 🎈 Anything related to SDGs?")
123
-
124
- c4, c5, c6 = st.columns([5, 7, 1])
125
-
126
- # Add styling
127
- cmGreen = sns.light_palette("green", as_cmap=True)
128
- cmRed = sns.light_palette("red", as_cmap=True)
129
- df = df.style.background_gradient(
130
- cmap=cmGreen,
131
- subset=[
132
- "Relevancy",
133
- ],
134
- )
135
-
136
- format_dictionary = {
137
- "Relevancy": "{:.1%}",
138
- }
139
-
140
- df = df.format(format_dictionary)
141
-
142
- with c4:
143
- st.pyplot(fig)
144
- with c5:
145
- st.table(df)
146
-
147
-
148
-
149
- app.run()
 
9
  import numpy as np
10
  import streamlit as st
11
 
 
12
 
13
+ with st.container():
14
+ st.markdown("<h1 style='text-align: center; color: black;'> Policy Action Tracking</h1>", unsafe_allow_html=True)
15
+ st.write(' ')
16
+ st.write(' ')
17
 
18
+ with st.expander("ℹ️ - About this app", expanded=True):
19
 
20
+ st.write(
21
+ """
22
+ The *Policy Action Tracker* app is an easy-to-use interface built in Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network.
23
 
24
+ It uses a minimal keyword extraction technique that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) πŸ€— to create keywords/keyphrases that are most similar to a document.
25
+ """
26
+ )
 
 
27
 
28
  st.markdown("")
29
+
30
+ st.markdown("")
31
+ st.markdown("## πŸ“Œ Step One: Upload document ")
32
+
33
+ with st.container():
34
+
35
+ file = st.file_uploader('Upload PDF File', type=['pdf'])
36
+
37
+ if file is not None:
38
+ text = []
39
+ with pdfplumber.open(file) as pdf:
40
+ for page in pdf.pages:
41
+ text.append(page.extract_text())
42
+ text_str = ' '.join([page for page in text])
43
+
44
+ st.write('Number of pages:',len(pdf.pages))
45
+
46
+ @st.cache(allow_output_mutation=True)
47
+ def load_model():
48
+ return KeyBERT()
49
+
50
+ kw_model = load_model()
51
+
52
+ keywords = kw_model.extract_keywords(
53
+ text_str,
54
+ keyphrase_ngram_range=(1, 2),
55
+ use_mmr=True,
56
+ stop_words="english",
57
+ top_n=15,
58
+ diversity=0.7,
59
+ )
60
+
61
+ st.markdown("## 🎈 What is my document about?")
62
 
63
+ df = (
64
+ DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
65
+ .sort_values(by="Relevancy", ascending=False)
66
+ .reset_index(drop=True)
67
+ )
68
+
69
+ df.index += 1
70
+
71
+ # Add styling
72
+ cmGreen = sns.light_palette("green", as_cmap=True)
73
+ cmRed = sns.light_palette("red", as_cmap=True)
74
+ df = df.style.background_gradient(
75
+ cmap=cmGreen,
76
+ subset=[
77
+ "Relevancy",
78
+ ],
79
+ )
80
+ c1, c2, c3 = st.columns([1, 3, 1])
81
+
82
+ format_dictionary = {
83
+ "Relevancy": "{:.1%}",
84
+ }
85
+
86
+ df = df.format(format_dictionary)
87
+
88
+ with c2:
89
+ st.table(df)
90
+
91
+ ######## SDG!
92
+ from transformers import pipeline
93
+
94
+ finetuned_checkpoint = "peter2000/roberta-base-finetuned-osdg"
95
+ classifier = pipeline("text-classification", model=finetuned_checkpoint)
96
+
97
+ word_list = text_str.split()
98
+ len_word_list = len(word_list)
99
+ par_list = []
100
+ par_len = 130
101
+ for i in range(0,len_word_list // par_len):
102
+ string_part = ' '.join(word_list[i*par_len:(i+1)*par_len])
103
+ par_list.append(string_part)
104
+
105
+ labels = classifier(par_list)
106
+ labels_= [(l['label'],l['score']) for l in labels]
107
+ df = DataFrame(labels_, columns=["SDG", "Relevancy"])
108
+ df['text'] = par_list
109
+ df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
110
+ df.index += 1
111
+ #df =df[df['Relevancy']>.95]
112
+ x = df['SDG'].value_counts()
113
+
114
+ plt.rcParams['font.size'] = 25
115
+ colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
116
+ # plot
117
+ fig, ax = plt.subplots()
118
+ ax.pie(x, colors=colors, radius=2, center=(4, 4),
119
+ wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
120
+
121
+ st.markdown("## 🎈 Anything related to SDGs?")
122
+
123
+ c4, c5, c6 = st.columns([5, 7, 1])
124
+
125
+ # Add styling
126
+ cmGreen = sns.light_palette("green", as_cmap=True)
127
+ cmRed = sns.light_palette("red", as_cmap=True)
128
+ df = df.style.background_gradient(
129
+ cmap=cmGreen,
130
+ subset=[
131
+ "Relevancy",
132
+ ],
133
+ )
134
+
135
+ format_dictionary = {
136
+ "Relevancy": "{:.1%}",
137
+ }
138
+
139
+ df = df.format(format_dictionary)
140
+
141
+ with c4:
142
+ st.pyplot(fig)
143
+ with c5:
144
+ st.table(df)
145
+
146
+