Spaces:
GIZ
/
Running on CPU Upgrade

prashant commited on
Commit
40debb1
1 Parent(s): a3c251d

trying streamlit-aggrid

Browse files
appStore/sdg_analysis.py CHANGED
@@ -12,6 +12,7 @@ import docx
12
  from docx.shared import Inches
13
  from docx.shared import Pt
14
  from docx.enum.style import WD_STYLE_TYPE
 
15
  from utils.sdg_classifier import sdg_classification
16
  from utils.sdg_classifier import runSDGPreprocessingPipeline
17
  from utils.keyword_extraction import keywordExtraction, textrank
@@ -22,6 +23,7 @@ logger = logging.getLogger(__name__)
22
 
23
  def app():
24
 
 
25
  with st.container():
26
  st.markdown("<h2 style='text-align: center; color: black;'> SDG Classification and Keyphrase Extraction </h2>", unsafe_allow_html=True)
27
  st.write(' ')
@@ -72,7 +74,25 @@ def app():
72
  """)
73
  st.markdown("")
74
 
75
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  with st.container():
77
  if st.button("RUN SDG Analysis"):
78
 
@@ -90,15 +110,15 @@ def app():
90
 
91
  df, x = sdg_classification(allDocuments['documents'])
92
  sdg_labels = df.SDG.unique()
93
- # tfidfkeywordList = []
94
  textrankkeywordlist = []
95
  for label in sdg_labels:
96
  sdgdata = " ".join(df[df.SDG == label].text.to_list())
97
  # tfidflist_ = keywordExtraction(label,[sdgdata])
98
- textranklist_ = textrank(sdgdata, words = 20)
99
- tfidfkeywordList.append({'SDG':label, 'TFIDF Keywords':tfidflist_})
100
- textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':textranklist_})
101
- tfidfkeywordsDf = pd.DataFrame(tfidfkeywordList)
 
102
  tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
103
 
104
 
@@ -106,9 +126,9 @@ def app():
106
  colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
107
  # plot
108
  fig, ax = plt.subplots()
109
- ax.pie(x, colors=colors, radius=2, center=(4, 4),
110
  wedgeprops={"linewidth": 1, "edgecolor": "white"},
111
- frame=False,labels =list(x.index))
112
  # fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
113
 
114
 
@@ -120,20 +140,15 @@ def app():
120
  st.pyplot(fig)
121
 
122
  st.markdown("##### What keywords are present under SDG classified text? #####")
123
- st.write("TFIDF BASED")
124
 
125
  c1, c2, c3 = st.columns([1, 10, 1])
126
  with c2:
127
- st.table(tfidfkeywordsDf)
128
-
129
- st.write("TextRank BASED")
130
 
131
- c11, c12, c13 = st.columns([1, 10, 1])
132
- with c12:
133
- st.table(tRkeywordsDf)
134
  c7, c8, c9 = st.columns([1, 10, 1])
135
  with c8:
136
- st.table(df)
137
  else:
138
  st.info("🤔 No document found, please try to upload it at the sidebar!")
139
  logging.warning("Terminated as no document provided")
 
12
  from docx.shared import Inches
13
  from docx.shared import Pt
14
  from docx.enum.style import WD_STYLE_TYPE
15
+ from st_aggrid import AgGrid
16
  from utils.sdg_classifier import sdg_classification
17
  from utils.sdg_classifier import runSDGPreprocessingPipeline
18
  from utils.keyword_extraction import keywordExtraction, textrank
 
23
 
24
  def app():
25
 
26
+ #### APP INFO #####
27
  with st.container():
28
  st.markdown("<h2 style='text-align: center; color: black;'> SDG Classification and Keyphrase Extraction </h2>", unsafe_allow_html=True)
29
  st.write(' ')
 
74
  """)
75
  st.markdown("")
76
 
77
+ _lab_dict = {0: 'no_cat',
78
+ 1:'SDG 1 - No poverty',
79
+ 2:'SDG 2 - Zero hunger',
80
+ 3:'SDG 3 - Good health and well-being',
81
+ 4:'SDG 4 - Quality education',
82
+ 5:'SDG 5 - Gender equality',
83
+ 6:'SDG 6 - Clean water and sanitation',
84
+ 7:'SDG 7 - Affordable and clean energy',
85
+ 8:'SDG 8 - Decent work and economic growth',
86
+ 9:'SDG 9 - Industry, Innovation and Infrastructure',
87
+ 10:'SDG 10 - Reduced inequality',
88
+ 11:'SDG 11 - Sustainable cities and communities',
89
+ 12:'SDG 12 - Responsible consumption and production',
90
+ 13:'SDG 13 - Climate action',
91
+ 14:'SDG 14 - Life below water',
92
+ 15:'SDG 15 - Life on land',
93
+ 16:'SDG 16 - Peace, justice and strong institutions',
94
+ 17:'SDG 17 - Partnership for the goals',}
95
+
96
  with st.container():
97
  if st.button("RUN SDG Analysis"):
98
 
 
110
 
111
  df, x = sdg_classification(allDocuments['documents'])
112
  sdg_labels = df.SDG.unique()
 
113
  textrankkeywordlist = []
114
  for label in sdg_labels:
115
  sdgdata = " ".join(df[df.SDG == label].text.to_list())
116
  # tfidflist_ = keywordExtraction(label,[sdgdata])
117
+ textranklist_ = textrank(sdgdata)
118
+ if len(textranklist_) > 0:
119
+ # tfidfkeywordList.append({'SDG':label, 'TFIDF Keywords':tfidflist_})
120
+ textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':textranklist_})
121
+ # tfidfkeywordsDf = pd.DataFrame(tfidfkeywordList)
122
  tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
123
 
124
 
 
126
  colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
127
  # plot
128
  fig, ax = plt.subplots()
129
+ ax.pie(x.count, colors=colors, radius=3, center=(4, 4),
130
  wedgeprops={"linewidth": 1, "edgecolor": "white"},
131
+ frame=False,labels =list(x.SDG_name))
132
  # fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
133
 
134
 
 
140
  st.pyplot(fig)
141
 
142
  st.markdown("##### What keywords are present under SDG classified text? #####")
 
143
 
144
  c1, c2, c3 = st.columns([1, 10, 1])
145
  with c2:
146
+ st.table(tRkeywordsDf)
 
 
147
 
148
+ st.markdown("##### Top few SDG Classified paragraph/text results #####")
 
 
149
  c7, c8, c9 = st.columns([1, 10, 1])
150
  with c8:
151
+ AgGrid(df)
152
  else:
153
  st.info("🤔 No document found, please try to upload it at the sidebar!")
154
  logging.warning("Terminated as no document provided")
paramconfig.cfg CHANGED
@@ -25,6 +25,7 @@ REMOVE_PUNC = 0
25
  SPLIT_LENGTH = 120
26
  SPLIT_OVERLAP = 10
27
  RESPECT_SENTENCE_BOUNDARY = 1
 
28
 
29
  [preprocessor]
30
  SPLIT_OVERLAP_WORD = 10
 
25
  SPLIT_LENGTH = 120
26
  SPLIT_OVERLAP = 10
27
  RESPECT_SENTENCE_BOUNDARY = 1
28
+ TOP_KEY = 15
29
 
30
  [preprocessor]
31
  SPLIT_OVERLAP_WORD = 10
requirements.txt CHANGED
@@ -14,5 +14,6 @@ transformers==4.21.2
14
  st-annotated-text==3.0.0
15
  markdown==3.4.1
16
  summa==1.2.0
 
17
  python-docx
18
  streamlit_option_menu
 
14
  st-annotated-text==3.0.0
15
  markdown==3.4.1
16
  summa==1.2.0
17
+ streamlit-aggrid
18
  python-docx
19
  streamlit_option_menu
utils/keyword_extraction.py CHANGED
@@ -66,7 +66,12 @@ def keywordExtraction(sdg:int,sdgdata:List[Text]):
66
 
67
  def textrank(textdata, ratio = 0.1, words = 0):
68
  if words == 0:
69
- results = keywords.keywords(textdata, ratio= ratio).split("\n")
 
 
 
 
 
70
  else:
71
  results = keywords.keywords(textdata, words= words).split("\n")
72
 
 
66
 
67
  def textrank(textdata, ratio = 0.1, words = 0):
68
  if words == 0:
69
+ try:
70
+ words = config.get('sdg','TOP_KEY')
71
+ results = keywords.keywords(textdata, words = ratio).split("\n")
72
+ except:
73
+ logging.warning("paramconfig not found, running textrank with ratio")
74
+ results = keywords.keywords(textdata, ratio= ratio).split("\n")
75
  else:
76
  results = keywords.keywords(textdata, words= words).split("\n")
77
 
utils/sdg_classifier.py CHANGED
@@ -3,6 +3,7 @@ from haystack.schema import Document
3
  from typing import List, Tuple
4
  import configparser
5
  import logging
 
6
  from pandas import DataFrame, Series
7
  from utils.preprocessing import processingpipeline
8
  try:
@@ -17,6 +18,25 @@ except Exception:
17
  st.info("Please place the paramconfig file in the same directory as app.py")
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  @st.cache(allow_output_mutation=True)
21
  def load_sdgClassifier():
22
  """
@@ -73,6 +93,10 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
73
  df.index += 1
74
  df =df[df['Relevancy']>threshold]
75
  x = df['SDG'].value_counts()
 
 
 
 
76
  df= df.drop(['Relevancy'], axis = 1)
77
 
78
 
 
3
  from typing import List, Tuple
4
  import configparser
5
  import logging
6
+ import pandas as pd
7
  from pandas import DataFrame, Series
8
  from utils.preprocessing import processingpipeline
9
  try:
 
18
  st.info("Please place the paramconfig file in the same directory as app.py")
19
 
20
 
21
+ _lab_dict = {0: 'no_cat',
22
+ 1:'SDG 1 - No poverty',
23
+ 2:'SDG 2 - Zero hunger',
24
+ 3:'SDG 3 - Good health and well-being',
25
+ 4:'SDG 4 - Quality education',
26
+ 5:'SDG 5 - Gender equality',
27
+ 6:'SDG 6 - Clean water and sanitation',
28
+ 7:'SDG 7 - Affordable and clean energy',
29
+ 8:'SDG 8 - Decent work and economic growth',
30
+ 9:'SDG 9 - Industry, Innovation and Infrastructure',
31
+ 10:'SDG 10 - Reduced inequality',
32
+ 11:'SDG 11 - Sustainable cities and communities',
33
+ 12:'SDG 12 - Responsible consumption and production',
34
+ 13:'SDG 13 - Climate action',
35
+ 14:'SDG 14 - Life below water',
36
+ 15:'SDG 15 - Life on land',
37
+ 16:'SDG 16 - Peace, justice and strong institutions',
38
+ 17:'SDG 17 - Partnership for the goals',}
39
+
40
  @st.cache(allow_output_mutation=True)
41
  def load_sdgClassifier():
42
  """
 
93
  df.index += 1
94
  df =df[df['Relevancy']>threshold]
95
  x = df['SDG'].value_counts()
96
+ x = x.rename('count')
97
+ x = x.rename_axis('SDG').reset_index()
98
+ x["SDG"] = pd.to_numeric(x["SDG"])
99
+ x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
100
  df= df.drop(['Relevancy'], axis = 1)
101
 
102