Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

prashant commited on Nov 11, 2022

Commit

ce1209f

1 Parent(s): 1984bd1

info and sdg update

Browse files

Files changed (6) hide show

appStore/info.py +2 -3
appStore/sdg_analysis.py +37 -6
paramconfig.cfg +2 -1
utils/preprocessing.py +5 -10
utils/sdg_classifier.py +3 -2
utils/uploadAndExample.py +0 -2

appStore/info.py CHANGED Viewed

@@ -28,7 +28,7 @@ def app():
            </div>
        """
     st.markdown(footer, unsafe_allow_html=True)
-    # <div class="text">
     c1, c2, c3 =  st.columns([8,1,12])
     with c1:
         st.image("docStore/img/ndc.png")
@@ -42,13 +42,12 @@ def app():
     evaluation of stated goals and targets and their actual implementation on \
     the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
     Language Processing (NLP) methods can help in shortening and easing this \
-    task for policy analysts.</div>',
     unsafe_allow_html=True)
     intro = """
     <div style="text-align: justify;">
     For this purpose, the United Nations Sustainable Development Solutions \
     Network (SDSN) and the Deutsche Gesellschaft für Internationale \
     Zusammenarbeit (GIZ) GmbH are collaborating since 2021 in the development \

            </div>
        """
     st.markdown(footer, unsafe_allow_html=True)
     c1, c2, c3 =  st.columns([8,1,12])
     with c1:
         st.image("docStore/img/ndc.png")
     evaluation of stated goals and targets and their actual implementation on \
     the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
     Language Processing (NLP) methods can help in shortening and easing this \
+    task for policy analysts.</div><br>',
     unsafe_allow_html=True)
     intro = """
     <div style="text-align: justify;">
     For this purpose, the United Nations Sustainable Development Solutions \
     Network (SDSN) and the Deutsche Gesellschaft für Internationale \
     Zusammenarbeit (GIZ) GmbH are collaborating since 2021 in the development \

appStore/sdg_analysis.py CHANGED Viewed

@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
 def app():
     with st.container():
-        st.markdown("<h2 style='text-align: center; color: black;'> SDG Analysis on Polcy Document</h2>", unsafe_allow_html=True)
         st.write(' ')
         st.write(' ')
@@ -31,12 +31,45 @@ def app():
         st.write(
             """
-            The *SDG Analysis on Polcy Document* app is an easy-to-use interface built \
                 in Streamlit for analyzing policy documents with respect to SDG \
                  Classification for the paragraphs/texts in the document and \
                 extracting the keyphrase per SDG label - developed by GIZ Data \
                  and the Sustainable Development Solution Network. \n
             """)
         st.markdown("")
@@ -57,11 +90,11 @@ def app():
                     df, x = sdg_classification(allDocuments['documents'])
                     sdg_labels = df.SDG.unique()
-                    tfidfkeywordList = []
                     textrankkeywordlist = []
                     for label in sdg_labels:
                         sdgdata = " ".join(df[df.SDG == label].text.to_list())
-                        tfidflist_ = keywordExtraction(label,[sdgdata])
                         textranklist_ = textrank(sdgdata, words = 20)
                         tfidfkeywordList.append({'SDG':label, 'TFIDF Keywords':tfidflist_})
                         textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':textranklist_})
@@ -69,8 +102,6 @@ def app():
                     tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
                     plt.rcParams['font.size'] = 25
                     colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
                     # plot

 def app():
     with st.container():
+        st.markdown("<h2 style='text-align: center; color: black;'> SDG Classification and Keyphrase Extraction </h2>", unsafe_allow_html=True)
         st.write(' ')
         st.write(' ')
         st.write(
             """
+            The *SDG Analysis* app is an easy-to-use interface built \
                 in Streamlit for analyzing policy documents with respect to SDG \
                  Classification for the paragraphs/texts in the document and \
                 extracting the keyphrase per SDG label - developed by GIZ Data \
                  and the Sustainable Development Solution Network. \n
             """)
+        st.write("""Document Processing: The Uploaded/Selected document is \
+            automatically cleaned and split into paragraphs with a maximum \
+            length of 120 words using a Haystack preprocessing pipeline. The \
+            length of 120 is an empirical value which should reflect the length \
+            of a “context” and should limit the paragraph length deviation. \
+            However, since we want to respect the sentence boundary the limit \
+            can breach and hence this limit of 120 is tentative.\n
+            SDG cLassification: The application assigns paragraphs to 15 of \
+            the 17 United Nations Sustainable Development Goals (SDGs). SDG 16 \
+            “Peace, Justice and Strong Institutions” and SDG 17 \
+            “Partnerships for the Goals” are excluded from the analysis due to \
+            their broad nature which could potentially inflate the results. \
+            Each paragraph is assigned to one SDG only. Again, the results are \
+            displayed in a summary table including the number of the SDG, a \
+            relevancy score highlighted through a green color shading, and the \
+            respective text of the analyzed paragraph. Additionally, a pie \
+            chart with a blue color shading is displayed which illustrates the \
+            three most prominent SDGs in the document. The SDG classification \
+            uses open-source training [data](https://zenodo.org/record/5550238#.Y25ICHbMJPY) \
+            from [OSDG.ai](https://osdg.ai/) which is a global \
+            partnerships and growing community of researchers and institutions \
+            interested in the classification of research according to the \
+            Sustainable Development Goals. The summary table only displays \
+            paragraphs with a calculated relevancy score above 85%.\n
+            Keyphrase Extraction: The application extracts 15 keyphrases from \
+            the document, calculates a respective relevancy score, and displays \
+            the results in a summary table. The keyphrases are extracted using \
+            using [Textrank](https://github.com/summanlp/textrank) which is an \
+            easy-to-use computational less expensive \
+            model leveraging combination of TFIDF and Graph networks.
+            """)
         st.markdown("")
                     df, x = sdg_classification(allDocuments['documents'])
                     sdg_labels = df.SDG.unique()
+                    # tfidfkeywordList = []
                     textrankkeywordlist = []
                     for label in sdg_labels:
                         sdgdata = " ".join(df[df.SDG == label].text.to_list())
+                        # tfidflist_ = keywordExtraction(label,[sdgdata])
                         textranklist_ = textrank(sdgdata, words = 20)
                         tfidfkeywordList.append({'SDG':label, 'TFIDF Keywords':tfidflist_})
                         textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':textranklist_})
                     tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
                     plt.rcParams['font.size'] = 25
                     colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
                     # plot

paramconfig.cfg CHANGED Viewed

@@ -22,8 +22,9 @@ THRESHOLD = 0.85
 MODEL = jonas/sdg_classifier_osdg
 SPLIT_BY = word
 REMOVE_PUNC = 0
-SPLIT_LENGTH = 110
 SPLIT_OVERLAP = 10
 [preprocessor]
 SPLIT_OVERLAP_WORD = 10

 MODEL = jonas/sdg_classifier_osdg
 SPLIT_BY = word
 REMOVE_PUNC = 0
+SPLIT_LENGTH = 120
 SPLIT_OVERLAP = 10
+RESPECT_SENTENCE_BOUNDARY = 1
 [preprocessor]
 SPLIT_OVERLAP_WORD = 10

utils/preprocessing.py CHANGED Viewed

@@ -9,10 +9,6 @@ import logging
 import re
 import string
 from haystack.pipelines import Pipeline
-import configparser
-config = configparser.ConfigParser()
-config.read_file(open('paramconfig.cfg'))
-top_k = int(config.get('lexical_search','TOP_K'))
 def useOCR(file_path: str)-> Text:
     """
@@ -167,11 +163,10 @@ class UdfPreProcessor(BaseComponent):
     """
     outgoing_edges = 1
-    # split_overlap_word = int(config.get('preprocessor','SPLIT_OVERLAP_WORD'))
-    # split_overlap_sentence = int(config.get('preprocessor','SPLIT_OVERLAP_SENTENCE'))
     def run(self, documents:List[Document], removePunc:bool,
             split_by: Literal["sentence", "word"] = 'sentence',
             split_length:int = 2, split_overlap = 0):
         """ this is required method to invoke the component in
@@ -198,11 +193,9 @@ class UdfPreProcessor(BaseComponent):
         if split_by == 'sentence':
             split_respect_sentence_boundary = False
-            # split_overlap=self.split_overlap_sentence
         else:
-            split_respect_sentence_boundary = True
-            # split_overlap= self.split_overlap_word
         preprocessor = PreProcessor(
             clean_empty_lines=True,
@@ -218,6 +211,8 @@ class UdfPreProcessor(BaseComponent):
             )
         for i in documents:
             docs_processed = preprocessor.process([i])
             for item in docs_processed:
                 item.content = basic(item.content, removePunc= removePunc)
@@ -243,7 +238,7 @@ class UdfPreProcessor(BaseComponent):
 def processingpipeline():
     """
     Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
-    from utils.
     """

 import re
 import string
 from haystack.pipelines import Pipeline
 def useOCR(file_path: str)-> Text:
     """
     """
     outgoing_edges = 1
     def run(self, documents:List[Document], removePunc:bool,
             split_by: Literal["sentence", "word"] = 'sentence',
+            split_respect_sentence_boundary = False,
             split_length:int = 2, split_overlap = 0):
         """ this is required method to invoke the component in
         if split_by == 'sentence':
             split_respect_sentence_boundary = False
         else:
+            split_respect_sentence_boundary = split_respect_sentence_boundary
         preprocessor = PreProcessor(
             clean_empty_lines=True,
             )
         for i in documents:
+            # # basic cleaning before passing it to preprocessor.
+            # i = basic(i)
             docs_processed = preprocessor.process([i])
             for item in docs_processed:
                 item.content = basic(item.content, removePunc= removePunc)
 def processingpipeline():
     """
     Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
+    from utils.preprocessing
     """

utils/sdg_classifier.py CHANGED Viewed

@@ -106,7 +106,7 @@ def runSDGPreprocessingPipeline(filePath, fileName)->List[Document]:
     split_length = int(config.get('sdg','SPLIT_LENGTH'))
     split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
     remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
     output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
                             params= {"FileConverter": {"file_path": filePath, \
@@ -114,6 +114,7 @@ def runSDGPreprocessingPipeline(filePath, fileName)->List[Document]:
                                      "UdfPreProcessor": {"removePunc": remove_punc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
-                                            "split_overlap": split_overlap}})
     return output_sdg_pre

     split_length = int(config.get('sdg','SPLIT_LENGTH'))
     split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
     remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
+    split_respect_sentence_boundary = bool(int(config.get('sdg','RESPECT_SENTENCE_BOUNDARY')))
     output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
                             params= {"FileConverter": {"file_path": filePath, \
                                      "UdfPreProcessor": {"removePunc": remove_punc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
+                                            "split_overlap": split_overlap, \
+        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
     return output_sdg_pre

utils/uploadAndExample.py CHANGED Viewed

@@ -8,7 +8,6 @@ def add_upload(choice):
     the 'file' to streamlit session_state which then can be fetched later.
     """
     if choice == 'Upload Document':
         uploaded_file = st.sidebar.file_uploader('Upload the File',
@@ -21,7 +20,6 @@ def add_upload(choice):
                 st.session_state['filepath'] = temp.name
     else:
         # listing the options
         option = st.sidebar.selectbox('Select the example document',

     the 'file' to streamlit session_state which then can be fetched later.
     """
     if choice == 'Upload Document':
         uploaded_file = st.sidebar.file_uploader('Upload the File',
                 st.session_state['filepath'] = temp.name
     else:
         # listing the options
         option = st.sidebar.selectbox('Select the example document',