Spaces:

GIZ
/

SDSN-demo

Runtime error

App Files Files Community

prashant commited on Nov 8, 2022

Commit

685552c

•

1 Parent(s): 9119fa1

update sdg

Browse files

Files changed (4) hide show

appStore/sdg_analysis.py +4 -9
paramconfig.cfg +1 -0
utils/sdg_classifier.py +28 -23
utils/streamlitcheck.py +0 -19

appStore/sdg_analysis.py CHANGED Viewed

@@ -2,9 +2,6 @@
 import glob, os, sys;
 sys.path.append('../utils')
-#import helper
 #import needed libraries
 import seaborn as sns
 import matplotlib.pyplot as plt
@@ -16,9 +13,6 @@ from docx.shared import Pt
 from docx.enum.style import WD_STYLE_TYPE
 from utils.sdg_classifier import sdg_classification
 from utils.sdg_classifier import runSDGPreprocessingPipeline
-# from utils.streamlitcheck import check_streamlit
-import tempfile
-import sqlite3
 import logging
 logger = logging.getLogger(__name__)
@@ -47,15 +41,16 @@ def app():
         if 'filepath' in st.session_state:
-            paraList = runSDGPreprocessingPipeline()
-            if len(paraList) > 150:
                 warning_msg = ": This might take sometime, please sit back and relax."
             else:
                 warning_msg = ""
             with st.spinner("Running SDG Classification{}".format(warning_msg)):
-                df, x = sdg_classification(paraList)
                 plt.rcParams['font.size'] = 25
                 colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))

 import glob, os, sys;
 sys.path.append('../utils')
 #import needed libraries
 import seaborn as sns
 import matplotlib.pyplot as plt
 from docx.enum.style import WD_STYLE_TYPE
 from utils.sdg_classifier import sdg_classification
 from utils.sdg_classifier import runSDGPreprocessingPipeline
 import logging
 logger = logging.getLogger(__name__)
         if 'filepath' in st.session_state:
+            allDocuments = runSDGPreprocessingPipeline(st.session_state['filepath'],
+                                                        st.session_state['filename'])
+            if len(allDocuments['documents']) > 100:
                 warning_msg = ": This might take sometime, please sit back and relax."
             else:
                 warning_msg = ""
             with st.spinner("Running SDG Classification{}".format(warning_msg)):
+                df, x = sdg_classification(allDocuments['documents'])
                 plt.rcParams['font.size'] = 25
                 colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))

paramconfig.cfg CHANGED Viewed

@@ -22,6 +22,7 @@ SPLIT_OVERLAP = 0
 THRESHOLD = 0.85
 MODEL = jonas/sdg_classifier_osdg
 SPLIT_BY = word
 SPLIT_LENGTH = 110
 SPLIT_OVERLAP = 10

 THRESHOLD = 0.85
 MODEL = jonas/sdg_classifier_osdg
 SPLIT_BY = word
+REMOVE_PUNC = 0
 SPLIT_LENGTH = 110
 SPLIT_OVERLAP = 10

utils/sdg_classifier.py CHANGED Viewed

@@ -2,21 +2,28 @@ from haystack.nodes import TransformersDocumentClassifier
 from haystack.schema import Document
 from typing import List, Tuple
 import configparser
-import streamlit as st
-from utils.streamlitcheck import check_streamlit
-from pandas import DataFrame, Series
 import logging
 from utils.preprocessing import processingpipeline
 config = configparser.ConfigParser()
-config.read_file(open('paramconfig.cfg'))
 def load_sdgClassifier():
     """
     loads the document classifier using haystack, where the name/path of model
     in HF-hub as string is used to fetch the model object.
-     1. https://docs.haystack.deepset.ai/reference/document-classifier-api
-     2. https://docs.haystack.deepset.ai/docs/document_classifier
     Return: document classifier model
     """
@@ -28,6 +35,8 @@ def load_sdgClassifier():
     return doc_classifier
 def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
@@ -50,16 +59,13 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
     logging.info("running SDG classifiication")
     threshold = float(config.get('sdg','THRESHOLD'))
-    if check_streamlit():
-        st.write("caching model")
-        classifier = st.cache(load_sdgClassifier, allow_output_mutation=True)
-    else:
-        classifier = load_sdgClassifier()
     results = classifier.predict(haystackdoc)
     labels_= [(l.meta['classification']['label'],
-               l.meta['classification']['score'],l.content,) for l in results]
     df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
@@ -72,7 +78,7 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
     return df, x
-def runSDGPreprocessingPipeline(file_path = None, file_name = None)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig
@@ -80,12 +86,12 @@ def runSDGPreprocessingPipeline(file_path = None, file_name = None)->List[Docume
     Param
     ------------
-    file_path: filepath, if not given will check for file_path in streamlit
-    session_state, else will return
-    file_name: filename, if not given will check for file_name in streamlit
-    session_state
     Return
     --------------
     List[Document]: When preprocessing pipeline is run, the output dictionary
@@ -94,21 +100,20 @@ def runSDGPreprocessingPipeline(file_path = None, file_name = None)->List[Docume
     key = 'documents' on output.
     """
-    # if file_path:
-    file_path = st.session_state['filepath']
-    file_name = st.session_state['filename']
     sdg_processing_pipeline = processingpipeline()
     split_by = config.get('sdg','SPLIT_BY')
     split_length = int(config.get('sdg','SPLIT_LENGTH'))
     split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
     output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
-                                     "UdfPreProcessor": {"removePunc": False, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap}})
-    return output_sdg_pre['documents']

 from haystack.schema import Document
 from typing import List, Tuple
 import configparser
 import logging
+from pandas import DataFrame, Series
 from utils.preprocessing import processingpipeline
+try:
+    import streamlit as st
+except ImportError:
+    logging.info("Streamlit not installed")
 config = configparser.ConfigParser()
+try:
+    config.read_file(open('paramconfig.cfg'))
+except Exception:
+    logging.info("paramconfig file not found")
+    st.info("Please place the paramconfig file in the same directory as app.py")
+@st.cache
 def load_sdgClassifier():
     """
     loads the document classifier using haystack, where the name/path of model
     in HF-hub as string is used to fetch the model object.
+    1. https://docs.haystack.deepset.ai/reference/document-classifier-api
+    2. https://docs.haystack.deepset.ai/docs/document_classifier
     Return: document classifier model
     """
     return doc_classifier
+@st.cache
 def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
     logging.info("running SDG classifiication")
     threshold = float(config.get('sdg','THRESHOLD'))
+    classifier = load_sdgClassifier()
     results = classifier.predict(haystackdoc)
     labels_= [(l.meta['classification']['label'],
+            l.meta['classification']['score'],l.content,) for l in results]
     df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
     return df, x
+def runSDGPreprocessingPipeline(file_path, file_name)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig
     Param
     ------------
+    file_name: filename, in case of streamlit application use
+    st.session_state['filename']
+    file_path: filepath, in case of streamlit application use
+    st.session_state['filepath']
     Return
     --------------
     List[Document]: When preprocessing pipeline is run, the output dictionary
     key = 'documents' on output.
     """
     sdg_processing_pipeline = processingpipeline()
     split_by = config.get('sdg','SPLIT_BY')
     split_length = int(config.get('sdg','SPLIT_LENGTH'))
     split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
+    remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
     output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
+                                     "UdfPreProcessor": {"removePunc": remove_punc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap}})
+    return output_sdg_pre

utils/streamlitcheck.py DELETED Viewed

@@ -1,19 +0,0 @@
-def check_streamlit():
-    """
-    Function to check whether python code is run within streamlit
-    Returns
-    -------
-    use_streamlit : boolean
-        True if code is run within streamlit, else False
-    """
-    try:
-        from streamlit.scriptrunner.script_run_context import get_script_run_ctx
-        if not get_script_run_ctx():
-            use_streamlit = False
-        else:
-            use_streamlit = True
-    except ModuleNotFoundError:
-        use_streamlit = False
-    return use_streamlit