Spaces:

GIZ
/

cpu_tracs

Sleeping

App Files Files Community

ppsingh commited on Jul 18, 2023

Commit

44da430

•

1 Parent(s): 5ac96cb

target code refactor

Browse files

Files changed (8) hide show

appStore/adapmit.py +21 -23
appStore/ghg.py +27 -28
appStore/netzero.py +29 -29
appStore/sector.py +45 -46
utils/adapmit_classifier.py +7 -9
utils/ghg_classifier.py +6 -5
utils/netzero_classifier.py +5 -5
utils/sector_classifier.py +7 -6

appStore/adapmit.py CHANGED Viewed

@@ -22,32 +22,10 @@ import plotly.express as px
 classifier_identifier = 'adapmit'
 params  = get_classifier_params(classifier_identifier)
-@st.cache_data
-def to_excel(df):
-    len_df = len(df)
-    output = BytesIO()
-    writer = pd.ExcelWriter(output, engine='xlsxwriter')
-    df.to_excel(writer, index=False, sheet_name='Sheet1')
-    workbook = writer.book
-    worksheet = writer.sheets['Sheet1']
-    worksheet.data_validation('E2:E{}'.format(len_df),
-                              {'validate': 'list',
-                               'source': ['No', 'Yes', 'Discard']})
-    worksheet.data_validation('F2:F{}'.format(len_df),
-                              {'validate': 'list',
-                               'source': ['No', 'Yes', 'Discard']})
-    worksheet.data_validation('G2:G{}'.format(len_df),
-                              {'validate': 'list',
-                               'source': ['No', 'Yes', 'Discard']})
-    writer.save()
-    processed_data = output.getvalue()
-    return processed_data
 def app():
     ### Main app code ###
-    with st.container():
         if 'key1' in st.session_state:
             df = st.session_state.key1
@@ -63,6 +41,26 @@ def app():
             st.session_state.key1 = df

 classifier_identifier = 'adapmit'
 params  = get_classifier_params(classifier_identifier)
 def app():
     ### Main app code ###
+    with st.container():
         if 'key1' in st.session_state:
             df = st.session_state.key1
             st.session_state.key1 = df
+# @st.cache_data
+# def to_excel(df):
+#     len_df = len(df)
+#     output = BytesIO()
+#     writer = pd.ExcelWriter(output, engine='xlsxwriter')
+#     df.to_excel(writer, index=False, sheet_name='Sheet1')
+#     workbook = writer.book
+#     worksheet = writer.sheets['Sheet1']
+#     worksheet.data_validation('E2:E{}'.format(len_df),
+#                               {'validate': 'list',
+#                                'source': ['No', 'Yes', 'Discard']})
+#     worksheet.data_validation('F2:F{}'.format(len_df),
+#                               {'validate': 'list',
+#                                'source': ['No', 'Yes', 'Discard']})
+#     worksheet.data_validation('G2:G{}'.format(len_df),
+#                               {'validate': 'list',
+#                                'source': ['No', 'Yes', 'Discard']})
+#     writer.save()
+#     processed_data = output.getvalue()
+#     return processed_data

appStore/ghg.py CHANGED Viewed

@@ -29,41 +29,40 @@ _lab_dict = {
             }
-@st.cache_data
-def to_excel(df):
-    len_df = len(df)
-    output = BytesIO()
-    writer = pd.ExcelWriter(output, engine='xlsxwriter')
-    df.to_excel(writer, index=False, sheet_name='Sheet1')
-    workbook = writer.book
-    worksheet = writer.sheets['Sheet1']
-    worksheet.data_validation('E2:E{}'.format(len_df),
-                              {'validate': 'list',
-                               'source': ['No', 'Yes', 'Discard']})
-    writer.save()
-    processed_data = output.getvalue()
-    return processed_data
 def app():
     ### Main app code ###
     with st.container():
-            if 'key1' in st.session_state:
-                df = st.session_state.key1
-                # Load the classifier model
-                classifier = load_ghgClassifier(classifier_name=params['model_name'])
-                st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
-                if sum(df['Target Label'] == 'TARGET') > 100:
-                    warning_msg = ": This might take sometime, please sit back and relax."
-                else:
-                    warning_msg = ""
-                df = ghg_classification(haystack_doc=df,
-                                            threshold= params['threshold'])
-                st.session_state.key1 = df
 # def netzero_display():
 #   if 'key1' in st.session_state:
 #       df = st.session_state.key2

             }
 def app():
     ### Main app code ###
     with st.container():
+        if 'key1' in st.session_state:
+            df = st.session_state.key1
+            # Load the classifier model
+            classifier = load_ghgClassifier(classifier_name=params['model_name'])
+            st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+            if sum(df['Target Label'] == 'TARGET') > 100:
+                warning_msg = ": This might take sometime, please sit back and relax."
+            else:
+                warning_msg = ""
+            df = ghg_classification(haystack_doc=df,
+                                        threshold= params['threshold'])
+            st.session_state.key1 = df
+# @st.cache_data
+# def to_excel(df):
+#     len_df = len(df)
+#     output = BytesIO()
+#     writer = pd.ExcelWriter(output, engine='xlsxwriter')
+#     df.to_excel(writer, index=False, sheet_name='Sheet1')
+#     workbook = writer.book
+#     worksheet = writer.sheets['Sheet1']
+#     worksheet.data_validation('E2:E{}'.format(len_df),
+#                               {'validate': 'list',
+#                                'source': ['No', 'Yes', 'Discard']})
+#     writer.save()
+#     processed_data = output.getvalue()
+    # return processed_data
 # def netzero_display():
 #   if 'key1' in st.session_state:
 #       df = st.session_state.key2

appStore/netzero.py CHANGED Viewed

@@ -28,41 +28,41 @@ _lab_dict = {
             'NETZERO':'NETZERO TARGET',
             }
-@st.cache_data
-def to_excel(df):
-    len_df = len(df)
-    output = BytesIO()
-    writer = pd.ExcelWriter(output, engine='xlsxwriter')
-    df.to_excel(writer, index=False, sheet_name='Sheet1')
-    workbook = writer.book
-    worksheet = writer.sheets['Sheet1']
-    worksheet.data_validation('E2:E{}'.format(len_df),
-                              {'validate': 'list',
-                               'source': ['No', 'Yes', 'Discard']})
-    writer.save()
-    processed_data = output.getvalue()
-    return processed_data
 def app():
     ### Main app code ###
     with st.container():
-            if 'key1' in st.session_state:
-                df = st.session_state.key1
-                # Load the classifier model
-                classifier = load_netzeroClassifier(classifier_name=params['model_name'])
-                st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
-                if sum(df['Target Label'] == 'TARGET') > 100:
-                    warning_msg = ": This might take sometime, please sit back and relax."
-                else:
-                    warning_msg = ""
-                df = netzero_classification(haystack_doc=df,
-                                            threshold= params['threshold'])
-                st.session_state.key1 = df
 # def netzero_display():
 #   if 'key1' in st.session_state:

             'NETZERO':'NETZERO TARGET',
             }
 def app():
     ### Main app code ###
     with st.container():
+        if 'key1' in st.session_state:
+            df = st.session_state.key1
+            # Load the classifier model
+            classifier = load_netzeroClassifier(classifier_name=params['model_name'])
+            st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+            if sum(df['Target Label'] == 'TARGET') > 100:
+                warning_msg = ": This might take sometime, please sit back and relax."
+            else:
+                warning_msg = ""
+            df = netzero_classification(haystack_doc=df,
+                                        threshold= params['threshold'])
+            st.session_state.key1 = df
+# @st.cache_data
+# def to_excel(df):
+#     len_df = len(df)
+#     output = BytesIO()
+#     writer = pd.ExcelWriter(output, engine='xlsxwriter')
+#     df.to_excel(writer, index=False, sheet_name='Sheet1')
+#     workbook = writer.book
+#     worksheet = writer.sheets['Sheet1']
+#     worksheet.data_validation('E2:E{}'.format(len_df),
+#                               {'validate': 'list',
+#                                'source': ['No', 'Yes', 'Discard']})
+#     writer.save()
+#     processed_data = output.getvalue()
+#     return processed_data
 # def netzero_display():
 #   if 'key1' in st.session_state:

appStore/sector.py CHANGED Viewed

@@ -22,56 +22,55 @@ import plotly.express as px
 classifier_identifier = 'sector'
 params  = get_classifier_params(classifier_identifier)
-@st.cache_data
-def to_excel(df,sectorlist):
-    len_df = len(df)
-    output = BytesIO()
-    writer = pd.ExcelWriter(output, engine='xlsxwriter')
-    df.to_excel(writer, index=False, sheet_name='Sheet1')
-    workbook = writer.book
-    worksheet = writer.sheets['Sheet1']
-    worksheet.data_validation('S2:S{}'.format(len_df),
-                              {'validate': 'list',
-                               'source': ['No', 'Yes', 'Discard']})
-    worksheet.data_validation('X2:X{}'.format(len_df),
-                              {'validate': 'list',
-                               'source': sectorlist + ['Blank']})
-    worksheet.data_validation('T2:T{}'.format(len_df),
-                              {'validate': 'list',
-                               'source': sectorlist + ['Blank']})
-    worksheet.data_validation('U2:U{}'.format(len_df),
-                              {'validate': 'list',
-                               'source': sectorlist + ['Blank']})
-    worksheet.data_validation('V2:V{}'.format(len_df),
-                              {'validate': 'list',
-                               'source': sectorlist + ['Blank']})
-    worksheet.data_validation('W2:U{}'.format(len_df),
-                              {'validate': 'list',
-                               'source': sectorlist + ['Blank']})
-    writer.save()
-    processed_data = output.getvalue()
-    return processed_data
 def app():
     ### Main app code ###
     with st.container():
-            if 'key1' in st.session_state:
-                df = st.session_state.key1
-                classifier = load_sectorClassifier(classifier_name=params['model_name'])
-                st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
-                if sum(df['Target Label'] == 'TARGET') > 100:
-                    warning_msg = ": This might take sometime, please sit back and relax."
-                else:
-                    warning_msg = ""
-                df = sector_classification(haystack_doc=df,
-                                            threshold= params['threshold'])
-                st.session_state.key1 = df
                 # # st.write(df)
                 # threshold= params['threshold']

 classifier_identifier = 'sector'
 params  = get_classifier_params(classifier_identifier)
 def app():
     ### Main app code ###
     with st.container():
+        if 'key1' in st.session_state:
+            df = st.session_state.key1
+            classifier = load_sectorClassifier(classifier_name=params['model_name'])
+            st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+            if sum(df['Target Label'] == 'TARGET') > 100:
+                warning_msg = ": This might take sometime, please sit back and relax."
+            else:
+                warning_msg = ""
+            df = sector_classification(haystack_doc=df,
+                                        threshold= params['threshold'])
+            st.session_state.key1 = df
+# @st.cache_data
+# def to_excel(df,sectorlist):
+#     len_df = len(df)
+#     output = BytesIO()
+#     writer = pd.ExcelWriter(output, engine='xlsxwriter')
+#     df.to_excel(writer, index=False, sheet_name='Sheet1')
+#     workbook = writer.book
+#     worksheet = writer.sheets['Sheet1']
+#     worksheet.data_validation('S2:S{}'.format(len_df),
+#                               {'validate': 'list',
+#                                'source': ['No', 'Yes', 'Discard']})
+#     worksheet.data_validation('X2:X{}'.format(len_df),
+#                               {'validate': 'list',
+#                                'source': sectorlist + ['Blank']})
+#     worksheet.data_validation('T2:T{}'.format(len_df),
+#                               {'validate': 'list',
+#                                'source': sectorlist + ['Blank']})
+#     worksheet.data_validation('U2:U{}'.format(len_df),
+#                               {'validate': 'list',
+#                                'source': sectorlist + ['Blank']})
+#     worksheet.data_validation('V2:V{}'.format(len_df),
+#                               {'validate': 'list',
+#                                'source': sectorlist + ['Blank']})
+#     worksheet.data_validation('W2:U{}'.format(len_df),
+#                               {'validate': 'list',
+#                                'source': sectorlist + ['Blank']})
+#     writer.save()
+#     processed_data = output.getvalue()
+#     return processed_data
                 # # st.write(df)
                 # threshold= params['threshold']

utils/adapmit_classifier.py CHANGED Viewed

@@ -35,9 +35,7 @@ def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
     doc_classifier = pipeline("text-classification",
                             model=classifier_name,
                             return_all_scores=True,
-                            function_to_apply= "sigmoid")
     return doc_classifier
@@ -61,14 +59,10 @@ def adapmit_classification(haystack_doc:pd.DataFrame,
     In case of streamlit avoid passing the model directly.
     Returns
     ----------
-    df: Dataframe with two columns['SDG:int', 'text']
-    x: Series object with the unique SDG covered in the document uploaded and
-    the number of times it is covered/discussed/count_of_paragraphs.
     """
     logging.info("Working on Adaptation-Mitigation Identification")
     haystack_doc['Adapt-Mitig Label'] = 'NA'
-   # df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
-   # df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
     if not classifier_model:
         classifier_model = st.session_state['adapmit_classifier']
@@ -86,14 +80,18 @@ def adapmit_classification(haystack_doc:pd.DataFrame,
     labels_ = [{**list_[l]} for l in range(len(predictions))]
     truth_df = DataFrame.from_dict(labels_)
     truth_df = truth_df.round(2)
     truth_df = truth_df.astype(float) >= threshold
     truth_df = truth_df.astype(str)
     categories = list(truth_df.columns)
     truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
                                         else None for i in categories}, axis=1)
     truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
                                   list(x['Adapt-Mitig Label'] -{None}),axis=1)
     haystack_doc['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
-    #df = pd.concat([df,df1])
     return haystack_doc

     doc_classifier = pipeline("text-classification",
                             model=classifier_name,
                             return_all_scores=True,
+                            function_to_apply= "sigmoid")s
     return doc_classifier
     In case of streamlit avoid passing the model directly.
     Returns
     ----------
+    df: Dataframe
     """
     logging.info("Working on Adaptation-Mitigation Identification")
     haystack_doc['Adapt-Mitig Label'] = 'NA'
     if not classifier_model:
         classifier_model = st.session_state['adapmit_classifier']
     labels_ = [{**list_[l]} for l in range(len(predictions))]
     truth_df = DataFrame.from_dict(labels_)
     truth_df = truth_df.round(2)
+    # convert the labels score into boolean based on threshold value
     truth_df = truth_df.astype(float) >= threshold
     truth_df = truth_df.astype(str)
+    # list of labels
     categories = list(truth_df.columns)
+    # collecting the labels, None is passed to overcome comprehension syntax
     truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
                                         else None for i in categories}, axis=1)
     truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
                                   list(x['Adapt-Mitig Label'] -{None}),axis=1)
+    # adding Adaptation-Mitigation label
     haystack_doc['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
     return haystack_doc

utils/ghg_classifier.py CHANGED Viewed

@@ -55,8 +55,8 @@ def ghg_classification(haystack_doc:pd.DataFrame,
                         )->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
-    most appropriate label for each text. these labels are in terms of if text
-    belongs to which particular Sustainable Devleopment Goal (SDG).
     Params
     ---------
     haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
@@ -68,13 +68,12 @@ def ghg_classification(haystack_doc:pd.DataFrame,
     In case of streamlit avoid passing the model directly.
     Returns
     ----------
-    df: Dataframe with two columns['SDG:int', 'text']
-    x: Series object with the unique SDG covered in the document uploaded and
-    the number of times it is covered/discussed/count_of_paragraphs.
     """
     logging.info("Working on GHG Extraction")
     haystack_doc['GHG Label'] = 'NA'
     haystack_doc['GHG Score'] = 'NA'
     temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
     temp = temp.reset_index(drop=True)
     df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
@@ -86,6 +85,8 @@ def ghg_classification(haystack_doc:pd.DataFrame,
     results = classifier_model(list(temp.text))
     labels_= [(l[0]['label'],l[0]['score']) for l in results]
     temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
     df = pd.concat([df,temp])
     df['GHG Label'] = df['GHG Label'].apply(lambda i: _lab_dict[i])
     df = df.reset_index(drop =True)

                         )->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. It identifies if text contains 'GHG'
+    related information or not.
     Params
     ---------
     haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
     In case of streamlit avoid passing the model directly.
     Returns
     ----------
+    df: Dataframe
     """
     logging.info("Working on GHG Extraction")
     haystack_doc['GHG Label'] = 'NA'
     haystack_doc['GHG Score'] = 'NA'
+    # applying GHG Identifier to only 'Target' paragraphs.
     temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
     temp = temp.reset_index(drop=True)
     df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
     results = classifier_model(list(temp.text))
     labels_= [(l[0]['label'],l[0]['score']) for l in results]
     temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
+    # merge back Target and non-Target dataframe
     df = pd.concat([df,temp])
     df['GHG Label'] = df['GHG Label'].apply(lambda i: _lab_dict[i])
     df = df.reset_index(drop =True)

utils/netzero_classifier.py CHANGED Viewed

@@ -52,8 +52,8 @@ def netzero_classification(haystack_doc:pd.DataFrame,
                         )->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
-    most appropriate label for each text. these labels are in terms of if text
-    belongs to which particular Sustainable Devleopment Goal (SDG).
     Params
     ---------
     haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
@@ -65,13 +65,12 @@ def netzero_classification(haystack_doc:pd.DataFrame,
     In case of streamlit avoid passing the model directly.
     Returns
     ----------
-    df: Dataframe with two columns['SDG:int', 'text']
-    x: Series object with the unique SDG covered in the document uploaded and
-    the number of times it is covered/discussed/count_of_paragraphs.
     """
     logging.info("Working on Netzero Extraction")
     haystack_doc['Netzero Label'] = 'NA'
     haystack_doc['Netzero Score'] = 'NA'
     temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
     temp = temp.reset_index(drop=True)
     df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
@@ -83,6 +82,7 @@ def netzero_classification(haystack_doc:pd.DataFrame,
     results = classifier_model(list(temp.text))
     labels_= [(l[0]['label'],l[0]['score']) for l in results]
     temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
     df = pd.concat([df,temp])
     df = df.reset_index(drop =True)
     df.index += 1

                         )->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. It informs if paragraph contains any
+    netzero information or not.
     Params
     ---------
     haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
     In case of streamlit avoid passing the model directly.
     Returns
     ----------
+    df: Dataframe
     """
     logging.info("Working on Netzero Extraction")
     haystack_doc['Netzero Label'] = 'NA'
     haystack_doc['Netzero Score'] = 'NA'
+    # we apply Netzero to only paragraphs which are classified as 'Target' related
     temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
     temp = temp.reset_index(drop=True)
     df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
     results = classifier_model(list(temp.text))
     labels_= [(l[0]['label'],l[0]['score']) for l in results]
     temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
+    # merging Target with Non Target dataframe
     df = pd.concat([df,temp])
     df = df.reset_index(drop =True)
     df.index += 1

utils/sector_classifier.py CHANGED Viewed

@@ -70,19 +70,16 @@ def sector_classification(haystack_doc:pd.DataFrame,
     In case of streamlit avoid passing the model directly.
     Returns
     ----------
-    df: Dataframe with two columns['SDG:int', 'text']
-    x: Series object with the unique SDG covered in the document uploaded and
-    the number of times it is covered/discussed/count_of_paragraphs.
     """
     logging.info("Working on Sector Identification")
     haystack_doc['Sector Label'] = 'NA'
- #   df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
-  #  df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
     if not classifier_model:
         classifier_model = st.session_state['sector_classifier']
         predictions = classifier_model(list(haystack_doc.text))
     list_ = []
     for i in range(len(predictions)):
@@ -94,13 +91,17 @@ def sector_classification(haystack_doc:pd.DataFrame,
     labels_ = [{**list_[l]} for l in range(len(predictions))]
     truth_df = DataFrame.from_dict(labels_)
     truth_df = truth_df.round(2)
     truth_df = truth_df.astype(float) >= threshold
     truth_df = truth_df.astype(str)
     categories = list(truth_df.columns)
     truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
                                               None for i in categories}, axis=1)
     truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
                                                             -{None}),axis=1)
     haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
-  #  df = pd.concat([df,df1])
     return haystack_doc

     In case of streamlit avoid passing the model directly.
     Returns
     ----------
+    df: Dataframe
     """
     logging.info("Working on Sector Identification")
     haystack_doc['Sector Label'] = 'NA'
     if not classifier_model:
         classifier_model = st.session_state['sector_classifier']
         predictions = classifier_model(list(haystack_doc.text))
+    # getting the sector label and scores
     list_ = []
     for i in range(len(predictions)):
     labels_ = [{**list_[l]} for l in range(len(predictions))]
     truth_df = DataFrame.from_dict(labels_)
     truth_df = truth_df.round(2)
+    # based on threshold value, we convert each sector score into boolean
     truth_df = truth_df.astype(float) >= threshold
     truth_df = truth_df.astype(str)
+    # collecting list of Sector Labels
     categories = list(truth_df.columns)
+    # we collect the Sector Labels as set, None represent the value at the index
+    # in the list of Sector Labels.
     truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
                                               None for i in categories}, axis=1)
+    # we keep all Sector label except None
     truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
                                                             -{None}),axis=1)
     haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
     return haystack_doc