Spaces:

GIZ
/

cpu_tracs

Sleeping

App Files Files Community

ppsingh commited on Jul 25, 2023

Commit

f3a3954

•

1 Parent(s): 338b9ad

add reader

Browse files

Files changed (4) hide show

app.py +2 -1
appStore/reader.py +89 -0
paramconfig.cfg +11 -1
utils/reader_qa.py +110 -0

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import appStore.ghg as ghg
 import appStore.policyaction as policyaction
 import appStore.conditional as conditional
 import appStore.indicator as indicator
 import appStore.doc_processing as processing
 from utils.uploadAndExample import add_upload
 import streamlit as st
@@ -88,7 +89,7 @@ with st.expander("ℹ️ - About this app", expanded=False):
     st.write("")
 apps = [processing.app, target_extraction.app, netzero.app, ghg.app,
-        policyaction.app, conditional.app, sector.app, adapmit.app,indicator.app]
  #conditional.app, sector.app]
         #adapmit.app]

 import appStore.policyaction as policyaction
 import appStore.conditional as conditional
 import appStore.indicator as indicator
+import appStore.reader as reader
 import appStore.doc_processing as processing
 from utils.uploadAndExample import add_upload
 import streamlit as st
     st.write("")
 apps = [processing.app, target_extraction.app, netzero.app, ghg.app,
+        policyaction.app, conditional.app, sector.app, adapmit.app,indicator.app, reader.app]
  #conditional.app, sector.app]
         #adapmit.app]

appStore/reader.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# set path
+import glob, os, sys;
+sys.path.append('../utils')
+#import needed libraries
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import streamlit as st
+from utils.reader_qa import load_reader, reader_highlight
+import logging
+logger = logging.getLogger(__name__)
+from utils.config import get_classifier_params
+from io import BytesIO
+import xlsxwriter
+import plotly.express as px
+# Declare all the necessary variables
+classifier_identifier = 'reader'
+params  = get_classifier_params(classifier_identifier)
+def app():
+    ### Main app code ###
+    with st.container():
+        if 'key1' in st.session_state:
+            df = st.session_state.key1
+            # Load the classifier model
+            classifier = load_reader(classifier_name=params['model_name'])
+            st.session_state['{}_qa'.format(classifier_identifier)] = classifier
+            if sum(df['Target Label'] == 'TARGET') > 100:
+                warning_msg = ": This might take sometime, please sit back and relax."
+            else:
+                warning_msg = ""
+            reader_highlight(haystack_doc=df,
+                                        threshold= params['threshold'])
+            # st.session_state.key1 = df
+# @st.cache_data
+# def to_excel(df):
+#     len_df = len(df)
+#     output = BytesIO()
+#     writer = pd.ExcelWriter(output, engine='xlsxwriter')
+#     df.to_excel(writer, index=False, sheet_name='Sheet1')
+#     workbook = writer.book
+#     worksheet = writer.sheets['Sheet1']
+#     worksheet.data_validation('E2:E{}'.format(len_df),
+#                               {'validate': 'list',
+#                                'source': ['No', 'Yes', 'Discard']})
+#     writer.save()
+#     processed_data = output.getvalue()
+#     return processed_data
+# def netzero_display():
+#   if 'key1' in st.session_state:
+#       df = st.session_state.key2
+#       hits  = df[df['Netzero Label'] == 'NETZERO']
+#       range_val = min(5,len(hits))
+#       if range_val !=0:
+#           count_df = df['Netzero Label'].value_counts()
+#           count_df = count_df.rename('count')
+#           count_df = count_df.rename_axis('Netzero Label').reset_index()
+#           count_df['Label_def'] = count_df['Netzero Label'].apply(lambda x: _lab_dict[x])
+#           fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
+#           c1, c2 = st.columns([1,1])
+#           with c1:
+#               st.plotly_chart(fig,use_container_width= True)
+#           hits = hits.sort_values(by=['Netzero Score'], ascending=False)
+#           st.write("")
+#           st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
+#           range_val = min(5,len(hits))
+#           for i in range(range_val):
+#               # the page number reflects the page that contains the main paragraph
+#               # according to split limit, the overlapping part can be on a separate page
+#               st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Netzero Score']))
+#               st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
+#       else:
+#           st.info("🤔 No Netzero target found")

paramconfig.cfg CHANGED Viewed

@@ -33,7 +33,7 @@ THRESHOLD = 0.50
 MODEL = ppsingh/mpnet-multilabel-sector-classifier
 SPLIT_BY = word
 REMOVE_PUNC = 0
-SPLIT_LENGTH = 60
 SPLIT_OVERLAP = 10
 RESPECT_SENTENCE_BOUNDARY = 1
 TOP_KEY = 10
@@ -86,4 +86,14 @@ REMOVE_PUNC = 0
 SPLIT_LENGTH = 80
 SPLIT_OVERLAP = 10
 RESPECT_SENTENCE_BOUNDARY = 1
 TOP_KEY = 10

 MODEL = ppsingh/mpnet-multilabel-sector-classifier
 SPLIT_BY = word
 REMOVE_PUNC = 0
+SPLIT_LENGTH = 80
 SPLIT_OVERLAP = 10
 RESPECT_SENTENCE_BOUNDARY = 1
 TOP_KEY = 10
 SPLIT_LENGTH = 80
 SPLIT_OVERLAP = 10
 RESPECT_SENTENCE_BOUNDARY = 1
+TOP_KEY = 10
+[reader]
+THRESHOLD = 0.50
+MODEL = ppsingh/roberta-finetuned-qa-policy_0.1
+SPLIT_BY = word
+REMOVE_PUNC = 0
+SPLIT_LENGTH = 80
+SPLIT_OVERLAP = 10
+RESPECT_SENTENCE_BOUNDARY = 1
 TOP_KEY = 10

utils/reader_qa.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from typing import List, Tuple
+from typing_extensions import Literal
+import logging
+import pandas as pd
+from pandas import DataFrame, Series
+from utils.config import getconfig
+from utils.preprocessing import processingpipeline
+import streamlit as st
+from transformers import pipeline
+@st.cache_resource
+def load_reader(config_file:str = None, classifier_name:str = None):
+    """
+    loads the document classifier using haystack, where the name/path of model
+    in HF-hub as string is used to fetch the model object.Either configfile or
+    model should be passed.
+    1. https://docs.haystack.deepset.ai/reference/document-classifier-api
+    2. https://docs.haystack.deepset.ai/docs/document_classifier
+    Params
+    --------
+    config_file: config file path from which to read the model name
+    classifier_name: if modelname is passed, it takes a priority if not \
+    found then will look for configfile, else raise error.
+    Return: document classifier model
+    """
+    if not classifier_name:
+        if not config_file:
+            logging.warning("Pass either model name or config file")
+            return
+        else:
+            config = getconfig(config_file)
+            classifier_name = config.get('reader','MODEL')
+    logging.info("Loading Reader")
+    # we are using the pipeline as the model is multilabel and DocumentClassifier
+    # from Haystack doesnt support multilabel
+    # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
+    # if not then it will automatically use softmax, which is not a desired thing.
+    # doc_classifier = TransformersDocumentClassifier(
+    #                     model_name_or_path=classifier_name,
+    #                     task="text-classification",
+    #                     top_k = None)
+    qa_model = pipeline("question-answering", model=classifier_name )
+    return qa_model
+@st.cache_data
+def reader_highlight(haystack_doc:pd.DataFrame,
+                        threshold:float = 0.5,
+                        classifier_model:pipeline= None
+                        )->Tuple[DataFrame,Series]:
+    """
+    Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. these labels are in terms of if text
+    belongs to which particular Sustainable Devleopment Goal (SDG).
+    Params
+    ---------
+    haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
+    contains the list of paragraphs in different format,here the list of
+    Haystack Documents is used.
+    threshold: threshold value for the model to keep the results from classifier
+    classifiermodel: you can pass the classifier model directly,which takes priority
+    however if not then looks for model in streamlit session.
+    In case of streamlit avoid passing the model directly.
+    Returns
+    ----------
+    df: Dataframe
+    """
+    logging.info("Working on Reader")
+    haystack_doc['Extracted Text'] = 'NA'
+    df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
+    df1 = df1.reset_index(drop=True)
+    df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
+    df = df.reset_index(drop=True)
+    if not classifier_model:
+        reader_model = st.session_state['reader_qa']
+        ques_ = ['What Target/commitments have been made ?'] * len(df1)
+        predictions = reader_model(ques_, list(df1.text))
+    st.write(predictions)
+    # # getting the sector label and scores
+    # list_ = []
+    # for i in range(len(predictions)):
+    #   temp = predictions[i]
+    #   placeholder = {}
+    #   for j in range(len(temp)):
+    #     placeholder[temp[j]['label']] = temp[j]['score']
+    #   list_.append(placeholder)
+    # labels_ = [{**list_[l]} for l in range(len(predictions))]
+    # truth_df = DataFrame.from_dict(labels_)
+    # truth_df = truth_df.round(2)
+    # # based on threshold value, we convert each sector score into boolean
+    # truth_df = truth_df.astype(float) >= threshold
+    # truth_df = truth_df.astype(str)
+    # # collecting list of Sector Labels
+    # categories = list(truth_df.columns)
+    # # we collect the Sector Labels as set, None represent the value at the index
+    # # in the list of Sector Labels.
+    # truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
+    #                                           None for i in categories}, axis=1)
+    # # we keep all Sector label except None
+    # truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
+    #                                                         -{None}),axis=1)
+    # haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
+    # return haystack_doc