Spaces:

GIZ
/

cpu_tracs

Sleeping

App Files Files Community

ppsingh commited on Jul 24, 2023

Commit

3353eb1

•

1 Parent(s): 7de8f90

add conditional

Browse files

Files changed (3) hide show

appStore/conditional.py +89 -0
paramconfig.cfg +10 -0
utils/conditional_classifier.py +92 -0

appStore/conditional.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# set path
+import glob, os, sys;
+sys.path.append('../utils')
+#import needed libraries
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import streamlit as st
+from utils.netzero_classifier import load_netzeroClassifier, netzero_classification
+import logging
+logger = logging.getLogger(__name__)
+from utils.config import get_classifier_params
+from io import BytesIO
+import xlsxwriter
+import plotly.express as px
+# Declare all the necessary variables
+classifier_identifier = 'netzero'
+params  = get_classifier_params(classifier_identifier)
+def app():
+    ### Main app code ###
+    with st.container():
+        if 'key1' in st.session_state:
+            df = st.session_state.key1
+            # Load the classifier model
+            classifier = load_netzeroClassifier(classifier_name=params['model_name'])
+            st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+            if sum(df['Target Label'] == 'TARGET') > 100:
+                warning_msg = ": This might take sometime, please sit back and relax."
+            else:
+                warning_msg = ""
+            df = netzero_classification(haystack_doc=df,
+                                        threshold= params['threshold'])
+            st.session_state.key1 = df
+# @st.cache_data
+# def to_excel(df):
+#     len_df = len(df)
+#     output = BytesIO()
+#     writer = pd.ExcelWriter(output, engine='xlsxwriter')
+#     df.to_excel(writer, index=False, sheet_name='Sheet1')
+#     workbook = writer.book
+#     worksheet = writer.sheets['Sheet1']
+#     worksheet.data_validation('E2:E{}'.format(len_df),
+#                               {'validate': 'list',
+#                                'source': ['No', 'Yes', 'Discard']})
+#     writer.save()
+#     processed_data = output.getvalue()
+#     return processed_data
+# def netzero_display():
+#   if 'key1' in st.session_state:
+#       df = st.session_state.key2
+#       hits  = df[df['Netzero Label'] == 'NETZERO']
+#       range_val = min(5,len(hits))
+#       if range_val !=0:
+#           count_df = df['Netzero Label'].value_counts()
+#           count_df = count_df.rename('count')
+#           count_df = count_df.rename_axis('Netzero Label').reset_index()
+#           count_df['Label_def'] = count_df['Netzero Label'].apply(lambda x: _lab_dict[x])
+#           fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
+#           c1, c2 = st.columns([1,1])
+#           with c1:
+#               st.plotly_chart(fig,use_container_width= True)
+#           hits = hits.sort_values(by=['Netzero Score'], ascending=False)
+#           st.write("")
+#           st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
+#           range_val = min(5,len(hits))
+#           for i in range(range_val):
+#               # the page number reflects the page that contains the main paragraph
+#               # according to split limit, the overlapping part can be on a separate page
+#               st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Netzero Score']))
+#               st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
+#       else:
+#           st.info("🤔 No Netzero target found")

paramconfig.cfg CHANGED Viewed

@@ -76,4 +76,14 @@ REMOVE_PUNC = 0
 SPLIT_LENGTH = 80
 SPLIT_OVERLAP = 10
 RESPECT_SENTENCE_BOUNDARY = 1
 TOP_KEY = 10

 SPLIT_LENGTH = 80
 SPLIT_OVERLAP = 10
 RESPECT_SENTENCE_BOUNDARY = 1
+TOP_KEY = 10
+[conditional]
+THRESHOLD = 0.50
+MODEL = mtyrrell/mtyrrell/CPU_Conditional_Classifier
+SPLIT_BY = word
+REMOVE_PUNC = 0
+SPLIT_LENGTH = 80
+SPLIT_OVERLAP = 10
+RESPECT_SENTENCE_BOUNDARY = 1
 TOP_KEY = 10

utils/conditional_classifier.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from typing import List, Tuple
+from typing_extensions import Literal
+import logging
+import pandas as pd
+from pandas import DataFrame, Series
+from utils.config import getconfig
+from utils.preprocessing import processingpipeline
+import streamlit as st
+from transformers import pipeline
+# Labels dictionary ###
+_lab_dict = {
+            'NEGATIVE':'NO NETZERO TARGET',
+            'NET-ZERO':'NETZERO TARGET',
+            'TARGET_FREE':'OTHERS'
+            }
+@st.cache_resource
+def load_netzeroClassifier(config_file:str = None, classifier_name:str = None):
+    """
+    loads the document classifier using haystack, where the name/path of model
+    in HF-hub as string is used to fetch the model object.Either configfile or
+    model should be passed.
+    1. https://docs.haystack.deepset.ai/reference/document-classifier-api
+    2. https://docs.haystack.deepset.ai/docs/document_classifier
+    Params
+    --------
+    config_file: config file path from which to read the model name
+    classifier_name: if modelname is passed, it takes a priority if not \
+    found then will look for configfile, else raise error.
+    Return: document classifier model
+    """
+    if not classifier_name:
+        if not config_file:
+            logging.warning("Pass either model name or config file")
+            return
+        else:
+            config = getconfig(config_file)
+            classifier_name = config.get('netzero','MODEL')
+    logging.info("Loading netzero classifier")
+    doc_classifier = pipeline("text-classification",
+                            model=classifier_name,
+                            top_k =1)
+    return doc_classifier
+@st.cache_data
+def netzero_classification(haystack_doc:pd.DataFrame,
+                        threshold:float = 0.8,
+                        classifier_model:pipeline= None
+                        )->Tuple[DataFrame,Series]:
+    """
+    Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. It informs if paragraph contains any
+    netzero information or not.
+    Params
+    ---------
+    haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
+    contains the list of paragraphs in different format,here the list of
+    Haystack Documents is used.
+    threshold: threshold value for the model to keep the results from classifier
+    classifiermodel: you can pass the classifier model directly,which takes priority
+    however if not then looks for model in streamlit session.
+    In case of streamlit avoid passing the model directly.
+    Returns
+    ----------
+    df: Dataframe
+    """
+    logging.info("Working on Netzero Extraction")
+    haystack_doc['Netzero Label'] = 'NA'
+    haystack_doc['Netzero Score'] = 'NA'
+    # we apply Netzero to only paragraphs which are classified as 'Target' related
+    temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
+    temp = temp.reset_index(drop=True)
+    df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
+    df = df.reset_index(drop=True)
+    if not classifier_model:
+        classifier_model = st.session_state['netzero_classifier']
+    results = classifier_model(list(temp.text))
+    labels_= [(l[0]['label'],l[0]['score']) for l in results]
+    temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
+    temp['Netzero Label'] = temp['Netzero Label'].apply(lambda x: _lab_dict[x])
+    # merging Target with Non Target dataframe
+    df = pd.concat([df,temp])
+    df = df.reset_index(drop =True)
+    df.index += 1
+    return df