ppsingh commited on
Commit
3353eb1
1 Parent(s): 7de8f90

add conditional

Browse files
appStore/conditional.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ from utils.netzero_classifier import load_netzeroClassifier, netzero_classification
12
+ import logging
13
+ logger = logging.getLogger(__name__)
14
+ from utils.config import get_classifier_params
15
+ from io import BytesIO
16
+ import xlsxwriter
17
+ import plotly.express as px
18
+
19
+
20
+ # Declare all the necessary variables
21
+ classifier_identifier = 'netzero'
22
+ params = get_classifier_params(classifier_identifier)
23
+
24
+
25
+ def app():
26
+ ### Main app code ###
27
+ with st.container():
28
+ if 'key1' in st.session_state:
29
+ df = st.session_state.key1
30
+
31
+ # Load the classifier model
32
+ classifier = load_netzeroClassifier(classifier_name=params['model_name'])
33
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
34
+
35
+ if sum(df['Target Label'] == 'TARGET') > 100:
36
+ warning_msg = ": This might take sometime, please sit back and relax."
37
+ else:
38
+ warning_msg = ""
39
+
40
+ df = netzero_classification(haystack_doc=df,
41
+ threshold= params['threshold'])
42
+ st.session_state.key1 = df
43
+
44
+
45
+
46
+ # @st.cache_data
47
+ # def to_excel(df):
48
+ # len_df = len(df)
49
+ # output = BytesIO()
50
+ # writer = pd.ExcelWriter(output, engine='xlsxwriter')
51
+ # df.to_excel(writer, index=False, sheet_name='Sheet1')
52
+ # workbook = writer.book
53
+ # worksheet = writer.sheets['Sheet1']
54
+ # worksheet.data_validation('E2:E{}'.format(len_df),
55
+ # {'validate': 'list',
56
+ # 'source': ['No', 'Yes', 'Discard']})
57
+ # writer.save()
58
+ # processed_data = output.getvalue()
59
+ # return processed_data
60
+
61
+ # def netzero_display():
62
+ # if 'key1' in st.session_state:
63
+ # df = st.session_state.key2
64
+ # hits = df[df['Netzero Label'] == 'NETZERO']
65
+ # range_val = min(5,len(hits))
66
+ # if range_val !=0:
67
+ # count_df = df['Netzero Label'].value_counts()
68
+ # count_df = count_df.rename('count')
69
+ # count_df = count_df.rename_axis('Netzero Label').reset_index()
70
+ # count_df['Label_def'] = count_df['Netzero Label'].apply(lambda x: _lab_dict[x])
71
+
72
+ # fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
73
+ # c1, c2 = st.columns([1,1])
74
+ # with c1:
75
+ # st.plotly_chart(fig,use_container_width= True)
76
+
77
+ # hits = hits.sort_values(by=['Netzero Score'], ascending=False)
78
+ # st.write("")
79
+ # st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
80
+ # range_val = min(5,len(hits))
81
+ # for i in range(range_val):
82
+ # # the page number reflects the page that contains the main paragraph
83
+ # # according to split limit, the overlapping part can be on a separate page
84
+ # st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Netzero Score']))
85
+ # st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
86
+ # else:
87
+ # st.info("🤔 No Netzero target found")
88
+
89
+
paramconfig.cfg CHANGED
@@ -76,4 +76,14 @@ REMOVE_PUNC = 0
76
  SPLIT_LENGTH = 80
77
  SPLIT_OVERLAP = 10
78
  RESPECT_SENTENCE_BOUNDARY = 1
 
 
 
 
 
 
 
 
 
 
79
  TOP_KEY = 10
 
76
  SPLIT_LENGTH = 80
77
  SPLIT_OVERLAP = 10
78
  RESPECT_SENTENCE_BOUNDARY = 1
79
+ TOP_KEY = 10
80
+
81
+ [conditional]
82
+ THRESHOLD = 0.50
83
+ MODEL = mtyrrell/mtyrrell/CPU_Conditional_Classifier
84
+ SPLIT_BY = word
85
+ REMOVE_PUNC = 0
86
+ SPLIT_LENGTH = 80
87
+ SPLIT_OVERLAP = 10
88
+ RESPECT_SENTENCE_BOUNDARY = 1
89
  TOP_KEY = 10
utils/conditional_classifier.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from typing_extensions import Literal
3
+ import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
+ import streamlit as st
9
+ from transformers import pipeline
10
+
11
+ # Labels dictionary ###
12
+ _lab_dict = {
13
+ 'NEGATIVE':'NO NETZERO TARGET',
14
+ 'NET-ZERO':'NETZERO TARGET',
15
+ 'TARGET_FREE':'OTHERS'
16
+ }
17
+
18
+ @st.cache_resource
19
+ def load_netzeroClassifier(config_file:str = None, classifier_name:str = None):
20
+ """
21
+ loads the document classifier using haystack, where the name/path of model
22
+ in HF-hub as string is used to fetch the model object.Either configfile or
23
+ model should be passed.
24
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
25
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
26
+ Params
27
+ --------
28
+ config_file: config file path from which to read the model name
29
+ classifier_name: if modelname is passed, it takes a priority if not \
30
+ found then will look for configfile, else raise error.
31
+ Return: document classifier model
32
+ """
33
+ if not classifier_name:
34
+ if not config_file:
35
+ logging.warning("Pass either model name or config file")
36
+ return
37
+ else:
38
+ config = getconfig(config_file)
39
+ classifier_name = config.get('netzero','MODEL')
40
+
41
+ logging.info("Loading netzero classifier")
42
+ doc_classifier = pipeline("text-classification",
43
+ model=classifier_name,
44
+ top_k =1)
45
+
46
+ return doc_classifier
47
+
48
+
49
+ @st.cache_data
50
+ def netzero_classification(haystack_doc:pd.DataFrame,
51
+ threshold:float = 0.8,
52
+ classifier_model:pipeline= None
53
+ )->Tuple[DataFrame,Series]:
54
+ """
55
+ Text-Classification on the list of texts provided. Classifier provides the
56
+ most appropriate label for each text. It informs if paragraph contains any
57
+ netzero information or not.
58
+ Params
59
+ ---------
60
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
61
+ contains the list of paragraphs in different format,here the list of
62
+ Haystack Documents is used.
63
+ threshold: threshold value for the model to keep the results from classifier
64
+ classifiermodel: you can pass the classifier model directly,which takes priority
65
+ however if not then looks for model in streamlit session.
66
+ In case of streamlit avoid passing the model directly.
67
+ Returns
68
+ ----------
69
+ df: Dataframe
70
+ """
71
+ logging.info("Working on Netzero Extraction")
72
+ haystack_doc['Netzero Label'] = 'NA'
73
+ haystack_doc['Netzero Score'] = 'NA'
74
+ # we apply Netzero to only paragraphs which are classified as 'Target' related
75
+ temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
76
+ temp = temp.reset_index(drop=True)
77
+ df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
78
+ df = df.reset_index(drop=True)
79
+
80
+ if not classifier_model:
81
+ classifier_model = st.session_state['netzero_classifier']
82
+
83
+ results = classifier_model(list(temp.text))
84
+ labels_= [(l[0]['label'],l[0]['score']) for l in results]
85
+ temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
86
+ temp['Netzero Label'] = temp['Netzero Label'].apply(lambda x: _lab_dict[x])
87
+ # merging Target with Non Target dataframe
88
+ df = pd.concat([df,temp])
89
+ df = df.reset_index(drop =True)
90
+ df.index += 1
91
+
92
+ return df