ppsingh commited on
Commit
f3a3954
1 Parent(s): 338b9ad

add reader

Browse files
Files changed (4) hide show
  1. app.py +2 -1
  2. appStore/reader.py +89 -0
  3. paramconfig.cfg +11 -1
  4. utils/reader_qa.py +110 -0
app.py CHANGED
@@ -6,6 +6,7 @@ import appStore.ghg as ghg
6
  import appStore.policyaction as policyaction
7
  import appStore.conditional as conditional
8
  import appStore.indicator as indicator
 
9
  import appStore.doc_processing as processing
10
  from utils.uploadAndExample import add_upload
11
  import streamlit as st
@@ -88,7 +89,7 @@ with st.expander("ℹ️ - About this app", expanded=False):
88
 
89
  st.write("")
90
  apps = [processing.app, target_extraction.app, netzero.app, ghg.app,
91
- policyaction.app, conditional.app, sector.app, adapmit.app,indicator.app]
92
  #conditional.app, sector.app]
93
  #adapmit.app]
94
 
 
6
  import appStore.policyaction as policyaction
7
  import appStore.conditional as conditional
8
  import appStore.indicator as indicator
9
+ import appStore.reader as reader
10
  import appStore.doc_processing as processing
11
  from utils.uploadAndExample import add_upload
12
  import streamlit as st
 
89
 
90
  st.write("")
91
  apps = [processing.app, target_extraction.app, netzero.app, ghg.app,
92
+ policyaction.app, conditional.app, sector.app, adapmit.app,indicator.app, reader.app]
93
  #conditional.app, sector.app]
94
  #adapmit.app]
95
 
appStore/reader.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ from utils.reader_qa import load_reader, reader_highlight
12
+ import logging
13
+ logger = logging.getLogger(__name__)
14
+ from utils.config import get_classifier_params
15
+ from io import BytesIO
16
+ import xlsxwriter
17
+ import plotly.express as px
18
+
19
+
20
+ # Declare all the necessary variables
21
+ classifier_identifier = 'reader'
22
+ params = get_classifier_params(classifier_identifier)
23
+
24
+
25
+ def app():
26
+ ### Main app code ###
27
+ with st.container():
28
+ if 'key1' in st.session_state:
29
+ df = st.session_state.key1
30
+
31
+ # Load the classifier model
32
+ classifier = load_reader(classifier_name=params['model_name'])
33
+ st.session_state['{}_qa'.format(classifier_identifier)] = classifier
34
+
35
+ if sum(df['Target Label'] == 'TARGET') > 100:
36
+ warning_msg = ": This might take sometime, please sit back and relax."
37
+ else:
38
+ warning_msg = ""
39
+
40
+ reader_highlight(haystack_doc=df,
41
+ threshold= params['threshold'])
42
+ # st.session_state.key1 = df
43
+
44
+
45
+
46
+ # @st.cache_data
47
+ # def to_excel(df):
48
+ # len_df = len(df)
49
+ # output = BytesIO()
50
+ # writer = pd.ExcelWriter(output, engine='xlsxwriter')
51
+ # df.to_excel(writer, index=False, sheet_name='Sheet1')
52
+ # workbook = writer.book
53
+ # worksheet = writer.sheets['Sheet1']
54
+ # worksheet.data_validation('E2:E{}'.format(len_df),
55
+ # {'validate': 'list',
56
+ # 'source': ['No', 'Yes', 'Discard']})
57
+ # writer.save()
58
+ # processed_data = output.getvalue()
59
+ # return processed_data
60
+
61
+ # def netzero_display():
62
+ # if 'key1' in st.session_state:
63
+ # df = st.session_state.key2
64
+ # hits = df[df['Netzero Label'] == 'NETZERO']
65
+ # range_val = min(5,len(hits))
66
+ # if range_val !=0:
67
+ # count_df = df['Netzero Label'].value_counts()
68
+ # count_df = count_df.rename('count')
69
+ # count_df = count_df.rename_axis('Netzero Label').reset_index()
70
+ # count_df['Label_def'] = count_df['Netzero Label'].apply(lambda x: _lab_dict[x])
71
+
72
+ # fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
73
+ # c1, c2 = st.columns([1,1])
74
+ # with c1:
75
+ # st.plotly_chart(fig,use_container_width= True)
76
+
77
+ # hits = hits.sort_values(by=['Netzero Score'], ascending=False)
78
+ # st.write("")
79
+ # st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
80
+ # range_val = min(5,len(hits))
81
+ # for i in range(range_val):
82
+ # # the page number reflects the page that contains the main paragraph
83
+ # # according to split limit, the overlapping part can be on a separate page
84
+ # st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Netzero Score']))
85
+ # st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
86
+ # else:
87
+ # st.info("🤔 No Netzero target found")
88
+
89
+
paramconfig.cfg CHANGED
@@ -33,7 +33,7 @@ THRESHOLD = 0.50
33
  MODEL = ppsingh/mpnet-multilabel-sector-classifier
34
  SPLIT_BY = word
35
  REMOVE_PUNC = 0
36
- SPLIT_LENGTH = 60
37
  SPLIT_OVERLAP = 10
38
  RESPECT_SENTENCE_BOUNDARY = 1
39
  TOP_KEY = 10
@@ -86,4 +86,14 @@ REMOVE_PUNC = 0
86
  SPLIT_LENGTH = 80
87
  SPLIT_OVERLAP = 10
88
  RESPECT_SENTENCE_BOUNDARY = 1
 
 
 
 
 
 
 
 
 
 
89
  TOP_KEY = 10
 
33
  MODEL = ppsingh/mpnet-multilabel-sector-classifier
34
  SPLIT_BY = word
35
  REMOVE_PUNC = 0
36
+ SPLIT_LENGTH = 80
37
  SPLIT_OVERLAP = 10
38
  RESPECT_SENTENCE_BOUNDARY = 1
39
  TOP_KEY = 10
 
86
  SPLIT_LENGTH = 80
87
  SPLIT_OVERLAP = 10
88
  RESPECT_SENTENCE_BOUNDARY = 1
89
+ TOP_KEY = 10
90
+
91
+ [reader]
92
+ THRESHOLD = 0.50
93
+ MODEL = ppsingh/roberta-finetuned-qa-policy_0.1
94
+ SPLIT_BY = word
95
+ REMOVE_PUNC = 0
96
+ SPLIT_LENGTH = 80
97
+ SPLIT_OVERLAP = 10
98
+ RESPECT_SENTENCE_BOUNDARY = 1
99
  TOP_KEY = 10
utils/reader_qa.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from typing_extensions import Literal
3
+ import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
+ import streamlit as st
9
+ from transformers import pipeline
10
+
11
+
12
+ @st.cache_resource
13
+ def load_reader(config_file:str = None, classifier_name:str = None):
14
+ """
15
+ loads the document classifier using haystack, where the name/path of model
16
+ in HF-hub as string is used to fetch the model object.Either configfile or
17
+ model should be passed.
18
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
+ Params
21
+ --------
22
+ config_file: config file path from which to read the model name
23
+ classifier_name: if modelname is passed, it takes a priority if not \
24
+ found then will look for configfile, else raise error.
25
+ Return: document classifier model
26
+ """
27
+ if not classifier_name:
28
+ if not config_file:
29
+ logging.warning("Pass either model name or config file")
30
+ return
31
+ else:
32
+ config = getconfig(config_file)
33
+ classifier_name = config.get('reader','MODEL')
34
+
35
+ logging.info("Loading Reader")
36
+ # we are using the pipeline as the model is multilabel and DocumentClassifier
37
+ # from Haystack doesnt support multilabel
38
+ # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
39
+ # if not then it will automatically use softmax, which is not a desired thing.
40
+ # doc_classifier = TransformersDocumentClassifier(
41
+ # model_name_or_path=classifier_name,
42
+ # task="text-classification",
43
+ # top_k = None)
44
+
45
+ qa_model = pipeline("question-answering", model=classifier_name )
46
+
47
+ return qa_model
48
+
49
+
50
+ @st.cache_data
51
+ def reader_highlight(haystack_doc:pd.DataFrame,
52
+ threshold:float = 0.5,
53
+ classifier_model:pipeline= None
54
+ )->Tuple[DataFrame,Series]:
55
+ """
56
+ Text-Classification on the list of texts provided. Classifier provides the
57
+ most appropriate label for each text. these labels are in terms of if text
58
+ belongs to which particular Sustainable Devleopment Goal (SDG).
59
+ Params
60
+ ---------
61
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
62
+ contains the list of paragraphs in different format,here the list of
63
+ Haystack Documents is used.
64
+ threshold: threshold value for the model to keep the results from classifier
65
+ classifiermodel: you can pass the classifier model directly,which takes priority
66
+ however if not then looks for model in streamlit session.
67
+ In case of streamlit avoid passing the model directly.
68
+ Returns
69
+ ----------
70
+ df: Dataframe
71
+ """
72
+ logging.info("Working on Reader")
73
+ haystack_doc['Extracted Text'] = 'NA'
74
+ df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
75
+ df1 = df1.reset_index(drop=True)
76
+ df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
77
+ df = df.reset_index(drop=True)
78
+ if not classifier_model:
79
+ reader_model = st.session_state['reader_qa']
80
+ ques_ = ['What Target/commitments have been made ?'] * len(df1)
81
+
82
+ predictions = reader_model(ques_, list(df1.text))
83
+
84
+ st.write(predictions)
85
+ # # getting the sector label and scores
86
+ # list_ = []
87
+ # for i in range(len(predictions)):
88
+
89
+ # temp = predictions[i]
90
+ # placeholder = {}
91
+ # for j in range(len(temp)):
92
+ # placeholder[temp[j]['label']] = temp[j]['score']
93
+ # list_.append(placeholder)
94
+ # labels_ = [{**list_[l]} for l in range(len(predictions))]
95
+ # truth_df = DataFrame.from_dict(labels_)
96
+ # truth_df = truth_df.round(2)
97
+ # # based on threshold value, we convert each sector score into boolean
98
+ # truth_df = truth_df.astype(float) >= threshold
99
+ # truth_df = truth_df.astype(str)
100
+ # # collecting list of Sector Labels
101
+ # categories = list(truth_df.columns)
102
+ # # we collect the Sector Labels as set, None represent the value at the index
103
+ # # in the list of Sector Labels.
104
+ # truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
105
+ # None for i in categories}, axis=1)
106
+ # # we keep all Sector label except None
107
+ # truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
108
+ # -{None}),axis=1)
109
+ # haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
110
+ # return haystack_doc