ppsingh commited on
Commit
6d6a8a4
1 Parent(s): 7532d3e

Delete utils

Browse files
utils/__init__.py DELETED
@@ -1 +0,0 @@
1
- # adding for package implementation
 
 
utils/adapmit_classifier.py DELETED
@@ -1,110 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- @st.cache_resource
12
- def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
13
- """
14
- loads the document classifier using haystack, where the name/path of model
15
- in HF-hub as string is used to fetch the model object.Either configfile or
16
- model should be passed.
17
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
18
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
19
- Params
20
- --------
21
- config_file: config file path from which to read the model name
22
- classifier_name: if modelname is passed, it takes a priority if not \
23
- found then will look for configfile, else raise error.
24
- Return: document classifier model
25
- """
26
- if not classifier_name:
27
- if not config_file:
28
- logging.warning("Pass either model name or config file")
29
- return
30
- else:
31
- config = getconfig(config_file)
32
- classifier_name = config.get('adapmit','MODEL')
33
-
34
- logging.info("Loading Adaptation Mitigation classifier")
35
- doc_classifier = pipeline("text-classification",
36
- model=classifier_name,
37
- return_all_scores=True,
38
- function_to_apply= "sigmoid")
39
- return doc_classifier
40
-
41
-
42
- @st.cache_data
43
- def adapmit_classification(haystack_doc:pd.DataFrame,
44
- threshold:float = 0.5,
45
- classifier_model:pipeline= None
46
- )->Tuple[DataFrame,Series]:
47
- """
48
- Text-Classification on the list of texts provided. Classifier provides the
49
- most appropriate label for each text. these labels are in terms of if text
50
- belongs to which particular Sustainable Devleopment Goal (SDG).
51
- Params
52
- ---------
53
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
54
- contains the list of paragraphs in different format,here the list of
55
- Haystack Documents is used.
56
- threshold: threshold value for the model to keep the results from classifier
57
- classifiermodel: you can pass the classifier model directly,which takes priority
58
- however if not then looks for model in streamlit session.
59
- In case of streamlit avoid passing the model directly.
60
- Returns
61
- ----------
62
- df: Dataframe
63
- """
64
- logging.info("Working on Adaptation-Mitigation Identification")
65
- haystack_doc['Adapt-Mitig Label'] = 'NA'
66
- haystack_doc['cond_check'] = haystack_doc.apply(lambda x: True if (
67
- (x['Target Label'] == 'TARGET') | (x['Action Label'] == 'Action') |
68
- (x['Policies_Plans Label'] == 'Policies and Plans')) else
69
- False, axis=1)
70
- # we apply Netzero to only paragraphs which are classified as 'Target' related
71
- df1 = haystack_doc[haystack_doc['cond_check'] == True]
72
- df1 = df1.reset_index(drop=True)
73
- df = haystack_doc[haystack_doc['cond_check'] == False]
74
- df = df.reset_index(drop=True)
75
-
76
- if not classifier_model:
77
- classifier_model = st.session_state['adapmit_classifier']
78
-
79
- predictions = classifier_model(list(df1.text))
80
- # converting the predictions to desired format
81
- list_ = []
82
- for i in range(len(predictions)):
83
-
84
- temp = predictions[i]
85
- placeholder = {}
86
- for j in range(len(temp)):
87
- placeholder[temp[j]['label']] = temp[j]['score']
88
- list_.append(placeholder)
89
- labels_ = [{**list_[l]} for l in range(len(predictions))]
90
- truth_df = DataFrame.from_dict(labels_)
91
- truth_df = truth_df.round(2)
92
- # convert the labels score into boolean based on threshold value
93
- truth_df = truth_df.astype(float) >= threshold
94
- truth_df = truth_df.astype(str)
95
- # list of labels
96
- categories = list(truth_df.columns)
97
-
98
- # collecting the labels, None is passed to overcome comprehension syntax
99
- truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
100
- else None for i in categories}, axis=1)
101
- truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
102
- list(x['Adapt-Mitig Label'] -{None}),axis=1)
103
- # adding Adaptation-Mitigation label
104
- df1['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
105
- df = pd.concat([df,df1])
106
- df = df.drop(columns = ['cond_check'])
107
- df = df.reset_index(drop =True)
108
- df.index += 1
109
-
110
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/conditional_classifier.py DELETED
@@ -1,92 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
-
12
- @st.cache_resource
13
- def load_conditionalClassifier(config_file:str = None, classifier_name:str = None):
14
- """
15
- loads the document classifier using haystack, where the name/path of model
16
- in HF-hub as string is used to fetch the model object.Either configfile or
17
- model should be passed.
18
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
- Params
21
- --------
22
- config_file: config file path from which to read the model name
23
- classifier_name: if modelname is passed, it takes a priority if not \
24
- found then will look for configfile, else raise error.
25
- Return: document classifier model
26
- """
27
- if not classifier_name:
28
- if not config_file:
29
- logging.warning("Pass either model name or config file")
30
- return
31
- else:
32
- config = getconfig(config_file)
33
- classifier_name = config.get('conditional','MODEL')
34
-
35
- logging.info("Loading conditional classifier")
36
- doc_classifier = pipeline("text-classification",
37
- model=classifier_name,
38
- top_k =1)
39
-
40
- return doc_classifier
41
-
42
-
43
- @st.cache_data
44
- def conditional_classification(haystack_doc:pd.DataFrame,
45
- threshold:float = 0.8,
46
- classifier_model:pipeline= None
47
- )->Tuple[DataFrame,Series]:
48
- """
49
- Text-Classification on the list of texts provided. Classifier provides the
50
- most appropriate label for each text. It informs if paragraph contains any
51
- netzero information or not.
52
- Params
53
- ---------
54
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
55
- contains the list of paragraphs in different format,here the list of
56
- Haystack Documents is used.
57
- threshold: threshold value for the model to keep the results from classifier
58
- classifiermodel: you can pass the classifier model directly,which takes priority
59
- however if not then looks for model in streamlit session.
60
- In case of streamlit avoid passing the model directly.
61
- Returns
62
- ----------
63
- df: Dataframe
64
- """
65
- logging.info("Working on Conditionality Identification")
66
- haystack_doc['Conditional Label'] = 'NA'
67
- haystack_doc['Conditional Score'] = 0.0
68
- haystack_doc['cond_check'] = False
69
- haystack_doc['cond_check'] = haystack_doc.apply(lambda x: True if (
70
- (x['Target Label'] == 'TARGET') | (x['Action Label'] == 'Action') |
71
- (x['Policies_Plans Label'] == 'Policies and Plans')) else
72
- False, axis=1)
73
- # we apply Netzero to only paragraphs which are classified as 'Target' related
74
- temp = haystack_doc[haystack_doc['cond_check'] == True]
75
- temp = temp.reset_index(drop=True)
76
- df = haystack_doc[haystack_doc['cond_check'] == False]
77
- df = df.reset_index(drop=True)
78
-
79
- if not classifier_model:
80
- classifier_model = st.session_state['conditional_classifier']
81
-
82
- results = classifier_model(list(temp.text))
83
- labels_= [(l[0]['label'],l[0]['score']) for l in results]
84
- temp['Conditional Label'],temp['Conditional Score'] = zip(*labels_)
85
- # temp[' Label'] = temp['Netzero Label'].apply(lambda x: _lab_dict[x])
86
- # merging Target with Non Target dataframe
87
- df = pd.concat([df,temp])
88
- df = df.drop(columns = ['cond_check'])
89
- df = df.reset_index(drop =True)
90
- df.index += 1
91
-
92
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/config.py DELETED
@@ -1,31 +0,0 @@
1
- import configparser
2
- import logging
3
-
4
- def getconfig(configfile_path:str):
5
- """
6
- configfile_path: file path of .cfg file
7
- """
8
-
9
- config = configparser.ConfigParser()
10
-
11
- try:
12
- config.read_file(open(configfile_path))
13
- return config
14
- except:
15
- logging.warning("config file not found")
16
-
17
-
18
- # Declare all the necessary variables
19
- def get_classifier_params(model_name):
20
- config = getconfig('paramconfig.cfg')
21
- params = {}
22
- params['model_name'] = config.get(model_name,'MODEL')
23
- params['split_by'] = config.get(model_name,'SPLIT_BY')
24
- params['split_length'] = int(config.get(model_name,'SPLIT_LENGTH'))
25
- params['split_overlap'] = int(config.get(model_name,'SPLIT_OVERLAP'))
26
- params['remove_punc'] = bool(int(config.get(model_name,'REMOVE_PUNC')))
27
- params['split_respect_sentence_boundary'] = bool(int(config.get(model_name,'RESPECT_SENTENCE_BOUNDARY')))
28
- params['threshold'] = float(config.get(model_name,'THRESHOLD'))
29
- params['top_n'] = int(config.get(model_name,'TOP_KEY'))
30
-
31
- return params
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/ghg_classifier.py DELETED
@@ -1,94 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- # Labels dictionary ###
12
- _lab_dict = {
13
- 'GHG':'GHG',
14
- 'NOT_GHG':'NON GHG TRANSPORT TARGET',
15
- 'NEGATIVE':'OTHERS',
16
- }
17
-
18
-
19
- @st.cache_resource
20
- def load_ghgClassifier(config_file:str = None, classifier_name:str = None):
21
- """
22
- loads the document classifier using haystack, where the name/path of model
23
- in HF-hub as string is used to fetch the model object.Either configfile or
24
- model should be passed.
25
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
26
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
27
- Params
28
- --------
29
- config_file: config file path from which to read the model name
30
- classifier_name: if modelname is passed, it takes a priority if not \
31
- found then will look for configfile, else raise error.
32
- Return: document classifier model
33
- """
34
- if not classifier_name:
35
- if not config_file:
36
- logging.warning("Pass either model name or config file")
37
- return
38
- else:
39
- config = getconfig(config_file)
40
- classifier_name = config.get('ghg','MODEL')
41
-
42
- logging.info("Loading ghg classifier")
43
- doc_classifier = pipeline("text-classification",
44
- model=classifier_name,
45
- top_k =1)
46
-
47
- return doc_classifier
48
-
49
-
50
- @st.cache_data
51
- def ghg_classification(haystack_doc:pd.DataFrame,
52
- threshold:float = 0.5,
53
- classifier_model:pipeline= None
54
- )->Tuple[DataFrame,Series]:
55
- """
56
- Text-Classification on the list of texts provided. Classifier provides the
57
- most appropriate label for each text. It identifies if text contains 'GHG'
58
- related information or not.
59
- Params
60
- ---------
61
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
62
- contains the list of paragraphs in different format,here the list of
63
- Haystack Documents is used.
64
- threshold: threshold value for the model to keep the results from classifier
65
- classifiermodel: you can pass the classifier model directly,which takes priority
66
- however if not then looks for model in streamlit session.
67
- In case of streamlit avoid passing the model directly.
68
- Returns
69
- ----------
70
- df: Dataframe
71
- """
72
- logging.info("Working on GHG Extraction")
73
- haystack_doc['GHG Label'] = 'NA'
74
- haystack_doc['GHG Score'] = 0.0
75
- # applying GHG Identifier to only 'Target' paragraphs.
76
- temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
77
- temp = temp.reset_index(drop=True)
78
- df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
79
- df = df.reset_index(drop=True)
80
-
81
- if not classifier_model:
82
- classifier_model = st.session_state['ghg_classifier']
83
-
84
- results = classifier_model(list(temp.text))
85
- labels_= [(l[0]['label'],l[0]['score']) for l in results]
86
- temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
87
- temp['GHG Label'] = temp['GHG Label'].apply(lambda x: _lab_dict[x])
88
- # merge back Target and non-Target dataframe
89
- df = pd.concat([df,temp])
90
- df = df.reset_index(drop =True)
91
- df['GHG Score'] = df['GHG Score'].round(2)
92
- df.index += 1
93
-
94
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/indicator_classifier.py DELETED
@@ -1,117 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
-
12
- @st.cache_resource
13
- def load_indicatorClassifier(config_file:str = None, classifier_name:str = None):
14
- """
15
- loads the document classifier using haystack, where the name/path of model
16
- in HF-hub as string is used to fetch the model object.Either configfile or
17
- model should be passed.
18
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
- Params
21
- --------
22
- config_file: config file path from which to read the model name
23
- classifier_name: if modelname is passed, it takes a priority if not \
24
- found then will look for configfile, else raise error.
25
- Return: document classifier model
26
- """
27
- if not classifier_name:
28
- if not config_file:
29
- logging.warning("Pass either model name or config file")
30
- return
31
- else:
32
- config = getconfig(config_file)
33
- classifier_name = config.get('indicator','MODEL')
34
-
35
- logging.info("Loading indicator classifier")
36
- # we are using the pipeline as the model is multilabel and DocumentClassifier
37
- # from Haystack doesnt support multilabel
38
- # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
39
- # if not then it will automatically use softmax, which is not a desired thing.
40
- # doc_classifier = TransformersDocumentClassifier(
41
- # model_name_or_path=classifier_name,
42
- # task="text-classification",
43
- # top_k = None)
44
-
45
- doc_classifier = pipeline("text-classification",
46
- model=classifier_name,
47
- return_all_scores=True,
48
- function_to_apply= "sigmoid")
49
-
50
- return doc_classifier
51
-
52
-
53
- @st.cache_data
54
- def indicator_classification(haystack_doc:pd.DataFrame,
55
- threshold:float = 0.5,
56
- classifier_model:pipeline= None
57
- )->Tuple[DataFrame,Series]:
58
- """
59
- Text-Classification on the list of texts provided. Classifier provides the
60
- most appropriate label for each text. these labels are in terms of if text
61
- belongs to which particular Sustainable Devleopment Goal (SDG).
62
- Params
63
- ---------
64
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
65
- contains the list of paragraphs in different format,here the list of
66
- Haystack Documents is used.
67
- threshold: threshold value for the model to keep the results from classifier
68
- classifiermodel: you can pass the classifier model directly,which takes priority
69
- however if not then looks for model in streamlit session.
70
- In case of streamlit avoid passing the model directly.
71
- Returns
72
- ----------
73
- df: Dataframe with two columns['SDG:int', 'text']
74
- x: Series object with the unique SDG covered in the document uploaded and
75
- the number of times it is covered/discussed/count_of_paragraphs.
76
- """
77
- logging.info("Working on Indicator Identification")
78
- haystack_doc['Indicator Label'] = 'NA'
79
- haystack_doc['cond_check'] = False
80
- haystack_doc['cond_check'] = haystack_doc.apply(lambda x: True if (
81
- (x['Action Label'] == 'Action') |
82
- (x['Policies_Plans Label'] == 'Policies and Plans')) else
83
- False, axis=1)
84
- # we apply Netzero to only paragraphs which are classified as 'Target' related
85
- df1 = haystack_doc[haystack_doc['cond_check'] == True]
86
- df1 = df1.reset_index(drop=True)
87
- df = haystack_doc[haystack_doc['cond_check'] == False]
88
- df = df.reset_index(drop=True)
89
- if not classifier_model:
90
- classifier_model = st.session_state['indicator_classifier']
91
-
92
- predictions = classifier_model(list(df1.text))
93
-
94
- list_ = []
95
- for i in range(len(predictions)):
96
-
97
- temp = predictions[i]
98
- placeholder = {}
99
- for j in range(len(temp)):
100
- placeholder[temp[j]['label']] = temp[j]['score']
101
- list_.append(placeholder)
102
- labels_ = [{**list_[l]} for l in range(len(predictions))]
103
- truth_df = DataFrame.from_dict(labels_)
104
- truth_df = truth_df.round(2)
105
- truth_df = truth_df.astype(float) >= threshold
106
- truth_df = truth_df.astype(str)
107
- categories = list(truth_df.columns)
108
- truth_df['Indicator Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
109
- None for i in categories}, axis=1)
110
- truth_df['Indicator Label'] = truth_df.apply(lambda x: list(x['Indicator Label']
111
- -{None}),axis=1)
112
- df1['Indicator Label'] = list(truth_df['Indicator Label'])
113
- df = pd.concat([df,df1])
114
- df = df.drop(columns = ['cond_check'])
115
- df = df.reset_index(drop =True)
116
- df.index += 1
117
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/netzero_classifier.py DELETED
@@ -1,93 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- # Labels dictionary ###
12
- _lab_dict = {
13
- 'NEGATIVE':'NON NETZERO TARGET',
14
- 'NET-ZERO':'NETZERO TARGET',
15
- 'TARGET_FREE':'OTHERS'
16
- }
17
-
18
- @st.cache_resource
19
- def load_netzeroClassifier(config_file:str = None, classifier_name:str = None):
20
- """
21
- loads the document classifier using haystack, where the name/path of model
22
- in HF-hub as string is used to fetch the model object.Either configfile or
23
- model should be passed.
24
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
25
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
26
- Params
27
- --------
28
- config_file: config file path from which to read the model name
29
- classifier_name: if modelname is passed, it takes a priority if not \
30
- found then will look for configfile, else raise error.
31
- Return: document classifier model
32
- """
33
- if not classifier_name:
34
- if not config_file:
35
- logging.warning("Pass either model name or config file")
36
- return
37
- else:
38
- config = getconfig(config_file)
39
- classifier_name = config.get('netzero','MODEL')
40
-
41
- logging.info("Loading netzero classifier")
42
- doc_classifier = pipeline("text-classification",
43
- model=classifier_name,
44
- top_k =1)
45
-
46
- return doc_classifier
47
-
48
-
49
- @st.cache_data
50
- def netzero_classification(haystack_doc:pd.DataFrame,
51
- threshold:float = 0.8,
52
- classifier_model:pipeline= None
53
- )->Tuple[DataFrame,Series]:
54
- """
55
- Text-Classification on the list of texts provided. Classifier provides the
56
- most appropriate label for each text. It informs if paragraph contains any
57
- netzero information or not.
58
- Params
59
- ---------
60
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
61
- contains the list of paragraphs in different format,here the list of
62
- Haystack Documents is used.
63
- threshold: threshold value for the model to keep the results from classifier
64
- classifiermodel: you can pass the classifier model directly,which takes priority
65
- however if not then looks for model in streamlit session.
66
- In case of streamlit avoid passing the model directly.
67
- Returns
68
- ----------
69
- df: Dataframe
70
- """
71
- logging.info("Working on Netzero Extraction")
72
- haystack_doc['Netzero Label'] = 'NA'
73
- haystack_doc['Netzero Score'] = 0.0
74
- # we apply Netzero to only paragraphs which are classified as 'Target' related
75
- temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
76
- temp = temp.reset_index(drop=True)
77
- df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
78
- df = df.reset_index(drop=True)
79
-
80
- if not classifier_model:
81
- classifier_model = st.session_state['netzero_classifier']
82
-
83
- results = classifier_model(list(temp.text))
84
- labels_= [(l[0]['label'],l[0]['score']) for l in results]
85
- temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
86
- temp['Netzero Label'] = temp['Netzero Label'].apply(lambda x: _lab_dict[x])
87
- # merging Target with Non Target dataframe
88
- df = pd.concat([df,temp])
89
- df = df.reset_index(drop =True)
90
- df['Netzero Score'] = df['Netzero Score'].round(2)
91
- df.index += 1
92
-
93
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/policyaction_classifier.py DELETED
@@ -1,113 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
-
12
- @st.cache_resource
13
- def load_policyactionClassifier(config_file:str = None, classifier_name:str = None):
14
- """
15
- loads the document classifier using haystack, where the name/path of model
16
- in HF-hub as string is used to fetch the model object.Either configfile or
17
- model should be passed.
18
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
- Params
21
- --------
22
- config_file: config file path from which to read the model name
23
- classifier_name: if modelname is passed, it takes a priority if not \
24
- found then will look for configfile, else raise error.
25
- Return: document classifier model
26
- """
27
- if not classifier_name:
28
- if not config_file:
29
- logging.warning("Pass either model name or config file")
30
- return
31
- else:
32
- config = getconfig(config_file)
33
- classifier_name = config.get('policyaction','MODEL')
34
-
35
- logging.info("Loading classifier")
36
-
37
- doc_classifier = pipeline("text-classification",
38
- model=classifier_name,
39
- return_all_scores=True,
40
- function_to_apply= "sigmoid")
41
-
42
- return doc_classifier
43
-
44
-
45
- @st.cache_data
46
- def policyaction_classification(haystack_doc:pd.DataFrame,
47
- threshold:float = 0.5,
48
- classifier_model:pipeline= None
49
- )->Tuple[DataFrame,Series]:
50
- """
51
- Text-Classification on the list of texts provided. Classifier provides the
52
- most appropriate label for each text. these labels are in terms of if text
53
- belongs to which particular Sustainable Devleopment Goal (SDG).
54
- Params
55
- ---------
56
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
57
- contains the list of paragraphs in different format,here the list of
58
- Haystack Documents is used.
59
- threshold: threshold value for the model to keep the results from classifier
60
- classifiermodel: you can pass the classifier model directly,which takes priority
61
- however if not then looks for model in streamlit session.
62
- In case of streamlit avoid passing the model directly.
63
- Returns
64
- ----------
65
- df: Dataframe with two columns['SDG:int', 'text']
66
- x: Series object with the unique SDG covered in the document uploaded and
67
- the number of times it is covered/discussed/count_of_paragraphs.
68
- """
69
- logging.info("Working on Policy/Action. Extraction")
70
- # haystack_doc['Policy-Action Label'] = 'NA'
71
- if not classifier_model:
72
- classifier_model = st.session_state['policyaction_classifier']
73
-
74
- predictions = classifier_model(list(haystack_doc.text))
75
- list_ = []
76
- for i in range(len(predictions)):
77
-
78
- temp = predictions[i]
79
- placeholder = {}
80
- for j in range(len(temp)):
81
- placeholder[temp[j]['label']] = temp[j]['score']
82
- list_.append(placeholder)
83
- labels_ = [{**list_[l]} for l in range(len(predictions))]
84
- truth_df = DataFrame.from_dict(labels_)
85
- truth_df = truth_df.round(2)
86
- temp = truth_df.copy()
87
- truth_df = truth_df.astype(float) >= threshold
88
- truth_df = truth_df.astype(str)
89
- truth_df.rename(columns = {'Action':'Action Label','Policies & Plans':'Policies_Plans Label'},
90
- inplace =True)
91
- # st.dataframe(truth_df)
92
- truth_df['Action Label'] = truth_df['Action Label'].apply(lambda x: 'Action' if x == 'True'
93
- else 'NEGATIVE')
94
- truth_df['Policies_Plans Label'] = truth_df['Policies_Plans Label'].apply(lambda x:
95
- 'Policy and Plans' if x == 'True' else 'NEGATIVE')
96
- temp.rename(columns = {'Action':'Action Score','Policies & Plans':'Policies_Plans Score'},
97
- inplace=True)
98
- truth_df = pd.concat([truth_df, temp],axis=1)
99
- truth_df.index += 1
100
- # categories = list(truth_df.columns)
101
- # truth_df['Policy-Action Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
102
- # else None for i in categories}, axis=1)
103
- # st.dataframe(truth_df)
104
- # st.dataframe(temp)
105
- # truth_df['Policy-Action Label'] = truth_df.apply(lambda x:
106
- # list(x['Policy-Action Label'] -{None}),axis=1)
107
-
108
- # haystack_doc['Policy-Action Label'] = list(truth_df['Policy-Action Label'])
109
- # haystack_doc['Policy-Action Label'] = haystack_doc['Policy-Action Label'].apply(
110
- # lambda x:x if x else ['NA'])
111
- haystack_doc = pd.concat([haystack_doc,truth_df], axis=1)
112
-
113
- return haystack_doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/preprocessing.py DELETED
@@ -1,291 +0,0 @@
1
- from haystack.nodes.base import BaseComponent
2
- from haystack.schema import Document
3
- from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
4
- from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
- from typing import Callable, Dict, List, Optional, Text, Tuple, Union
6
- from typing_extensions import Literal
7
- import pandas as pd
8
- import logging
9
- import re
10
- import string
11
- from haystack.pipelines import Pipeline
12
-
13
- def useOCR(file_path: str)-> Text:
14
- """
15
- Converts image pdfs into text, Using the Farm-haystack[OCR]
16
-
17
- Params
18
- ----------
19
- file_path: file_path of uploade file, returned by add_upload function in
20
- uploadAndExample.py
21
-
22
- Returns the text file as string.
23
- """
24
-
25
-
26
- converter = PDFToTextOCRConverter(remove_numeric_tables=True,
27
- valid_languages=["eng"])
28
- docs = converter.convert(file_path=file_path, meta=None)
29
- return docs[0].content
30
-
31
-
32
-
33
-
34
- class FileConverter(BaseComponent):
35
- """
36
- Wrapper class to convert uploaded document into text by calling appropriate
37
- Converter class, will use internally haystack PDFToTextOCR in case of image
38
- pdf. Cannot use the FileClassifier from haystack as its doesnt has any
39
- label/output class for image.
40
-
41
- 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
42
- 2. https://docs.haystack.deepset.ai/docs/file_converters
43
- 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
44
- 4. https://docs.haystack.deepset.ai/reference/file-converters-api
45
-
46
-
47
- """
48
-
49
- outgoing_edges = 1
50
-
51
- def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
52
- id_hash_keys: Optional[List[str]] = None,
53
- ) -> Tuple[dict,str]:
54
- """ this is required method to invoke the component in
55
- the pipeline implementation.
56
-
57
- Params
58
- ----------
59
- file_name: name of file
60
- file_path: file_path of uploade file, returned by add_upload function in
61
- uploadAndExample.py
62
-
63
- See the links provided in Class docstring/description to see other params
64
-
65
- Return
66
- ---------
67
- output: dictionary, with key as identifier and value could be anything
68
- we need to return. In this case its the List of Hasyatck Document
69
-
70
- output_1: As there is only one outgoing edge, we pass 'output_1' string
71
- """
72
- try:
73
- if file_name.endswith('.pdf'):
74
- converter = PDFToTextConverter(remove_numeric_tables=True)
75
- if file_name.endswith('.txt'):
76
- converter = TextConverter(remove_numeric_tables=True)
77
- if file_name.endswith('.docx'):
78
- converter = DocxToTextConverter()
79
- except Exception as e:
80
- logging.error(e)
81
- return
82
-
83
-
84
-
85
- documents = []
86
-
87
-
88
- # encoding is empty, probably should be utf-8
89
- document = converter.convert(
90
- file_path=file_path, meta=None,
91
- encoding=encoding, id_hash_keys=id_hash_keys
92
- )[0]
93
-
94
- text = document.content
95
-
96
- # in case of scanned/images only PDF the content might contain only
97
- # the page separator (\f or \x0c). We check if is so and use
98
- # use the OCR to get the text.
99
- filtered = re.sub(r'\x0c', '', text)
100
-
101
- if filtered == "":
102
- logging.info("Using OCR")
103
- text = useOCR(file_path)
104
-
105
- documents.append(Document(content=text,
106
- meta={"name": file_name},
107
- id_hash_keys=id_hash_keys))
108
-
109
- logging.info('file conversion succesful')
110
- output = {'documents': documents}
111
- return output, 'output_1'
112
-
113
- def run_batch():
114
- """
115
- we dont have requirement to process the multiple files in one go
116
- therefore nothing here, however to use the custom node we need to have
117
- this method for the class.
118
- """
119
-
120
- return
121
-
122
-
123
- def basic(s:str, remove_punc:bool = False):
124
-
125
- """
126
- Performs basic cleaning of text.
127
-
128
- Params
129
- ----------
130
- s: string to be processed
131
- removePunc: to remove all Punctuation including ',' and '.' or not
132
-
133
- Returns: processed string: see comments in the source code for more info
134
- """
135
-
136
- # Remove URLs
137
- s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
138
- s = re.sub(r"http\S+", " ", s)
139
-
140
- # Remove new line characters
141
- s = re.sub('\n', ' ', s)
142
-
143
- # Remove punctuations
144
- if remove_punc == True:
145
- translator = str.maketrans(' ', ' ', string.punctuation)
146
- s = s.translate(translator)
147
- # Remove distracting single quotes and dotted pattern
148
- s = re.sub("\'", " ", s)
149
- s = s.replace("..","")
150
-
151
- return s.strip()
152
-
153
- def paraLengthCheck(paraList, max_len = 100):
154
- """
155
- There are cases where preprocessor cannot respect word limit, when using
156
- respect sentence boundary flag due to missing sentence boundaries.
157
- Therefore we run one more round of split here for those paragraphs
158
-
159
- Params
160
- ---------------
161
- paraList : list of paragraphs/text
162
- max_len : max length to be respected by sentences which bypassed
163
- preprocessor strategy
164
-
165
- """
166
- new_para_list = []
167
- for passage in paraList:
168
- # check if para exceeds words limit
169
- if len(passage.content.split()) > max_len:
170
- # we might need few iterations example if para = 512 tokens
171
- # we need to iterate 5 times to reduce para to size limit of '100'
172
- iterations = int(len(passage.content.split())/max_len)
173
- for i in range(iterations):
174
- temp = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
175
- new_para_list.append((temp,passage.meta['page']))
176
- temp = " ".join(passage.content.split()[max_len*(i+1):])
177
- new_para_list.append((temp,passage.meta['page']))
178
- else:
179
- # paragraphs which dont need any splitting
180
- new_para_list.append((passage.content, passage.meta['page']))
181
-
182
- logging.info("New paragraphs length {}".format(len(new_para_list)))
183
- return new_para_list
184
-
185
- class UdfPreProcessor(BaseComponent):
186
- """
187
- class to preprocess the document returned by FileConverter. It will check
188
- for splitting strategy and splits the document by word or sentences and then
189
- synthetically create the paragraphs.
190
-
191
- 1. https://docs.haystack.deepset.ai/docs/preprocessor
192
- 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
193
- 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
194
-
195
- """
196
- outgoing_edges = 1
197
-
198
- def run(self, documents:List[Document], remove_punc:bool=False,
199
- split_by: Literal["sentence", "word"] = 'sentence',
200
- split_length:int = 2, split_respect_sentence_boundary:bool = False,
201
- split_overlap:int = 0):
202
-
203
- """ this is required method to invoke the component in
204
- the pipeline implementation.
205
-
206
- Params
207
- ----------
208
- documents: documents from the output dictionary returned by Fileconverter
209
- remove_punc: to remove all Punctuation including ',' and '.' or not
210
- split_by: document splitting strategy either as word or sentence
211
- split_length: when synthetically creating the paragrpahs from document,
212
- it defines the length of paragraph.
213
- split_respect_sentence_boundary: Used when using 'word' strategy for
214
- splititng of text.
215
- split_overlap: Number of words or sentences that overlap when creating
216
- the paragraphs. This is done as one sentence or 'some words' make sense
217
- when read in together with others. Therefore the overlap is used.
218
-
219
- Return
220
- ---------
221
- output: dictionary, with key as identifier and value could be anything
222
- we need to return. In this case the output will contain 4 objects
223
- the paragraphs text list as List, Haystack document, Dataframe and
224
- one raw text file.
225
-
226
- output_1: As there is only one outgoing edge, we pass 'output_1' string
227
-
228
- """
229
-
230
- if split_by == 'sentence':
231
- split_respect_sentence_boundary = False
232
-
233
- else:
234
- split_respect_sentence_boundary = split_respect_sentence_boundary
235
-
236
- preprocessor = PreProcessor(
237
- clean_empty_lines=True,
238
- clean_whitespace=True,
239
- clean_header_footer=True,
240
- split_by=split_by,
241
- split_length=split_length,
242
- split_respect_sentence_boundary= split_respect_sentence_boundary,
243
- split_overlap=split_overlap,
244
-
245
- # will add page number only in case of PDF not for text/docx file.
246
- add_page_number=True
247
- )
248
-
249
- for i in documents:
250
- # # basic cleaning before passing it to preprocessor.
251
- # i = basic(i)
252
- docs_processed = preprocessor.process([i])
253
- for item in docs_processed:
254
- item.content = basic(item.content, remove_punc= remove_punc)
255
-
256
- df = pd.DataFrame(docs_processed)
257
- all_text = " ".join(df.content.to_list())
258
- para_list = df.content.to_list()
259
- logging.info('document split into {} paragraphs'.format(len(para_list)))
260
- output = {'documents': docs_processed,
261
- 'dataframe': df,
262
- 'text': all_text,
263
- 'paraList': para_list
264
- }
265
- return output, "output_1"
266
- def run_batch():
267
- """
268
- we dont have requirement to process the multiple files in one go
269
- therefore nothing here, however to use the custom node we need to have
270
- this method for the class.
271
- """
272
- return
273
-
274
- def processingpipeline():
275
- """
276
- Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
277
- from utils.preprocessing
278
-
279
- """
280
-
281
- preprocessing_pipeline = Pipeline()
282
- file_converter = FileConverter()
283
- custom_preprocessor = UdfPreProcessor()
284
-
285
- preprocessing_pipeline.add_node(component=file_converter,
286
- name="FileConverter", inputs=["File"])
287
- preprocessing_pipeline.add_node(component = custom_preprocessor,
288
- name ='UdfPreProcessor', inputs=["FileConverter"])
289
-
290
- return preprocessing_pipeline
291
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/reader_qa.py DELETED
@@ -1,118 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
-
12
- @st.cache_resource
13
- def load_reader(config_file:str = None, classifier_name:str = None):
14
- """
15
- loads the document classifier using haystack, where the name/path of model
16
- in HF-hub as string is used to fetch the model object.Either configfile or
17
- model should be passed.
18
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
- Params
21
- --------
22
- config_file: config file path from which to read the model name
23
- classifier_name: if modelname is passed, it takes a priority if not \
24
- found then will look for configfile, else raise error.
25
- Return: document classifier model
26
- """
27
- if not classifier_name:
28
- if not config_file:
29
- logging.warning("Pass either model name or config file")
30
- return
31
- else:
32
- config = getconfig(config_file)
33
- classifier_name = config.get('reader','MODEL')
34
-
35
- logging.info("Loading Reader")
36
- # we are using the pipeline as the model is multilabel and DocumentClassifier
37
- # from Haystack doesnt support multilabel
38
- # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
39
- # if not then it will automatically use softmax, which is not a desired thing.
40
- # doc_classifier = TransformersDocumentClassifier(
41
- # model_name_or_path=classifier_name,
42
- # task="text-classification",
43
- # top_k = None)
44
-
45
- qa_model = pipeline("question-answering", model=classifier_name )
46
-
47
- return qa_model
48
-
49
-
50
- @st.cache_data
51
- def reader_highlight(haystack_doc:pd.DataFrame,
52
- threshold:float = 0.5,
53
- classifier_model:pipeline= None
54
- )->Tuple[DataFrame,Series]:
55
- """
56
- Text-Classification on the list of texts provided. Classifier provides the
57
- most appropriate label for each text. these labels are in terms of if text
58
- belongs to which particular Sustainable Devleopment Goal (SDG).
59
- Params
60
- ---------
61
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
62
- contains the list of paragraphs in different format,here the list of
63
- Haystack Documents is used.
64
- threshold: threshold value for the model to keep the results from classifier
65
- classifiermodel: you can pass the classifier model directly,which takes priority
66
- however if not then looks for model in streamlit session.
67
- In case of streamlit avoid passing the model directly.
68
- Returns
69
- ----------
70
- df: Dataframe
71
- """
72
- logging.info("Working on Reader")
73
- haystack_doc['Extracted Text'] = 'NA'
74
- df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
75
- df1 = df1.reset_index(drop=True)
76
- df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
77
- df = df.reset_index(drop=True)
78
- predictions = []
79
- if not classifier_model:
80
- reader_model = st.session_state['reader_qa']
81
- ques_ = 'What Target/commitments have been made ?'
82
- for text in list(df1.text):
83
- predictions.append(reader_model(question = ques_, context = text))
84
-
85
- # # getting the sector label and scores
86
- list_ = []
87
- for i in range(len(predictions)):
88
- list_.append(predictions[i]['answer'])
89
- df1['Extracted Text'] = list_
90
-
91
- df = pd.concat([df,df1])
92
- df = df.reset_index(drop =True)
93
- df.index += 1
94
-
95
- return df
96
-
97
- # temp = predictions[i]
98
- # placeholder = {}
99
- # for j in range(len(temp)):
100
- # placeholder[temp[j]['label']] = temp[j]['score']
101
- # list_.append(placeholder)
102
- # labels_ = [{**list_[l]} for l in range(len(predictions))]
103
- # truth_df = DataFrame.from_dict(labels_)
104
- # truth_df = truth_df.round(2)
105
- # # based on threshold value, we convert each sector score into boolean
106
- # truth_df = truth_df.astype(float) >= threshold
107
- # truth_df = truth_df.astype(str)
108
- # # collecting list of Sector Labels
109
- # categories = list(truth_df.columns)
110
- # # we collect the Sector Labels as set, None represent the value at the index
111
- # # in the list of Sector Labels.
112
- # truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
113
- # None for i in categories}, axis=1)
114
- # # we keep all Sector label except None
115
- # truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
116
- # -{None}),axis=1)
117
- # haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
118
- # return haystack_doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/sector_classifier.py DELETED
@@ -1,107 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
-
12
- @st.cache_resource
13
- def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
14
- """
15
- loads the document classifier using haystack, where the name/path of model
16
- in HF-hub as string is used to fetch the model object.Either configfile or
17
- model should be passed.
18
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
- Params
21
- --------
22
- config_file: config file path from which to read the model name
23
- classifier_name: if modelname is passed, it takes a priority if not \
24
- found then will look for configfile, else raise error.
25
- Return: document classifier model
26
- """
27
- if not classifier_name:
28
- if not config_file:
29
- logging.warning("Pass either model name or config file")
30
- return
31
- else:
32
- config = getconfig(config_file)
33
- classifier_name = config.get('sector','MODEL')
34
-
35
- logging.info("Loading sector classifier")
36
- # we are using the pipeline as the model is multilabel and DocumentClassifier
37
- # from Haystack doesnt support multilabel
38
- # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
39
- # if not then it will automatically use softmax, which is not a desired thing.
40
- # doc_classifier = TransformersDocumentClassifier(
41
- # model_name_or_path=classifier_name,
42
- # task="text-classification",
43
- # top_k = None)
44
-
45
- doc_classifier = pipeline("text-classification",
46
- model=classifier_name,
47
- return_all_scores=True,
48
- function_to_apply= "sigmoid")
49
-
50
- return doc_classifier
51
-
52
-
53
- @st.cache_data
54
- def sector_classification(haystack_doc:pd.DataFrame,
55
- threshold:float = 0.5,
56
- classifier_model:pipeline= None
57
- )->Tuple[DataFrame,Series]:
58
- """
59
- Text-Classification on the list of texts provided. Classifier provides the
60
- most appropriate label for each text. these labels are in terms of if text
61
- belongs to which particular Sustainable Devleopment Goal (SDG).
62
- Params
63
- ---------
64
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
65
- contains the list of paragraphs in different format,here the list of
66
- Haystack Documents is used.
67
- threshold: threshold value for the model to keep the results from classifier
68
- classifiermodel: you can pass the classifier model directly,which takes priority
69
- however if not then looks for model in streamlit session.
70
- In case of streamlit avoid passing the model directly.
71
- Returns
72
- ----------
73
- df: Dataframe
74
- """
75
- logging.info("Working on Sector Identification")
76
- haystack_doc['Sector Label'] = 'NA'
77
- if not classifier_model:
78
- classifier_model = st.session_state['sector_classifier']
79
-
80
- predictions = classifier_model(list(haystack_doc.text))
81
-
82
- # getting the sector label and scores
83
- list_ = []
84
- for i in range(len(predictions)):
85
-
86
- temp = predictions[i]
87
- placeholder = {}
88
- for j in range(len(temp)):
89
- placeholder[temp[j]['label']] = temp[j]['score']
90
- list_.append(placeholder)
91
- labels_ = [{**list_[l]} for l in range(len(predictions))]
92
- truth_df = DataFrame.from_dict(labels_)
93
- truth_df = truth_df.round(2)
94
- # based on threshold value, we convert each sector score into boolean
95
- truth_df = truth_df.astype(float) >= threshold
96
- truth_df = truth_df.astype(str)
97
- # collecting list of Sector Labels
98
- categories = list(truth_df.columns)
99
- # we collect the Sector Labels as set, None represent the value at the index
100
- # in the list of Sector Labels.
101
- truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
102
- None for i in categories}, axis=1)
103
- # we keep all Sector label except None
104
- truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
105
- -{None}),axis=1)
106
- haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
107
- return haystack_doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/target_classifier.py DELETED
@@ -1,90 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- ## Labels dictionary ###
12
- _lab_dict = {
13
- 'NEGATIVE':'NO TARGET INFO',
14
- 'TARGET':'TARGET',
15
- }
16
-
17
- @st.cache_resource
18
- def load_targetClassifier(config_file:str = None, classifier_name:str = None):
19
- """
20
- loads the document classifier using haystack, where the name/path of model
21
- in HF-hub as string is used to fetch the model object.Either configfile or
22
- model should be passed.
23
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
24
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
25
- Params
26
- --------
27
- config_file: config file path from which to read the model name
28
- classifier_name: if modelname is passed, it takes a priority if not \
29
- found then will look for configfile, else raise error.
30
- Return: document classifier model
31
- """
32
- if not classifier_name:
33
- if not config_file:
34
- logging.warning("Pass either model name or config file")
35
- return
36
- else:
37
- config = getconfig(config_file)
38
- classifier_name = config.get('target','MODEL')
39
-
40
- logging.info("Loading classifier")
41
-
42
- doc_classifier = pipeline("text-classification",
43
- model=classifier_name,
44
- top_k =1)
45
-
46
- return doc_classifier
47
-
48
-
49
- @st.cache_data
50
- def target_classification(haystack_doc:pd.DataFrame,
51
- threshold:float = 0.5,
52
- classifier_model:pipeline= None
53
- )->Tuple[DataFrame,Series]:
54
- """
55
- Text-Classification on the list of texts provided. Classifier provides the
56
- most appropriate label for each text. these labels are in terms of if text
57
- belongs to which particular Sustainable Devleopment Goal (SDG).
58
- Params
59
- ---------
60
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
61
- contains the list of paragraphs in different format,here the list of
62
- Haystack Documents is used.
63
- threshold: threshold value for the model to keep the results from classifier
64
- classifiermodel: you can pass the classifier model directly,which takes priority
65
- however if not then looks for model in streamlit session.
66
- In case of streamlit avoid passing the model directly.
67
- Returns
68
- ----------
69
- df: Dataframe with two columns['SDG:int', 'text']
70
- x: Series object with the unique SDG covered in the document uploaded and
71
- the number of times it is covered/discussed/count_of_paragraphs.
72
- """
73
- logging.info("Working on Target Extraction")
74
- if not classifier_model:
75
- classifier_model = st.session_state['target_classifier']
76
-
77
- results = classifier_model(list(haystack_doc.text))
78
- labels_= [(l[0]['label'],
79
- l[0]['score']) for l in results]
80
-
81
-
82
- df1 = DataFrame(labels_, columns=["Target Label","Target Score"])
83
- df = pd.concat([haystack_doc,df1],axis=1)
84
-
85
- df = df.sort_values(by="Target Score", ascending=False).reset_index(drop=True)
86
- df['Target Score'] = df['Target Score'].round(2)
87
- df.index += 1
88
- # df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
89
-
90
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/uploadAndExample.py DELETED
@@ -1,38 +0,0 @@
1
- import streamlit as st
2
- import tempfile
3
- import json
4
- import time
5
-
6
- def add_upload(choice):
7
- """
8
- Provdies the user with choice to either 'Upload Document' or 'Try Example'.
9
- Based on user choice runs streamlit processes and save the path and name of
10
- the 'file' to streamlit session_state which then can be fetched later.
11
-
12
- """
13
-
14
- if choice == 'Upload Document':
15
-
16
- uploaded_file = st.sidebar.file_uploader('Upload the File',
17
- type=['pdf', 'docx', 'txt'])
18
- if uploaded_file is not None:
19
- with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
20
- bytes_data = uploaded_file.getvalue()
21
- temp.write(bytes_data)
22
- st.session_state['filename'] = uploaded_file.name
23
- st.session_state['filepath'] = temp.name
24
- succes = st.success("Upload succesful")
25
- time.sleep(3)
26
- succes.empty()
27
-
28
-
29
- else:
30
- # listing the options
31
- with open('docStore/sample/files.json','r') as json_file:
32
- files = json.load(json_file)
33
-
34
- option = st.sidebar.selectbox('Select the example document',
35
- list(files.keys()))
36
- file_name = file_path = files[option]
37
- st.session_state['filename'] = file_name
38
- st.session_state['filepath'] = file_path