ppsingh commited on
Commit
ae31548
·
1 Parent(s): 91648ac

Update utils/adapmit_classifier.py

Browse files
Files changed (1) hide show
  1. utils/adapmit_classifier.py +100 -101
utils/adapmit_classifier.py CHANGED
@@ -1,101 +1,100 @@
1
- from haystack.schema import Document
2
- from typing import List, Tuple
3
- from typing_extensions import Literal
4
- import logging
5
- import pandas as pd
6
- from pandas import DataFrame, Series
7
- from utils.config import getconfig
8
- from utils.preprocessing import processingpipeline
9
- import streamlit as st
10
- from haystack.nodes import TransformersDocumentClassifier
11
- from transformers import pipeline
12
-
13
- @st.cache_resource
14
- def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
15
- """
16
- loads the document classifier using haystack, where the name/path of model
17
- in HF-hub as string is used to fetch the model object.Either configfile or
18
- model should be passed.
19
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
20
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
21
- Params
22
- --------
23
- config_file: config file path from which to read the model name
24
- classifier_name: if modelname is passed, it takes a priority if not \
25
- found then will look for configfile, else raise error.
26
- Return: document classifier model
27
- """
28
- if not classifier_name:
29
- if not config_file:
30
- logging.warning("Pass either model name or config file")
31
- return
32
- else:
33
- config = getconfig(config_file)
34
- classifier_name = config.get('adapmit','MODEL')
35
-
36
- logging.info("Loading Adaptation Mitigation classifier")
37
- doc_classifier = pipeline("text-classification",
38
- model=classifier_name,
39
- return_all_scores=True,
40
- function_to_apply= "sigmoid")
41
-
42
-
43
- return doc_classifier
44
-
45
-
46
- @st.cache_data
47
- def adapmit_classification(haystack_doc:pd.DataFrame,
48
- threshold:float = 0.5,
49
- classifier_model:pipeline= None
50
- )->Tuple[DataFrame,Series]:
51
- """
52
- Text-Classification on the list of texts provided. Classifier provides the
53
- most appropriate label for each text. these labels are in terms of if text
54
- belongs to which particular Sustainable Devleopment Goal (SDG).
55
- Params
56
- ---------
57
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
58
- contains the list of paragraphs in different format,here the list of
59
- Haystack Documents is used.
60
- threshold: threshold value for the model to keep the results from classifier
61
- classifiermodel: you can pass the classifier model directly,which takes priority
62
- however if not then looks for model in streamlit session.
63
- In case of streamlit avoid passing the model directly.
64
- Returns
65
- ----------
66
- df: Dataframe with two columns['SDG:int', 'text']
67
- x: Series object with the unique SDG covered in the document uploaded and
68
- the number of times it is covered/discussed/count_of_paragraphs.
69
- """
70
- logging.info("Working on Adaptation-Mitigation Identification")
71
- haystack_doc['Adapt-Mitig Label'] = 'NA'
72
- df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
73
- df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
74
-
75
- if not classifier_model:
76
- classifier_model = st.session_state['adapmit_classifier']
77
-
78
- predictions = classifier_model(list(df1.text))
79
- # converting the predictions to desired format
80
- list_ = []
81
- for i in range(len(predictions)):
82
-
83
- temp = predictions[i]
84
- placeholder = {}
85
- for j in range(len(temp)):
86
- placeholder[temp[j]['label']] = temp[j]['score']
87
- list_.append(placeholder)
88
- labels_ = [{**list_[l]} for l in range(len(predictions))]
89
- truth_df = DataFrame.from_dict(labels_)
90
- truth_df = truth_df.round(2)
91
- truth_df = truth_df.astype(float) >= threshold
92
- truth_df = truth_df.astype(str)
93
- categories = list(truth_df.columns)
94
- truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
95
- else None for i in categories}, axis=1)
96
- truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
97
- list(x['Adapt-Mitig Label'] -{None}),axis=1)
98
- df1['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
99
- df = pd.concat([df,df1])
100
-
101
- return df
 
1
+ from haystack.schema import Document
2
+ from typing import List, Tuple
3
+ from typing_extensions import Literal
4
+ import logging
5
+ import pandas as pd
6
+ from pandas import DataFrame, Series
7
+ from utils.config import getconfig
8
+ from utils.preprocessing import processingpipeline
9
+ import streamlit as st
10
+ from transformers import pipeline
11
+
12
+ @st.cache_resource
13
+ def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
14
+ """
15
+ loads the document classifier using haystack, where the name/path of model
16
+ in HF-hub as string is used to fetch the model object.Either configfile or
17
+ model should be passed.
18
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
+ Params
21
+ --------
22
+ config_file: config file path from which to read the model name
23
+ classifier_name: if modelname is passed, it takes a priority if not \
24
+ found then will look for configfile, else raise error.
25
+ Return: document classifier model
26
+ """
27
+ if not classifier_name:
28
+ if not config_file:
29
+ logging.warning("Pass either model name or config file")
30
+ return
31
+ else:
32
+ config = getconfig(config_file)
33
+ classifier_name = config.get('adapmit','MODEL')
34
+
35
+ logging.info("Loading Adaptation Mitigation classifier")
36
+ doc_classifier = pipeline("text-classification",
37
+ model=classifier_name,
38
+ return_all_scores=True,
39
+ function_to_apply= "sigmoid")
40
+
41
+
42
+ return doc_classifier
43
+
44
+
45
+ @st.cache_data
46
+ def adapmit_classification(haystack_doc:pd.DataFrame,
47
+ threshold:float = 0.5,
48
+ classifier_model:pipeline= None
49
+ )->Tuple[DataFrame,Series]:
50
+ """
51
+ Text-Classification on the list of texts provided. Classifier provides the
52
+ most appropriate label for each text. these labels are in terms of if text
53
+ belongs to which particular Sustainable Devleopment Goal (SDG).
54
+ Params
55
+ ---------
56
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
57
+ contains the list of paragraphs in different format,here the list of
58
+ Haystack Documents is used.
59
+ threshold: threshold value for the model to keep the results from classifier
60
+ classifiermodel: you can pass the classifier model directly,which takes priority
61
+ however if not then looks for model in streamlit session.
62
+ In case of streamlit avoid passing the model directly.
63
+ Returns
64
+ ----------
65
+ df: Dataframe with two columns['SDG:int', 'text']
66
+ x: Series object with the unique SDG covered in the document uploaded and
67
+ the number of times it is covered/discussed/count_of_paragraphs.
68
+ """
69
+ logging.info("Working on Adaptation-Mitigation Identification")
70
+ haystack_doc['Adapt-Mitig Label'] = 'NA'
71
+ df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
72
+ df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
73
+
74
+ if not classifier_model:
75
+ classifier_model = st.session_state['adapmit_classifier']
76
+
77
+ predictions = classifier_model(list(df1.text))
78
+ # converting the predictions to desired format
79
+ list_ = []
80
+ for i in range(len(predictions)):
81
+
82
+ temp = predictions[i]
83
+ placeholder = {}
84
+ for j in range(len(temp)):
85
+ placeholder[temp[j]['label']] = temp[j]['score']
86
+ list_.append(placeholder)
87
+ labels_ = [{**list_[l]} for l in range(len(predictions))]
88
+ truth_df = DataFrame.from_dict(labels_)
89
+ truth_df = truth_df.round(2)
90
+ truth_df = truth_df.astype(float) >= threshold
91
+ truth_df = truth_df.astype(str)
92
+ categories = list(truth_df.columns)
93
+ truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
94
+ else None for i in categories}, axis=1)
95
+ truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
96
+ list(x['Adapt-Mitig Label'] -{None}),axis=1)
97
+ df1['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
98
+ df = pd.concat([df,df1])
99
+
100
+ return df