leavoigt commited on
Commit
2fcb20e
·
1 Parent(s): cb80a49

Delete utils

Browse files
utils/__init__ DELETED
File without changes
utils/adapmit_classifier.py DELETED
@@ -1,99 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- @st.cache_resource
12
- def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
13
- """
14
- loads the document classifier using haystack, where the name/path of model
15
- in HF-hub as string is used to fetch the model object.Either configfile or
16
- model should be passed.
17
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
18
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
19
- Params
20
- --------
21
- config_file: config file path from which to read the model name
22
- classifier_name: if modelname is passed, it takes a priority if not \
23
- found then will look for configfile, else raise error.
24
- Return: document classifier model
25
- """
26
- if not classifier_name:
27
- if not config_file:
28
- logging.warning("Pass either model name or config file")
29
- return
30
- else:
31
- config = getconfig(config_file)
32
- classifier_name = config.get('adapmit','MODEL')
33
-
34
- logging.info("Loading Adaptation Mitigation classifier")
35
- doc_classifier = pipeline("text-classification",
36
- model=classifier_name,
37
- return_all_scores=True,
38
- function_to_apply= "sigmoid")
39
-
40
-
41
- return doc_classifier
42
-
43
-
44
- @st.cache_data
45
- def adapmit_classification(haystack_doc:pd.DataFrame,
46
- threshold:float = 0.5,
47
- classifier_model:pipeline= None
48
- )->Tuple[DataFrame,Series]:
49
- """
50
- Text-Classification on the list of texts provided. Classifier provides the
51
- most appropriate label for each text. these labels are in terms of if text
52
- belongs to which particular Sustainable Devleopment Goal (SDG).
53
- Params
54
- ---------
55
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
56
- contains the list of paragraphs in different format,here the list of
57
- Haystack Documents is used.
58
- threshold: threshold value for the model to keep the results from classifier
59
- classifiermodel: you can pass the classifier model directly,which takes priority
60
- however if not then looks for model in streamlit session.
61
- In case of streamlit avoid passing the model directly.
62
- Returns
63
- ----------
64
- df: Dataframe with two columns['SDG:int', 'text']
65
- x: Series object with the unique SDG covered in the document uploaded and
66
- the number of times it is covered/discussed/count_of_paragraphs.
67
- """
68
- logging.info("Working on Adaptation-Mitigation Identification")
69
- haystack_doc['Adapt-Mitig Label'] = 'NA'
70
- # df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
71
- # df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
72
-
73
- if not classifier_model:
74
- classifier_model = st.session_state['adapmit_classifier']
75
-
76
- predictions = classifier_model(list(haystack_doc.text))
77
- # converting the predictions to desired format
78
- list_ = []
79
- for i in range(len(predictions)):
80
-
81
- temp = predictions[i]
82
- placeholder = {}
83
- for j in range(len(temp)):
84
- placeholder[temp[j]['label']] = temp[j]['score']
85
- list_.append(placeholder)
86
- labels_ = [{**list_[l]} for l in range(len(predictions))]
87
- truth_df = DataFrame.from_dict(labels_)
88
- truth_df = truth_df.round(2)
89
- truth_df = truth_df.astype(float) >= threshold
90
- truth_df = truth_df.astype(str)
91
- categories = list(truth_df.columns)
92
- truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
93
- else None for i in categories}, axis=1)
94
- truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
95
- list(x['Adapt-Mitig Label'] -{None}),axis=1)
96
- haystack_doc['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
97
- #df = pd.concat([df,df1])
98
-
99
- return haystack_doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/conditional_classifier.py DELETED
@@ -1,95 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
-
12
- @st.cache_resource
13
- def load_conditionalClassifier(config_file:str = None, classifier_name:str = None):
14
- """
15
- loads the document classifier using haystack, where the name/path of model
16
- in HF-hub as string is used to fetch the model object.Either configfile or
17
- model should be passed.
18
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
- Params
21
- --------
22
- config_file: config file path from which to read the model name
23
- classifier_name: if modelname is passed, it takes a priority if not \
24
- found then will look for configfile, else raise error.
25
- Return: document classifier model
26
- """
27
- if not classifier_name:
28
- if not config_file:
29
- logging.warning("Pass either model name or config file")
30
- return
31
- else:
32
- config = getconfig(config_file)
33
- classifier_name = config.get('conditional','MODEL')
34
-
35
- logging.info("Loading conditional classifier")
36
- doc_classifier = pipeline("text-classification",
37
- model=classifier_name,
38
- top_k =1)
39
-
40
- return doc_classifier
41
-
42
-
43
- @st.cache_data
44
- def conditional_classification(haystack_doc:pd.DataFrame,
45
- threshold:float = 0.8,
46
- classifier_model:pipeline= None
47
- )->Tuple[DataFrame,Series]:
48
- """
49
- Text-Classification on the list of texts provided. Classifier provides the
50
- most appropriate label for each text. It informs if paragraph contains any
51
- netzero information or not.
52
- Params
53
- ---------
54
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
55
- contains the list of paragraphs in different format,here the list of
56
- Haystack Documents is used.
57
- threshold: threshold value for the model to keep the results from classifier
58
- classifiermodel: you can pass the classifier model directly,which takes priority
59
- however if not then looks for model in streamlit session.
60
- In case of streamlit avoid passing the model directly.
61
- Returns
62
- ----------
63
- df: Dataframe
64
- """
65
- logging.info("Working on Conditionality Identification")
66
- haystack_doc['Conditional Label'] = 'NA'
67
- haystack_doc['Conditional Score'] = 0.0
68
- haystack_doc['cond_check'] = False
69
- haystack_doc['PA_check'] = haystack_doc['Policy-Action Label'].apply(lambda x: True if len(x) != 0 else False)
70
-
71
- #df1 = haystack_doc[haystack_doc['PA_check'] == True]
72
- #df = haystack_doc[haystack_doc['PA_check'] == False]
73
- haystack_doc['cond_check'] = haystack_doc.apply(lambda x: True if (
74
- (x['Target Label'] == 'TARGET') | (x['PA_check'] == True)) else
75
- False, axis=1)
76
- # we apply Netzero to only paragraphs which are classified as 'Target' related
77
- temp = haystack_doc[haystack_doc['cond_check'] == True]
78
- temp = temp.reset_index(drop=True)
79
- df = haystack_doc[haystack_doc['cond_check'] == False]
80
- df = df.reset_index(drop=True)
81
-
82
- if not classifier_model:
83
- classifier_model = st.session_state['conditional_classifier']
84
-
85
- results = classifier_model(list(temp.text))
86
- labels_= [(l[0]['label'],l[0]['score']) for l in results]
87
- temp['Conditional Label'],temp['Conditional Score'] = zip(*labels_)
88
- # temp[' Label'] = temp['Netzero Label'].apply(lambda x: _lab_dict[x])
89
- # merging Target with Non Target dataframe
90
- df = pd.concat([df,temp])
91
- df = df.drop(columns = ['cond_check','PA_check'])
92
- df = df.reset_index(drop =True)
93
- df.index += 1
94
-
95
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/config.py DELETED
@@ -1,31 +0,0 @@
1
- import configparser
2
- import logging
3
-
4
- def getconfig(configfile_path:str):
5
- """
6
- configfile_path: file path of .cfg file
7
- """
8
-
9
- config = configparser.ConfigParser()
10
-
11
- try:
12
- config.read_file(open(configfile_path))
13
- return config
14
- except:
15
- logging.warning("config file not found")
16
-
17
-
18
- # Declare all the necessary variables
19
- def get_classifier_params(model_name):
20
- config = getconfig('paramconfig.cfg')
21
- params = {}
22
- params['model_name'] = config.get(model_name,'MODEL')
23
- params['split_by'] = config.get(model_name,'SPLIT_BY')
24
- params['split_length'] = int(config.get(model_name,'SPLIT_LENGTH'))
25
- params['split_overlap'] = int(config.get(model_name,'SPLIT_OVERLAP'))
26
- params['remove_punc'] = bool(int(config.get(model_name,'REMOVE_PUNC')))
27
- params['split_respect_sentence_boundary'] = bool(int(config.get(model_name,'RESPECT_SENTENCE_BOUNDARY')))
28
- params['threshold'] = float(config.get(model_name,'THRESHOLD'))
29
- params['top_n'] = int(config.get(model_name,'TOP_KEY'))
30
-
31
- return params
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/ghg_classifier.py DELETED
@@ -1,96 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- # Labels dictionary ###
12
- _lab_dict = {
13
- 'GHG':'GHG',
14
- 'NOT_GHG':'NON GHG TRANSPORT TARGET',
15
- 'NEGATIVE':'OTHERS',
16
- }
17
-
18
-
19
- @st.cache_resource
20
- def load_ghgClassifier(config_file:str = None, classifier_name:str = None):
21
- """
22
- loads the document classifier using haystack, where the name/path of model
23
- in HF-hub as string is used to fetch the model object.Either configfile or
24
- model should be passed.
25
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
26
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
27
- Params
28
- --------
29
- config_file: config file path from which to read the model name
30
- classifier_name: if modelname is passed, it takes a priority if not \
31
- found then will look for configfile, else raise error.
32
- Return: document classifier model
33
- """
34
- if not classifier_name:
35
- if not config_file:
36
- logging.warning("Pass either model name or config file")
37
- return
38
- else:
39
- config = getconfig(config_file)
40
- classifier_name = config.get('ghg','MODEL')
41
-
42
- logging.info("Loading ghg classifier")
43
- doc_classifier = pipeline("text-classification",
44
- model=classifier_name,
45
- top_k =1)
46
-
47
- return doc_classifier
48
-
49
-
50
- @st.cache_data
51
- def ghg_classification(haystack_doc:pd.DataFrame,
52
- threshold:float = 0.5,
53
- classifier_model:pipeline= None
54
- )->Tuple[DataFrame,Series]:
55
- """
56
- Text-Classification on the list of texts provided. Classifier provides the
57
- most appropriate label for each text. these labels are in terms of if text
58
- belongs to which particular Sustainable Devleopment Goal (SDG).
59
- Params
60
- ---------
61
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
62
- contains the list of paragraphs in different format,here the list of
63
- Haystack Documents is used.
64
- threshold: threshold value for the model to keep the results from classifier
65
- classifiermodel: you can pass the classifier model directly,which takes priority
66
- however if not then looks for model in streamlit session.
67
- In case of streamlit avoid passing the model directly.
68
- Returns
69
- ----------
70
- df: Dataframe with two columns['SDG:int', 'text']
71
- x: Series object with the unique SDG covered in the document uploaded and
72
- the number of times it is covered/discussed/count_of_paragraphs.
73
- """
74
- logging.info("Working on GHG Extraction")
75
- haystack_doc['GHG Label'] = 'NA'
76
- haystack_doc['GHG Score'] = 0.0
77
- # applying GHG Identifier to only 'Target' paragraphs.
78
- temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
79
- temp = temp.reset_index(drop=True)
80
- df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
81
- df = df.reset_index(drop=True)
82
-
83
- if not classifier_model:
84
- classifier_model = st.session_state['ghg_classifier']
85
-
86
- results = classifier_model(list(temp.text))
87
- labels_= [(l[0]['label'],l[0]['score']) for l in results]
88
- temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
89
- temp['GHG Label'] = temp['GHG Label'].apply(lambda x: _lab_dict[x])
90
- # merge back Target and non-Target dataframe
91
- df = pd.concat([df,temp])
92
- df = df.reset_index(drop =True)
93
- df['GHG Score'] = df['GHG Score'].round(2)
94
- df.index += 1
95
-
96
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/indicator_classifier.py DELETED
@@ -1,109 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
-
12
- @st.cache_resource
13
- def load_indicatorClassifier(config_file:str = None, classifier_name:str = None):
14
- """
15
- loads the document classifier using haystack, where the name/path of model
16
- in HF-hub as string is used to fetch the model object.Either configfile or
17
- model should be passed.
18
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
- Params
21
- --------
22
- config_file: config file path from which to read the model name
23
- classifier_name: if modelname is passed, it takes a priority if not \
24
- found then will look for configfile, else raise error.
25
- Return: document classifier model
26
- """
27
- if not classifier_name:
28
- if not config_file:
29
- logging.warning("Pass either model name or config file")
30
- return
31
- else:
32
- config = getconfig(config_file)
33
- classifier_name = config.get('indicator','MODEL')
34
-
35
- logging.info("Loading indicator classifier")
36
- # we are using the pipeline as the model is multilabel and DocumentClassifier
37
- # from Haystack doesnt support multilabel
38
- # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
39
- # if not then it will automatically use softmax, which is not a desired thing.
40
- # doc_classifier = TransformersDocumentClassifier(
41
- # model_name_or_path=classifier_name,
42
- # task="text-classification",
43
- # top_k = None)
44
-
45
- doc_classifier = pipeline("text-classification",
46
- model=classifier_name,
47
- return_all_scores=True,
48
- function_to_apply= "sigmoid")
49
-
50
- return doc_classifier
51
-
52
-
53
- @st.cache_data
54
- def indicator_classification(haystack_doc:pd.DataFrame,
55
- threshold:float = 0.5,
56
- classifier_model:pipeline= None
57
- )->Tuple[DataFrame,Series]:
58
- """
59
- Text-Classification on the list of texts provided. Classifier provides the
60
- most appropriate label for each text. these labels are in terms of if text
61
- belongs to which particular Sustainable Devleopment Goal (SDG).
62
- Params
63
- ---------
64
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
65
- contains the list of paragraphs in different format,here the list of
66
- Haystack Documents is used.
67
- threshold: threshold value for the model to keep the results from classifier
68
- classifiermodel: you can pass the classifier model directly,which takes priority
69
- however if not then looks for model in streamlit session.
70
- In case of streamlit avoid passing the model directly.
71
- Returns
72
- ----------
73
- df: Dataframe with two columns['SDG:int', 'text']
74
- x: Series object with the unique SDG covered in the document uploaded and
75
- the number of times it is covered/discussed/count_of_paragraphs.
76
- """
77
- logging.info("Working on Indicator Identification")
78
- haystack_doc['Indicator Label'] = 'NA'
79
- haystack_doc['PA_check'] = haystack_doc['Policy-Action Label'].apply(lambda x: True if len(x) != 0 else False)
80
-
81
- df1 = haystack_doc[haystack_doc['PA_check'] == True]
82
- df = haystack_doc[haystack_doc['PA_check'] == False]
83
- if not classifier_model:
84
- classifier_model = st.session_state['indicator_classifier']
85
-
86
- predictions = classifier_model(list(df1.text))
87
-
88
- list_ = []
89
- for i in range(len(predictions)):
90
-
91
- temp = predictions[i]
92
- placeholder = {}
93
- for j in range(len(temp)):
94
- placeholder[temp[j]['label']] = temp[j]['score']
95
- list_.append(placeholder)
96
- labels_ = [{**list_[l]} for l in range(len(predictions))]
97
- truth_df = DataFrame.from_dict(labels_)
98
- truth_df = truth_df.round(2)
99
- truth_df = truth_df.astype(float) >= threshold
100
- truth_df = truth_df.astype(str)
101
- categories = list(truth_df.columns)
102
- truth_df['Indicator Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
103
- None for i in categories}, axis=1)
104
- truth_df['Indicator Label'] = truth_df.apply(lambda x: list(x['Indicator Label']
105
- -{None}),axis=1)
106
- df1['Indicator Label'] = list(truth_df['Indicator Label'])
107
- df = pd.concat([df,df1])
108
- df = df.drop(columns = ['PA_check'])
109
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/netzero_classifier.py DELETED
@@ -1,88 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- # Labels dictionary ###
12
- _lab_dict = {
13
- 'NEGATIVE':'NO NETZERO TARGET',
14
- 'NETZERO':'NETZERO TARGET',
15
- }
16
-
17
- @st.cache_resource
18
- def load_netzeroClassifier(config_file:str = None, classifier_name:str = None):
19
- """
20
- loads the document classifier using haystack, where the name/path of model
21
- in HF-hub as string is used to fetch the model object.Either configfile or
22
- model should be passed.
23
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
24
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
25
- Params
26
- --------
27
- config_file: config file path from which to read the model name
28
- classifier_name: if modelname is passed, it takes a priority if not \
29
- found then will look for configfile, else raise error.
30
- Return: document classifier model
31
- """
32
- if not classifier_name:
33
- if not config_file:
34
- logging.warning("Pass either model name or config file")
35
- return
36
- else:
37
- config = getconfig(config_file)
38
- classifier_name = config.get('netzero','MODEL')
39
-
40
- logging.info("Loading netzero classifier")
41
- doc_classifier = pipeline("text-classification",
42
- model=classifier_name,
43
- top_k =1)
44
-
45
- return doc_classifier
46
-
47
-
48
- @st.cache_data
49
- def netzero_classification(haystack_doc:pd.DataFrame,
50
- threshold:float = 0.8,
51
- classifier_model:pipeline= None
52
- )->Tuple[DataFrame,Series]:
53
- """
54
- Text-Classification on the list of texts provided. Classifier provides the
55
- most appropriate label for each text. these labels are in terms of if text
56
- belongs to which particular Sustainable Devleopment Goal (SDG).
57
- Params
58
- ---------
59
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
60
- contains the list of paragraphs in different format,here the list of
61
- Haystack Documents is used.
62
- threshold: threshold value for the model to keep the results from classifier
63
- classifiermodel: you can pass the classifier model directly,which takes priority
64
- however if not then looks for model in streamlit session.
65
- In case of streamlit avoid passing the model directly.
66
- Returns
67
- ----------
68
- df: Dataframe with two columns['SDG:int', 'text']
69
- x: Series object with the unique SDG covered in the document uploaded and
70
- the number of times it is covered/discussed/count_of_paragraphs.
71
- """
72
- logging.info("Working on Netzero Extraction")
73
- haystack_doc['Netzero Label'] = 'NA'
74
- haystack_doc['Netzero Score'] = 'NA'
75
- temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
76
- df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
77
-
78
- if not classifier_model:
79
- classifier_model = st.session_state['netzero_classifier']
80
-
81
- results = classifier_model(list(temp.text))
82
- labels_= [(l[0]['label'],l[0]['score']) for l in results]
83
- temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
84
- df = pd.concat([df,temp])
85
- df = df.reset_index(drop =True)
86
- df.index += 1
87
-
88
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/policyaction_classifier.py DELETED
@@ -1,101 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- ## Labels dictionary ###
12
- _lab_dict = {
13
- 'NEGATIVE':'NO TARGET INFO',
14
- 'TARGET':'TARGET',
15
- }
16
-
17
- @st.cache_resource
18
- def load_policyactionClassifier(config_file:str = None, classifier_name:str = None):
19
- """
20
- loads the document classifier using haystack, where the name/path of model
21
- in HF-hub as string is used to fetch the model object.Either configfile or
22
- model should be passed.
23
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
24
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
25
- Params
26
- --------
27
- config_file: config file path from which to read the model name
28
- classifier_name: if modelname is passed, it takes a priority if not \
29
- found then will look for configfile, else raise error.
30
- Return: document classifier model
31
- """
32
- if not classifier_name:
33
- if not config_file:
34
- logging.warning("Pass either model name or config file")
35
- return
36
- else:
37
- config = getconfig(config_file)
38
- classifier_name = config.get('policyaction','MODEL')
39
-
40
- logging.info("Loading classifier")
41
-
42
- doc_classifier = pipeline("text-classification",
43
- model=classifier_name,
44
- return_all_scores=True,
45
- function_to_apply= "sigmoid")
46
-
47
- return doc_classifier
48
-
49
-
50
- @st.cache_data
51
- def policyaction_classification(haystack_doc:pd.DataFrame,
52
- threshold:float = 0.5,
53
- classifier_model:pipeline= None
54
- )->Tuple[DataFrame,Series]:
55
- """
56
- Text-Classification on the list of texts provided. Classifier provides the
57
- most appropriate label for each text. these labels are in terms of if text
58
- belongs to which particular Sustainable Devleopment Goal (SDG).
59
- Params
60
- ---------
61
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
62
- contains the list of paragraphs in different format,here the list of
63
- Haystack Documents is used.
64
- threshold: threshold value for the model to keep the results from classifier
65
- classifiermodel: you can pass the classifier model directly,which takes priority
66
- however if not then looks for model in streamlit session.
67
- In case of streamlit avoid passing the model directly.
68
- Returns
69
- ----------
70
- df: Dataframe with two columns['SDG:int', 'text']
71
- x: Series object with the unique SDG covered in the document uploaded and
72
- the number of times it is covered/discussed/count_of_paragraphs.
73
- """
74
- logging.info("Working on Policy/Action. Extraction")
75
- haystack_doc['Policy-Action Label'] = 'NA'
76
- if not classifier_model:
77
- classifier_model = st.session_state['policyaction_classifier']
78
-
79
- predictions = classifier_model(list(haystack_doc.text))
80
- list_ = []
81
- for i in range(len(predictions)):
82
-
83
- temp = predictions[i]
84
- placeholder = {}
85
- for j in range(len(temp)):
86
- placeholder[temp[j]['label']] = temp[j]['score']
87
- list_.append(placeholder)
88
- labels_ = [{**list_[l]} for l in range(len(predictions))]
89
- truth_df = DataFrame.from_dict(labels_)
90
- truth_df = truth_df.round(2)
91
- truth_df = truth_df.astype(float) >= threshold
92
- truth_df = truth_df.astype(str)
93
- categories = list(truth_df.columns)
94
- truth_df['Policy-Action Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
95
- else None for i in categories}, axis=1)
96
- truth_df['Policy-Action Label'] = truth_df.apply(lambda x:
97
- list(x['Policy-Action Label'] -{None}),axis=1)
98
-
99
- haystack_doc['Policy-Action Label'] = list(truth_df['Policy-Action Label'])
100
-
101
- return haystack_doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/preprocessing.py DELETED
@@ -1,291 +0,0 @@
1
- from haystack.nodes.base import BaseComponent
2
- from haystack.schema import Document
3
- from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
4
- from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
- from typing import Callable, Dict, List, Optional, Text, Tuple, Union
6
- from typing_extensions import Literal
7
- import pandas as pd
8
- import logging
9
- import re
10
- import string
11
- from haystack.pipelines import Pipeline
12
-
13
- def useOCR(file_path: str)-> Text:
14
- """
15
- Converts image pdfs into text, Using the Farm-haystack[OCR]
16
-
17
- Params
18
- ----------
19
- file_path: file_path of uploade file, returned by add_upload function in
20
- uploadAndExample.py
21
-
22
- Returns the text file as string.
23
- """
24
-
25
-
26
- converter = PDFToTextOCRConverter(remove_numeric_tables=True,
27
- valid_languages=["eng"])
28
- docs = converter.convert(file_path=file_path, meta=None)
29
- return docs[0].content
30
-
31
-
32
-
33
-
34
- class FileConverter(BaseComponent):
35
- """
36
- Wrapper class to convert uploaded document into text by calling appropriate
37
- Converter class, will use internally haystack PDFToTextOCR in case of image
38
- pdf. Cannot use the FileClassifier from haystack as its doesnt has any
39
- label/output class for image.
40
-
41
- 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
42
- 2. https://docs.haystack.deepset.ai/docs/file_converters
43
- 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
44
- 4. https://docs.haystack.deepset.ai/reference/file-converters-api
45
-
46
-
47
- """
48
-
49
- outgoing_edges = 1
50
-
51
- def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
52
- id_hash_keys: Optional[List[str]] = None,
53
- ) -> Tuple[dict,str]:
54
- """ this is required method to invoke the component in
55
- the pipeline implementation.
56
-
57
- Params
58
- ----------
59
- file_name: name of file
60
- file_path: file_path of uploade file, returned by add_upload function in
61
- uploadAndExample.py
62
-
63
- See the links provided in Class docstring/description to see other params
64
-
65
- Return
66
- ---------
67
- output: dictionary, with key as identifier and value could be anything
68
- we need to return. In this case its the List of Hasyatck Document
69
-
70
- output_1: As there is only one outgoing edge, we pass 'output_1' string
71
- """
72
- try:
73
- if file_name.endswith('.pdf'):
74
- converter = PDFToTextConverter(remove_numeric_tables=True)
75
- if file_name.endswith('.txt'):
76
- converter = TextConverter(remove_numeric_tables=True)
77
- if file_name.endswith('.docx'):
78
- converter = DocxToTextConverter()
79
- except Exception as e:
80
- logging.error(e)
81
- return
82
-
83
-
84
-
85
- documents = []
86
-
87
-
88
- # encoding is empty, probably should be utf-8
89
- document = converter.convert(
90
- file_path=file_path, meta=None,
91
- encoding=encoding, id_hash_keys=id_hash_keys
92
- )[0]
93
-
94
- text = document.content
95
-
96
- # in case of scanned/images only PDF the content might contain only
97
- # the page separator (\f or \x0c). We check if is so and use
98
- # use the OCR to get the text.
99
- filtered = re.sub(r'\x0c', '', text)
100
-
101
- if filtered == "":
102
- logging.info("Using OCR")
103
- text = useOCR(file_path)
104
-
105
- documents.append(Document(content=text,
106
- meta={"name": file_name},
107
- id_hash_keys=id_hash_keys))
108
-
109
- logging.info('file conversion succesful')
110
- output = {'documents': documents}
111
- return output, 'output_1'
112
-
113
- def run_batch():
114
- """
115
- we dont have requirement to process the multiple files in one go
116
- therefore nothing here, however to use the custom node we need to have
117
- this method for the class.
118
- """
119
-
120
- return
121
-
122
-
123
- def basic(s:str, remove_punc:bool = False):
124
-
125
- """
126
- Performs basic cleaning of text.
127
-
128
- Params
129
- ----------
130
- s: string to be processed
131
- removePunc: to remove all Punctuation including ',' and '.' or not
132
-
133
- Returns: processed string: see comments in the source code for more info
134
- """
135
-
136
- # Remove URLs
137
- s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
138
- s = re.sub(r"http\S+", " ", s)
139
-
140
- # Remove new line characters
141
- s = re.sub('\n', ' ', s)
142
-
143
- # Remove punctuations
144
- if remove_punc == True:
145
- translator = str.maketrans(' ', ' ', string.punctuation)
146
- s = s.translate(translator)
147
- # Remove distracting single quotes and dotted pattern
148
- s = re.sub("\'", " ", s)
149
- s = s.replace("..","")
150
-
151
- return s.strip()
152
-
153
- def paraLengthCheck(paraList, max_len = 100):
154
- """
155
- There are cases where preprocessor cannot respect word limit, when using
156
- respect sentence boundary flag due to missing sentence boundaries.
157
- Therefore we run one more round of split here for those paragraphs
158
-
159
- Params
160
- ---------------
161
- paraList : list of paragraphs/text
162
- max_len : max length to be respected by sentences which bypassed
163
- preprocessor strategy
164
-
165
- """
166
- new_para_list = []
167
- for passage in paraList:
168
- # check if para exceeds words limit
169
- if len(passage.content.split()) > max_len:
170
- # we might need few iterations example if para = 512 tokens
171
- # we need to iterate 5 times to reduce para to size limit of '100'
172
- iterations = int(len(passage.content.split())/max_len)
173
- for i in range(iterations):
174
- temp = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
175
- new_para_list.append((temp,passage.meta['page']))
176
- temp = " ".join(passage.content.split()[max_len*(i+1):])
177
- new_para_list.append((temp,passage.meta['page']))
178
- else:
179
- # paragraphs which dont need any splitting
180
- new_para_list.append((passage.content, passage.meta['page']))
181
-
182
- logging.info("New paragraphs length {}".format(len(new_para_list)))
183
- return new_para_list
184
-
185
- class UdfPreProcessor(BaseComponent):
186
- """
187
- class to preprocess the document returned by FileConverter. It will check
188
- for splitting strategy and splits the document by word or sentences and then
189
- synthetically create the paragraphs.
190
-
191
- 1. https://docs.haystack.deepset.ai/docs/preprocessor
192
- 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
193
- 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
194
-
195
- """
196
- outgoing_edges = 1
197
-
198
- def run(self, documents:List[Document], remove_punc:bool=False,
199
- split_by: Literal["sentence", "word"] = 'sentence',
200
- split_length:int = 2, split_respect_sentence_boundary:bool = False,
201
- split_overlap:int = 0):
202
-
203
- """ this is required method to invoke the component in
204
- the pipeline implementation.
205
-
206
- Params
207
- ----------
208
- documents: documents from the output dictionary returned by Fileconverter
209
- remove_punc: to remove all Punctuation including ',' and '.' or not
210
- split_by: document splitting strategy either as word or sentence
211
- split_length: when synthetically creating the paragrpahs from document,
212
- it defines the length of paragraph.
213
- split_respect_sentence_boundary: Used when using 'word' strategy for
214
- splititng of text.
215
- split_overlap: Number of words or sentences that overlap when creating
216
- the paragraphs. This is done as one sentence or 'some words' make sense
217
- when read in together with others. Therefore the overlap is used.
218
-
219
- Return
220
- ---------
221
- output: dictionary, with key as identifier and value could be anything
222
- we need to return. In this case the output will contain 4 objects
223
- the paragraphs text list as List, Haystack document, Dataframe and
224
- one raw text file.
225
-
226
- output_1: As there is only one outgoing edge, we pass 'output_1' string
227
-
228
- """
229
-
230
- if split_by == 'sentence':
231
- split_respect_sentence_boundary = False
232
-
233
- else:
234
- split_respect_sentence_boundary = split_respect_sentence_boundary
235
-
236
- preprocessor = PreProcessor(
237
- clean_empty_lines=True,
238
- clean_whitespace=True,
239
- clean_header_footer=True,
240
- split_by=split_by,
241
- split_length=split_length,
242
- split_respect_sentence_boundary= split_respect_sentence_boundary,
243
- split_overlap=split_overlap,
244
-
245
- # will add page number only in case of PDF not for text/docx file.
246
- add_page_number=True
247
- )
248
-
249
- for i in documents:
250
- # # basic cleaning before passing it to preprocessor.
251
- # i = basic(i)
252
- docs_processed = preprocessor.process([i])
253
- for item in docs_processed:
254
- item.content = basic(item.content, remove_punc= remove_punc)
255
-
256
- df = pd.DataFrame(docs_processed)
257
- all_text = " ".join(df.content.to_list())
258
- para_list = df.content.to_list()
259
- logging.info('document split into {} paragraphs'.format(len(para_list)))
260
- output = {'documents': docs_processed,
261
- 'dataframe': df,
262
- 'text': all_text,
263
- 'paraList': para_list
264
- }
265
- return output, "output_1"
266
- def run_batch():
267
- """
268
- we dont have requirement to process the multiple files in one go
269
- therefore nothing here, however to use the custom node we need to have
270
- this method for the class.
271
- """
272
- return
273
-
274
- def processingpipeline():
275
- """
276
- Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
277
- from utils.preprocessing
278
-
279
- """
280
-
281
- preprocessing_pipeline = Pipeline()
282
- file_converter = FileConverter()
283
- custom_preprocessor = UdfPreProcessor()
284
-
285
- preprocessing_pipeline.add_node(component=file_converter,
286
- name="FileConverter", inputs=["File"])
287
- preprocessing_pipeline.add_node(component = custom_preprocessor,
288
- name ='UdfPreProcessor', inputs=["FileConverter"])
289
-
290
- return preprocessing_pipeline
291
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/sector_classifier.py DELETED
@@ -1,106 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
-
12
- @st.cache_resource
13
- def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
14
- """
15
- loads the document classifier using haystack, where the name/path of model
16
- in HF-hub as string is used to fetch the model object.Either configfile or
17
- model should be passed.
18
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
- Params
21
- --------
22
- config_file: config file path from which to read the model name
23
- classifier_name: if modelname is passed, it takes a priority if not \
24
- found then will look for configfile, else raise error.
25
- Return: document classifier model
26
- """
27
- if not classifier_name:
28
- if not config_file:
29
- logging.warning("Pass either model name or config file")
30
- return
31
- else:
32
- config = getconfig(config_file)
33
- classifier_name = config.get('sector','MODEL')
34
-
35
- logging.info("Loading sector classifier")
36
- # we are using the pipeline as the model is multilabel and DocumentClassifier
37
- # from Haystack doesnt support multilabel
38
- # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
39
- # if not then it will automatically use softmax, which is not a desired thing.
40
- # doc_classifier = TransformersDocumentClassifier(
41
- # model_name_or_path=classifier_name,
42
- # task="text-classification",
43
- # top_k = None)
44
-
45
- doc_classifier = pipeline("text-classification",
46
- model=classifier_name,
47
- return_all_scores=True,
48
- function_to_apply= "sigmoid")
49
-
50
- return doc_classifier
51
-
52
-
53
- @st.cache_data
54
- def sector_classification(haystack_doc:pd.DataFrame,
55
- threshold:float = 0.5,
56
- classifier_model:pipeline= None
57
- )->Tuple[DataFrame,Series]:
58
- """
59
- Text-Classification on the list of texts provided. Classifier provides the
60
- most appropriate label for each text. these labels are in terms of if text
61
- belongs to which particular Sustainable Devleopment Goal (SDG).
62
- Params
63
- ---------
64
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
65
- contains the list of paragraphs in different format,here the list of
66
- Haystack Documents is used.
67
- threshold: threshold value for the model to keep the results from classifier
68
- classifiermodel: you can pass the classifier model directly,which takes priority
69
- however if not then looks for model in streamlit session.
70
- In case of streamlit avoid passing the model directly.
71
- Returns
72
- ----------
73
- df: Dataframe with two columns['SDG:int', 'text']
74
- x: Series object with the unique SDG covered in the document uploaded and
75
- the number of times it is covered/discussed/count_of_paragraphs.
76
- """
77
- logging.info("Working on Sector Identification")
78
- haystack_doc['Sector Label'] = 'NA'
79
- # df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
80
- # df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
81
- if not classifier_model:
82
- classifier_model = st.session_state['sector_classifier']
83
-
84
- predictions = classifier_model(list(haystack_doc.text))
85
-
86
- list_ = []
87
- for i in range(len(predictions)):
88
-
89
- temp = predictions[i]
90
- placeholder = {}
91
- for j in range(len(temp)):
92
- placeholder[temp[j]['label']] = temp[j]['score']
93
- list_.append(placeholder)
94
- labels_ = [{**list_[l]} for l in range(len(predictions))]
95
- truth_df = DataFrame.from_dict(labels_)
96
- truth_df = truth_df.round(2)
97
- truth_df = truth_df.astype(float) >= threshold
98
- truth_df = truth_df.astype(str)
99
- categories = list(truth_df.columns)
100
- truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
101
- None for i in categories}, axis=1)
102
- truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
103
- -{None}),axis=1)
104
- haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
105
- # df = pd.concat([df,df1])
106
- return haystack_doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/target_classifier.py DELETED
@@ -1,89 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- ## Labels dictionary ###
12
- _lab_dict = {
13
- 'NEGATIVE':'NO TARGET INFO',
14
- 'TARGET':'TARGET',
15
- }
16
-
17
- @st.cache_resource
18
- def load_targetClassifier(config_file:str = None, classifier_name:str = None):
19
- """
20
- loads the document classifier using haystack, where the name/path of model
21
- in HF-hub as string is used to fetch the model object.Either configfile or
22
- model should be passed.
23
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
24
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
25
- Params
26
- --------
27
- config_file: config file path from which to read the model name
28
- classifier_name: if modelname is passed, it takes a priority if not \
29
- found then will look for configfile, else raise error.
30
- Return: document classifier model
31
- """
32
- if not classifier_name:
33
- if not config_file:
34
- logging.warning("Pass either model name or config file")
35
- return
36
- else:
37
- config = getconfig(config_file)
38
- classifier_name = config.get('target','MODEL')
39
-
40
- logging.info("Loading classifier")
41
-
42
- doc_classifier = pipeline("text-classification",
43
- model=classifier_name,
44
- top_k =1)
45
-
46
- return doc_classifier
47
-
48
-
49
- @st.cache_data
50
- def target_classification(haystack_doc:pd.DataFrame,
51
- threshold:float = 0.5,
52
- classifier_model:pipeline= None
53
- )->Tuple[DataFrame,Series]:
54
- """
55
- Text-Classification on the list of texts provided. Classifier provides the
56
- most appropriate label for each text. these labels are in terms of if text
57
- belongs to which particular Sustainable Devleopment Goal (SDG).
58
- Params
59
- ---------
60
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
61
- contains the list of paragraphs in different format,here the list of
62
- Haystack Documents is used.
63
- threshold: threshold value for the model to keep the results from classifier
64
- classifiermodel: you can pass the classifier model directly,which takes priority
65
- however if not then looks for model in streamlit session.
66
- In case of streamlit avoid passing the model directly.
67
- Returns
68
- ----------
69
- df: Dataframe with two columns['SDG:int', 'text']
70
- x: Series object with the unique SDG covered in the document uploaded and
71
- the number of times it is covered/discussed/count_of_paragraphs.
72
- """
73
- logging.info("Working on Target Extraction")
74
- if not classifier_model:
75
- classifier_model = st.session_state['target_classifier']
76
-
77
- results = classifier_model(list(haystack_doc.text))
78
- labels_= [(l[0]['label'],
79
- l[0]['score']) for l in results]
80
-
81
-
82
- df1 = DataFrame(labels_, columns=["Target Label","Relevancy"])
83
- df = pd.concat([haystack_doc,df1],axis=1)
84
-
85
- df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
86
- df.index += 1
87
- df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
88
-
89
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/uploadAndExample.py DELETED
@@ -1,39 +0,0 @@
1
- import streamlit as st
2
- import tempfile
3
- import json
4
-
5
- def add_upload(choice):
6
- """
7
- Provdies the user with choice to either 'Upload Document' or 'Try Example'.
8
- Based on user choice runs streamlit processes and save the path and name of
9
- the 'file' to streamlit session_state which then can be fetched later.
10
-
11
- """
12
-
13
- if choice == 'Upload Document':
14
-
15
- # if 'filename' in st.session_state:
16
- # Delete all the items in Session state
17
- # for key in st.session_state.keys():
18
- # del st.session_state[key]
19
-
20
- uploaded_file = st.sidebar.file_uploader('Upload the File',
21
- type=['pdf', 'docx', 'txt'])
22
- if uploaded_file is not None:
23
- with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
24
- bytes_data = uploaded_file.getvalue()
25
- temp.write(bytes_data)
26
- st.session_state['filename'] = uploaded_file.name
27
- st.session_state['filepath'] = temp.name
28
-
29
-
30
- else:
31
- # listing the options
32
- with open('docStore/sample/files.json','r') as json_file:
33
- files = json.load(json_file)
34
-
35
- option = st.sidebar.selectbox('Select the example document',
36
- list(files.keys()))
37
- file_name = file_path = files[option]
38
- st.session_state['filename'] = file_name
39
- st.session_state['filepath'] = file_path