prashant
commited on
Commit
•
685552c
1
Parent(s):
9119fa1
update sdg
Browse files- appStore/sdg_analysis.py +4 -9
- paramconfig.cfg +1 -0
- utils/sdg_classifier.py +28 -23
- utils/streamlitcheck.py +0 -19
appStore/sdg_analysis.py
CHANGED
@@ -2,9 +2,6 @@
|
|
2 |
import glob, os, sys;
|
3 |
sys.path.append('../utils')
|
4 |
|
5 |
-
#import helper
|
6 |
-
|
7 |
-
|
8 |
#import needed libraries
|
9 |
import seaborn as sns
|
10 |
import matplotlib.pyplot as plt
|
@@ -16,9 +13,6 @@ from docx.shared import Pt
|
|
16 |
from docx.enum.style import WD_STYLE_TYPE
|
17 |
from utils.sdg_classifier import sdg_classification
|
18 |
from utils.sdg_classifier import runSDGPreprocessingPipeline
|
19 |
-
# from utils.streamlitcheck import check_streamlit
|
20 |
-
import tempfile
|
21 |
-
import sqlite3
|
22 |
import logging
|
23 |
logger = logging.getLogger(__name__)
|
24 |
|
@@ -47,15 +41,16 @@ def app():
|
|
47 |
|
48 |
|
49 |
if 'filepath' in st.session_state:
|
50 |
-
|
51 |
-
|
|
|
52 |
warning_msg = ": This might take sometime, please sit back and relax."
|
53 |
else:
|
54 |
warning_msg = ""
|
55 |
|
56 |
with st.spinner("Running SDG Classification{}".format(warning_msg)):
|
57 |
|
58 |
-
df, x = sdg_classification(
|
59 |
|
60 |
plt.rcParams['font.size'] = 25
|
61 |
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
|
|
|
2 |
import glob, os, sys;
|
3 |
sys.path.append('../utils')
|
4 |
|
|
|
|
|
|
|
5 |
#import needed libraries
|
6 |
import seaborn as sns
|
7 |
import matplotlib.pyplot as plt
|
|
|
13 |
from docx.enum.style import WD_STYLE_TYPE
|
14 |
from utils.sdg_classifier import sdg_classification
|
15 |
from utils.sdg_classifier import runSDGPreprocessingPipeline
|
|
|
|
|
|
|
16 |
import logging
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
|
|
41 |
|
42 |
|
43 |
if 'filepath' in st.session_state:
|
44 |
+
allDocuments = runSDGPreprocessingPipeline(st.session_state['filepath'],
|
45 |
+
st.session_state['filename'])
|
46 |
+
if len(allDocuments['documents']) > 100:
|
47 |
warning_msg = ": This might take sometime, please sit back and relax."
|
48 |
else:
|
49 |
warning_msg = ""
|
50 |
|
51 |
with st.spinner("Running SDG Classification{}".format(warning_msg)):
|
52 |
|
53 |
+
df, x = sdg_classification(allDocuments['documents'])
|
54 |
|
55 |
plt.rcParams['font.size'] = 25
|
56 |
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
|
paramconfig.cfg
CHANGED
@@ -22,6 +22,7 @@ SPLIT_OVERLAP = 0
|
|
22 |
THRESHOLD = 0.85
|
23 |
MODEL = jonas/sdg_classifier_osdg
|
24 |
SPLIT_BY = word
|
|
|
25 |
SPLIT_LENGTH = 110
|
26 |
SPLIT_OVERLAP = 10
|
27 |
|
|
|
22 |
THRESHOLD = 0.85
|
23 |
MODEL = jonas/sdg_classifier_osdg
|
24 |
SPLIT_BY = word
|
25 |
+
REMOVE_PUNC = 0
|
26 |
SPLIT_LENGTH = 110
|
27 |
SPLIT_OVERLAP = 10
|
28 |
|
utils/sdg_classifier.py
CHANGED
@@ -2,21 +2,28 @@ from haystack.nodes import TransformersDocumentClassifier
|
|
2 |
from haystack.schema import Document
|
3 |
from typing import List, Tuple
|
4 |
import configparser
|
5 |
-
import streamlit as st
|
6 |
-
from utils.streamlitcheck import check_streamlit
|
7 |
-
from pandas import DataFrame, Series
|
8 |
import logging
|
|
|
9 |
from utils.preprocessing import processingpipeline
|
|
|
|
|
|
|
|
|
10 |
config = configparser.ConfigParser()
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
|
13 |
|
|
|
14 |
def load_sdgClassifier():
|
15 |
"""
|
16 |
loads the document classifier using haystack, where the name/path of model
|
17 |
in HF-hub as string is used to fetch the model object.
|
18 |
-
|
19 |
-
|
20 |
|
21 |
Return: document classifier model
|
22 |
"""
|
@@ -28,6 +35,8 @@ def load_sdgClassifier():
|
|
28 |
return doc_classifier
|
29 |
|
30 |
|
|
|
|
|
31 |
def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
|
32 |
"""
|
33 |
Text-Classification on the list of texts provided. Classifier provides the
|
@@ -50,16 +59,13 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
|
|
50 |
logging.info("running SDG classifiication")
|
51 |
threshold = float(config.get('sdg','THRESHOLD'))
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
classifier = st.cache(load_sdgClassifier, allow_output_mutation=True)
|
56 |
-
else:
|
57 |
-
classifier = load_sdgClassifier()
|
58 |
results = classifier.predict(haystackdoc)
|
59 |
|
60 |
|
61 |
labels_= [(l.meta['classification']['label'],
|
62 |
-
|
63 |
|
64 |
df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
|
65 |
|
@@ -72,7 +78,7 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
|
|
72 |
|
73 |
return df, x
|
74 |
|
75 |
-
def runSDGPreprocessingPipeline(file_path
|
76 |
"""
|
77 |
creates the pipeline and runs the preprocessing pipeline,
|
78 |
the params for pipeline are fetched from paramconfig
|
@@ -80,12 +86,12 @@ def runSDGPreprocessingPipeline(file_path = None, file_name = None)->List[Docume
|
|
80 |
Param
|
81 |
------------
|
82 |
|
83 |
-
|
84 |
-
session_state
|
|
|
|
|
|
|
85 |
|
86 |
-
file_name: filename, if not given will check for file_name in streamlit
|
87 |
-
session_state
|
88 |
-
|
89 |
Return
|
90 |
--------------
|
91 |
List[Document]: When preprocessing pipeline is run, the output dictionary
|
@@ -94,21 +100,20 @@ def runSDGPreprocessingPipeline(file_path = None, file_name = None)->List[Docume
|
|
94 |
key = 'documents' on output.
|
95 |
|
96 |
"""
|
97 |
-
|
98 |
-
file_path = st.session_state['filepath']
|
99 |
-
file_name = st.session_state['filename']
|
100 |
sdg_processing_pipeline = processingpipeline()
|
101 |
split_by = config.get('sdg','SPLIT_BY')
|
102 |
split_length = int(config.get('sdg','SPLIT_LENGTH'))
|
103 |
split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
|
|
|
104 |
|
105 |
|
106 |
output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
|
107 |
params= {"FileConverter": {"file_path": file_path, \
|
108 |
"file_name": file_name},
|
109 |
-
"UdfPreProcessor": {"removePunc":
|
110 |
"split_by": split_by, \
|
111 |
"split_length":split_length,\
|
112 |
"split_overlap": split_overlap}})
|
113 |
|
114 |
-
return output_sdg_pre
|
|
|
2 |
from haystack.schema import Document
|
3 |
from typing import List, Tuple
|
4 |
import configparser
|
|
|
|
|
|
|
5 |
import logging
|
6 |
+
from pandas import DataFrame, Series
|
7 |
from utils.preprocessing import processingpipeline
|
8 |
+
try:
|
9 |
+
import streamlit as st
|
10 |
+
except ImportError:
|
11 |
+
logging.info("Streamlit not installed")
|
12 |
config = configparser.ConfigParser()
|
13 |
+
try:
|
14 |
+
config.read_file(open('paramconfig.cfg'))
|
15 |
+
except Exception:
|
16 |
+
logging.info("paramconfig file not found")
|
17 |
+
st.info("Please place the paramconfig file in the same directory as app.py")
|
18 |
|
19 |
|
20 |
+
@st.cache
|
21 |
def load_sdgClassifier():
|
22 |
"""
|
23 |
loads the document classifier using haystack, where the name/path of model
|
24 |
in HF-hub as string is used to fetch the model object.
|
25 |
+
1. https://docs.haystack.deepset.ai/reference/document-classifier-api
|
26 |
+
2. https://docs.haystack.deepset.ai/docs/document_classifier
|
27 |
|
28 |
Return: document classifier model
|
29 |
"""
|
|
|
35 |
return doc_classifier
|
36 |
|
37 |
|
38 |
+
|
39 |
+
@st.cache
|
40 |
def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
|
41 |
"""
|
42 |
Text-Classification on the list of texts provided. Classifier provides the
|
|
|
59 |
logging.info("running SDG classifiication")
|
60 |
threshold = float(config.get('sdg','THRESHOLD'))
|
61 |
|
62 |
+
|
63 |
+
classifier = load_sdgClassifier()
|
|
|
|
|
|
|
64 |
results = classifier.predict(haystackdoc)
|
65 |
|
66 |
|
67 |
labels_= [(l.meta['classification']['label'],
|
68 |
+
l.meta['classification']['score'],l.content,) for l in results]
|
69 |
|
70 |
df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
|
71 |
|
|
|
78 |
|
79 |
return df, x
|
80 |
|
81 |
+
def runSDGPreprocessingPipeline(file_path, file_name)->List[Document]:
|
82 |
"""
|
83 |
creates the pipeline and runs the preprocessing pipeline,
|
84 |
the params for pipeline are fetched from paramconfig
|
|
|
86 |
Param
|
87 |
------------
|
88 |
|
89 |
+
file_name: filename, in case of streamlit application use
|
90 |
+
st.session_state['filename']
|
91 |
+
file_path: filepath, in case of streamlit application use
|
92 |
+
st.session_state['filepath']
|
93 |
+
|
94 |
|
|
|
|
|
|
|
95 |
Return
|
96 |
--------------
|
97 |
List[Document]: When preprocessing pipeline is run, the output dictionary
|
|
|
100 |
key = 'documents' on output.
|
101 |
|
102 |
"""
|
103 |
+
|
|
|
|
|
104 |
sdg_processing_pipeline = processingpipeline()
|
105 |
split_by = config.get('sdg','SPLIT_BY')
|
106 |
split_length = int(config.get('sdg','SPLIT_LENGTH'))
|
107 |
split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
|
108 |
+
remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
|
109 |
|
110 |
|
111 |
output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
|
112 |
params= {"FileConverter": {"file_path": file_path, \
|
113 |
"file_name": file_name},
|
114 |
+
"UdfPreProcessor": {"removePunc": remove_punc, \
|
115 |
"split_by": split_by, \
|
116 |
"split_length":split_length,\
|
117 |
"split_overlap": split_overlap}})
|
118 |
|
119 |
+
return output_sdg_pre
|
utils/streamlitcheck.py
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
def check_streamlit():
|
2 |
-
"""
|
3 |
-
Function to check whether python code is run within streamlit
|
4 |
-
|
5 |
-
Returns
|
6 |
-
-------
|
7 |
-
use_streamlit : boolean
|
8 |
-
True if code is run within streamlit, else False
|
9 |
-
"""
|
10 |
-
try:
|
11 |
-
from streamlit.scriptrunner.script_run_context import get_script_run_ctx
|
12 |
-
if not get_script_run_ctx():
|
13 |
-
use_streamlit = False
|
14 |
-
else:
|
15 |
-
use_streamlit = True
|
16 |
-
except ModuleNotFoundError:
|
17 |
-
use_streamlit = False
|
18 |
-
return use_streamlit
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|