TeresaK commited on
Commit
a5e9cde
1 Parent(s): b8fac0d

Upload 38 files

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ cpv_v2/
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Vulnerable Groups
3
+ emoji: 🦀
4
+ colorFrom: blue
5
+ colorTo: pink
6
+ sdk: streamlit
7
+ sdk_version: 1.21.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: openrail
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # hacky fix for HF environment issues
2
+ import os
3
+ os.system("pip uninstall -y spaces")
4
+ os.system('pip install spaces==0.17.0')
5
+ os.system("pip uninstall -y gradio")
6
+ os.system("pip uninstall -y pydantic")
7
+ os.system("pip uninstall -y typer")
8
+ os.system('pip install typer==0.4.0')
9
+ os.system('pip install pydantic==1.8.2 --use-deprecated=legacy-resolver')
10
+
11
+ import appStore.vulnerability_analysis as vulnerability_analysis
12
+ import appStore.doc_processing as processing
13
+ from appStore.rag import run_query
14
+ from utils.uploadAndExample import add_upload, get_tabs
15
+ from utils.vulnerability_classifier import label_dict
16
+ import streamlit as st
17
+ import pandas as pd
18
+ import plotly.express as px
19
+
20
+
21
+ st.set_page_config(page_title = 'Vulnerability Analysis',
22
+ initial_sidebar_state='expanded', layout="wide")
23
+
24
+ with st.sidebar:
25
+ # upload and example doc
26
+ choice = st.sidebar.radio(label = 'Select the Document',
27
+ help = 'You can upload your own documents \
28
+ or use the example document',
29
+ options = ('Upload Document', 'Try Example'),
30
+ horizontal = True)
31
+ add_upload(choice)
32
+
33
+ with st.container():
34
+ st.markdown("<h2 style='text-align: center;'> Vulnerability Analysis </h2>", unsafe_allow_html=True)
35
+ st.write(' ')
36
+
37
+ with st.expander("ℹ️ - About this app", expanded=False):
38
+ st.write(
39
+ """
40
+ The Vulnerability Analysis App is an open-source\
41
+ digital tool which aims to assist policy analysts and \
42
+ other users in extracting and filtering references \
43
+ to different vulnerable groups from public documents.
44
+ """)
45
+
46
+ st.write("""
47
+ What Happens in background?
48
+
49
+ - Step 1: Once the document is provided to app, it undergoes *Pre-processing*.\
50
+ In this step the document is broken into smaller paragraphs \
51
+ (based on word/sentence count).
52
+ - Step 2: The paragraphs are then fed to the **Vulnerability Classifier** which detects if
53
+ the paragraph contains any references to vulnerable groups.
54
+ """)
55
+
56
+ st.write("")
57
+
58
+
59
+ # Define the apps used
60
+ apps = [processing.app, vulnerability_analysis.app]
61
+
62
+ multiplier_val = 1 / len(apps)
63
+ if st.button("Analyze Documents"):
64
+ prg = st.progress(0.0)
65
+ for i, func in enumerate(apps):
66
+ func()
67
+ prg.progress((i + 1) * multiplier_val)
68
+
69
+ if 'combined_files_df' in st.session_state: # check for existence of processed documents
70
+ # get the filenames from the processed docs dataframe so we can use for tab names
71
+ uploaded_docs = [value for key, value in st.session_state.items() if key.startswith('filename_')]
72
+ tab_titles = get_tabs(uploaded_docs)
73
+
74
+ if tab_titles:
75
+ tabs = st.tabs(tab_titles)
76
+
77
+ # Render the results (Pie chart, Summary and Table) in indidivual tabs for each doc
78
+ for tab, doc in zip(tabs, uploaded_docs):
79
+ with tab:
80
+ # Main app code
81
+ with st.container():
82
+ st.write(' ')
83
+
84
+ # Assign dataframe a name
85
+ df_vul = st.session_state['combined_files_df']
86
+ df_vul = df_vul[df_vul['filename'] == doc]
87
+
88
+ col1, col2 = st.columns([1,1])
89
+
90
+ with col1:
91
+ # Header
92
+ st.subheader("Explore references to vulnerable groups:")
93
+
94
+ # Text
95
+ num_paragraphs = len(df_vul['Vulnerability Label'])
96
+ num_references = len(df_vul[df_vul['Vulnerability Label'] != 'Other'])
97
+
98
+ st.markdown(f"""<div style="text-align: justify;"> The document contains a
99
+ total of <span style="color: red;">{num_paragraphs}</span> paragraphs.
100
+ We identified <span style="color: red;">{num_references}</span>
101
+ references to vulnerable groups.</div>
102
+ <br>
103
+ In the pie chart on the right you can see the distribution of the different
104
+ groups defined. For a more detailed view in the text, see the paragraphs and
105
+ their respective labels in the table below.</div>""", unsafe_allow_html=True)
106
+
107
+ with col2:
108
+ ### Pie chart
109
+
110
+ # Create a df that stores all the labels
111
+ df_labels = pd.DataFrame(list(label_dict.items()), columns=['Label ID', 'Label'])
112
+
113
+ # Count how often each label appears in the "Vulnerability Labels" column
114
+ label_counts = df_vul['Vulnerability Label'].value_counts().reset_index()
115
+ label_counts.columns = ['Label', 'Count']
116
+
117
+ # Merge the label counts with the df_label DataFrame
118
+ df_labels = df_labels.merge(label_counts, on='Label', how='left')
119
+
120
+ # Configure graph
121
+ fig = px.pie(df_labels,
122
+ names="Label",
123
+ values="Count",
124
+ title='Label Counts',
125
+ hover_name="Count",
126
+ color_discrete_sequence=px.colors.qualitative.Plotly
127
+ )
128
+
129
+ #Show plot
130
+ st.plotly_chart(fig, use_container_width=True)
131
+
132
+ ### Document Summary
133
+ st.markdown("----")
134
+ st.markdown('**DOCUMENT FINDINGS SUMMARY:**')
135
+
136
+ # filter out 'Other' cause we don't want that in the table (and it's way too big for the summary)
137
+ df_docs = df_vul[df_vul['Vulnerability Label'] != 'Other']
138
+ # construct RAG query, send to openai and process response
139
+ run_query(df_docs)
140
+
141
+ st.markdown("----")
142
+
143
+ with st.expander("ℹ️ - Document Text Classifications", expanded=False):
144
+ ### Table
145
+ st.table(df_docs)
146
+
147
+
148
+
appStore/.DS_Store ADDED
Binary file (6.15 kB). View file
 
appStore/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # adding for package implementation
appStore/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (138 Bytes). View file
 
appStore/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (240 Bytes). View file
 
appStore/__pycache__/doc_processing.cpython-310.pyc ADDED
Binary file (3.42 kB). View file
 
appStore/__pycache__/vulnerability_analysis.cpython-310.pyc ADDED
Binary file (2.01 kB). View file
 
appStore/__pycache__/vulnerability_analysis.cpython-38.pyc ADDED
Binary file (2.05 kB). View file
 
appStore/doc_processing.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+ from typing import List, Tuple
5
+ from typing_extensions import Literal
6
+ from haystack.schema import Document
7
+ from utils.config import get_classifier_params
8
+ from utils.preprocessing import processingpipeline,paraLengthCheck
9
+ import streamlit as st
10
+ import logging
11
+ import pandas as pd
12
+ params = get_classifier_params("preprocessing")
13
+
14
+ @st.cache_data
15
+ def runPreprocessingPipeline(file_name:str, file_path:str,
16
+ split_by: Literal["sentence", "word"] = 'sentence',
17
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
18
+ split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
19
+ """
20
+ creates the pipeline and runs the preprocessing pipeline,
21
+ the params for pipeline are fetched from paramconfig
22
+ Params
23
+ ------------
24
+ file_name: filename, in case of streamlit application use
25
+ st.session_state['filename']
26
+ file_path: filepath, in case of streamlit application use st.session_state['filepath']
27
+ split_by: document splitting strategy either as word or sentence
28
+ split_length: when synthetically creating the paragrpahs from document,
29
+ it defines the length of paragraph.
30
+ split_respect_sentence_boundary: Used when using 'word' strategy for
31
+ splititng of text.
32
+ split_overlap: Number of words or sentences that overlap when creating
33
+ the paragraphs. This is done as one sentence or 'some words' make sense
34
+ when read in together with others. Therefore the overlap is used.
35
+ remove_punc: to remove all Punctuation including ',' and '.' or not
36
+ Return
37
+ --------------
38
+ List[Document]: When preprocessing pipeline is run, the output dictionary
39
+ has four objects. For the Haysatck implementation of SDG classification we,
40
+ need to use the List of Haystack Document, which can be fetched by
41
+ key = 'documents' on output.
42
+ """
43
+
44
+ processing_pipeline = processingpipeline()
45
+
46
+ output_pre = processing_pipeline.run(file_paths = file_path,
47
+ params= {"FileConverter": {"file_path": file_path, \
48
+ "file_name": file_name},
49
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
50
+ "split_by": split_by, \
51
+ "split_length":split_length,\
52
+ "split_overlap": split_overlap, \
53
+ "split_respect_sentence_boundary":split_respect_sentence_boundary}})
54
+
55
+ return output_pre
56
+
57
+
58
+ def app():
59
+ with st.container():
60
+ all_files_df = pd.DataFrame() # Initialize an empty DataFrame to store data from all files
61
+
62
+ for key in st.session_state:
63
+ if key.startswith('filepath_'):
64
+ file_path = st.session_state[key]
65
+ file_name = st.session_state['filename' + key[-2:]]
66
+
67
+ all_documents = runPreprocessingPipeline(file_name=file_name,
68
+ file_path=file_path, split_by=params['split_by'],
69
+ split_length=params['split_length'],
70
+ split_respect_sentence_boundary=params['split_respect_sentence_boundary'],
71
+ split_overlap=params['split_overlap'], remove_punc=params['remove_punc'])
72
+ paralist = paraLengthCheck(all_documents['documents'], 100)
73
+ file_df = pd.DataFrame(paralist, columns=['text', 'page'])
74
+ file_df['filename'] = file_name # Add a column for the file name
75
+
76
+ all_files_df = pd.concat([all_files_df, file_df], ignore_index=True)
77
+
78
+ if not all_files_df.empty:
79
+ st.session_state['combined_files_df'] = all_files_df
80
+ else:
81
+ st.info("🤔 No document found, please try to upload it at the sidebar!")
82
+ logging.warning("Terminated as no document provided")
83
+
84
+
85
+
86
+
87
+
appStore/rag.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # import json
3
+ import numpy as np
4
+ import pandas as pd
5
+ import openai
6
+ from haystack.schema import Document
7
+ import streamlit as st
8
+ from tenacity import retry, stop_after_attempt, wait_random_exponential
9
+
10
+
11
+ # Get openai API key
12
+ openai.api_key = os.environ["OPENAI_API_KEY"]
13
+ model_select = "gpt-3.5-turbo-1106"
14
+
15
+
16
+ # define a special function for putting the prompt together (as we can't use haystack)
17
+ def get_prompt(docs):
18
+ base_prompt="Provide a single paragraph summary of the documents provided below. \
19
+ Formulate your answer in the style of an academic report."
20
+ # Add the meta data for references
21
+ context = ' - '.join([d.content for d in docs])
22
+ prompt = base_prompt+"; Context: "+context+"; Answer:"
23
+
24
+ return prompt
25
+
26
+
27
+ # convert df rows to Document object so we can feed it into the summarizer easily
28
+ def get_document(df):
29
+ # we take a list of each extract
30
+ ls_dict = []
31
+ for index, row in df.iterrows():
32
+ # Create a Document object for each row (we only need the text)
33
+ doc = Document(
34
+ row['text'],
35
+ meta={
36
+ 'filename': row['filename']}
37
+ )
38
+ # Append the Document object to the documents list
39
+ ls_dict.append(doc)
40
+
41
+ return ls_dict
42
+
43
+
44
+ # exception handling for issuing multiple API calls to openai (exponential backoff)
45
+ @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
46
+ def completion_with_backoff(**kwargs):
47
+ return openai.ChatCompletion.create(**kwargs)
48
+
49
+
50
+ # construct RAG query, send to openai and process response
51
+ def run_query(df):
52
+ docs = get_document(df)
53
+
54
+ '''
55
+ For non-streamed completion, enable the following 2 lines and comment out the code below
56
+ '''
57
+ # res = openai.ChatCompletion.create(model=model_select, messages=[{"role": "user", "content": get_prompt(docs)}])
58
+ # result = res.choices[0].message.content
59
+
60
+ # instantiate ChatCompletion as a generator object (stream is set to True)
61
+ response = completion_with_backoff(model=model_select, messages=[{"role": "user", "content": get_prompt(docs)}], stream=True)
62
+ # iterate through the streamed output
63
+ report = []
64
+ res_box = st.empty()
65
+ for chunk in response:
66
+ # extract the object containing the text (totally different structure when streaming)
67
+ chunk_message = chunk['choices'][0]['delta']
68
+ # test to make sure there is text in the object (some don't have)
69
+ if 'content' in chunk_message:
70
+ report.append(chunk_message.content) # extract the message
71
+ # add the latest text and merge it with all previous
72
+ result = "".join(report).strip()
73
+ # res_box.success(result) # output to response text box
74
+ res_box.success(result)
75
+
76
+
77
+
78
+
79
+
80
+
appStore/vulnerability_analysis.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ from utils.vulnerability_classifier import load_vulnerabilityClassifier, vulnerability_classification
12
+ import logging
13
+ logger = logging.getLogger(__name__)
14
+ from utils.config import get_classifier_params
15
+ from utils.preprocessing import paraLengthCheck
16
+ from io import BytesIO
17
+ import xlsxwriter
18
+ import plotly.express as px
19
+
20
+
21
+ # Declare all the necessary variables
22
+ classifier_identifier = 'vulnerability'
23
+ params = get_classifier_params(classifier_identifier)
24
+
25
+ @st.cache_data
26
+ def to_excel(df,sectorlist):
27
+ len_df = len(df)
28
+ output = BytesIO()
29
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
30
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
31
+ workbook = writer.book
32
+ worksheet = writer.sheets['Sheet1']
33
+ worksheet.data_validation('S2:S{}'.format(len_df),
34
+ {'validate': 'list',
35
+ 'source': ['No', 'Yes', 'Discard']})
36
+ worksheet.data_validation('X2:X{}'.format(len_df),
37
+ {'validate': 'list',
38
+ 'source': sectorlist + ['Blank']})
39
+ worksheet.data_validation('T2:T{}'.format(len_df),
40
+ {'validate': 'list',
41
+ 'source': sectorlist + ['Blank']})
42
+ worksheet.data_validation('U2:U{}'.format(len_df),
43
+ {'validate': 'list',
44
+ 'source': sectorlist + ['Blank']})
45
+ worksheet.data_validation('V2:V{}'.format(len_df),
46
+ {'validate': 'list',
47
+ 'source': sectorlist + ['Blank']})
48
+ worksheet.data_validation('W2:U{}'.format(len_df),
49
+ {'validate': 'list',
50
+ 'source': sectorlist + ['Blank']})
51
+ writer.save()
52
+ processed_data = output.getvalue()
53
+ return processed_data
54
+
55
+
56
+
57
+ def app():
58
+ with st.container():
59
+ if 'combined_files_df' in st.session_state:
60
+ combined_files_df = st.session_state['combined_files_df']
61
+ classifier = load_vulnerabilityClassifier(classifier_name=params['model_name'])
62
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
63
+
64
+ combined_files_df = vulnerability_classification(haystack_doc=combined_files_df,
65
+ threshold=params['threshold'])
66
+
67
+ st.session_state['combined_files_df'] = combined_files_df
68
+
69
+
docStore/.DS_Store ADDED
Binary file (6.15 kB). View file
 
docStore/sample/KE_First_NDC.pdf ADDED
Binary file (214 kB). View file
 
docStore/sample/PH_First_NDC.pdf ADDED
Binary file (136 kB). View file
 
docStore/sample/files.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {"Kenya: First NDC":"docStore/sample/KE_First_NDC.pdf",
2
+ "Philippines: First NDC":"docStore/sample/PH_First_NDC.pdf"
3
+ }
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ poppler-utils
2
+ xpdf
3
+ tesseract-ocr
4
+ libtesseract-dev
paramconfig.cfg ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [preprocessing]
2
+ THRESHOLD = 0.50
3
+ MODEL = garbage
4
+ SPLIT_BY = word
5
+ REMOVE_PUNC = 0
6
+ SPLIT_LENGTH = 60
7
+ SPLIT_OVERLAP = 5
8
+ RESPECT_SENTENCE_BOUNDARY = 1
9
+ TOP_KEY = 10
10
+
11
+ [vulnerability]
12
+ THRESHOLD = 0.50
13
+ MODEL = leavoigt/vulnerable_groups
14
+ SPLIT_BY = word
15
+ REMOVE_PUNC = 0
16
+ SPLIT_LENGTH = 60
17
+ SPLIT_OVERLAP = 5
18
+ RESPECT_SENTENCE_BOUNDARY = 1
19
+ TOP_KEY = 10
requirements.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ farm-haystack == 1.16
2
+ farm-haystack[ocr,pdf]==1.16.0
3
+ spacy==3.2.0
4
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
5
+ matplotlib==3.5.1
6
+ nltk==3.7
7
+ numpy==1.22.1
8
+ pandas==1.4.0
9
+ pdfplumber==0.6.2
10
+ Pillow==9.1.1
11
+ seaborn==0.11.2
12
+ transformers==4.25.1
13
+ st-annotated-text==3.0.0
14
+ markdown==3.4.1
15
+ summa==1.2.0
16
+ plotly
17
+ xlsxwriter
18
+ altair==4.0
19
+ streamlit-aggrid
20
+ python-docx
21
+ setfit
22
+ plotly.express
23
+ openai==0.27.9
24
+ pydantic==1.8.2
style.css ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ .row-widget.stTextInput > div:first-of-type {
3
+ background: #fff;
4
+ display: flex;
5
+ border: 1px solid #dfe1e5;
6
+ box-shadow: none;
7
+ border-radius: 24px;
8
+ height: 50px;
9
+ width: auto;
10
+ margin: 10px auto 30px;
11
+ }
12
+
13
+ .row-widget.stTextInput > div:first-of-type:hover,
14
+ .row-widget.stTextInput > div:first-of-type:focus {
15
+ box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
16
+ }
17
+
18
+ .row-widget.stTextInput .st-bq {
19
+ background-color: #fff;
20
+ }
21
+
22
+ .row-widget.stTextInput > label {
23
+ color: #b3b3b3;
24
+ }
25
+
26
+ .row-widget.stButton > button {
27
+ border-radius: 24px;
28
+ background-color: #B6C9B1;
29
+ color: #fff;
30
+ border: none;
31
+ padding: 6px 20px;
32
+ float: right;
33
+ background-image: none;
34
+ }
35
+
36
+ .row-widget.stButton > button:hover {
37
+ box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
38
+ }
39
+
40
+ .row-widget.stButton > button:focus {
41
+ border: none;
42
+ color: #fff;
43
+ }
44
+
45
+ .footer-custom {
46
+ position: fixed;
47
+ bottom: 0;
48
+ width: 100%;
49
+ color: var(--text-color);
50
+ max-width: 698px;
51
+ font-size: 14px;
52
+ height: 50px;
53
+ padding: 10px 0;
54
+ z-index: 50;
55
+ }
56
+
57
+ .main {
58
+ padding: 20px;
59
+ }
60
+
61
+ footer {
62
+ display: none !important;
63
+ }
64
+
65
+ .footer-custom a {
66
+ color: var(--text-color);
67
+ }
68
+
69
+ #wikipedia-assistant {
70
+ font-size: 36px;
71
+ }
72
+
73
+ .generated-answer p {
74
+ font-size: 16px;
75
+ font-weight: bold;
76
+ }
77
+
78
+ .react-json-view {
79
+ margin: 40px 0 80px;
80
+ }
81
+
82
+ .tooltip {
83
+ text-align: center;
84
+ line-height: 20px;
85
+ display: table-caption;
86
+ font-size: 10px;
87
+ border-radius: 50%;
88
+ height: 20px;
89
+ width: 20px;
90
+ position: relative;
91
+ cursor: pointer;
92
+ color:#000;
93
+ }
94
+
95
+ .tooltip .tooltiptext {
96
+ visibility: hidden;
97
+ width: 280px;
98
+ text-align: center;
99
+ border-radius: 6px;
100
+ padding: 10px;
101
+ position: absolute;
102
+ z-index: 1;
103
+ top: 25px;
104
+ left: 50%;
105
+ margin-left: -140px;
106
+ font-size: 14px;
107
+ background-color: #fff;
108
+ border: 1px solid #ccc;
109
+ box-shadow: 0px 0px 3px 1px rgba(0, 0, 0, 0.16);
110
+ color: #000;
111
+ }
112
+
113
+ .tooltip:hover .tooltiptext {
114
+ visibility: visible;
115
+ }
116
+
117
+ .sentence-wrapper {
118
+ border-left: 4px solid #ffc423;
119
+ padding-left: 20px;
120
+ margin-bottom: 40px;
121
+ }
122
+
123
+ #context {
124
+ padding: 2rem 0 1rem;
125
+ }
126
+
127
+ hr {
128
+ margin: 2em 0 1em;
129
+ }
130
+
131
+
132
+ .technical-details-info {
133
+ margin-bottom: 100px;
134
+ }
135
+
136
+ .loader-wrapper {
137
+ display: flex;
138
+ align-items: center;
139
+ background-color: rgba(250, 202, 43, 0.2);
140
+ padding: 15px 20px;
141
+ border-radius: 6px;
142
+ }
143
+
144
+ .loader-wrapper p {
145
+ margin-bottom: 0;
146
+ margin-left: 20px;
147
+ }
148
+
149
+ .loader {
150
+ width: 30px;
151
+ height: 30px;
152
+ border: dotted 5px #868686;
153
+ border-radius: 100%;
154
+ animation: spin 1s linear infinite;
155
+ }
156
+
157
+ .loader-note {
158
+ font-size: 14px;
159
+ color: #b3b3b3;
160
+ margin-left: 5px;
161
+ }
162
+
163
+ @keyframes spin {
164
+ 0% {
165
+ transform: rotate(0deg) scale(0.8);
166
+ border-top-color: transparent;
167
+ border-right-color: transparent;
168
+ }
169
+ 50% { transform: rotate(180deg) scale(1.2);
170
+ border-color: #949494;
171
+ border-top-color: transparent;
172
+ border-right-color: transparent;
173
+ }
174
+ 100% { transform: rotate(360deg) scale(0.8);
175
+ border-color: #bbbbbb;
176
+ border-top-color: transparent;
177
+ border-right-color: transparent;
178
+ }
179
+ }
utils/.DS_Store ADDED
Binary file (6.15 kB). View file
 
utils/__init__.py ADDED
File without changes
utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (135 Bytes). View file
 
utils/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (237 Bytes). View file
 
utils/__pycache__/config.cpython-310.pyc ADDED
Binary file (1.08 kB). View file
 
utils/__pycache__/config.cpython-38.pyc ADDED
Binary file (1.19 kB). View file
 
utils/__pycache__/preprocessing.cpython-310.pyc ADDED
Binary file (9.05 kB). View file
 
utils/__pycache__/preprocessing.cpython-38.pyc ADDED
Binary file (9.13 kB). View file
 
utils/__pycache__/uploadAndExample.cpython-310.pyc ADDED
Binary file (1.27 kB). View file
 
utils/__pycache__/vulnerability_classifier.cpython-310.pyc ADDED
Binary file (4.2 kB). View file
 
utils/__pycache__/vulnerability_classifier.cpython-38.pyc ADDED
Binary file (4.2 kB). View file
 
utils/config.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import configparser
2
+ import logging
3
+
4
+ def getconfig(configfile_path:str):
5
+ """
6
+ configfile_path: file path of .cfg file
7
+ """
8
+
9
+ config = configparser.ConfigParser()
10
+
11
+ try:
12
+ config.read_file(open(configfile_path))
13
+ return config
14
+ except:
15
+ logging.warning("config file not found")
16
+
17
+
18
+ # Declare all the necessary variables
19
+ def get_classifier_params(model_name):
20
+ config = getconfig('paramconfig.cfg')
21
+ params = {}
22
+ params['model_name'] = config.get(model_name,'MODEL')
23
+ params['split_by'] = config.get(model_name,'SPLIT_BY')
24
+ params['split_length'] = int(config.get(model_name,'SPLIT_LENGTH'))
25
+ params['split_overlap'] = int(config.get(model_name,'SPLIT_OVERLAP'))
26
+ params['remove_punc'] = bool(int(config.get(model_name,'REMOVE_PUNC')))
27
+ params['split_respect_sentence_boundary'] = bool(int(config.get(model_name,'RESPECT_SENTENCE_BOUNDARY')))
28
+ params['threshold'] = float(config.get(model_name,'THRESHOLD'))
29
+ params['top_n'] = int(config.get(model_name,'TOP_KEY'))
30
+
31
+ return params
utils/preprocessing.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes.base import BaseComponent
2
+ from haystack.schema import Document
3
+ from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
4
+ from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
+ from typing import Callable, Dict, List, Optional, Text, Tuple, Union
6
+ from typing_extensions import Literal
7
+ import pandas as pd
8
+ import logging
9
+ import re
10
+ import string
11
+ from haystack.pipelines import Pipeline
12
+
13
+ def useOCR(file_path: str)-> Text:
14
+ """
15
+ Converts image pdfs into text, Using the Farm-haystack[OCR]
16
+
17
+ Params
18
+ ----------
19
+ file_path: file_path of uploade file, returned by add_upload function in
20
+ uploadAndExample.py
21
+
22
+ Returns the text file as string.
23
+ """
24
+
25
+
26
+ converter = PDFToTextOCRConverter(remove_numeric_tables=True,
27
+ valid_languages=["eng"])
28
+ docs = converter.convert(file_path=file_path, meta=None)
29
+ return docs[0].content
30
+
31
+
32
+
33
+
34
+ class FileConverter(BaseComponent):
35
+ """
36
+ Wrapper class to convert uploaded document into text by calling appropriate
37
+ Converter class, will use internally haystack PDFToTextOCR in case of image
38
+ pdf. Cannot use the FileClassifier from haystack as its doesnt has any
39
+ label/output class for image.
40
+
41
+ 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
42
+ 2. https://docs.haystack.deepset.ai/docs/file_converters
43
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
44
+ 4. https://docs.haystack.deepset.ai/reference/file-converters-api
45
+
46
+
47
+ """
48
+
49
+ outgoing_edges = 1
50
+
51
+ def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
52
+ id_hash_keys: Optional[List[str]] = None,
53
+ ) -> Tuple[dict,str]:
54
+ """ this is required method to invoke the component in
55
+ the pipeline implementation.
56
+
57
+ Params
58
+ ----------
59
+ file_name: name of file
60
+ file_path: file_path of uploade file, returned by add_upload function in
61
+ uploadAndExample.py
62
+
63
+ See the links provided in Class docstring/description to see other params
64
+
65
+ Return
66
+ ---------
67
+ output: dictionary, with key as identifier and value could be anything
68
+ we need to return. In this case its the List of Hasyatck Document
69
+
70
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
71
+ """
72
+ try:
73
+ if file_name.endswith('.pdf'):
74
+ converter = PDFToTextConverter(remove_numeric_tables=True)
75
+ if file_name.endswith('.txt'):
76
+ converter = TextConverter(remove_numeric_tables=True)
77
+ if file_name.endswith('.docx'):
78
+ converter = DocxToTextConverter()
79
+ except Exception as e:
80
+ logging.error(e)
81
+ return
82
+
83
+
84
+
85
+ documents = []
86
+
87
+
88
+ # encoding is empty, probably should be utf-8
89
+ document = converter.convert(
90
+ file_path=file_path, meta=None,
91
+ encoding=encoding, id_hash_keys=id_hash_keys
92
+ )[0]
93
+
94
+ text = document.content
95
+
96
+ # in case of scanned/images only PDF the content might contain only
97
+ # the page separator (\f or \x0c). We check if is so and use
98
+ # use the OCR to get the text.
99
+ filtered = re.sub(r'\x0c', '', text)
100
+
101
+ if filtered == "":
102
+ logging.info("Using OCR")
103
+ text = useOCR(file_path)
104
+
105
+ documents.append(Document(content=text,
106
+ meta={"name": file_name},
107
+ id_hash_keys=id_hash_keys))
108
+
109
+ logging.info('file conversion succesful')
110
+ output = {'documents': documents}
111
+ return output, 'output_1'
112
+
113
+ def run_batch():
114
+ """
115
+ we dont have requirement to process the multiple files in one go
116
+ therefore nothing here, however to use the custom node we need to have
117
+ this method for the class.
118
+ """
119
+
120
+ return
121
+
122
+
123
+ def basic(s:str, remove_punc:bool = False):
124
+
125
+ """
126
+ Performs basic cleaning of text.
127
+
128
+ Params
129
+ ----------
130
+ s: string to be processed
131
+ removePunc: to remove all Punctuation including ',' and '.' or not
132
+
133
+ Returns: processed string: see comments in the source code for more info
134
+ """
135
+
136
+ # Remove URLs
137
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
138
+ s = re.sub(r"http\S+", " ", s)
139
+
140
+ # Remove new line characters
141
+ s = re.sub('\n', ' ', s)
142
+
143
+ # Remove punctuations
144
+ if remove_punc == True:
145
+ translator = str.maketrans(' ', ' ', string.punctuation)
146
+ s = s.translate(translator)
147
+ # Remove distracting single quotes and dotted pattern
148
+ s = re.sub("\'", " ", s)
149
+ s = s.replace("..","")
150
+
151
+ return s.strip()
152
+
153
+ def paraLengthCheck(paraList, max_len = 100):
154
+ """
155
+ There are cases where preprocessor cannot respect word limit, when using
156
+ respect sentence boundary flag due to missing sentence boundaries.
157
+ Therefore we run one more round of split here for those paragraphs
158
+
159
+ Params
160
+ ---------------
161
+ paraList : list of paragraphs/text
162
+ max_len : max length to be respected by sentences which bypassed
163
+ preprocessor strategy
164
+
165
+ """
166
+ new_para_list = []
167
+ for passage in paraList:
168
+ # check if para exceeds words limit
169
+ if len(passage.content.split()) > max_len:
170
+ # we might need few iterations example if para = 512 tokens
171
+ # we need to iterate 5 times to reduce para to size limit of '100'
172
+ iterations = int(len(passage.content.split())/max_len)
173
+ for i in range(iterations):
174
+ temp = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
175
+ new_para_list.append((temp,passage.meta['page']))
176
+ temp = " ".join(passage.content.split()[max_len*(i+1):])
177
+ new_para_list.append((temp,passage.meta['page']))
178
+ else:
179
+ # paragraphs which dont need any splitting
180
+ new_para_list.append((passage.content, passage.meta['page']))
181
+
182
+ logging.info("New paragraphs length {}".format(len(new_para_list)))
183
+ return new_para_list
184
+
185
+ class UdfPreProcessor(BaseComponent):
186
+ """
187
+ class to preprocess the document returned by FileConverter. It will check
188
+ for splitting strategy and splits the document by word or sentences and then
189
+ synthetically create the paragraphs.
190
+
191
+ 1. https://docs.haystack.deepset.ai/docs/preprocessor
192
+ 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
193
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
194
+
195
+ """
196
+ outgoing_edges = 1
197
+
198
+ def run(self, documents:List[Document], remove_punc:bool=False,
199
+ split_by: Literal["sentence", "word"] = 'sentence',
200
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
201
+ split_overlap:int = 0):
202
+
203
+ """ this is required method to invoke the component in
204
+ the pipeline implementation.
205
+
206
+ Params
207
+ ----------
208
+ documents: documents from the output dictionary returned by Fileconverter
209
+ remove_punc: to remove all Punctuation including ',' and '.' or not
210
+ split_by: document splitting strategy either as word or sentence
211
+ split_length: when synthetically creating the paragrpahs from document,
212
+ it defines the length of paragraph.
213
+ split_respect_sentence_boundary: Used when using 'word' strategy for
214
+ splititng of text.
215
+ split_overlap: Number of words or sentences that overlap when creating
216
+ the paragraphs. This is done as one sentence or 'some words' make sense
217
+ when read in together with others. Therefore the overlap is used.
218
+
219
+ Return
220
+ ---------
221
+ output: dictionary, with key as identifier and value could be anything
222
+ we need to return. In this case the output will contain 4 objects
223
+ the paragraphs text list as List, Haystack document, Dataframe and
224
+ one raw text file.
225
+
226
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
227
+
228
+ """
229
+
230
+ if split_by == 'sentence':
231
+ split_respect_sentence_boundary = False
232
+
233
+ else:
234
+ split_respect_sentence_boundary = split_respect_sentence_boundary
235
+
236
+ preprocessor = PreProcessor(
237
+ clean_empty_lines=True,
238
+ clean_whitespace=True,
239
+ clean_header_footer=True,
240
+ split_by=split_by,
241
+ split_length=split_length,
242
+ split_respect_sentence_boundary= split_respect_sentence_boundary,
243
+ split_overlap=split_overlap,
244
+
245
+ # will add page number only in case of PDF not for text/docx file.
246
+ add_page_number=True
247
+ )
248
+
249
+ for i in documents:
250
+ # # basic cleaning before passing it to preprocessor.
251
+ # i = basic(i)
252
+ docs_processed = preprocessor.process([i])
253
+ for item in docs_processed:
254
+ item.content = basic(item.content, remove_punc= remove_punc)
255
+
256
+ df = pd.DataFrame(docs_processed)
257
+ all_text = " ".join(df.content.to_list())
258
+ para_list = df.content.to_list()
259
+ logging.info('document split into {} paragraphs'.format(len(para_list)))
260
+ output = {'documents': docs_processed,
261
+ 'dataframe': df,
262
+ 'text': all_text,
263
+ 'paraList': para_list
264
+ }
265
+ return output, "output_1"
266
+ def run_batch():
267
+ """
268
+ we dont have requirement to process the multiple files in one go
269
+ therefore nothing here, however to use the custom node we need to have
270
+ this method for the class.
271
+ """
272
+ return
273
+
274
+ def processingpipeline():
275
+ """
276
+ Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
277
+ from utils.preprocessing
278
+
279
+ """
280
+
281
+ preprocessing_pipeline = Pipeline()
282
+ file_converter = FileConverter()
283
+ custom_preprocessor = UdfPreProcessor()
284
+
285
+ preprocessing_pipeline.add_node(component=file_converter,
286
+ name="FileConverter", inputs=["File"])
287
+ preprocessing_pipeline.add_node(component = custom_preprocessor,
288
+ name ='UdfPreProcessor', inputs=["FileConverter"])
289
+
290
+ return preprocessing_pipeline
291
+
utils/uploadAndExample.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import json
4
+
5
+ def add_upload(choice):
6
+ if choice == 'Upload Document':
7
+ uploaded_files = st.sidebar.file_uploader('Upload Files',
8
+ type=['pdf', 'docx', 'txt'],
9
+ accept_multiple_files=True)
10
+
11
+ if uploaded_files is not None:
12
+ # Clear previous uploaded files from session state
13
+ for key in list(st.session_state.keys()):
14
+ if key.startswith('filename') or key.startswith('filepath'):
15
+ del st.session_state[key]
16
+
17
+ # Process and store each uploaded file
18
+ for index, uploaded_file in enumerate(uploaded_files):
19
+ with tempfile.NamedTemporaryFile(mode="wb", delete=False) as temp:
20
+ bytes_data = uploaded_file.getvalue()
21
+ temp.write(bytes_data)
22
+ st.session_state[f'filename_{index}'] = uploaded_file.name
23
+ st.session_state[f'filepath_{index}'] = temp.name
24
+
25
+ else: # Handle example document selection
26
+ # listing the options
27
+ with open('docStore/sample/files.json', 'r') as json_file:
28
+ files = json.load(json_file)
29
+
30
+ option = st.sidebar.selectbox('Select the example document',
31
+ list(files.keys()))
32
+ file_path = files[option]
33
+ st.session_state['filename_0'] = file_path # Use 'filename_0' to align with the upload naming convention
34
+ st.session_state['filepath_0'] = file_path # Use 'filepath_0' for consistency
35
+
36
+
37
+ # get the filenames from the processed docs dataframe so we can use for tab names
38
+ def get_tabs(uploaded_docs):
39
+ tabs = []
40
+ for doc_name in uploaded_docs:
41
+ tab_title = doc_name # Assuming doc_name is a string with the file name
42
+ tabs.append(tab_title)
43
+ return tabs
44
+
utils/vulnerability_classifier.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from typing_extensions import Literal
3
+ import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
+ import streamlit as st
9
+ from transformers import pipeline
10
+ from setfit import SetFitModel
11
+
12
+ label_dict= {0: 'Agricultural communities',
13
+ 1: 'Children',
14
+ 2: 'Coastal communities',
15
+ 3: 'Ethnic, racial or other minorities',
16
+ 4: 'Fishery communities',
17
+ 5: 'Informal sector workers',
18
+ 6: 'Members of indigenous and local communities',
19
+ 7: 'Migrants and displaced persons',
20
+ 8: 'Older persons',
21
+ 9: 'Other',
22
+ 10: 'Persons living in poverty',
23
+ 11: 'Persons with disabilities',
24
+ 12: 'Persons with pre-existing health conditions',
25
+ 13: 'Residents of drought-prone regions',
26
+ 14: 'Rural populations',
27
+ 15: 'Sexual minorities (LGBTQI+)',
28
+ 16: 'Urban populations',
29
+ 17: 'Women and other genders'}
30
+
31
+ def getlabels(preds):
32
+ # Get label names
33
+ preds_list = preds.tolist()
34
+
35
+ predictions_names=[]
36
+
37
+ # loop through each prediction
38
+ for ele in preds_list:
39
+
40
+ # see if there is a value 1 and retrieve index
41
+ try:
42
+ index_of_one = ele.index(1)
43
+ except ValueError:
44
+ index_of_one = "NA"
45
+
46
+ # Retrieve the name of the label (if no prediction made = NA)
47
+ if index_of_one != "NA":
48
+ name = label_dict[index_of_one]
49
+ else:
50
+ name = "Other"
51
+
52
+ # Append name to list
53
+ predictions_names.append(name)
54
+
55
+ return predictions_names
56
+
57
+ @st.cache_resource
58
+ def load_vulnerabilityClassifier(config_file:str = None, classifier_name:str = None):
59
+ """
60
+ loads the document classifier using haystack, where the name/path of model
61
+ in HF-hub as string is used to fetch the model object.Either configfile or
62
+ model should be passed.
63
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
64
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
65
+ Params
66
+ --------
67
+ config_file: config file path from which to read the model name
68
+ classifier_name: if modelname is passed, it takes a priority if not \
69
+ found then will look for configfile, else raise error.
70
+ Return: document classifier model
71
+ """
72
+ if not classifier_name:
73
+ if not config_file:
74
+ logging.warning("Pass either model name or config file")
75
+ return
76
+ else:
77
+ config = getconfig(config_file)
78
+ classifier_name = config.get('vulnerability','MODEL')
79
+
80
+ logging.info("Loading vulnerability classifier")
81
+ # we are using the pipeline as the model is multilabel and DocumentClassifier
82
+ # from Haystack doesnt support multilabel
83
+ # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
84
+ # if not then it will automatically use softmax, which is not a desired thing.
85
+ # doc_classifier = TransformersDocumentClassifier(
86
+ # model_name_or_path=classifier_name,
87
+ # task="text-classification",
88
+ # top_k = None)
89
+
90
+ # # Download model from HF Hub
91
+ doc_classifier = SetFitModel.from_pretrained("leavoigt/vulnerable_groups")
92
+
93
+ # doc_classifier = pipeline("text-classification",
94
+ # model=classifier_name,
95
+ # return_all_scores=True,
96
+ # function_to_apply= "sigmoid")
97
+
98
+ return doc_classifier
99
+
100
+
101
+ @st.cache_data
102
+ def vulnerability_classification(haystack_doc:pd.DataFrame,
103
+ threshold:float = 0.5,
104
+ classifier_model:pipeline= None
105
+ )->Tuple[DataFrame,Series]:
106
+ """
107
+ Text-Classification on the list of texts provided. Classifier provides the
108
+ most appropriate label for each text. these labels are in terms of if text
109
+ belongs to which particular Sustainable Devleopment Goal (SDG).
110
+ Params
111
+ ---------
112
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
113
+ contains the list of paragraphs in different format,here the list of
114
+ Haystack Documents is used.
115
+ threshold: threshold value for the model to keep the results from classifier
116
+ classifiermodel: you can pass the classifier model directly,which takes priority
117
+ however if not then looks for model in streamlit session.
118
+ In case of streamlit avoid passing the model directly.
119
+ Returns
120
+ ----------
121
+ df: Dataframe with two columns['SDG:int', 'text']
122
+ x: Series object with the unique SDG covered in the document uploaded and
123
+ the number of times it is covered/discussed/count_of_paragraphs.
124
+ """
125
+ logging.info("Working on vulnerability Identification")
126
+ haystack_doc['Vulnerability Label'] = 'NA'
127
+ # haystack_doc['PA_check'] = haystack_doc['Policy-Action Label'].apply(lambda x: True if len(x) != 0 else False)
128
+
129
+ # df1 = haystack_doc[haystack_doc['PA_check'] == True]
130
+ # df = haystack_doc[haystack_doc['PA_check'] == False]
131
+ if not classifier_model:
132
+ classifier_model = st.session_state['vulnerability_classifier']
133
+
134
+ predictions = classifier_model(list(haystack_doc.text))
135
+
136
+
137
+
138
+ pred_labels = getlabels(predictions)
139
+
140
+ haystack_doc['Vulnerability Label'] = pred_labels
141
+ # placeholder = {}
142
+ # for j in range(len(temp)):
143
+ # placeholder[temp[j]['label']] = temp[j]['score']
144
+ # list_.append(placeholder)
145
+ # labels_ = [{**list_[l]} for l in range(len(predictions))]
146
+ # truth_df = DataFrame.from_dict(labels_)
147
+ # truth_df = truth_df.round(2)
148
+ # truth_df = truth_df.astype(float) >= threshold
149
+ # truth_df = truth_df.astype(str)
150
+ # categories = list(truth_df.columns)
151
+ # truth_df['Vulnerability Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
152
+ # None for i in categories}, axis=1)
153
+ # truth_df['Vulnerability Label'] = truth_df.apply(lambda x: list(x['Vulnerability Label']
154
+ # -{None}),axis=1)
155
+ # haystack_doc['Vulnerability Label'] = list(truth_df['Vulnerability Label'])
156
+ return haystack_doc