Spaces:
Running
Running
# set path | |
import glob, os, sys; | |
sys.path.append('../utils') | |
#import needed libraries | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import streamlit as st | |
from utils.vulnerability_classifier import load_vulnerabilityClassifier, vulnerability_classification | |
import logging | |
logger = logging.getLogger(__name__) | |
from utils.config import get_classifier_params | |
from utils.preprocessing import paraLengthCheck | |
from io import BytesIO | |
import xlsxwriter | |
import plotly.express as px | |
from utils.vulnerability_classifier import label_dict | |
# Declare all the necessary variables | |
classifier_identifier = 'vulnerability' | |
params = get_classifier_params(classifier_identifier) | |
def to_excel(df,sectorlist): | |
len_df = len(df) | |
output = BytesIO() | |
writer = pd.ExcelWriter(output, engine='xlsxwriter') | |
df.to_excel(writer, index=False, sheet_name='Sheet1') | |
workbook = writer.book | |
worksheet = writer.sheets['Sheet1'] | |
worksheet.data_validation('S2:S{}'.format(len_df), | |
{'validate': 'list', | |
'source': ['No', 'Yes', 'Discard']}) | |
worksheet.data_validation('X2:X{}'.format(len_df), | |
{'validate': 'list', | |
'source': sectorlist + ['Blank']}) | |
worksheet.data_validation('T2:T{}'.format(len_df), | |
{'validate': 'list', | |
'source': sectorlist + ['Blank']}) | |
worksheet.data_validation('U2:U{}'.format(len_df), | |
{'validate': 'list', | |
'source': sectorlist + ['Blank']}) | |
worksheet.data_validation('V2:V{}'.format(len_df), | |
{'validate': 'list', | |
'source': sectorlist + ['Blank']}) | |
worksheet.data_validation('W2:U{}'.format(len_df), | |
{'validate': 'list', | |
'source': sectorlist + ['Blank']}) | |
writer.save() | |
processed_data = output.getvalue() | |
return processed_data | |
def app(): | |
### Main app code ### | |
with st.container(): | |
# If a document has been processed | |
if 'key0' in st.session_state: | |
# Run vulnerability classifier | |
df = st.session_state.key0 | |
classifier = load_vulnerabilityClassifier(classifier_name=params['model_name']) | |
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier | |
# Get the predictions | |
df = vulnerability_classification(haystack_doc=df, | |
threshold= params['threshold']) | |
# Store df in session state with key1 | |
st.session_state.key1 = df | |
def vulnerability_display(): | |
# Assign dataframe a name | |
df_vul = st.session_state['key0'] | |
#st.write(df_vul) | |
col1, col2 = st.columns([1,1]) | |
with col1: | |
# Header | |
st.subheader("Explore references to vulnerable groups:") | |
# Text | |
num_paragraphs = len(df_vul['Vulnerability Label']) | |
num_references = df_vul['Vulnerability Label'].apply(lambda x: 'Other' not in x).sum() | |
st.markdown(f"""<div style="text-align: justify;"> The document contains a | |
total of <span style="color: red;">{num_paragraphs}</span> paragraphs. | |
We identified <span style="color: red;">{num_references}</span> | |
references to groups in vulnerable situations.</div> | |
<br> | |
In the chart on the right you can see how often each group has been references. | |
For a more detailed view in the text, see the paragraphs and | |
their respective labels in the table below.</div>""", unsafe_allow_html=True) | |
with col2: | |
### Bar chart | |
# # Create a df that stores all the labels | |
df_labels = pd.DataFrame(list(label_dict.items()), columns=['Label ID', 'Label']) | |
# Count how often each label appears in the "Vulnerability Labels" column | |
group_counts = {} | |
# Iterate through each sublist | |
for index, row in df_vul.iterrows(): | |
# Iterate through each group in the sublist | |
for sublist in row['Vulnerability Label']: | |
# Update the count in the dictionary | |
group_counts[sublist] = group_counts.get(sublist, 0) + 1 | |
# Create a new dataframe from group_counts | |
df_label_count = pd.DataFrame(list(group_counts.items()), columns=['Label', 'Count']) | |
# Merge the label counts with the df_label DataFrame | |
df_label_count = df_labels.merge(df_label_count, on='Label', how='left') | |
# Exclude the "Other" group | |
df_bar_chart = df_label_count[df_label_count['Label'] != 'Other'] | |
# Bar chart | |
fig = px.bar(df_bar_chart, | |
x='Label', | |
y='Count', | |
title='How many references have been found to each group?', | |
labels={'Count': 'Frequency'}) | |
#Show plot | |
st.plotly_chart(fig, use_container_width=True) | |
# ### Table | |
st.write(df_vul[df_vul['Vulnerability Label'].apply(lambda x: 'Other' not in x)]) | |