Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import pickle | |
from io import StringIO | |
from annotated_text import annotated_text | |
import tensorflow as tf | |
from tensorflow import keras | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
max_len = 175 | |
trunc_type = 'post' | |
padding_type = 'post' | |
oov_tok = '<OOV>' # out of vocabulary token | |
vocab_size = 5000 # hopefully this isn't too big... | |
MIN_STRING_LENGTH = 25 | |
def tokenize_text(x): | |
training_sequences = tokenizer.texts_to_sequences(x) | |
return pad_sequences(training_sequences, | |
maxlen = max_len, | |
padding = padding_type, | |
truncating = trunc_type) | |
def expand_one_paragraph(para): | |
sentences = para.split('.') | |
return sentences | |
# expects a float value between 0.0 and 1.0 | |
def get_rgb_val(float_val): | |
if float_val < 0.0: | |
return '#000' | |
else: | |
scaled_val = int(round(float_val * 15, 0)) | |
green_val = hex(15 - scaled_val)[-1] | |
red_val = hex(scaled_val)[-1] | |
return '#' + red_val + green_val + '0' | |
class OutputSentence: | |
def __init__(self, text, confidence): | |
self.text = text | |
if len(self.text) < MIN_STRING_LENGTH: | |
self.confidence = -1 | |
self.is_quillbotted = False | |
else: | |
self.confidence = confidence | |
if self.confidence < 0.5: | |
self.is_quillbotted = False | |
else: | |
self.is_quillbotted = True | |
with st.sidebar: | |
option = st.selectbox( | |
'Model Options', | |
('Limited-text-processing', 'Moderate-text-processing',)) | |
MIN_STRING_LENGTH = st.slider('Minimum String Length in Characters: (Default 25)', 0, 200, 25) | |
show_clear = st.checkbox('Show Cleared Sentences', value = True) | |
display_type = st.radio("Select Display Output", ('Confidence', 'Binary')) | |
model_name = 'v1' | |
tokenizer_name = 'v1' | |
if option == 'Limited-text-processing': | |
model_name = 'v1' | |
tokenizer_name = 'v1' | |
elif option == 'Moderate-text-processing': | |
model_name = 'v2' | |
tokenizer_name = 'v2' | |
reconstructed_model = keras.models.load_model('frontend_files/finalmodel' + model_name + '.h5') | |
with open('frontend_files/tokenizer' + tokenizer_name + '.pickle', 'rb') as handle: | |
tokenizer = pickle.load(handle) | |
# ========================================= | |
# got from https://pythonwife.com/file-upload-download-with-streamlit/ | |
import docx2txt | |
import pdfplumber | |
docx_file = st.file_uploader("Choose a text file", type=['pdf', 'docx', 'txt']) | |
if docx_file is not None: | |
if docx_file.type == "text/plain": | |
# Read as string (decode bytes to string) | |
text_data = str(docx_file.read(),"utf-8") | |
elif docx_file.type == "application/pdf": | |
try: | |
with pdfplumber.open(docx_file) as pdf: | |
pages = '' | |
for page in pdf.pages: | |
pages = pages + page.extract_text() | |
text_data = pages | |
except: | |
st.write("None") | |
else: | |
text_data = docx2txt.process(docx_file) | |
# ========================================= | |
input_list = expand_one_paragraph(text_data) | |
vectorized_input = tokenize_text(input_list) | |
model_output = reconstructed_model.predict(vectorized_input) | |
output_paragraph = [] | |
for i in range(len(input_list)): | |
if len(input_list[i]) > MIN_STRING_LENGTH: | |
output_paragraph.append(OutputSentence(input_list[i], model_output[i][0])) | |
total_quillbotted = 0 | |
total_safe = 0 | |
average_confidence = 0 | |
for item in output_paragraph: | |
if item.is_quillbotted: | |
total_quillbotted = 1 + total_quillbotted | |
average_confidence = item.confidence + average_confidence | |
else: | |
total_safe = 1 + total_safe | |
average_confidence = average_confidence / total_quillbotted | |
col1, col2, col3 = st.columns(3) | |
per = str(round((total_quillbotted/len(output_paragraph)) * 100, 2)) + '%' | |
conf_per = str(round(average_confidence * 100, 2)) + '%' | |
fraction = str(total_quillbotted) + '/' + str(len(output_paragraph)) | |
col1.metric("Fraction of Suspect Sentences", fraction) | |
col2.metric("Percentage of Suspect Sentences", per) | |
col3.metric("avg. Confidence of Suspect Sentences", conf_per) | |
for item in output_paragraph: | |
if display_type == 'Confidence': | |
display_value = item.confidence | |
elif display_type == 'Binary': | |
if item.is_quillbotted: | |
display_value = 0.85 | |
else: | |
display_value = 0.2 | |
if item.confidence > 0.0: | |
if (not show_clear) and (not item.is_quillbotted): | |
annotated_text(item.text) | |
else: | |
if display_type == 'Binary': | |
annotated_text((item.text, '', get_rgb_val(display_value)),) | |
else: | |
annotated_text((item.text, str(display_value), get_rgb_val(display_value)),) | |
st.write('Finished...') | |