fatpuma's picture
Fixed pdf upload
78d14c2
import streamlit as st
import pandas as pd
import pickle
from io import StringIO
from annotated_text import annotated_text
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 175
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # out of vocabulary token
vocab_size = 5000 # hopefully this isn't too big...
MIN_STRING_LENGTH = 25
def tokenize_text(x):
training_sequences = tokenizer.texts_to_sequences(x)
return pad_sequences(training_sequences,
maxlen = max_len,
padding = padding_type,
truncating = trunc_type)
def expand_one_paragraph(para):
sentences = para.split('.')
return sentences
# expects a float value between 0.0 and 1.0
def get_rgb_val(float_val):
if float_val < 0.0:
return '#000'
else:
scaled_val = int(round(float_val * 15, 0))
green_val = hex(15 - scaled_val)[-1]
red_val = hex(scaled_val)[-1]
return '#' + red_val + green_val + '0'
class OutputSentence:
def __init__(self, text, confidence):
self.text = text
if len(self.text) < MIN_STRING_LENGTH:
self.confidence = -1
self.is_quillbotted = False
else:
self.confidence = confidence
if self.confidence < 0.5:
self.is_quillbotted = False
else:
self.is_quillbotted = True
with st.sidebar:
option = st.selectbox(
'Model Options',
('Limited-text-processing', 'Moderate-text-processing',))
MIN_STRING_LENGTH = st.slider('Minimum String Length in Characters: (Default 25)', 0, 200, 25)
show_clear = st.checkbox('Show Cleared Sentences', value = True)
display_type = st.radio("Select Display Output", ('Confidence', 'Binary'))
model_name = 'v1'
tokenizer_name = 'v1'
if option == 'Limited-text-processing':
model_name = 'v1'
tokenizer_name = 'v1'
elif option == 'Moderate-text-processing':
model_name = 'v2'
tokenizer_name = 'v2'
reconstructed_model = keras.models.load_model('frontend_files/finalmodel' + model_name + '.h5')
with open('frontend_files/tokenizer' + tokenizer_name + '.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
# =========================================
# got from https://pythonwife.com/file-upload-download-with-streamlit/
import docx2txt
import pdfplumber
docx_file = st.file_uploader("Choose a text file", type=['pdf', 'docx', 'txt'])
if docx_file is not None:
if docx_file.type == "text/plain":
# Read as string (decode bytes to string)
text_data = str(docx_file.read(),"utf-8")
elif docx_file.type == "application/pdf":
try:
with pdfplumber.open(docx_file) as pdf:
pages = ''
for page in pdf.pages:
pages = pages + page.extract_text()
text_data = pages
except:
st.write("None")
else:
text_data = docx2txt.process(docx_file)
# =========================================
input_list = expand_one_paragraph(text_data)
vectorized_input = tokenize_text(input_list)
model_output = reconstructed_model.predict(vectorized_input)
output_paragraph = []
for i in range(len(input_list)):
if len(input_list[i]) > MIN_STRING_LENGTH:
output_paragraph.append(OutputSentence(input_list[i], model_output[i][0]))
total_quillbotted = 0
total_safe = 0
average_confidence = 0
for item in output_paragraph:
if item.is_quillbotted:
total_quillbotted = 1 + total_quillbotted
average_confidence = item.confidence + average_confidence
else:
total_safe = 1 + total_safe
average_confidence = average_confidence / total_quillbotted
col1, col2, col3 = st.columns(3)
per = str(round((total_quillbotted/len(output_paragraph)) * 100, 2)) + '%'
conf_per = str(round(average_confidence * 100, 2)) + '%'
fraction = str(total_quillbotted) + '/' + str(len(output_paragraph))
col1.metric("Fraction of Suspect Sentences", fraction)
col2.metric("Percentage of Suspect Sentences", per)
col3.metric("avg. Confidence of Suspect Sentences", conf_per)
for item in output_paragraph:
if display_type == 'Confidence':
display_value = item.confidence
elif display_type == 'Binary':
if item.is_quillbotted:
display_value = 0.85
else:
display_value = 0.2
if item.confidence > 0.0:
if (not show_clear) and (not item.is_quillbotted):
annotated_text(item.text)
else:
if display_type == 'Binary':
annotated_text((item.text, '', get_rgb_val(display_value)),)
else:
annotated_text((item.text, str(display_value), get_rgb_val(display_value)),)
st.write('Finished...')