Spaces:

fatpuma
/

quillbot-text-classifier

Runtime error

App Files Files Community

quillbot-text-classifier / app.py

fatpuma

Fixed pdf upload

78d14c2 over 2 years ago

raw

history blame contribute delete

5.2 kB

	import streamlit as st
	import pandas as pd
	import pickle
	from io import StringIO
	from annotated_text import annotated_text
	import tensorflow as tf
	from tensorflow import keras
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences

	max_len = 175
	trunc_type = 'post'
	padding_type = 'post'
	oov_tok = '<OOV>' # out of vocabulary token
	vocab_size = 5000 # hopefully this isn't too big...


	MIN_STRING_LENGTH = 25

	def tokenize_text(x):
	training_sequences = tokenizer.texts_to_sequences(x)
	return pad_sequences(training_sequences,
	maxlen = max_len,
	padding = padding_type,
	truncating = trunc_type)

	def expand_one_paragraph(para):
	sentences = para.split('.')
	return sentences

	# expects a float value between 0.0 and 1.0
	def get_rgb_val(float_val):
	if float_val < 0.0:
	return '#000'
	else:
	scaled_val = int(round(float_val * 15, 0))
	green_val = hex(15 - scaled_val)[-1]
	red_val = hex(scaled_val)[-1]

	return '#' + red_val + green_val + '0'

	class OutputSentence:

	def __init__(self, text, confidence):
	self.text = text

	if len(self.text) < MIN_STRING_LENGTH:
	self.confidence = -1
	self.is_quillbotted = False
	else:
	self.confidence = confidence

	if self.confidence < 0.5:
	self.is_quillbotted = False
	else:
	self.is_quillbotted = True


	with st.sidebar:
	option = st.selectbox(
	'Model Options',
	('Limited-text-processing', 'Moderate-text-processing',))

	MIN_STRING_LENGTH = st.slider('Minimum String Length in Characters: (Default 25)', 0, 200, 25)

	show_clear = st.checkbox('Show Cleared Sentences', value = True)

	display_type = st.radio("Select Display Output", ('Confidence', 'Binary'))

	model_name = 'v1'
	tokenizer_name = 'v1'

	if option == 'Limited-text-processing':
	model_name = 'v1'
	tokenizer_name = 'v1'
	elif option == 'Moderate-text-processing':
	model_name = 'v2'
	tokenizer_name = 'v2'

	reconstructed_model = keras.models.load_model('frontend_files/finalmodel' + model_name + '.h5')
	with open('frontend_files/tokenizer' + tokenizer_name + '.pickle', 'rb') as handle:
	tokenizer = pickle.load(handle)

	# =========================================

	# got from https://pythonwife.com/file-upload-download-with-streamlit/

	import docx2txt
	import pdfplumber

	docx_file = st.file_uploader("Choose a text file", type=['pdf', 'docx', 'txt'])
	if docx_file is not None:

	if docx_file.type == "text/plain":
	# Read as string (decode bytes to string)
	text_data = str(docx_file.read(),"utf-8")

	elif docx_file.type == "application/pdf":
	try:
	with pdfplumber.open(docx_file) as pdf:
	pages = ''
	for page in pdf.pages:
	pages = pages + page.extract_text()
	text_data = pages
	except:
	st.write("None")

	else:
	text_data = docx2txt.process(docx_file)


	# =========================================


	input_list = expand_one_paragraph(text_data)

	vectorized_input = tokenize_text(input_list)
	model_output = reconstructed_model.predict(vectorized_input)

	output_paragraph = []


	for i in range(len(input_list)):
	if len(input_list[i]) > MIN_STRING_LENGTH:
	output_paragraph.append(OutputSentence(input_list[i], model_output[i][0]))

	total_quillbotted = 0
	total_safe = 0
	average_confidence = 0

	for item in output_paragraph:
	if item.is_quillbotted:
	total_quillbotted = 1 + total_quillbotted
	average_confidence = item.confidence + average_confidence
	else:
	total_safe = 1 + total_safe

	average_confidence = average_confidence / total_quillbotted

	col1, col2, col3 = st.columns(3)
	per = str(round((total_quillbotted/len(output_paragraph)) * 100, 2)) + '%'
	conf_per = str(round(average_confidence * 100, 2)) + '%'
	fraction = str(total_quillbotted) + '/' + str(len(output_paragraph))

	col1.metric("Fraction of Suspect Sentences", fraction)
	col2.metric("Percentage of Suspect Sentences", per)
	col3.metric("avg. Confidence of Suspect Sentences", conf_per)

	for item in output_paragraph:
	if display_type == 'Confidence':
	display_value = item.confidence
	elif display_type == 'Binary':
	if item.is_quillbotted:
	display_value = 0.85
	else:
	display_value = 0.2

	if item.confidence > 0.0:
	if (not show_clear) and (not item.is_quillbotted):
	annotated_text(item.text)
	else:
	if display_type == 'Binary':
	annotated_text((item.text, '', get_rgb_val(display_value)),)
	else:
	annotated_text((item.text, str(display_value), get_rgb_val(display_value)),)

	st.write('Finished...')