Spaces:

fatpuma
/

quillbot-text-classifier

Runtime error

File size: 5,197 Bytes

import streamlit as st
import pandas as pd
import pickle
from io import StringIO
from annotated_text import annotated_text
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 175 
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # out of vocabulary token
vocab_size = 5000 # hopefully this isn't too big...


MIN_STRING_LENGTH = 25

def tokenize_text(x):
    training_sequences = tokenizer.texts_to_sequences(x)
    return pad_sequences(training_sequences,
                         maxlen = max_len,
                         padding = padding_type,
                         truncating = trunc_type)

def expand_one_paragraph(para):
    sentences = para.split('.')
    return sentences

# expects a float value between 0.0 and 1.0
def get_rgb_val(float_val):
    if float_val < 0.0:
        return '#000'
    else:
        scaled_val = int(round(float_val * 15, 0))
        green_val = hex(15 - scaled_val)[-1]
        red_val = hex(scaled_val)[-1]

        return '#' + red_val + green_val + '0'
    
class OutputSentence:
    
    def __init__(self, text, confidence):
        self.text = text
        
        if len(self.text) < MIN_STRING_LENGTH:
            self.confidence = -1
            self.is_quillbotted = False
        else:
            self.confidence = confidence
            
            if self.confidence < 0.5:
                self.is_quillbotted = False
            else:
                self.is_quillbotted = True
    

with st.sidebar: 
    option = st.selectbox(
    'Model Options',
    ('Limited-text-processing', 'Moderate-text-processing',))
    
    MIN_STRING_LENGTH = st.slider('Minimum String Length in Characters: (Default 25)', 0, 200, 25)

    show_clear = st.checkbox('Show Cleared Sentences', value = True)

    display_type = st.radio("Select Display Output", ('Confidence', 'Binary'))
    
model_name = 'v1'
tokenizer_name = 'v1'

if option == 'Limited-text-processing':
    model_name = 'v1'
    tokenizer_name = 'v1'
elif option == 'Moderate-text-processing':
    model_name = 'v2'
    tokenizer_name = 'v2'

reconstructed_model = keras.models.load_model('frontend_files/finalmodel' + model_name + '.h5')
with open('frontend_files/tokenizer' + tokenizer_name + '.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# =========================================

# got from https://pythonwife.com/file-upload-download-with-streamlit/

import docx2txt
import pdfplumber

docx_file = st.file_uploader("Choose a text file", type=['pdf', 'docx', 'txt'])
if docx_file is not None:

    if docx_file.type == "text/plain":
        # Read as string (decode bytes to string)
        text_data = str(docx_file.read(),"utf-8")

    elif docx_file.type == "application/pdf":
        try:
            with pdfplumber.open(docx_file) as pdf:
                pages = ''
                for page in pdf.pages:
                    pages = pages + page.extract_text()
                text_data = pages
        except:
            st.write("None")

    else:
        text_data = docx2txt.process(docx_file)
    
    
# =========================================

    
    input_list = expand_one_paragraph(text_data)

    vectorized_input = tokenize_text(input_list)
    model_output = reconstructed_model.predict(vectorized_input)

    output_paragraph = []
      
        
    for i in range(len(input_list)):
        if len(input_list[i]) > MIN_STRING_LENGTH:
            output_paragraph.append(OutputSentence(input_list[i], model_output[i][0]))  
    
    total_quillbotted = 0
    total_safe = 0
    average_confidence = 0
    
    for item in output_paragraph:
        if item.is_quillbotted:
            total_quillbotted = 1 + total_quillbotted
            average_confidence = item.confidence + average_confidence
        else:
            total_safe = 1 + total_safe
    
    average_confidence = average_confidence / total_quillbotted
    
    col1, col2, col3 = st.columns(3)
    per = str(round((total_quillbotted/len(output_paragraph)) * 100, 2)) + '%'
    conf_per = str(round(average_confidence * 100, 2)) + '%'
    fraction = str(total_quillbotted) + '/' + str(len(output_paragraph))
    
    col1.metric("Fraction of Suspect Sentences", fraction)
    col2.metric("Percentage of Suspect Sentences", per)
    col3.metric("avg. Confidence of Suspect Sentences", conf_per)
    
    for item in output_paragraph:
        if display_type == 'Confidence':
            display_value = item.confidence
        elif display_type == 'Binary':
            if item.is_quillbotted:
                display_value = 0.85
            else:
                display_value = 0.2
        
        if item.confidence > 0.0:
            if (not show_clear) and (not item.is_quillbotted):
                annotated_text(item.text)
            else:
                if display_type == 'Binary':
                    annotated_text((item.text, '', get_rgb_val(display_value)),)
                else:
                    annotated_text((item.text, str(display_value), get_rgb_val(display_value)),)
    
    st.write('Finished...')