Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""Model-Anxiety_label_training_bert.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1k4LW-0K6Y61H_UmqGwSe0h87qRaK8pIG | |
#Prediction of anxiety levels through text analysis | |
#Transcript loading method | |
When considering both the interviewer and the participant, the dataset is reduced to the sessions of 186 individuals, as 3 transcripts do not contain the text corresponding to Ellie, the virtual interviewer. | |
""" | |
#pip install transformers | |
import transformers | |
import pandas as pd | |
import re | |
import glob | |
#pip install -q gensim | |
#pip install Keras-Preprocessing | |
"""#Importing the required libraries""" | |
import glob | |
import pandas as pd | |
import numpy as np | |
import re | |
import fnmatch | |
import os | |
import keras | |
from keras.datasets import fashion_mnist | |
from keras.models import Sequential, Model | |
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, Activation, GlobalAveragePooling1D, Flatten, Concatenate, Conv1D, MaxPooling1D | |
from tensorflow.keras.layers import BatchNormalization | |
from keras.layers import concatenate | |
from keras.optimizers import SGD, RMSprop, Adagrad, Adam | |
from keras.preprocessing.text import one_hot, text_to_word_sequence, Tokenizer | |
from keras_preprocessing.sequence import pad_sequences | |
from keras.callbacks import EarlyStopping, ModelCheckpoint | |
from keras.utils.vis_utils import plot_model | |
from nltk.corpus import stopwords | |
from nltk.stem import SnowballStemmer | |
from string import punctuation | |
from scipy import stats | |
from keras.utils.vis_utils import plot_model | |
import matplotlib | |
import matplotlib.pyplot as plt | |
import itertools | |
import gensim | |
import nltk | |
from nltk.stem import WordNetLemmatizer | |
nltk.download('wordnet') | |
nltk.download('stopwords') | |
wordnet_lemmatizer = WordNetLemmatizer() | |
all_participants = pd.read_csv('all.csv', sep=',') | |
all_participants.columns = ['index','personId', 'question', 'answer'] | |
all_participants = all_participants.astype({"index": float, "personId": float, "question": str, "answer": str }) | |
# The function "text_to_wordlist" is from | |
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text | |
def text_to_wordlist(text, remove_stopwords=True, stem_words=False): | |
# Clean the text, with the option to remove stopwords and to stem words. | |
# Convert words to lower case and split them | |
text = text.lower().split() | |
# Optionally, remove stop words | |
if remove_stopwords: | |
stops = set(stopwords.words("english")) | |
text = [wordnet_lemmatizer.lemmatize(w) for w in text if not w in stops ] | |
text = [w for w in text if w != "nan" ] | |
text = " ".join(text) | |
# Clean the text | |
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) | |
text = re.sub(r"what's", "what is ", text) | |
text = re.sub(r"\'s", " ", text) | |
text = re.sub(r"\'ve", " have ", text) | |
text = re.sub(r"can't", "cannot ", text) | |
text = re.sub(r"n't", " not ", text) | |
text = re.sub(r"i'm", "i am ", text) | |
text = re.sub(r"\'re", " are ", text) | |
text = re.sub(r"\'d", " would ", text) | |
text = re.sub(r"\'ll", " will ", text) | |
text = re.sub(r",", " ", text) | |
text = re.sub(r"\.", " ", text) | |
text = re.sub(r"!", " ! ", text) | |
text = re.sub(r"\/", " ", text) | |
text = re.sub(r"\^", " ^ ", text) | |
text = re.sub(r"\+", " + ", text) | |
text = re.sub(r"\-", " - ", text) | |
text = re.sub(r"\=", " = ", text) | |
text = re.sub(r"\<", " ", text) | |
text = re.sub(r"\>", " ", text) | |
text = re.sub(r"'", " ", text) | |
text = re.sub(r"(\d+)(k)", r"\g<1>000", text) | |
text = re.sub(r":", " : ", text) | |
text = re.sub(r" e g ", " eg ", text) | |
text = re.sub(r" b g ", " bg ", text) | |
text = re.sub(r" u s ", " american ", text) | |
text = re.sub(r"\0s", "0", text) | |
text = re.sub(r" 9 11 ", "911", text) | |
text = re.sub(r"e - mail", "email", text) | |
text = re.sub(r"j k", "jk", text) | |
text = re.sub(r"\s{2,}", " ", text) | |
# Optionally, shorten words to their stems | |
if stem_words: | |
text = text.split() | |
stemmer = SnowballStemmer('english') | |
stemmed_words = [stemmer.stem(word) for word in text] | |
text = " ".join(stemmed_words) | |
# Return a list of words | |
return(text) | |
all_participants_mix = all_participants.copy() | |
all_participants_mix['answer'] = all_participants_mix.apply(lambda row: text_to_wordlist(row.answer).split(), axis=1) | |
words = [w for w in all_participants_mix['answer'].tolist()] | |
words = set(itertools.chain(*words)) | |
vocab_size = len(words) | |
windows_size = 10 | |
tokenizer = Tokenizer(num_words=vocab_size) | |
tokenizer.fit_on_texts(all_participants_mix['answer']) | |
tokenizer.fit_on_sequences(all_participants_mix['answer']) | |
labels = ['None', 'None', 'Mild', 'Moderate', 'Moderately Severe'] | |
num_classes = len(labels) | |
def plot_acc(history, title="Model Accuracy"): | |
"""Imprime una gráfica mostrando la accuracy por epoch obtenida en un entrenamiento""" | |
plt.plot(history.history['accuracy']) | |
plt.plot(history.history['val_accuracy']) | |
plt.title(title) | |
plt.ylabel('Accuracy') | |
plt.xlabel('Epoch') | |
plt.legend(['Train', 'Val'], loc='upper left') | |
plt.show() | |
def plot_loss(history, title="Model Loss"): | |
"""Imprime una gráfica mostrando la pérdida por epoch obtenida en un entrenamiento""" | |
plt.plot(history.history['loss']) | |
plt.plot(history.history['val_loss']) | |
plt.title(title) | |
plt.ylabel('Loss') | |
plt.xlabel('Epoch') | |
plt.legend(['Train', 'Val'], loc='upper right') | |
plt.show() | |
def plot_compare_losses(history1, history2, name1="Red 1", | |
name2="Red 2", title="Graph title"): | |
"""Compara losses de dos entrenamientos con nombres name1 y name2""" | |
plt.plot(history1.history['loss'], color="green") | |
plt.plot(history1.history['val_loss'], 'r--', color="green") | |
plt.plot(history2.history['loss'], color="blue") | |
plt.plot(history2.history['val_loss'], 'r--', color="blue") | |
plt.title(title) | |
plt.ylabel('Loss') | |
plt.xlabel('Epoch') | |
plt.legend(['Train ' + name1, 'Val ' + name1, | |
'Train ' + name2, 'Val ' + name2], | |
loc='upper right') | |
plt.show() | |
def plot_compare_accs(history1, history2, name1="Red 1", | |
name2="Red 2", title="Graph title"): | |
"""Compara accuracies de dos entrenamientos con nombres name1 y name2""" | |
plt.plot(history1.history['acc'], color="green") | |
plt.plot(history1.history['val_acc'], 'r--', color="green") | |
plt.plot(history2.history['acc'], color="blue") | |
plt.plot(history2.history['val_acc'], 'r--', color="blue") | |
plt.title(title) | |
plt.ylabel('Accuracy') | |
plt.xlabel('Epoch') | |
plt.legend(['Train ' + name1, 'Val ' + name1, | |
'Train ' + name2, 'Val ' + name2], | |
loc='lower right') | |
plt.show() | |
def plot_compare_multiple_metrics(history_array, names, colors, title="Graph title", metric='acc'): | |
legend = [] | |
for i in range(0, len(history_array)): | |
plt.plot(history_array[i].history[metric], color=colors[i]) | |
plt.plot(history_array[i].history['val_' + metric], 'r--', color=colors[i]) | |
legend.append('Train ' + names[i]) | |
legend.append('Val ' + names[i]) | |
plt.title(title) | |
plt.ylabel('Accuracy') | |
plt.xlabel('Epoch') | |
plt.axis | |
plt.legend(legend, | |
loc='lower right') | |
plt.show() | |
"""#Loading and preprocessing of transcripts | |
#Data analysis | |
#Auxiliary functions for text processing | |
Function taken from Kaggle for text cleaning | |
""" | |
#nltk.download('omw-1.4') | |
"""Text cleaning | |
Lemmatization | |
Separation into vectors | |
""" | |
def load_avec_dataset_file(path, score_column): | |
ds = pd.read_csv(path, sep=',') | |
ds['level'] = pd.cut(ds[score_column], bins=[-1, 4, 9, 14, 19, 24], labels=[0,1,2,3,4]) | |
ds['PHQ8_Score'] = ds[score_column] | |
ds['cat_level'] = keras.utils.to_categorical(ds['level'], num_classes).tolist() | |
ds = ds[['Participant_ID', 'level', 'cat_level', 'PHQ8_Score']] | |
ds = ds.astype({"Participant_ID": float, "level": int, 'PHQ8_Score': int}) | |
return ds | |
def split_by_phq_level(ds): | |
none_ds = ds[ds['level']==0] | |
mild_ds = ds[ds['level']==1] | |
moderate_ds = ds[ds['level']==2] | |
moderate_severe_ds = ds[ds['level']==3] | |
severe_ds = ds[ds['level']==4] | |
return (none_ds, mild_ds, moderate_ds, moderate_severe_ds, severe_ds) | |
def distribute_instances(ds): | |
ds_shuffled = ds.sample(frac=1) | |
none_ds, mild_ds, moderate_ds, moderate_severe_ds, severe_ds = split_by_phq_level(ds_shuffled) | |
split = [70,14,16] | |
eq_ds = {} | |
prev_none = prev_mild = prev_moderate = prev_moderate_severe = prev_severe = 0 | |
for p in split: | |
last_none = min(len(none_ds), prev_none + round(len(none_ds) * p/100)) | |
last_mild = min(len(mild_ds), prev_mild + round(len(mild_ds) * p/100)) | |
last_moderate = min(len(moderate_ds), prev_moderate + round(len(moderate_ds) * p/100)) | |
last_moderate_severe = min(len(moderate_severe_ds), prev_moderate_severe + round(len(moderate_severe_ds) * p/100)) | |
last_severe = min(len(severe_ds), prev_severe + round(len(severe_ds) * p/100)) | |
eq_ds["d"+str(p)] = pd.concat([none_ds[prev_none: last_none], mild_ds[prev_mild: last_mild], moderate_ds[prev_moderate: last_moderate], moderate_severe_ds[prev_moderate_severe: last_moderate_severe], severe_ds[prev_severe: last_severe]]) | |
prev_none = last_none | |
prev_mild = last_mild | |
prev_moderate = last_moderate | |
prev_moderate_severe = last_moderate_severe | |
prev_severe = last_severe | |
return (eq_ds["d70"], eq_ds["d14"], eq_ds["d16"]) | |
def test_model(text, model): | |
print(text) | |
word_list = text_to_wordlist(text) | |
sequences = tokenizer.texts_to_sequences([word_list]) | |
sequences_input = list(itertools.chain(*sequences)) | |
sequences_input = pad_sequences([sequences_input], value=0, padding="post", maxlen=windows_size).tolist() | |
input_a = np.asarray(sequences_input) | |
pred = model.predict(input_a, batch_size=None, verbose=0, steps=None) | |
print(pred) | |
predicted_class = np.argmax(pred) | |
print(labels[predicted_class]) | |
import pickle | |
windows_size = 10 | |
# Load the trained model | |
with open('model_bert.pkl', 'rb') as f: | |
Model = pickle.load(f) | |
def Test_model(text, Model): | |
word_list = text_to_wordlist(text) | |
sequences = tokenizer.texts_to_sequences([word_list]) | |
sequences_input = list(itertools.chain(*sequences)) | |
sequences_input = pad_sequences([sequences_input], value=0, padding="post", maxlen=windows_size).tolist() | |
input_a = np.asarray(sequences_input) | |
pred = Model.predict(input_a, batch_size=None, verbose=0, steps=None) | |
print(pred) | |
predicted_class = np.argmax(pred) | |
print(labels[predicted_class]) | |
#pip install gradio | |
import gradio as gr | |
import pickle | |
# Load the trained model | |
with open('model_bert.pkl', 'rb') as f: | |
Modelll = pickle.load(f) | |
def predict(text): | |
word_list = text_to_wordlist(text) | |
sequences = tokenizer.texts_to_sequences([word_list]) | |
sequences_input = list(itertools.chain(*sequences)) | |
sequences_input = pad_sequences([sequences_input], value=0, padding="post", maxlen=windows_size).tolist() | |
input_a = np.asarray(sequences_input) | |
pred = Modelll.predict(input_a, batch_size=None, verbose=0, steps=None) | |
predicted_class = np.argmax(pred) | |
return labels[predicted_class] | |
input_text = gr.inputs.Textbox(label="Enter a sentence") | |
output_text = gr.outputs.Textbox(label="Predicted label") | |
iface = gr.Interface(fn=predict, inputs=input_text, outputs=output_text, title="Depression Severity Analysis", | |
description="Enter texts to classify its depression severity.") | |
iface.launch() |