# -*- coding: utf-8 -*-
"""JointmBERT.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/17r68TnVYmGnoeNYZhhbEqOeGxCn3fN8x
"""

# from google.colab import drive
# drive.mount('/content/drive')

# !nvidia-smi

#!pip install seqeval -qqq
# TODO: update this notebook to work with the latest version of transformers
#!pip install -q transformers==2.11.0

import tensorflow as tf
tf.__version__

"""#Importing Libraries"""

import os
from streamlit import st
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertModel
import warnings
warnings.filterwarnings('ignore')
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler, ModelCheckpoint, TensorBoard
from seqeval.metrics import classification_report
from transformers import TFAutoModel
from tensorflow.keras.layers import Dropout, Dense
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

"""#Loading Data"""
uploaded_file = st.file_uploader(“translated_swa_test_slot_labels.xlsx”)
df = pd.read_excel(uploaded_file)

df.shape

df.head(2)

df.rename(columns = {'utterance_swa	':'words','slot_labels_swa':'word_labels','intent_swa':'intent_label'}, inplace = True)

# df.head(2)

df["words"] = df["utterance_swa"]
del df["utterance_swa"]
df.head(2)

# df.head(2)

df_train, df_valid = train_test_split(df, test_size=0.2)

df_train.head(2)

df_valid.head(2)

"""
## A First Model: Intent Classification (Sentence Level)

Let's ignore the slot filling task for now and let's try to build a sentence level classifier by fine-tuning a pre-trained Transformer-based model using the `huggingface/transformers` package that provides both TF2/Keras and Pytorch APIs.

### The BERT Tokenizer

First let's load a pre-trained tokenizer and test it on a test sentence from the training set:"""

from transformers import BertTokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

"""#Testing word tokenization using Bert"""

first_sentence = df_train.iloc[0]["words"]
first_sentence

encoding = tokenizer.encode(first_sentence)
print(tokenizer.convert_ids_to_tokens(encoding))

"""It can be noticed that BERT uses subword tokens so the length of the tokenized sentence is likely to be larger than the number of words in the sentence.

Remarks:

- The first token `[CLS]` is used by the pre-training task for sequence classification.
- The last token `[SEP]` is a separator for the pre-training task that classifiies if a pair of sentences are consecutive in a corpus or not (next sentence prediction).

#Data Preprocessing

Checking the length of sequences after tokenization, so that we could assign them to equal dummy vectors in the training set
"""

train_sequence_lengths = [len(tokenizer.encode(text))
                          for text in df_train["words"]]
plt.hist(train_sequence_lengths, bins=30)
plt.title(f"max sequence length: {max(train_sequence_lengths)}")

"""[link text](https://)The mapping can be introspected in the `tokenizer.vocab` attribute:

### Encoding the Dataset with the Tokenizer

Encoding the full train / valid and test sets with Bert tokenizer to get a padded integer numpy arrays:
"""

import numpy as np

def encode_dataset(tokenizer, text_sequences, max_length):
    token_ids = np.zeros(shape=(len(text_sequences), max_length),
                         dtype=np.int32)
    for i, text_sequence in enumerate(text_sequences):
        encoded = tokenizer.encode(text_sequence)
        token_ids[i, 0:len(encoded)] = encoded
    attention_masks = (token_ids != 0).astype(np.int32)
    return {"input_ids": token_ids, "attention_masks": attention_masks}

encoded_train = encode_dataset(tokenizer, df_train["words"], 60)
encoded_train["input_ids"]

encoded_train["attention_masks"]

encoded_valid = encode_dataset(tokenizer, df_valid["words"], 60)
# encoded_test = encode_dataset(tokenizer, df_test["words"], 90)

"""### Encoding the Sequence Classification Targets

To do so we build a simple mapping from the auxiliary files:
"""

seq_out_tokenizer = Tokenizer(filters='!"#$%&()*+,/:;<=>?@[\\]^`{|}~\t\n', oov_token="UNK",lower=False)

##Preprocessing

df_train.fillna('O',inplace = True)

df_valid.fillna('O',inplace = True)

seq_out_tokenizer.fit_on_texts(df_train["word_labels"].tolist())

seq_out_tokenizer.fit_on_texts(df_valid["word_labels"].tolist())

seq_out_word_to_index = seq_out_tokenizer.word_index
len(seq_out_word_to_index)

seq_out_word_to_index

intent_names = set(df_train.intent_label)
intent_map = dict((label, idx) for idx, label in enumerate(intent_names))
intent_map

intent_train = df_train["intent_label"].map(intent_map).values

intent_valid = df_valid["intent_label"].map(intent_map).values

"""### Loading and Feeding a Pretrained BERT model

Loading a pretrained BERT Large model using the [huggingface transformers](https://github.com/huggingface/transformers) package:
"""

from transformers import TFAutoModel, AutoConfig
config = AutoConfig.from_pretrained('bert-base-multilingual-uncased')
large_bert_model = TFAutoModel.from_pretrained("bert-base-multilingual-cased")
large_bert_model.summary()

encoded_valid

outputs = large_bert_model(encoded_valid)
len(outputs)

"""The **first ouput** of the BERT model is a tensor with shape: `(batch_size, seq_len, output_dim)` which computes **features for each token in the input sequence**:"""

outputs[0].shape

"""The **second output** of the BERT model is a tensor with shape `(batch_size, output_dim)` which is the vector representation of the special token `[CLS]`. This vector is typically used as a **pooled representation for the sequence as a whole**."""

outputs[1].shape

"""#Mapping slots to the corresponding indexes"""

slot_names = ["[PAD]"]
slot_names += list(seq_out_word_to_index.keys())[1:]
slot_names

slot_map = {}
for label in slot_names:
    slot_map[label] = len(slot_map)
slot_map

"""The following function generates token-aligned integer labels from the BIO word-level annotations. In particular, if a specific word is too long to be represented as a single token, we expand its label for all the tokens of that word while taking care of using "B-" labels only for the first token and then use "I-" for the matching slot type for subsequent tokens of the same word:"""

def encode_token_labels(text_sequences, slot_names, tokenizer, slot_map,max_length):
  
    encoded = np.zeros(shape=(len(text_sequences), max_length), dtype=np.int32)
    for i, (text_sequence, word_labels) in enumerate(
            zip(text_sequences, slot_names)):
        encoded_labels = []
        for word, word_label in zip(text_sequence.split(), word_labels.split()):
            tokens = tokenizer.tokenize(word)
            encoded_labels.append(slot_map[word_label])
            expand_label = word_label.replace("B-", "I-")
            if not expand_label in slot_map:
                expand_label = word_label
            encoded_labels.extend([slot_map[expand_label]] * (len(tokens) - 1))
        encoded[i, 1:len(encoded_labels) + 1] = encoded_labels
    return encoded

slot_train = encode_token_labels(
    df_train["words"], df_train["word_labels"], tokenizer, slot_map, 60)
slot_valid = encode_token_labels(
    df_valid["words"], df_valid["word_labels"], tokenizer, slot_map, 60)
# slot_test = encode_token_labels(
#     df_test["words"], df_test["word_labels"], tokenizer, slot_map, 90)

########################
######### Debbuging proccess

slot_train[0]

slot_valid[0]

# slot_map

# # def encode_token_labels(text_sequences, slot_names, tokenizer, slot_map,max_length):

# text_sequences = df_train["words"]
# slot_names = df_train["word_labels"]
# slot_map = slot_map
# max_length  = 63

# encoded = np.zeros(shape=(len(text_sequences), max_length), dtype=np.int32)
# for i, (text_sequence, word_labels) in enumerate(
#         zip(text_sequences, slot_names)):
#     encoded_labels = []

#     for word, word_label in zip(text_sequence.split(), word_labels.split()):

#         tokens = tokenizer.tokenize(word)
#         # print(word_label)

#         encoded_labels.append(slot_map[word_label])

#         expand_label = word_label.replace("B-", "I-")

#         if not expand_label in slot_map:

#             expand_label = word_label
#         encoded_labels.extend([slot_map[expand_label]] * (len(tokens) - 1))
#     encoded[i, 1:len(encoded_labels) + 1] = encoded_labels

# # print(encoded)

# slot_train = encode_token_labels(
#     df_train["words"], df_train["word_labels"], tokenizer, slot_map, 63)
# slot_valid = encode_token_labels(
#     df_valid["words"], df_valid["word_labels"], tokenizer, slot_map, 63)
# slot_test = encode_token_labels(
#     df_test["words"], df_test["word_labels"], tokenizer, slot_map, 63)

# df_train["words"][0]

# df_train["word_labels"][0]

slot_map['O']

"""Note that the special tokens such as "[PAD]" and "[SEP]" and all padded positions recieve a 0 label.

#Joint Intent And Slot Filling Model
"""

class JointIntentAndSlotFillingModel(tf.keras.Model):

    def __init__(self, intent_num_labels=None, slot_num_labels=None,
                 model_name="bert-base-multilingual-cased", dropout_prob=0.1):
        super().__init__(name="joint_intent_slot")
        self.bert = TFAutoModel.from_pretrained(model_name)
        self.dropout = Dropout(dropout_prob)
        self.intent_classifier = Dense(intent_num_labels,
                                       name="intent_classifier")
        self.slot_classifier = Dense(slot_num_labels,
                                     name="slot_classifier")

    def call(self, inputs, training=False):
        sequence_output, pooled_output = self.bert(inputs, training=training)

        # The first output of the main BERT layer has shape:
        # (batch_size, max_length, output_dim)
        sequence_output = self.dropout(sequence_output, training=training)
        slot_logits = self.slot_classifier(sequence_output)

        # The second output of the main BERT layer has shape:
        # (batch_size, output_dim)
        # and gives a "pooled" representation for the full sequence from the
        # hidden state that corresponds to the "[CLS]" token.
        pooled_output = self.dropout(pooled_output, training=training)
        intent_logits = self.intent_classifier(pooled_output)

        return slot_logits, intent_logits

joint_model = JointIntentAndSlotFillingModel(intent_num_labels=len(intent_map), slot_num_labels=len(slot_map))
        
# joint_model.compile(optimizer=Adam(learning_rate=3e-5, epsilon=1e-08),loss=losses)

opt = Adam(learning_rate=3e-5, epsilon=1e-08)
losses = [SparseCategoricalCrossentropy(from_logits=True),
          SparseCategoricalCrossentropy(from_logits=True)]
metrics = [SparseCategoricalAccuracy('accuracy')]
joint_model.compile(optimizer=opt, loss=losses, metrics=metrics)

#########

# checkpoint_path = "/content/drive/MyDrive/JP Morgan/Data/model_ester_swa/cp.ckpt"
# checkpoint_dir = os.path.dirname(checkpoint_path)

# # Create a callback that saves the model's weights
# cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
#                                                  save_weights_only=True,
#                                                  verbose=1)

#### Loading Model weights for training 

################

# joint_model.load_weights('/content/drive/MyDrive/JP Morgan/Data/model_ester_swa/cp.ckpt')

history = joint_model.fit(
    encoded_train,
    (slot_train, intent_train),
    validation_data=(encoded_valid, (slot_valid, intent_valid)), 
    epochs=15,
    batch_size=8,
)

"""### Save the model"""

# model = joint_model

# model.save("/content/drive/MyDrive/JP Morgan/Data/model_ester_swa/best_working")

"""#Making prediction on a single text sequence and displaying both the sequence-wise and the token-wise class labels"""

def show_predictions(text, tokenizer, model, intent_names, slot_names):

    inputs = tf.constant(tokenizer.encode(text))[None, :]  # batch_size = 1
    # print(inputs)
    outputs = model(inputs)
    slot_logits, intent_logits = outputs
    # print(outputs)
    slot_ids = slot_logits.numpy().argmax(axis=-1)[0, 1:-1]
    # print(slot_ids)
    intent_id = intent_logits.numpy().argmax(axis=-1)[0]
    intent = [k for k, v in intent_names.items() if v == intent_id]

    # print(slot_ids)

    # print("## Intent:", intent)
    # print("## Slots:")
    slot_pred = []
    
    ####
    text_split = text.split(' ')
    tokens_ids = [(i,j) for i,j in zip(tokenizer.tokenize(text),slot_ids)]
    #### Removing ## Tokens
    tokens_ids_ = [x for x in tokens_ids if not '##' in x[0]]


    for token, slot_id in zip(text_split, tokens_ids_):
      slot_pred.append(slot_names[slot_id[1]])

    return intent, slot_pred

# slot_names

df_valid.iloc[55]["words"], df_valid.iloc[55]["word_labels"], df_valid.iloc[55]["intent_label"]

pred_intent, pred_slot = show_predictions(df_valid.iloc[55]["words"], tokenizer, joint_model, intent_map, slot_names)
pred_intent, pred_slot

# # def evaluate()
#       print(idx, len(true_label), len(pred_slot), len(df_valid.iloc[idx]["words"].split()))
#       print("Added", num_add, ["PAD"]*num_add)
#       print(df_valid.iloc[idx]["words"])
#       print(true_label)
#       print(pred_slot)
#       # break

# pred_slot
# from sklearn.metrics import f1_score

slot_avg_f1_score = []
slot_avg_accuracy_score = []
intent_avg_f1_score = []
intent_avg_accuracy_score = []

count = 0

for idx in range(len(df_valid)):
  try:
    pred_intent, pred_slot = show_predictions(df_valid.iloc[idx]["words"], tokenizer, joint_model, intent_map, slot_names)

    true_label_slot = df_valid.iloc[idx]["word_labels"].split()
    true_label_intent = df_valid.iloc[idx]["intent_label"].split()
    # if(len(true_label) > len(pred_slot)):
    #   num_add = len(true_label) - len(pred_slot)
    #   pred_slot.extend(["[PAD]"]*num_add)
    #   count += 1

    # elif(len(pred_slot) > len(true_label)):
    #   num_add = len(pred_slot) - len(true_label)
    #   pred_slot.extend(["O"]*num_add)
    #   count += 1

    f1_slot = f1_score(true_label_slot, pred_slot, average="weighted")
    slot_accuracy_score = accuracy_score(true_label_slot, pred_slot)

    f1_intent = f1_score(true_label_intent, pred_intent, average="weighted")
    intent_accuracy_score = accuracy_score(true_label_intent, pred_intent)


    slot_avg_f1_score.append(f1_slot)
    slot_avg_accuracy_score.append(slot_accuracy_score)

    intent_avg_f1_score.append(f1_intent)
    intent_avg_accuracy_score.append(intent_accuracy_score)

    # print(len(df_valid.iloc[idx]["word_labels"].split()), len(pred_slot))

  except:
    pass
    # print(idx)
    # print(df_valid.iloc[idx]["words"])
    # print(df_valid.iloc[idx]["word_labels"].split())
    # print(pred_slot)
    # print(len(df_valid.iloc[idx]["word_labels"].split()), len(pred_slot))
    # break
# df_valid.iloc[0]["word_labels"].split(" "), pred_slot

print("F1 Score: ", np.mean(slot_avg_f1_score), "\nAccuracy:", np.mean(slot_avg_accuracy_score))

print("F1 Score Intent: ", np.mean(intent_avg_f1_score), "\nAccuracy Intent:", np.mean(intent_avg_accuracy_score))

from sklearn.metrics import f1_score

avg_f1_score = []
avg_accuracy_score = []

count = 0

for idx in range(len(df_valid)):
  try:
    pred_intent, pred_slot = show_predictions(df_valid.iloc[idx]["words"], tokenizer, joint_model, intent_map, slot_names)

    true_label = df_valid.iloc[idx]["word_labels"].split()

    # if(len(true_label) > len(pred_slot)):
    #   num_add = len(true_label) - len(pred_slot)
    #   pred_slot.extend(["[PAD]"]*num_add)
    #   count += 1

    # elif(len(pred_slot) > len(true_label)):
    #   num_add = len(pred_slot) - len(true_label)
    #   pred_slot.extend(["O"]*num_add)
    #   count += 1

    f1 = f1_score(true_label, pred_slot, average="weighted")
    accuracy = accuracy_score(true_label, pred_slot)

    avg_f1_score.append(f1)
    avg_accuracy_score.append(accuracy)

    # print(len(df_valid.iloc[idx]["word_labels"].split()), len(pred_slot))

  except:
    pass
    # print(idx)
    # print(df_valid.iloc[idx]["words"])
    # print(df_valid.iloc[idx]["word_labels"].split())
    # print(pred_slot)
    # print(len(df_valid.iloc[idx]["word_labels"].split()), len(pred_slot))
    # break
# df_valid.iloc[0]["word_labels"].split(" "), pred_slot

print("F1 Score: ", np.mean(avg_f1_score), "\nAccuracy:", np.mean(avg_accuracy_score))


# text = st.text_area("Please enter a text")
# if(text):
#     output = show_predictions(text, tokenizer, joint_model, intent_map, slot_names)
#     st.json(output)