# -*- coding: utf-8 -*- """ Created on Thu Feb 8 20:22:57 2024 @author: Dhrumit Patel """ """ Milestone Project 2: SkimLit The purpose is to build an NLP model to make reading medical abstracts easier. """ # Check for GPU? # !nvidia-smi # !nvidia-smi -L """ Get the data Since we will be replicating the paper (PubMed 200K RCT), let's download the dataset they used. We can do so from author's github git clone https://github.com/Franck-Dernoncourt/pubmed-rct dir pubmed-rct # Check what files are in the PubMed_20K dataset cd pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign dir Contains 3 files dev.txt, test.txt, train.txt """ # Start our experiments using the 20k dataset with numbers replaced by "@" sign data_dir = "pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/" # Check all the filenames in the target directory import os filenames = [data_dir + filename for filename in os.listdir(data_dir)] filenames """ Preprocess the data """ # Create a function to read the lines of a document def get_lines(filename): """ Reads filename (a text filename) and returns the lines of text as a list. Args: filename (str): a string containing the target filepath. Returns: A list of strings with one string per line from the target filename. """ with open(filename, "r") as f: return f.readlines() # Let's read in the training lines train_lines = get_lines(filename=data_dir + "train.txt") # read the lines within the training file train_lines[:20] len(train_lines) # Let's write a function to preprocess our data as above (List of dictionaries) def preprocess_text_with_line_numbers(filename): """ Returns a list of dictionaries of abstract line data. Takes in filename, reads its contents, and sorts through each line, extracting things like the target label, the text of the sentence, how many senetences are in the current abstract and what sentence number the target line is. """ input_lines = get_lines(filename) # get all lines from filename abstract_lines = "" # Create an empty abstract abstract_samples = [] # Create an empty list of abstract to store dictionaries # Loop through each line in the target file for line in input_lines: if line.startswith("###"): # Check to see if the line is an ID line abstract_id = line abstract_lines = "" # Reset the abstract string if the line is an ID line elif line.isspace(): # Check to see if line is a new line abstract_line_split = abstract_lines.splitlines() # Split abstract into seperate lines # Iterate through each line in a single abstract and count them at the same time for abstract_line_number, abstract_line in enumerate(abstract_line_split): line_data = {} # Create an empty dictionary for each line target_text_split = abstract_line.split("\t") # Split target label from text line_data["target"] = target_text_split[0] # Get the target label line_data["text"] = target_text_split[1].lower() # Get target text and lower it line_data["line_number"] = abstract_line_number # What number line foes the line appear in the abstract? line_data["total_lines"] = len(abstract_line_split) - 1 # How many total line are there in the target abstract? (start from 0) abstract_samples.append(line_data) # Add line data dictionary to abstract samples list else: # If the above conditions aren't fulfilled, then the line contains a labelled sentence abstract_lines += line return abstract_samples # Get data from file and preprocess it train_samples = preprocess_text_with_line_numbers(filename = data_dir + "train.txt") val_samples = preprocess_text_with_line_numbers(filename = data_dir + "dev.txt") # dev is another name for validation dataset test_samples = preprocess_text_with_line_numbers(filename = data_dir + "test.txt") len(train_samples), len(val_samples), len(test_samples) # Check the first abstract of our training data train_samples[:14] """ Now that our data is in the format of a list of dictionaries, How about we turn it into a DataFrame to further visualize it? """ import pandas as pd train_df = pd.DataFrame(train_samples) val_df = pd.DataFrame(val_samples) test_df = pd.DataFrame(test_samples) train_df[:14] # Distribution of labels in training data train_df["target"].value_counts() # Let's check length of different lines (Number of sentences per abstract (X-axis) vs Number of occurrences (Y-axis)) train_df["total_lines"].plot.hist() """ Get list of sentences """ # Convert abstract text lines into lists train_sentences = train_df["text"].tolist() val_sentences = val_df["text"].tolist() test_sentences = test_df["text"].tolist() len(train_sentences), len(val_sentences), len(test_sentences) # View the first 10 lines of training sentences train_sentences[:10] """ Making numeric labels (ML models require numeric labels) """ # One hot encode labels from sklearn.preprocessing import OneHotEncoder one_hot_encoder = OneHotEncoder(sparse=False) # We want non-sparse matrix train_labels_one_hot = one_hot_encoder.fit_transform(train_df["target"].to_numpy().reshape(-1, 1)) val_labels_one_hot = one_hot_encoder.transform(val_df["target"].to_numpy().reshape(-1, 1)) test_labels_one_hot = one_hot_encoder.transform(test_df["target"].to_numpy().reshape(-1, 1)) # Check what one hot encoded labels look like train_labels_one_hot, val_labels_one_hot, test_labels_one_hot """ Label encode labels """ # Extract labels ("target" columns) and encode them into integers from sklearn.preprocessing import LabelEncoder label_encoder = LabelEncoder() train_labels_encoded = label_encoder.fit_transform(train_df["target"].to_numpy()) val_labels_encoded = label_encoder.transform(val_df["target"].to_numpy()) test_labels_encoded = label_encoder.transform(test_df["target"].to_numpy()) # Check what label encoded labels look like train_labels_encoded, val_labels_encoded, test_labels_encoded # Get class names and number of classes from LabelEncoder instance num_classes = len(label_encoder.classes_) class_names = label_encoder.classes_ num_classes, class_names """ Starting a series of Modelling experiments """ """ Model 0: Getting a baseline model (TF-IDF Multinomial Naive Bayes Classifier) """ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline # Create a pipeline model_0 = Pipeline([ ("tf-idf", TfidfVectorizer()), ("clf", MultinomialNB()) ]) # Fit the pipeline on the training data model_0.fit(train_sentences, train_labels_encoded) # Evaluate baseline model on validation dataset model_0.score(val_sentences, val_labels_encoded) # Make predictions using our baseline model baseline_preds = model_0.predict(val_sentences) baseline_preds """ For classification evaluation metrics (accuracy, precision, recall, f1-score) """ from helper_functions import calculate_results # Calculate baselien results baseline_results = calculate_results(y_true=val_labels_encoded, y_pred=baseline_preds) baseline_results train_sentences[:10] """ Preparing our data (the text) for deep sequence model Before we start builidng deeper models, we had got to create vectorization and embedding layers """ import numpy as np import tensorflow as tf from tensorflow.keras import layers # How long is each sentence on average sent_lens = [len(sentence.split()) for sentence in train_sentences] avg_sent_len = np.mean(sent_lens) avg_sent_len # What's the distribution look like? import matplotlib.pyplot as plt plt.hist(sent_lens, bins=20) # How long of a sentence length covers 95% of examples? output_seq_length = int(np.percentile(sent_lens, 95)) output_seq_length # Maximum sequence length in the training set max(sent_lens) """ Create a TextVectorizer layer We want to make a layer which maps our texts from words to numbers """ # How many words are in our vocab? This is taken from Table2 from paper max_tokens = 68000 # Came from paper by authors # Create text vectorizer from tensorflow.keras.layers.experimental.preprocessing import TextVectorization text_vectorizer = TextVectorization(max_tokens=max_tokens, # Numebr of words in vocabulary output_sequence_length=output_seq_length) # Desired output length of vectorized sequences # Adapt text vectorizer to training sentences text_vectorizer.adapt(train_sentences) # How many words in our training vocabulary? rct_20k_text_vocab = text_vectorizer.get_vocabulary() print(f"Number of words in vocab: {len(rct_20k_text_vocab)}") print(f"Most common words in the vocab: {rct_20k_text_vocab[:5]}") print(f"Least common words in the vocab: {rct_20k_text_vocab[-5:]}") # Get the config of our text vectorizer text_vectorizer.get_config() from keras import layers """ Create a custom text embedding layer """ token_embed = layers.Embedding(input_dim=len(rct_20k_text_vocab), output_dim=128, # Note: Different embedding sizes result in drastically different numbers of parameters to train mask_zero=True, # Use masking to handle variable sequences lengths(save space) name = "token_embedding") """ Creating datasets (making sure our data loads as fast as possible) We are going to setup our data to run as fast as poccible with TensorFlow tf.data API. """ # Turn our data into TensorFlow datasets train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot)) valid_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot)) test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_one_hot)) train_dataset # Take the TensorSliceDataset's and turn them into prefetched datasets train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE) valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE) test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE) train_dataset, len(train_dataset) """ Model 1: Conv1D with token embeddings """ # Create 1D Conv model to process sequences inputs = layers.Input(shape=(1,), dtype=tf.string) text_vectors = text_vectorizer(inputs) # Vectorize text inputs token_embeddings = token_embed(text_vectors) # Create embedding x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(token_embeddings) x = layers.GlobalAveragePooling1D()(x) # Condense the ouput of our feature vector from Conv layer outputs = layers.Dense(num_classes, activation="softmax")(x) model_1 = tf.keras.Model(inputs, outputs) # Compile the model model_1.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"]) model_1.summary() # Fit the model history_model_1 = model_1.fit(train_dataset, epochs=3, steps_per_epoch=int(0.1 * len(train_dataset)), # It will only look on 10% of batches for training (to speed up training) validation_data=valid_dataset, validation_steps=int(0.1 * len(valid_dataset))) # Evaluate on whole validation dataset model_1.evaluate(valid_dataset) # Make predictions on the validation dataset (our model predicts probabilities for each class) model_1_pred_probs = model_1.predict(valid_dataset) model_1_pred_probs, model_1_pred_probs.shape # Convert pred probs to classes model_1_preds = tf.argmax(model_1_pred_probs, axis=1) model_1_preds class_names class_names[model_1_preds] # Calculate model_1 results model_1_results = calculate_results(y_true=val_labels_encoded, y_pred=model_1_preds) model_1_results """ Model 2: Feature extraction with pretrained token embeddings Now let's use pretrained word embeddings from TensorFlow Hub, more sepcifically the universal sentence encoder The paper used originally used GloVe embeddings, however we are going to stick with the later created USE pretrained embeddings. """ # Download pretrained TensorFlow Hub USE import tensorflow_hub as hub tf_hub_embedding_layer = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2", trainable=False, name="universal_sentence_encoder") """ Building and fitting an NLP feature extraction model using pretrained embeddings TensorFlow Hub """ # Define feature extraction model using TF Hub layer inputs = layers.Input(shape=[], dtype=tf.string) pretrained_embedding = tf_hub_embedding_layer(inputs) # Tokenize text and create embedding of each sequence (512 long vector) x = layers.Dense(128, activation="relu")(pretrained_embedding) # Note: you could add more layers if you wanted to outputs = layers.Dense(num_classes, activation="softmax")(x) # Create the output layer model_2 = tf.keras.Model(inputs, outputs, name="model_2_USE_feature_extractor") # Compile the model model_2.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"]) model_2.summary() # Fit model_2 to the data with tf.device('/CPU:0'): history_model_2 = model_2.fit(train_dataset, epochs=3, steps_per_epoch=int(0.1 * len(train_dataset)), validation_data=valid_dataset, validation_steps=int(0.1 * len(valid_dataset))) # Evaluate on the whole validation dataset with tf.device('/CPU:0'): model_2.evaluate(valid_dataset) # Make predictions with feature extraction model with tf.device('/CPU:0'): model_2_pred_probs = model_2.predict(valid_dataset) model_2_pred_probs, model_2_pred_probs.shape # Convert the prediction probabilites found with feature extraction model to labels model_2_preds = tf.argmax(model_2_pred_probs, axis=1) model_2_preds class_names[model_2_preds] # Calculate results from TF Hub pretrained embeddings results on val set model_2_results = calculate_results(y_true=val_labels_encoded, y_pred=model_2_preds) model_2_results """ Model 3: Conv1D with character embeddings The paper which we are replicating states they used a combination of token and charcter level embeddings. Previously, we have token level embeddings but we will need to do similar steps for characters if we want to use char-level embeddings. """ """ Creating a charceter-level tokenizer """ train_sentences[:5] # Make function to split sentences into characters def split_chars(text): return " ".join(list(text)) # Split sequence-level data splits into character-level data splits train_chars = [split_chars(sentence) for sentence in train_sentences] val_chars = [split_chars(sentence) for sentence in val_sentences] test_chars = [split_chars(sentence) for sentence in test_sentences] train_chars, val_chars, test_chars # What's the average character length? char_lens = [len(sentence) for sentence in train_sentences] mean_char_len = np.mean(char_lens) mean_char_len # Check the distribution of our sequences at a character-level import matplotlib.pyplot as plt plt.hist(char_lens, bins=7) # Find what length of characters covers 95% of sequences output_seq_char_len = int(np.percentile(char_lens, 95)) output_seq_char_len # Get all keyboard characters import string alphabet = string.ascii_lowercase + string.digits + string.punctuation alphabet len(alphabet) # Create char-level token vectorizer instances NUM_CHAR_TOKENS = len(alphabet) + 2 # add 2 for space and OOV token (OOV = out of vocab, ['UNK]) char_vectorizer = TextVectorization(max_tokens=NUM_CHAR_TOKENS, output_sequence_length=output_seq_char_len, standardize="lower_and_strip_punctuation", # Default name="char_vectorizer") # Adapt character vectorizer to training character char_vectorizer.adapt(train_chars) # Chek character vocab stats char_vocab = char_vectorizer.get_vocabulary() print(f"Number of different characters in character vocab: {len(char_vocab)}") print(f"5 most common character: {char_vocab[:5]}") print(f"5 least common characters: {char_vocab[-5:]}") """ Creating a character-level embedding """ # Create char embedding layer char_embed = layers.Embedding(input_dim=len(char_vocab), # Number of different characters output_dim=25, # This is the size of char embedding in the paper mask_zero=True, name="char_embed") """ Model 3: Building a Conv1D model to fit on character embeddings """ # Make Conv1D on chars only inputs = layers.Input(shape=(1,), dtype="string") char_vectors = char_vectorizer(inputs) char_embeddings = char_embed(char_vectors) x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(char_embeddings) x = layers.GlobalMaxPool1D()(x) outputs = layers.Dense(num_classes, activation="softmax")(x) model_3 = tf.keras.Model(inputs, outputs, name="model_3_conv1d_char_embeddings") # Compile the model model_3.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"]) model_3.summary() # Create char level dataset train_char_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE) val_char_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE) test_char_dataset = tf.data.Dataset.from_tensor_slices((test_chars, test_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE) train_char_dataset, val_char_dataset, test_char_dataset # Fit the model on chars only model_3_history = model_3.fit(train_char_dataset, epochs=3, steps_per_epoch=int(0.1 * len (train_char_dataset)), validation_data=val_char_dataset, validation_steps=int(0.1 * len(val_char_dataset))) # Evaluate the model_3 model_3.evaluate(val_char_dataset) # Make predictions with character model only model_3_pred_probs = model_3.predict(val_char_dataset) model_3_pred_probs, model_3_pred_probs.shape # Convert prediction to class labels model_3_preds = tf.argmax(model_3_pred_probs, axis=1) model_3_preds class_names[model_3_preds] # Calculate results for Conv1D model chars only model_3_results = calculate_results(y_true=val_labels_encoded, y_pred=model_3_preds) model_3_results baseline_results """ Model 4: Combining pretrained token embeddings + characters embeddings (hybrid embedding layer) 1. Create a token level embedding model (similar to model_1) 2. Create a character level model (similar to model_3 with a slight modification) 3. Combine 1 & 2 with a concatenate (layers.Concatenate) 4. Build a series of output layer on top point 3. 5. Construct a model which takes token and character level sequences as input and produces sequence label probabilities as output. """ # 1. Setup token inputs/model token_inputs = layers.Input(shape=[], dtype=tf.string, name="token_inputs") token_embeddings = tf_hub_embedding_layer(token_inputs) token_outputs = layers.Dense(128, activation="relu")(token_embeddings) token_model = tf.keras.Model(inputs=token_inputs, outputs=token_outputs) # 2. Setup char inputs/model char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="char_input") char_vectors = char_vectorizer(char_inputs) char_embeddings = char_embed(char_vectors) char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings) # bi-LSTM as given in paper char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm) # 3. Concatenate token and char inputs (create hybrid tokem embedding) token_char_concat = layers.Concatenate(name="token_char_hybrid")([token_model.output, char_model.output]) # 4. Create output layers - adding in dropout (according to the paper) combined_dropout = layers.Dropout(0.5)(token_char_concat) combined_dense = layers.Dense(128, activation="relu")(combined_dropout) final_dropout = layers.Dropout(0.5)(combined_dense) output_layer = layers.Dense(num_classes, activation="softmax")(final_dropout) # 5. Construct model with char and token inputs model_4 = tf.keras.Model(inputs=[token_model.input, char_model.input], outputs=output_layer, name="model_4_token_and_char_embeddings") # Get a summary of our model model_4.summary() # Plot hybrid token and character model from keras.utils import plot_model plot_model(model_4, show_shapes=True) # Compile token char model model_4.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), # Paper says SGD optimizer metrics=["accuracy"]) """ Combining token and character data into tf.data.Dataset """ # Combine chars and tokens into a dataset train_char_token_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars)) # make data train_char_token_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) # make labels train_char_token_dataset = tf.data.Dataset.zip((train_char_token_data, train_char_token_labels)) # Combine data and labels # Prefetch and batch train data train_char_token_dataset = train_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) # For validation dataset val_char_token_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars)) val_char_token_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot) val_char_token_dataset = tf.data.Dataset.zip((val_char_token_data, val_char_token_labels)) # Prefetch and batch val data val_char_token_dataset = val_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) # Check out training char and token embedding dataset train_char_token_dataset, val_char_token_dataset # Fitting a model on token and character-level sequences with tf.device('/CPU:0'): history_model_4 = model_4.fit(train_char_token_dataset, epochs=3, steps_per_epoch=int(0.1 * len(train_char_token_dataset)), validation_data=val_char_token_dataset, validation_steps=int(0.1 * len(val_char_token_dataset))) # Evaluate on the whole validation dataset with tf.device('/CPU:0'): model_4.evaluate(val_char_token_dataset) # Make predictions using the token-character model hybrid model_4_pred_probs = model_4.predict(val_char_token_dataset) model_4_pred_probs, model_4_pred_probs.shape # Converting to prediction probabilities to labels model_4_preds = tf.argmax(model_4_pred_probs, axis=1) model_4_preds model_4_preds class_names[model_4_preds] # Get results of token char hybrid model model_4_results = calculate_results(y_true=val_labels_encoded, y_pred=model_4_preds) model_4_results """ Model 5: Transfer learning with pretrained token embeddings + character embeddings + positional embeddings """ train_df.head() """ Create positional embeddings """ # How many different line numbers are there? train_df["line_number"].value_counts() # Check the distribution of "line_number" column train_df["line_number"].plot.hist() # Use TensorFlow to create one-hot encoded tensors of our "line_number" column train_line_numbers_one_hot = tf.one_hot(train_df["line_number"].to_numpy(), depth=15) val_line_numbers_one_hot = tf.one_hot(val_df["line_number"].to_numpy(), depth=15) test_line_numbers_one_hot = tf.one_hot(test_df["line_number"].to_numpy(), depth=15) train_line_numbers_one_hot[:10], train_line_numbers_one_hot.shape train_line_numbers_one_hot[0].shape train_line_numbers_one_hot[0].dtype # How many different numbers of lines are there? train_df["total_lines"].value_counts() # Check the distribution of "total_lines" column train_df["total_lines"].plot.hist() # Check the coverage of a "total_lines" / What length of 95% covers our abstract string? np.percentile(train_df["total_lines"], 98) # Use TensorFlow One-hot encoded tensors for our "total_lines" column train_total_lines_one_hot = tf.one_hot(train_df["total_lines"].to_numpy(), depth=20) val_total_lines_one_hot = tf.one_hot(val_df["total_lines"].to_numpy(), depth=20) test_total_lines_one_hot = tf.one_hot(test_df["total_lines"].to_numpy(), depth=20) train_total_lines_one_hot[:10], train_total_lines_one_hot.shape train_total_lines_one_hot[0].shape train_total_lines_one_hot[0].dtype """ Building a tribrid embedding model 1. Create a token-level model 2. Create a character-level model 3. Create a model for the "line_number" feature 4. Create a model for the "total_lines" feature 5. Combine the outputs of 1 & 2 using tf.keras.layers.Concatenate 6. Combine the outputs of 3,4,5 using tf.keras.layers.Concatenate 7. Create an output layer to accept the tribrid embedding and output label probabilities. 8. Combine the inputs of 1,2,3,4 and outputs of 7 into tf.keras.Model """ # 1. Token inputs token_inputs = layers.Input(shape=[], dtype="string", name="token_inputs") token_embeddings = tf_hub_embedding_layer(token_inputs) token_outputs = layers.Dense(128, activation="relu")(token_embeddings) token_model = tf.keras.Model(inputs=token_inputs, outputs=token_outputs) # 2. Char inputs char_inputs = layers.Input(shape=(1,), dtype="string", name="char_inputs") char_vectors = char_vectorizer(char_inputs) char_embeddings = char_embed(char_vectors) char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings) char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm) # 3. Create a model for "line_number" feature line_number_inputs = layers.Input(shape=(15,), dtype=tf.float32, name="line_number_input") x = layers.Dense(32, activation="relu")(line_number_inputs) line_number_model = tf.keras.Model(inputs=line_number_inputs, outputs=x) # 4. Create a model for "total_lines" feature total_lines_inputs = layers.Input(shape=(20,), dtype=tf.float32, name="total_lines_input") y = layers.Dense(32, activation="relu")(total_lines_inputs) total_lines_model = tf.keras.Model(inputs=total_lines_inputs, outputs=y) # 5. Combine the outputs of token and char embeddings into a hybrid embedding combined_embeddings = layers.Concatenate(name="char_token_hybrid_embedding")([token_model.output, char_model.output]) z = layers.Dense(256, activation="relu")(combined_embeddings) z = layers.Dropout(0.5)(z) # 6. Combine positional embedding with combined token and char embeddings tribrid_embeddings = layers.Concatenate(name="char_token_positional_embedding")([line_number_model.output, total_lines_model.output, z]) # 7. Create output layer output_layer = layers.Dense(num_classes, activation="softmax", name="output_layer")(tribrid_embeddings) # 8. Put together model withall kinds of inputs model_5 = tf.keras.Model(inputs=[line_number_model.input, total_lines_model.input, token_model.input, char_model.input], outputs=output_layer, name="model_5_tribrid_embedding_model") # Get a summary of our tribrid model model_5.summary() from tensorflow.keras.utils import plot_model plot_model(model_5, show_shapes=True) # Compile token char and postional embedding model model_5.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2), # Helps to prevent overfitting optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"]) """ Create tribrid embeddings datasets using tf.data """ # Create training and validation datasets (with all 4 kinds of input data) train_char_token_pos_data = tf.data.Dataset.from_tensor_slices((train_line_numbers_one_hot, train_total_lines_one_hot, train_sentences, train_chars)) train_char_token_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) train_char_token_pos_dataset = tf.data.Dataset.zip((train_char_token_pos_data, train_char_token_pos_labels)) train_char_token_pos_dataset = train_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE) # Do the same as above for the validation dataset val_char_token_pos_data = tf.data.Dataset.from_tensor_slices((val_line_numbers_one_hot, val_total_lines_one_hot, val_sentences, val_chars)) val_char_token_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot) val_char_token_pos_dataset = tf.data.Dataset.zip((val_char_token_pos_data, val_char_token_pos_labels)) val_char_token_pos_dataset = val_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE) # Check the input shapes train_char_token_pos_dataset, val_char_token_pos_dataset # Fit the model with tf.device('/CPU:0'): history_model_5 = model_5.fit(train_char_token_pos_dataset, epochs=3, steps_per_epoch=int(0.1 * len(train_char_token_pos_dataset)), validation_data=val_char_token_pos_dataset, validation_steps=int(0.1 * len(val_char_token_pos_dataset))) with tf.device('/CPU:0'): # Evaluate our model on whole validation dataset model_5.evaluate(val_char_token_pos_dataset) # Make predictions with the char token pos model model_5_pred_probs = model_5.predict(val_char_token_pos_dataset) model_5_pred_probs, model_5_pred_probs.shape # Convert prediction probabilities to the labels model_5_preds = tf.argmax(model_5_pred_probs, axis=1) model_5_preds model_5_preds class_names[model_5_preds] # Calculate results of char token pos model model_5_results = calculate_results(y_true=val_labels_encoded, y_pred=model_5_preds) model_5_results """ Compare model results """ # Combine model results into a dataframe all_model_results = pd.DataFrame({"model_0_baseline": baseline_results, "model_1_custom_token_embedding": model_1_results, "model_2_pretrained_token_embedding": model_2_results, "model_3_custom_char_embedding": model_3_results, "model_4_hybrid_char_token_embedding": model_4_results, "model_5_pos_char_token_embedding": model_5_results}) all_model_results = all_model_results.transpose() all_model_results # Reduce the accuracy to same scale as other metrics all_model_results["accuracy"] = all_model_results["accuracy"]/100 all_model_results # Plot and comapre all model results all_model_results.plot(kind="bar", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0)) # Sort models results using f1-score all_model_results.sort_values("f1", ascending=True)["f1"].plot(kind="bar", figsize=(10, 7)) """ Save and load model """ # Save the best performing model to SavedModel format (default) model_5.save("skimlit_tribrid_model_me") # Load in best performing model from keras.models import load_model with tf.device('/CPU:0'): loaded_model = load_model("skimlit_tribrid_model_me") # Make predictions with our loaded model on the validation set with tf.device('/CPU:0'): loaded_pred_probs = loaded_model.predict(val_char_token_pos_dataset) loaded_pred_probs, loaded_pred_probs.shape # Convert prediction probabilities to labels loaded_preds = tf.argmax(loaded_pred_probs, axis=1) loaded_preds loaded_preds[:10] class_names[loaded_preds] # Calculate the results of our loaded model loaded_model_results = calculate_results(y_true=val_labels_encoded, y_pred=loaded_preds) loaded_model_results assert model_5_results == loaded_model_results # If nothing displays in console, it means True # Check the loaded model summary loaded_model.summary() """ Optional - for the loaded model you can use your own trained model """ import tensorflow as tf import tensorflow_hub as hub from tensorflow.keras.layers.experimental.preprocessing import TextVectorization import os url = "https://drive.google.com/file/d/1DYr3Ew9tU6dph_fI0JeTZ6GbdzZpWr8K/view?usp=sharing" # Load in downloaded online model loaded_gs_model = load_model("skimlit_tribrid_model") # Evaluate the online loaded model loaded_gs_model.evaluate(val_char_token_pos_dataset) loaded_preds = tf.argmax(loaded_pred_probs, axis=1) loaded_preds[:10] # Evaluate loaded model's predictions loaded_model_results = calculate_results(val_labels_encoded, loaded_preds) loaded_model_results # Check loaded model summary loaded_model.summary() # Create test dataset batch and prefetched test_pos_char_token_data = tf.data.Dataset.from_tensor_slices((test_line_numbers_one_hot, test_total_lines_one_hot, test_sentences, test_chars)) test_pos_char_token_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot) test_pos_char_token_dataset = tf.data.Dataset.zip((test_pos_char_token_data, test_pos_char_token_labels)) test_pos_char_token_dataset = test_pos_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) # Make predictions on the test dataset with tf.device('/CPU:0'): test_pred_probs = loaded_model.predict(test_pos_char_token_dataset, verbose=1) test_preds = tf.argmax(test_pred_probs, axis=1) test_preds[:10] # Evaluate loaded model test predictions loaded_model_test_results = calculate_results(y_true=test_labels_encoded, y_pred=test_preds) loaded_model_test_results # Get list of class names of test predictions test_pred_classes = [label_encoder.classes_[pred] for pred in test_preds] test_pred_classes # Create prediction-enriched test dataframe # Add a new column "prediction" to the test dataframe, containing predicted classes test_df["prediction"] = test_pred_classes # Add a new column "pred_prob" to the test dataframe, containing the maximum prediction probability test_df["pred_prob"] = tf.reduce_max(test_pred_probs, axis=1).numpy() # Add a new column "correct" to the test dataframe, which is True if the prediction matches the target, False otherwise # This creates a binary column indicating whether the prediction is correct or not test_df["correct"] = test_df["prediction"] == test_df["target"] # Display the first 20 rows of the enriched test dataframe test_df.head(20) # Find top 100 most wrong samples (note: 100 is an abitrary number, you could go through all of them if you wanted) top_100_wrong = test_df[test_df["correct"] == False].sort_values("pred_prob", ascending=False)[:100] top_100_wrong # Investigate top wrong predictions for rows in the top 100 wrong predictions dataframe for row in top_100_wrong[0:10].itertuples(): # Unpack row values _, target, text, line_number, total_lines, prediction, pred_prob, _ = row # Display information about the prediction print(f"Target: {target}, Pred: {prediction}, Prob: {pred_prob}, Line number: {line_number}, Total lines: {total_lines}\n") # Display the text associated with the prediction print(f"Text:\n{text}\n") # Separator for better readability print("-----------------------------------------------------------------------\n") import json import requests # Download and open example abstracts (copy and pasted from PubMed) url = "https://github.com/Dhrumit1314/Skimlit_NLP/blob/main/abstract_data.json" response = requests.get(url) # Check if the download was successful (status code 200) if response.status_code == 200: # Load the JSON data from the response example_abstracts = json.loads(response.text) print("Example abstracts loaded successfully.") else: print(f"Failed to download example abstracts. Status code: {response.status_code}") # See what our example abstracts look like abstracts = pd.DataFrame(example_abstracts) abstracts # Import necessary library from spacy.lang.en import English # Setup English sentence parser with spaCy nlp = English() # Add the sentencizer to the spaCy pipeline sentencizer = nlp.add_pipe("sentencizer") # Example abstract from the loaded dataset example_abstract = example_abstracts[0]["abstract"] example_abstract # Create a spaCy "doc" object by parsing the example abstract doc = nlp(example_abstract) doc # Extract sentences from the spaCy doc and convert to string type abstract_lines = [str(sent) for sent in list(doc.sents)] # Display the detected sentences from the abstract abstract_lines # Get the total number of lines in the sample total_lines_in_sample = len(abstract_lines) # Initialize an empty list to store dictionaries containing features for each line sample_lines = [] # Iterate through each line in the abstract and create a list of dictionaries containing features for each line for i, line in enumerate(abstract_lines): # Create a dictionary to store features for the current line sample_dict = {} # Store the text of the line in the dictionary sample_dict["text"] = str(line) # Store the line number in the dictionary sample_dict["line_number"] = i # Store the total number of lines in the sample (subtracting 1 to make it 0-based index) sample_dict["total_lines"] = total_lines_in_sample - 1 # Append the dictionary to the list sample_lines.append(sample_dict) # Display the list of dictionaries containing features for each line sample_lines # Get all line_number values from the sample abstract test_abstract_line_numbers = [line["line_number"] for line in sample_lines] # One-hot encode to the same depth as training data, so the model accepts the right input shape test_abstract_line_numbers_one_hot = tf.one_hot(test_abstract_line_numbers, depth=15) # Display the one-hot encoded line numbers test_abstract_line_numbers_one_hot # Get all total_lines values from sample abstract test_abstract_total_lines = [line["total_lines"] for line in sample_lines] # One-hot encode to same depth as training data, so model accepts right input shape test_abstract_total_lines_one_hot = tf.one_hot(test_abstract_total_lines, depth=20) test_abstract_total_lines_one_hot # Split abstract lines into characters abstract_chars = [split_chars(sentence) for sentence in abstract_lines] abstract_chars import tensorflow as tf import time # Define the depths for one-hot encoding line_numbers_depth = 15 total_lines_depth = 20 # Prepare the input features test_abstract_line_numbers_one_hot = tf.one_hot(test_abstract_line_numbers, depth=line_numbers_depth) test_abstract_total_lines_one_hot = tf.one_hot(test_abstract_total_lines, depth=total_lines_depth) test_abstract_abstract_lines = tf.constant(abstract_lines) test_abstract_abstract_chars = tf.constant(abstract_chars) # Make predictions on the sample abstract features start_time = time.time() with tf.device('/CPU:0'): # Note - Here you can use loaded_model if you want test_abstract_pred_probs = model_5.predict(x=(test_abstract_line_numbers_one_hot, test_abstract_total_lines_one_hot, tf.constant(abstract_lines), tf.constant(abstract_chars))) end_time = time.time() # Display the prediction probabilities print("Prediction Probabilities:", test_abstract_pred_probs) # Display the time taken for predictions print("Time taken for predictions: {:.2f} seconds".format(end_time - start_time)) # Turn prediction probabilities into prediction classes test_abstract_preds = tf.argmax(test_abstract_pred_probs, axis=1) test_abstract_preds # Turn prediction class integers into string class names test_abstract_pred_classes = [label_encoder.classes_[i] for i in test_abstract_preds] test_abstract_pred_classes # Visualize abstract lines and predicted sequence labels for i, line in enumerate(abstract_lines): print(f"{test_abstract_pred_classes[i]}: {line}")