Dhrumit1314
commited on
Commit
•
ed00d9b
1
Parent(s):
7f39d1d
Upload 8 files
Browse files- .gitattributes +1 -0
- SkimLit_NLP.py +1046 -0
- abstract_data.json +29 -0
- fingerprint.pb +3 -0
- helper_functions.py +302 -0
- keras_metadata.pb +3 -0
- saved_model.pb +3 -0
- variables.data-00000-of-00001 +3 -0
- variables.index +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
SkimLit_NLP.py
ADDED
@@ -0,0 +1,1046 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Thu Feb 8 20:22:57 2024
|
4 |
+
|
5 |
+
@author: Dhrumit Patel
|
6 |
+
"""
|
7 |
+
|
8 |
+
"""
|
9 |
+
Milestone Project 2: SkimLit
|
10 |
+
|
11 |
+
The purpose is to build an NLP model to make reading medical abstracts easier.
|
12 |
+
"""
|
13 |
+
|
14 |
+
# Check for GPU?
|
15 |
+
# !nvidia-smi
|
16 |
+
# !nvidia-smi -L
|
17 |
+
|
18 |
+
"""
|
19 |
+
Get the data
|
20 |
+
|
21 |
+
Since we will be replicating the paper (PubMed 200K RCT), let's download the dataset they used.
|
22 |
+
|
23 |
+
We can do so from author's github
|
24 |
+
|
25 |
+
git clone https://github.com/Franck-Dernoncourt/pubmed-rct
|
26 |
+
dir pubmed-rct
|
27 |
+
|
28 |
+
# Check what files are in the PubMed_20K dataset
|
29 |
+
cd pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign
|
30 |
+
dir
|
31 |
+
|
32 |
+
Contains 3 files dev.txt, test.txt, train.txt
|
33 |
+
"""
|
34 |
+
|
35 |
+
# Start our experiments using the 20k dataset with numbers replaced by "@" sign
|
36 |
+
data_dir = "pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/"
|
37 |
+
|
38 |
+
# Check all the filenames in the target directory
|
39 |
+
import os
|
40 |
+
filenames = [data_dir + filename for filename in os.listdir(data_dir)]
|
41 |
+
filenames
|
42 |
+
|
43 |
+
"""
|
44 |
+
Preprocess the data
|
45 |
+
"""
|
46 |
+
|
47 |
+
# Create a function to read the lines of a document
|
48 |
+
def get_lines(filename):
|
49 |
+
"""
|
50 |
+
Reads filename (a text filename) and returns the lines of text as a list.
|
51 |
+
|
52 |
+
Args:
|
53 |
+
filename (str): a string containing the target filepath.
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
A list of strings with one string per line from the target filename.
|
57 |
+
"""
|
58 |
+
with open(filename, "r") as f:
|
59 |
+
return f.readlines()
|
60 |
+
|
61 |
+
# Let's read in the training lines
|
62 |
+
train_lines = get_lines(filename=data_dir + "train.txt") # read the lines within the training file
|
63 |
+
train_lines[:20]
|
64 |
+
|
65 |
+
len(train_lines)
|
66 |
+
|
67 |
+
|
68 |
+
# Let's write a function to preprocess our data as above (List of dictionaries)
|
69 |
+
def preprocess_text_with_line_numbers(filename):
|
70 |
+
"""
|
71 |
+
Returns a list of dictionaries of abstract line data.
|
72 |
+
|
73 |
+
Takes in filename, reads its contents, and sorts through each line,
|
74 |
+
extracting things like the target label, the text of the sentence,
|
75 |
+
how many senetences are in the current abstract and what sentence
|
76 |
+
number the target line is.
|
77 |
+
"""
|
78 |
+
input_lines = get_lines(filename) # get all lines from filename
|
79 |
+
abstract_lines = "" # Create an empty abstract
|
80 |
+
abstract_samples = [] # Create an empty list of abstract to store dictionaries
|
81 |
+
|
82 |
+
# Loop through each line in the target file
|
83 |
+
for line in input_lines:
|
84 |
+
if line.startswith("###"): # Check to see if the line is an ID line
|
85 |
+
abstract_id = line
|
86 |
+
abstract_lines = "" # Reset the abstract string if the line is an ID line
|
87 |
+
|
88 |
+
elif line.isspace(): # Check to see if line is a new line
|
89 |
+
abstract_line_split = abstract_lines.splitlines() # Split abstract into seperate lines
|
90 |
+
|
91 |
+
# Iterate through each line in a single abstract and count them at the same time
|
92 |
+
for abstract_line_number, abstract_line in enumerate(abstract_line_split):
|
93 |
+
line_data = {} # Create an empty dictionary for each line
|
94 |
+
target_text_split = abstract_line.split("\t") # Split target label from text
|
95 |
+
line_data["target"] = target_text_split[0] # Get the target label
|
96 |
+
line_data["text"] = target_text_split[1].lower() # Get target text and lower it
|
97 |
+
line_data["line_number"] = abstract_line_number # What number line foes the line appear in the abstract?
|
98 |
+
line_data["total_lines"] = len(abstract_line_split) - 1 # How many total line are there in the target abstract? (start from 0)
|
99 |
+
abstract_samples.append(line_data) # Add line data dictionary to abstract samples list
|
100 |
+
|
101 |
+
else: # If the above conditions aren't fulfilled, then the line contains a labelled sentence
|
102 |
+
abstract_lines += line
|
103 |
+
|
104 |
+
return abstract_samples
|
105 |
+
|
106 |
+
# Get data from file and preprocess it
|
107 |
+
train_samples = preprocess_text_with_line_numbers(filename = data_dir + "train.txt")
|
108 |
+
val_samples = preprocess_text_with_line_numbers(filename = data_dir + "dev.txt") # dev is another name for validation dataset
|
109 |
+
test_samples = preprocess_text_with_line_numbers(filename = data_dir + "test.txt")
|
110 |
+
|
111 |
+
len(train_samples), len(val_samples), len(test_samples)
|
112 |
+
|
113 |
+
# Check the first abstract of our training data
|
114 |
+
train_samples[:14]
|
115 |
+
|
116 |
+
"""
|
117 |
+
Now that our data is in the format of a list of dictionaries, How about
|
118 |
+
we turn it into a DataFrame to further visualize it?
|
119 |
+
"""
|
120 |
+
import pandas as pd
|
121 |
+
train_df = pd.DataFrame(train_samples)
|
122 |
+
val_df = pd.DataFrame(val_samples)
|
123 |
+
test_df = pd.DataFrame(test_samples)
|
124 |
+
|
125 |
+
train_df[:14]
|
126 |
+
|
127 |
+
# Distribution of labels in training data
|
128 |
+
train_df["target"].value_counts()
|
129 |
+
|
130 |
+
# Let's check length of different lines (Number of sentences per abstract (X-axis) vs Number of occurrences (Y-axis))
|
131 |
+
train_df["total_lines"].plot.hist()
|
132 |
+
|
133 |
+
"""
|
134 |
+
Get list of sentences
|
135 |
+
"""
|
136 |
+
# Convert abstract text lines into lists
|
137 |
+
train_sentences = train_df["text"].tolist()
|
138 |
+
val_sentences = val_df["text"].tolist()
|
139 |
+
test_sentences = test_df["text"].tolist()
|
140 |
+
|
141 |
+
len(train_sentences), len(val_sentences), len(test_sentences)
|
142 |
+
|
143 |
+
# View the first 10 lines of training sentences
|
144 |
+
train_sentences[:10]
|
145 |
+
|
146 |
+
"""
|
147 |
+
Making numeric labels (ML models require numeric labels)
|
148 |
+
"""
|
149 |
+
# One hot encode labels
|
150 |
+
from sklearn.preprocessing import OneHotEncoder
|
151 |
+
one_hot_encoder = OneHotEncoder(sparse=False) # We want non-sparse matrix
|
152 |
+
train_labels_one_hot = one_hot_encoder.fit_transform(train_df["target"].to_numpy().reshape(-1, 1))
|
153 |
+
val_labels_one_hot = one_hot_encoder.transform(val_df["target"].to_numpy().reshape(-1, 1))
|
154 |
+
test_labels_one_hot = one_hot_encoder.transform(test_df["target"].to_numpy().reshape(-1, 1))
|
155 |
+
|
156 |
+
# Check what one hot encoded labels look like
|
157 |
+
train_labels_one_hot, val_labels_one_hot, test_labels_one_hot
|
158 |
+
|
159 |
+
"""
|
160 |
+
Label encode labels
|
161 |
+
"""
|
162 |
+
# Extract labels ("target" columns) and encode them into integers
|
163 |
+
from sklearn.preprocessing import LabelEncoder
|
164 |
+
label_encoder = LabelEncoder()
|
165 |
+
train_labels_encoded = label_encoder.fit_transform(train_df["target"].to_numpy())
|
166 |
+
val_labels_encoded = label_encoder.transform(val_df["target"].to_numpy())
|
167 |
+
test_labels_encoded = label_encoder.transform(test_df["target"].to_numpy())
|
168 |
+
|
169 |
+
# Check what label encoded labels look like
|
170 |
+
train_labels_encoded, val_labels_encoded, test_labels_encoded
|
171 |
+
|
172 |
+
# Get class names and number of classes from LabelEncoder instance
|
173 |
+
num_classes = len(label_encoder.classes_)
|
174 |
+
class_names = label_encoder.classes_
|
175 |
+
num_classes, class_names
|
176 |
+
|
177 |
+
"""
|
178 |
+
Starting a series of Modelling experiments
|
179 |
+
"""
|
180 |
+
|
181 |
+
"""
|
182 |
+
Model 0: Getting a baseline model (TF-IDF Multinomial Naive Bayes Classifier)
|
183 |
+
"""
|
184 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
185 |
+
from sklearn.naive_bayes import MultinomialNB
|
186 |
+
from sklearn.pipeline import Pipeline
|
187 |
+
|
188 |
+
# Create a pipeline
|
189 |
+
model_0 = Pipeline([
|
190 |
+
("tf-idf", TfidfVectorizer()),
|
191 |
+
("clf", MultinomialNB())
|
192 |
+
])
|
193 |
+
|
194 |
+
# Fit the pipeline on the training data
|
195 |
+
model_0.fit(train_sentences, train_labels_encoded)
|
196 |
+
|
197 |
+
# Evaluate baseline model on validation dataset
|
198 |
+
model_0.score(val_sentences, val_labels_encoded)
|
199 |
+
|
200 |
+
# Make predictions using our baseline model
|
201 |
+
baseline_preds = model_0.predict(val_sentences)
|
202 |
+
baseline_preds
|
203 |
+
|
204 |
+
"""
|
205 |
+
For classification evaluation metrics (accuracy, precision, recall, f1-score)
|
206 |
+
"""
|
207 |
+
from helper_functions import calculate_results
|
208 |
+
|
209 |
+
# Calculate baselien results
|
210 |
+
baseline_results = calculate_results(y_true=val_labels_encoded, y_pred=baseline_preds)
|
211 |
+
baseline_results
|
212 |
+
|
213 |
+
|
214 |
+
train_sentences[:10]
|
215 |
+
|
216 |
+
"""
|
217 |
+
Preparing our data (the text) for deep sequence model
|
218 |
+
|
219 |
+
Before we start builidng deeper models, we had got to create vectorization and embedding layers
|
220 |
+
"""
|
221 |
+
|
222 |
+
import numpy as np
|
223 |
+
import tensorflow as tf
|
224 |
+
from tensorflow.keras import layers
|
225 |
+
|
226 |
+
# How long is each sentence on average
|
227 |
+
sent_lens = [len(sentence.split()) for sentence in train_sentences]
|
228 |
+
avg_sent_len = np.mean(sent_lens)
|
229 |
+
avg_sent_len
|
230 |
+
|
231 |
+
# What's the distribution look like?
|
232 |
+
import matplotlib.pyplot as plt
|
233 |
+
plt.hist(sent_lens, bins=20)
|
234 |
+
|
235 |
+
# How long of a sentence length covers 95% of examples?
|
236 |
+
output_seq_length = int(np.percentile(sent_lens, 95))
|
237 |
+
output_seq_length
|
238 |
+
|
239 |
+
# Maximum sequence length in the training set
|
240 |
+
max(sent_lens)
|
241 |
+
|
242 |
+
"""
|
243 |
+
Create a TextVectorizer layer
|
244 |
+
|
245 |
+
We want to make a layer which maps our texts from words to numbers
|
246 |
+
"""
|
247 |
+
|
248 |
+
# How many words are in our vocab? This is taken from Table2 from paper
|
249 |
+
max_tokens = 68000 # Came from paper by authors
|
250 |
+
|
251 |
+
# Create text vectorizer
|
252 |
+
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
|
253 |
+
text_vectorizer = TextVectorization(max_tokens=max_tokens, # Numebr of words in vocabulary
|
254 |
+
output_sequence_length=output_seq_length) # Desired output length of vectorized sequences
|
255 |
+
|
256 |
+
# Adapt text vectorizer to training sentences
|
257 |
+
text_vectorizer.adapt(train_sentences)
|
258 |
+
|
259 |
+
# How many words in our training vocabulary?
|
260 |
+
rct_20k_text_vocab = text_vectorizer.get_vocabulary()
|
261 |
+
print(f"Number of words in vocab: {len(rct_20k_text_vocab)}")
|
262 |
+
print(f"Most common words in the vocab: {rct_20k_text_vocab[:5]}")
|
263 |
+
print(f"Least common words in the vocab: {rct_20k_text_vocab[-5:]}")
|
264 |
+
|
265 |
+
# Get the config of our text vectorizer
|
266 |
+
text_vectorizer.get_config()
|
267 |
+
|
268 |
+
from keras import layers
|
269 |
+
"""
|
270 |
+
Create a custom text embedding layer
|
271 |
+
"""
|
272 |
+
token_embed = layers.Embedding(input_dim=len(rct_20k_text_vocab),
|
273 |
+
output_dim=128, # Note: Different embedding sizes result in drastically different numbers of parameters to train
|
274 |
+
mask_zero=True, # Use masking to handle variable sequences lengths(save space)
|
275 |
+
name = "token_embedding")
|
276 |
+
|
277 |
+
|
278 |
+
"""
|
279 |
+
Creating datasets (making sure our data loads as fast as possible)
|
280 |
+
|
281 |
+
We are going to setup our data to run as fast as poccible with TensorFlow tf.data API.
|
282 |
+
"""
|
283 |
+
# Turn our data into TensorFlow datasets
|
284 |
+
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot))
|
285 |
+
valid_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot))
|
286 |
+
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_one_hot))
|
287 |
+
|
288 |
+
train_dataset
|
289 |
+
|
290 |
+
# Take the TensorSliceDataset's and turn them into prefetched datasets
|
291 |
+
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
|
292 |
+
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
|
293 |
+
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
|
294 |
+
|
295 |
+
train_dataset, len(train_dataset)
|
296 |
+
"""
|
297 |
+
Model 1: Conv1D with token embeddings
|
298 |
+
"""
|
299 |
+
# Create 1D Conv model to process sequences
|
300 |
+
inputs = layers.Input(shape=(1,), dtype=tf.string)
|
301 |
+
text_vectors = text_vectorizer(inputs) # Vectorize text inputs
|
302 |
+
token_embeddings = token_embed(text_vectors) # Create embedding
|
303 |
+
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(token_embeddings)
|
304 |
+
x = layers.GlobalAveragePooling1D()(x) # Condense the ouput of our feature vector from Conv layer
|
305 |
+
outputs = layers.Dense(num_classes, activation="softmax")(x)
|
306 |
+
|
307 |
+
model_1 = tf.keras.Model(inputs, outputs)
|
308 |
+
|
309 |
+
# Compile the model
|
310 |
+
model_1.compile(loss="categorical_crossentropy",
|
311 |
+
optimizer=tf.keras.optimizers.Adam(),
|
312 |
+
metrics=["accuracy"])
|
313 |
+
|
314 |
+
model_1.summary()
|
315 |
+
|
316 |
+
# Fit the model
|
317 |
+
history_model_1 = model_1.fit(train_dataset,
|
318 |
+
epochs=3,
|
319 |
+
steps_per_epoch=int(0.1 * len(train_dataset)), # It will only look on 10% of batches for training (to speed up training)
|
320 |
+
validation_data=valid_dataset,
|
321 |
+
validation_steps=int(0.1 * len(valid_dataset)))
|
322 |
+
|
323 |
+
# Evaluate on whole validation dataset
|
324 |
+
model_1.evaluate(valid_dataset)
|
325 |
+
|
326 |
+
# Make predictions on the validation dataset (our model predicts probabilities for each class)
|
327 |
+
model_1_pred_probs = model_1.predict(valid_dataset)
|
328 |
+
model_1_pred_probs, model_1_pred_probs.shape
|
329 |
+
|
330 |
+
# Convert pred probs to classes
|
331 |
+
model_1_preds = tf.argmax(model_1_pred_probs, axis=1)
|
332 |
+
model_1_preds
|
333 |
+
class_names
|
334 |
+
class_names[model_1_preds]
|
335 |
+
|
336 |
+
# Calculate model_1 results
|
337 |
+
model_1_results = calculate_results(y_true=val_labels_encoded, y_pred=model_1_preds)
|
338 |
+
model_1_results
|
339 |
+
|
340 |
+
"""
|
341 |
+
Model 2: Feature extraction with pretrained token embeddings
|
342 |
+
|
343 |
+
Now let's use pretrained word embeddings from TensorFlow Hub,
|
344 |
+
more sepcifically the universal sentence encoder
|
345 |
+
|
346 |
+
The paper used originally used GloVe embeddings, however we are going to stick with the later
|
347 |
+
created USE pretrained embeddings.
|
348 |
+
"""
|
349 |
+
# Download pretrained TensorFlow Hub USE
|
350 |
+
import tensorflow_hub as hub
|
351 |
+
tf_hub_embedding_layer = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
|
352 |
+
trainable=False,
|
353 |
+
name="universal_sentence_encoder")
|
354 |
+
|
355 |
+
|
356 |
+
"""
|
357 |
+
Building and fitting an NLP feature extraction model using pretrained embeddings TensorFlow Hub
|
358 |
+
"""
|
359 |
+
# Define feature extraction model using TF Hub layer
|
360 |
+
inputs = layers.Input(shape=[], dtype=tf.string)
|
361 |
+
pretrained_embedding = tf_hub_embedding_layer(inputs) # Tokenize text and create embedding of each sequence (512 long vector)
|
362 |
+
x = layers.Dense(128, activation="relu")(pretrained_embedding)
|
363 |
+
# Note: you could add more layers if you wanted to
|
364 |
+
outputs = layers.Dense(num_classes, activation="softmax")(x) # Create the output layer
|
365 |
+
|
366 |
+
model_2 = tf.keras.Model(inputs, outputs, name="model_2_USE_feature_extractor")
|
367 |
+
|
368 |
+
# Compile the model
|
369 |
+
model_2.compile(loss="categorical_crossentropy",
|
370 |
+
optimizer=tf.keras.optimizers.Adam(),
|
371 |
+
metrics=["accuracy"])
|
372 |
+
|
373 |
+
model_2.summary()
|
374 |
+
|
375 |
+
# Fit model_2 to the data
|
376 |
+
with tf.device('/CPU:0'):
|
377 |
+
history_model_2 = model_2.fit(train_dataset,
|
378 |
+
epochs=3,
|
379 |
+
steps_per_epoch=int(0.1 * len(train_dataset)),
|
380 |
+
validation_data=valid_dataset,
|
381 |
+
validation_steps=int(0.1 * len(valid_dataset)))
|
382 |
+
|
383 |
+
# Evaluate on the whole validation dataset
|
384 |
+
with tf.device('/CPU:0'):
|
385 |
+
model_2.evaluate(valid_dataset)
|
386 |
+
|
387 |
+
# Make predictions with feature extraction model
|
388 |
+
with tf.device('/CPU:0'):
|
389 |
+
model_2_pred_probs = model_2.predict(valid_dataset)
|
390 |
+
model_2_pred_probs, model_2_pred_probs.shape
|
391 |
+
|
392 |
+
# Convert the prediction probabilites found with feature extraction model to labels
|
393 |
+
model_2_preds = tf.argmax(model_2_pred_probs, axis=1)
|
394 |
+
model_2_preds
|
395 |
+
class_names[model_2_preds]
|
396 |
+
|
397 |
+
# Calculate results from TF Hub pretrained embeddings results on val set
|
398 |
+
model_2_results = calculate_results(y_true=val_labels_encoded, y_pred=model_2_preds)
|
399 |
+
model_2_results
|
400 |
+
|
401 |
+
"""
|
402 |
+
Model 3: Conv1D with character embeddings
|
403 |
+
|
404 |
+
The paper which we are replicating states they used a combination of token and charcter level embeddings.
|
405 |
+
Previously, we have token level embeddings but we will need to do similar steps for characters if we want to use char-level embeddings.
|
406 |
+
"""
|
407 |
+
|
408 |
+
"""
|
409 |
+
Creating a charceter-level tokenizer
|
410 |
+
"""
|
411 |
+
train_sentences[:5]
|
412 |
+
|
413 |
+
# Make function to split sentences into characters
|
414 |
+
def split_chars(text):
|
415 |
+
return " ".join(list(text))
|
416 |
+
|
417 |
+
|
418 |
+
# Split sequence-level data splits into character-level data splits
|
419 |
+
train_chars = [split_chars(sentence) for sentence in train_sentences]
|
420 |
+
val_chars = [split_chars(sentence) for sentence in val_sentences]
|
421 |
+
test_chars = [split_chars(sentence) for sentence in test_sentences]
|
422 |
+
|
423 |
+
train_chars, val_chars, test_chars
|
424 |
+
|
425 |
+
# What's the average character length?
|
426 |
+
char_lens = [len(sentence) for sentence in train_sentences]
|
427 |
+
mean_char_len = np.mean(char_lens)
|
428 |
+
mean_char_len
|
429 |
+
|
430 |
+
# Check the distribution of our sequences at a character-level
|
431 |
+
import matplotlib.pyplot as plt
|
432 |
+
plt.hist(char_lens, bins=7)
|
433 |
+
|
434 |
+
# Find what length of characters covers 95% of sequences
|
435 |
+
output_seq_char_len = int(np.percentile(char_lens, 95))
|
436 |
+
output_seq_char_len
|
437 |
+
|
438 |
+
# Get all keyboard characters
|
439 |
+
import string
|
440 |
+
alphabet = string.ascii_lowercase + string.digits + string.punctuation
|
441 |
+
alphabet
|
442 |
+
len(alphabet)
|
443 |
+
|
444 |
+
# Create char-level token vectorizer instances
|
445 |
+
NUM_CHAR_TOKENS = len(alphabet) + 2 # add 2 for space and OOV token (OOV = out of vocab, ['UNK])
|
446 |
+
char_vectorizer = TextVectorization(max_tokens=NUM_CHAR_TOKENS,
|
447 |
+
output_sequence_length=output_seq_char_len,
|
448 |
+
standardize="lower_and_strip_punctuation", # Default
|
449 |
+
name="char_vectorizer")
|
450 |
+
|
451 |
+
# Adapt character vectorizer to training character
|
452 |
+
char_vectorizer.adapt(train_chars)
|
453 |
+
|
454 |
+
# Chek character vocab stats
|
455 |
+
char_vocab = char_vectorizer.get_vocabulary()
|
456 |
+
print(f"Number of different characters in character vocab: {len(char_vocab)}")
|
457 |
+
print(f"5 most common character: {char_vocab[:5]}")
|
458 |
+
print(f"5 least common characters: {char_vocab[-5:]}")
|
459 |
+
|
460 |
+
"""
|
461 |
+
Creating a character-level embedding
|
462 |
+
"""
|
463 |
+
# Create char embedding layer
|
464 |
+
char_embed = layers.Embedding(input_dim=len(char_vocab), # Number of different characters
|
465 |
+
output_dim=25, # This is the size of char embedding in the paper
|
466 |
+
mask_zero=True,
|
467 |
+
name="char_embed")
|
468 |
+
|
469 |
+
|
470 |
+
"""
|
471 |
+
Model 3: Building a Conv1D model to fit on character embeddings
|
472 |
+
"""
|
473 |
+
# Make Conv1D on chars only
|
474 |
+
inputs = layers.Input(shape=(1,), dtype="string")
|
475 |
+
char_vectors = char_vectorizer(inputs)
|
476 |
+
char_embeddings = char_embed(char_vectors)
|
477 |
+
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(char_embeddings)
|
478 |
+
x = layers.GlobalMaxPool1D()(x)
|
479 |
+
outputs = layers.Dense(num_classes, activation="softmax")(x)
|
480 |
+
|
481 |
+
model_3 = tf.keras.Model(inputs, outputs, name="model_3_conv1d_char_embeddings")
|
482 |
+
|
483 |
+
# Compile the model
|
484 |
+
model_3.compile(loss="categorical_crossentropy",
|
485 |
+
optimizer=tf.keras.optimizers.Adam(),
|
486 |
+
metrics=["accuracy"])
|
487 |
+
|
488 |
+
model_3.summary()
|
489 |
+
|
490 |
+
# Create char level dataset
|
491 |
+
train_char_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
|
492 |
+
val_char_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
|
493 |
+
test_char_dataset = tf.data.Dataset.from_tensor_slices((test_chars, test_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
|
494 |
+
|
495 |
+
train_char_dataset, val_char_dataset, test_char_dataset
|
496 |
+
|
497 |
+
# Fit the model on chars only
|
498 |
+
model_3_history = model_3.fit(train_char_dataset,
|
499 |
+
epochs=3,
|
500 |
+
steps_per_epoch=int(0.1 * len (train_char_dataset)),
|
501 |
+
validation_data=val_char_dataset,
|
502 |
+
validation_steps=int(0.1 * len(val_char_dataset)))
|
503 |
+
|
504 |
+
# Evaluate the model_3
|
505 |
+
model_3.evaluate(val_char_dataset)
|
506 |
+
|
507 |
+
# Make predictions with character model only
|
508 |
+
model_3_pred_probs = model_3.predict(val_char_dataset)
|
509 |
+
model_3_pred_probs, model_3_pred_probs.shape
|
510 |
+
|
511 |
+
# Convert prediction to class labels
|
512 |
+
model_3_preds = tf.argmax(model_3_pred_probs, axis=1)
|
513 |
+
model_3_preds
|
514 |
+
class_names[model_3_preds]
|
515 |
+
|
516 |
+
# Calculate results for Conv1D model chars only
|
517 |
+
model_3_results = calculate_results(y_true=val_labels_encoded, y_pred=model_3_preds)
|
518 |
+
model_3_results
|
519 |
+
|
520 |
+
baseline_results
|
521 |
+
|
522 |
+
"""
|
523 |
+
Model 4: Combining pretrained token embeddings + characters embeddings (hybrid embedding layer)
|
524 |
+
|
525 |
+
1. Create a token level embedding model (similar to model_1)
|
526 |
+
2. Create a character level model (similar to model_3 with a slight modification)
|
527 |
+
3. Combine 1 & 2 with a concatenate (layers.Concatenate)
|
528 |
+
4. Build a series of output layer on top point 3.
|
529 |
+
5. Construct a model which takes token and character level sequences as input and produces sequence label probabilities as output.
|
530 |
+
"""
|
531 |
+
|
532 |
+
# 1. Setup token inputs/model
|
533 |
+
token_inputs = layers.Input(shape=[], dtype=tf.string, name="token_inputs")
|
534 |
+
token_embeddings = tf_hub_embedding_layer(token_inputs)
|
535 |
+
token_outputs = layers.Dense(128, activation="relu")(token_embeddings)
|
536 |
+
token_model = tf.keras.Model(inputs=token_inputs, outputs=token_outputs)
|
537 |
+
|
538 |
+
# 2. Setup char inputs/model
|
539 |
+
char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="char_input")
|
540 |
+
char_vectors = char_vectorizer(char_inputs)
|
541 |
+
char_embeddings = char_embed(char_vectors)
|
542 |
+
char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings) # bi-LSTM as given in paper
|
543 |
+
char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm)
|
544 |
+
|
545 |
+
# 3. Concatenate token and char inputs (create hybrid tokem embedding)
|
546 |
+
token_char_concat = layers.Concatenate(name="token_char_hybrid")([token_model.output, char_model.output])
|
547 |
+
|
548 |
+
# 4. Create output layers - adding in dropout (according to the paper)
|
549 |
+
combined_dropout = layers.Dropout(0.5)(token_char_concat)
|
550 |
+
combined_dense = layers.Dense(128, activation="relu")(combined_dropout)
|
551 |
+
final_dropout = layers.Dropout(0.5)(combined_dense)
|
552 |
+
output_layer = layers.Dense(num_classes, activation="softmax")(final_dropout)
|
553 |
+
|
554 |
+
# 5. Construct model with char and token inputs
|
555 |
+
model_4 = tf.keras.Model(inputs=[token_model.input, char_model.input],
|
556 |
+
outputs=output_layer,
|
557 |
+
name="model_4_token_and_char_embeddings")
|
558 |
+
|
559 |
+
# Get a summary of our model
|
560 |
+
model_4.summary()
|
561 |
+
|
562 |
+
# Plot hybrid token and character model
|
563 |
+
from keras.utils import plot_model
|
564 |
+
plot_model(model_4, show_shapes=True)
|
565 |
+
|
566 |
+
# Compile token char model
|
567 |
+
model_4.compile(loss="categorical_crossentropy",
|
568 |
+
optimizer=tf.keras.optimizers.Adam(), # Paper says SGD optimizer
|
569 |
+
metrics=["accuracy"])
|
570 |
+
|
571 |
+
"""
|
572 |
+
Combining token and character data into tf.data.Dataset
|
573 |
+
"""
|
574 |
+
# Combine chars and tokens into a dataset
|
575 |
+
|
576 |
+
train_char_token_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars)) # make data
|
577 |
+
train_char_token_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) # make labels
|
578 |
+
train_char_token_dataset = tf.data.Dataset.zip((train_char_token_data, train_char_token_labels)) # Combine data and labels
|
579 |
+
|
580 |
+
# Prefetch and batch train data
|
581 |
+
train_char_token_dataset = train_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
|
582 |
+
|
583 |
+
# For validation dataset
|
584 |
+
val_char_token_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars))
|
585 |
+
val_char_token_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
|
586 |
+
val_char_token_dataset = tf.data.Dataset.zip((val_char_token_data, val_char_token_labels))
|
587 |
+
|
588 |
+
# Prefetch and batch val data
|
589 |
+
val_char_token_dataset = val_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
|
590 |
+
|
591 |
+
# Check out training char and token embedding dataset
|
592 |
+
train_char_token_dataset, val_char_token_dataset
|
593 |
+
|
594 |
+
# Fitting a model on token and character-level sequences
|
595 |
+
with tf.device('/CPU:0'):
|
596 |
+
history_model_4 = model_4.fit(train_char_token_dataset,
|
597 |
+
epochs=3,
|
598 |
+
steps_per_epoch=int(0.1 * len(train_char_token_dataset)),
|
599 |
+
validation_data=val_char_token_dataset,
|
600 |
+
validation_steps=int(0.1 * len(val_char_token_dataset)))
|
601 |
+
|
602 |
+
# Evaluate on the whole validation dataset
|
603 |
+
with tf.device('/CPU:0'):
|
604 |
+
model_4.evaluate(val_char_token_dataset)
|
605 |
+
|
606 |
+
# Make predictions using the token-character model hybrid
|
607 |
+
model_4_pred_probs = model_4.predict(val_char_token_dataset)
|
608 |
+
model_4_pred_probs, model_4_pred_probs.shape
|
609 |
+
|
610 |
+
# Converting to prediction probabilities to labels
|
611 |
+
model_4_preds = tf.argmax(model_4_pred_probs, axis=1)
|
612 |
+
model_4_preds
|
613 |
+
|
614 |
+
model_4_preds
|
615 |
+
class_names[model_4_preds]
|
616 |
+
|
617 |
+
# Get results of token char hybrid model
|
618 |
+
model_4_results = calculate_results(y_true=val_labels_encoded, y_pred=model_4_preds)
|
619 |
+
model_4_results
|
620 |
+
|
621 |
+
"""
|
622 |
+
Model 5: Transfer learning with pretrained token embeddings + character embeddings +
|
623 |
+
positional embeddings
|
624 |
+
"""
|
625 |
+
train_df.head()
|
626 |
+
|
627 |
+
"""
|
628 |
+
Create positional embeddings
|
629 |
+
"""
|
630 |
+
# How many different line numbers are there?
|
631 |
+
train_df["line_number"].value_counts()
|
632 |
+
|
633 |
+
# Check the distribution of "line_number" column
|
634 |
+
train_df["line_number"].plot.hist()
|
635 |
+
|
636 |
+
# Use TensorFlow to create one-hot encoded tensors of our "line_number" column
|
637 |
+
train_line_numbers_one_hot = tf.one_hot(train_df["line_number"].to_numpy(), depth=15)
|
638 |
+
val_line_numbers_one_hot = tf.one_hot(val_df["line_number"].to_numpy(), depth=15)
|
639 |
+
test_line_numbers_one_hot = tf.one_hot(test_df["line_number"].to_numpy(), depth=15)
|
640 |
+
train_line_numbers_one_hot[:10], train_line_numbers_one_hot.shape
|
641 |
+
train_line_numbers_one_hot[0].shape
|
642 |
+
train_line_numbers_one_hot[0].dtype
|
643 |
+
|
644 |
+
# How many different numbers of lines are there?
|
645 |
+
train_df["total_lines"].value_counts()
|
646 |
+
|
647 |
+
# Check the distribution of "total_lines" column
|
648 |
+
train_df["total_lines"].plot.hist()
|
649 |
+
|
650 |
+
# Check the coverage of a "total_lines" / What length of 95% covers our abstract string?
|
651 |
+
np.percentile(train_df["total_lines"], 98)
|
652 |
+
|
653 |
+
# Use TensorFlow One-hot encoded tensors for our "total_lines" column
|
654 |
+
train_total_lines_one_hot = tf.one_hot(train_df["total_lines"].to_numpy(), depth=20)
|
655 |
+
val_total_lines_one_hot = tf.one_hot(val_df["total_lines"].to_numpy(), depth=20)
|
656 |
+
test_total_lines_one_hot = tf.one_hot(test_df["total_lines"].to_numpy(), depth=20)
|
657 |
+
train_total_lines_one_hot[:10], train_total_lines_one_hot.shape
|
658 |
+
train_total_lines_one_hot[0].shape
|
659 |
+
train_total_lines_one_hot[0].dtype
|
660 |
+
|
661 |
+
"""
|
662 |
+
Building a tribrid embedding model
|
663 |
+
|
664 |
+
1. Create a token-level model
|
665 |
+
2. Create a character-level model
|
666 |
+
3. Create a model for the "line_number" feature
|
667 |
+
4. Create a model for the "total_lines" feature
|
668 |
+
5. Combine the outputs of 1 & 2 using tf.keras.layers.Concatenate
|
669 |
+
6. Combine the outputs of 3,4,5 using tf.keras.layers.Concatenate
|
670 |
+
7. Create an output layer to accept the tribrid embedding and output label probabilities.
|
671 |
+
8. Combine the inputs of 1,2,3,4 and outputs of 7 into tf.keras.Model
|
672 |
+
"""
|
673 |
+
# 1. Token inputs
|
674 |
+
token_inputs = layers.Input(shape=[], dtype="string", name="token_inputs")
|
675 |
+
token_embeddings = tf_hub_embedding_layer(token_inputs)
|
676 |
+
token_outputs = layers.Dense(128, activation="relu")(token_embeddings)
|
677 |
+
token_model = tf.keras.Model(inputs=token_inputs, outputs=token_outputs)
|
678 |
+
|
679 |
+
# 2. Char inputs
|
680 |
+
char_inputs = layers.Input(shape=(1,), dtype="string", name="char_inputs")
|
681 |
+
char_vectors = char_vectorizer(char_inputs)
|
682 |
+
char_embeddings = char_embed(char_vectors)
|
683 |
+
char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings)
|
684 |
+
char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm)
|
685 |
+
|
686 |
+
# 3. Create a model for "line_number" feature
|
687 |
+
line_number_inputs = layers.Input(shape=(15,), dtype=tf.float32, name="line_number_input")
|
688 |
+
x = layers.Dense(32, activation="relu")(line_number_inputs)
|
689 |
+
line_number_model = tf.keras.Model(inputs=line_number_inputs, outputs=x)
|
690 |
+
|
691 |
+
# 4. Create a model for "total_lines" feature
|
692 |
+
total_lines_inputs = layers.Input(shape=(20,), dtype=tf.float32, name="total_lines_input")
|
693 |
+
y = layers.Dense(32, activation="relu")(total_lines_inputs)
|
694 |
+
total_lines_model = tf.keras.Model(inputs=total_lines_inputs, outputs=y)
|
695 |
+
|
696 |
+
# 5. Combine the outputs of token and char embeddings into a hybrid embedding
|
697 |
+
combined_embeddings = layers.Concatenate(name="char_token_hybrid_embedding")([token_model.output, char_model.output])
|
698 |
+
z = layers.Dense(256, activation="relu")(combined_embeddings)
|
699 |
+
z = layers.Dropout(0.5)(z)
|
700 |
+
|
701 |
+
# 6. Combine positional embedding with combined token and char embeddings
|
702 |
+
tribrid_embeddings = layers.Concatenate(name="char_token_positional_embedding")([line_number_model.output, total_lines_model.output, z])
|
703 |
+
|
704 |
+
# 7. Create output layer
|
705 |
+
output_layer = layers.Dense(num_classes, activation="softmax", name="output_layer")(tribrid_embeddings)
|
706 |
+
|
707 |
+
# 8. Put together model withall kinds of inputs
|
708 |
+
model_5 = tf.keras.Model(inputs=[line_number_model.input,
|
709 |
+
total_lines_model.input,
|
710 |
+
token_model.input,
|
711 |
+
char_model.input], outputs=output_layer, name="model_5_tribrid_embedding_model")
|
712 |
+
|
713 |
+
# Get a summary of our tribrid model
|
714 |
+
model_5.summary()
|
715 |
+
|
716 |
+
from tensorflow.keras.utils import plot_model
|
717 |
+
plot_model(model_5, show_shapes=True)
|
718 |
+
|
719 |
+
|
720 |
+
# Compile token char and postional embedding model
|
721 |
+
model_5.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2), # Helps to prevent overfitting
|
722 |
+
optimizer=tf.keras.optimizers.Adam(),
|
723 |
+
metrics=["accuracy"])
|
724 |
+
|
725 |
+
"""
|
726 |
+
Create tribrid embeddings datasets using tf.data
|
727 |
+
"""
|
728 |
+
|
729 |
+
# Create training and validation datasets (with all 4 kinds of input data)
|
730 |
+
train_char_token_pos_data = tf.data.Dataset.from_tensor_slices((train_line_numbers_one_hot,
|
731 |
+
train_total_lines_one_hot,
|
732 |
+
train_sentences,
|
733 |
+
train_chars))
|
734 |
+
train_char_token_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
|
735 |
+
train_char_token_pos_dataset = tf.data.Dataset.zip((train_char_token_pos_data, train_char_token_pos_labels))
|
736 |
+
|
737 |
+
train_char_token_pos_dataset = train_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
|
738 |
+
|
739 |
+
|
740 |
+
# Do the same as above for the validation dataset
|
741 |
+
val_char_token_pos_data = tf.data.Dataset.from_tensor_slices((val_line_numbers_one_hot,
|
742 |
+
val_total_lines_one_hot,
|
743 |
+
val_sentences,
|
744 |
+
val_chars))
|
745 |
+
val_char_token_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
|
746 |
+
val_char_token_pos_dataset = tf.data.Dataset.zip((val_char_token_pos_data, val_char_token_pos_labels))
|
747 |
+
|
748 |
+
val_char_token_pos_dataset = val_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
|
749 |
+
|
750 |
+
# Check the input shapes
|
751 |
+
train_char_token_pos_dataset, val_char_token_pos_dataset
|
752 |
+
|
753 |
+
# Fit the model
|
754 |
+
with tf.device('/CPU:0'):
|
755 |
+
history_model_5 = model_5.fit(train_char_token_pos_dataset,
|
756 |
+
epochs=3,
|
757 |
+
steps_per_epoch=int(0.1 * len(train_char_token_pos_dataset)),
|
758 |
+
validation_data=val_char_token_pos_dataset,
|
759 |
+
validation_steps=int(0.1 * len(val_char_token_pos_dataset)))
|
760 |
+
|
761 |
+
with tf.device('/CPU:0'):
|
762 |
+
# Evaluate our model on whole validation dataset
|
763 |
+
model_5.evaluate(val_char_token_pos_dataset)
|
764 |
+
|
765 |
+
# Make predictions with the char token pos model
|
766 |
+
model_5_pred_probs = model_5.predict(val_char_token_pos_dataset)
|
767 |
+
model_5_pred_probs, model_5_pred_probs.shape
|
768 |
+
|
769 |
+
# Convert prediction probabilities to the labels
|
770 |
+
model_5_preds = tf.argmax(model_5_pred_probs, axis=1)
|
771 |
+
model_5_preds
|
772 |
+
|
773 |
+
model_5_preds
|
774 |
+
class_names[model_5_preds]
|
775 |
+
|
776 |
+
# Calculate results of char token pos model
|
777 |
+
model_5_results = calculate_results(y_true=val_labels_encoded, y_pred=model_5_preds)
|
778 |
+
model_5_results
|
779 |
+
|
780 |
+
"""
|
781 |
+
Compare model results
|
782 |
+
"""
|
783 |
+
|
784 |
+
# Combine model results into a dataframe
|
785 |
+
all_model_results = pd.DataFrame({"model_0_baseline": baseline_results,
|
786 |
+
"model_1_custom_token_embedding": model_1_results,
|
787 |
+
"model_2_pretrained_token_embedding": model_2_results,
|
788 |
+
"model_3_custom_char_embedding": model_3_results,
|
789 |
+
"model_4_hybrid_char_token_embedding": model_4_results,
|
790 |
+
"model_5_pos_char_token_embedding": model_5_results})
|
791 |
+
|
792 |
+
all_model_results = all_model_results.transpose()
|
793 |
+
all_model_results
|
794 |
+
|
795 |
+
# Reduce the accuracy to same scale as other metrics
|
796 |
+
all_model_results["accuracy"] = all_model_results["accuracy"]/100
|
797 |
+
|
798 |
+
all_model_results
|
799 |
+
|
800 |
+
# Plot and comapre all model results
|
801 |
+
all_model_results.plot(kind="bar", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0))
|
802 |
+
|
803 |
+
# Sort models results using f1-score
|
804 |
+
all_model_results.sort_values("f1", ascending=True)["f1"].plot(kind="bar", figsize=(10, 7))
|
805 |
+
|
806 |
+
"""
|
807 |
+
Save and load model
|
808 |
+
"""
|
809 |
+
# Save the best performing model to SavedModel format (default)
|
810 |
+
model_5.save("skimlit_tribrid_model_me")
|
811 |
+
|
812 |
+
# Load in best performing model
|
813 |
+
from keras.models import load_model
|
814 |
+
with tf.device('/CPU:0'):
|
815 |
+
loaded_model = load_model("skimlit_tribrid_model_me")
|
816 |
+
|
817 |
+
# Make predictions with our loaded model on the validation set
|
818 |
+
with tf.device('/CPU:0'):
|
819 |
+
loaded_pred_probs = loaded_model.predict(val_char_token_pos_dataset)
|
820 |
+
loaded_pred_probs, loaded_pred_probs.shape
|
821 |
+
|
822 |
+
# Convert prediction probabilities to labels
|
823 |
+
loaded_preds = tf.argmax(loaded_pred_probs, axis=1)
|
824 |
+
loaded_preds
|
825 |
+
|
826 |
+
loaded_preds[:10]
|
827 |
+
class_names[loaded_preds]
|
828 |
+
|
829 |
+
# Calculate the results of our loaded model
|
830 |
+
loaded_model_results = calculate_results(y_true=val_labels_encoded, y_pred=loaded_preds)
|
831 |
+
loaded_model_results
|
832 |
+
|
833 |
+
assert model_5_results == loaded_model_results # If nothing displays in console, it means True
|
834 |
+
|
835 |
+
# Check the loaded model summary
|
836 |
+
loaded_model.summary()
|
837 |
+
|
838 |
+
|
839 |
+
"""
|
840 |
+
Optional - for the loaded model you can use your own trained model
|
841 |
+
"""
|
842 |
+
import tensorflow as tf
|
843 |
+
import tensorflow_hub as hub
|
844 |
+
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
|
845 |
+
|
846 |
+
import os
|
847 |
+
url = "https://drive.google.com/file/d/1DYr3Ew9tU6dph_fI0JeTZ6GbdzZpWr8K/view?usp=sharing"
|
848 |
+
|
849 |
+
# Load in downloaded online model
|
850 |
+
loaded_gs_model = load_model("skimlit_tribrid_model")
|
851 |
+
|
852 |
+
# Evaluate the online loaded model
|
853 |
+
loaded_gs_model.evaluate(val_char_token_pos_dataset)
|
854 |
+
loaded_preds = tf.argmax(loaded_pred_probs, axis=1)
|
855 |
+
loaded_preds[:10]
|
856 |
+
|
857 |
+
# Evaluate loaded model's predictions
|
858 |
+
loaded_model_results = calculate_results(val_labels_encoded, loaded_preds)
|
859 |
+
loaded_model_results
|
860 |
+
|
861 |
+
# Check loaded model summary
|
862 |
+
loaded_model.summary()
|
863 |
+
|
864 |
+
# Create test dataset batch and prefetched
|
865 |
+
test_pos_char_token_data = tf.data.Dataset.from_tensor_slices((test_line_numbers_one_hot, test_total_lines_one_hot, test_sentences, test_chars))
|
866 |
+
test_pos_char_token_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
|
867 |
+
|
868 |
+
test_pos_char_token_dataset = tf.data.Dataset.zip((test_pos_char_token_data, test_pos_char_token_labels))
|
869 |
+
|
870 |
+
test_pos_char_token_dataset = test_pos_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
|
871 |
+
|
872 |
+
# Make predictions on the test dataset
|
873 |
+
with tf.device('/CPU:0'):
|
874 |
+
test_pred_probs = loaded_model.predict(test_pos_char_token_dataset, verbose=1)
|
875 |
+
test_preds = tf.argmax(test_pred_probs, axis=1)
|
876 |
+
test_preds[:10]
|
877 |
+
|
878 |
+
# Evaluate loaded model test predictions
|
879 |
+
loaded_model_test_results = calculate_results(y_true=test_labels_encoded, y_pred=test_preds)
|
880 |
+
loaded_model_test_results
|
881 |
+
|
882 |
+
# Get list of class names of test predictions
|
883 |
+
test_pred_classes = [label_encoder.classes_[pred] for pred in test_preds]
|
884 |
+
test_pred_classes
|
885 |
+
|
886 |
+
# Create prediction-enriched test dataframe
|
887 |
+
|
888 |
+
# Add a new column "prediction" to the test dataframe, containing predicted classes
|
889 |
+
test_df["prediction"] = test_pred_classes
|
890 |
+
|
891 |
+
# Add a new column "pred_prob" to the test dataframe, containing the maximum prediction probability
|
892 |
+
test_df["pred_prob"] = tf.reduce_max(test_pred_probs, axis=1).numpy()
|
893 |
+
|
894 |
+
# Add a new column "correct" to the test dataframe, which is True if the prediction matches the target, False otherwise
|
895 |
+
# This creates a binary column indicating whether the prediction is correct or not
|
896 |
+
test_df["correct"] = test_df["prediction"] == test_df["target"]
|
897 |
+
|
898 |
+
# Display the first 20 rows of the enriched test dataframe
|
899 |
+
test_df.head(20)
|
900 |
+
|
901 |
+
# Find top 100 most wrong samples (note: 100 is an abitrary number, you could go through all of them if you wanted)
|
902 |
+
top_100_wrong = test_df[test_df["correct"] == False].sort_values("pred_prob", ascending=False)[:100]
|
903 |
+
top_100_wrong
|
904 |
+
|
905 |
+
# Investigate top wrong predictions for rows in the top 100 wrong predictions dataframe
|
906 |
+
for row in top_100_wrong[0:10].itertuples():
|
907 |
+
# Unpack row values
|
908 |
+
_, target, text, line_number, total_lines, prediction, pred_prob, _ = row
|
909 |
+
|
910 |
+
# Display information about the prediction
|
911 |
+
print(f"Target: {target}, Pred: {prediction}, Prob: {pred_prob}, Line number: {line_number}, Total lines: {total_lines}\n")
|
912 |
+
|
913 |
+
# Display the text associated with the prediction
|
914 |
+
print(f"Text:\n{text}\n")
|
915 |
+
|
916 |
+
# Separator for better readability
|
917 |
+
print("-----------------------------------------------------------------------\n")
|
918 |
+
|
919 |
+
|
920 |
+
import json
|
921 |
+
import requests
|
922 |
+
|
923 |
+
# Download and open example abstracts (copy and pasted from PubMed)
|
924 |
+
url = "https://github.com/Dhrumit1314/Skimlit_NLP/blob/main/abstract_data.json"
|
925 |
+
response = requests.get(url)
|
926 |
+
|
927 |
+
# Check if the download was successful (status code 200)
|
928 |
+
if response.status_code == 200:
|
929 |
+
# Load the JSON data from the response
|
930 |
+
example_abstracts = json.loads(response.text)
|
931 |
+
print("Example abstracts loaded successfully.")
|
932 |
+
else:
|
933 |
+
print(f"Failed to download example abstracts. Status code: {response.status_code}")
|
934 |
+
|
935 |
+
# See what our example abstracts look like
|
936 |
+
abstracts = pd.DataFrame(example_abstracts)
|
937 |
+
abstracts
|
938 |
+
|
939 |
+
# Import necessary library
|
940 |
+
from spacy.lang.en import English
|
941 |
+
|
942 |
+
# Setup English sentence parser with spaCy
|
943 |
+
nlp = English()
|
944 |
+
|
945 |
+
# Add the sentencizer to the spaCy pipeline
|
946 |
+
sentencizer = nlp.add_pipe("sentencizer")
|
947 |
+
|
948 |
+
# Example abstract from the loaded dataset
|
949 |
+
example_abstract = example_abstracts[0]["abstract"]
|
950 |
+
example_abstract
|
951 |
+
|
952 |
+
# Create a spaCy "doc" object by parsing the example abstract
|
953 |
+
doc = nlp(example_abstract)
|
954 |
+
doc
|
955 |
+
|
956 |
+
# Extract sentences from the spaCy doc and convert to string type
|
957 |
+
abstract_lines = [str(sent) for sent in list(doc.sents)]
|
958 |
+
# Display the detected sentences from the abstract
|
959 |
+
abstract_lines
|
960 |
+
|
961 |
+
# Get the total number of lines in the sample
|
962 |
+
total_lines_in_sample = len(abstract_lines)
|
963 |
+
|
964 |
+
# Initialize an empty list to store dictionaries containing features for each line
|
965 |
+
sample_lines = []
|
966 |
+
|
967 |
+
# Iterate through each line in the abstract and create a list of dictionaries containing features for each line
|
968 |
+
for i, line in enumerate(abstract_lines):
|
969 |
+
# Create a dictionary to store features for the current line
|
970 |
+
sample_dict = {}
|
971 |
+
|
972 |
+
# Store the text of the line in the dictionary
|
973 |
+
sample_dict["text"] = str(line)
|
974 |
+
|
975 |
+
# Store the line number in the dictionary
|
976 |
+
sample_dict["line_number"] = i
|
977 |
+
|
978 |
+
# Store the total number of lines in the sample (subtracting 1 to make it 0-based index)
|
979 |
+
sample_dict["total_lines"] = total_lines_in_sample - 1
|
980 |
+
|
981 |
+
# Append the dictionary to the list
|
982 |
+
sample_lines.append(sample_dict)
|
983 |
+
|
984 |
+
# Display the list of dictionaries containing features for each line
|
985 |
+
sample_lines
|
986 |
+
|
987 |
+
# Get all line_number values from the sample abstract
|
988 |
+
test_abstract_line_numbers = [line["line_number"] for line in sample_lines]
|
989 |
+
|
990 |
+
# One-hot encode to the same depth as training data, so the model accepts the right input shape
|
991 |
+
test_abstract_line_numbers_one_hot = tf.one_hot(test_abstract_line_numbers, depth=15)
|
992 |
+
|
993 |
+
# Display the one-hot encoded line numbers
|
994 |
+
test_abstract_line_numbers_one_hot
|
995 |
+
|
996 |
+
# Get all total_lines values from sample abstract
|
997 |
+
test_abstract_total_lines = [line["total_lines"] for line in sample_lines]
|
998 |
+
|
999 |
+
# One-hot encode to same depth as training data, so model accepts right input shape
|
1000 |
+
test_abstract_total_lines_one_hot = tf.one_hot(test_abstract_total_lines, depth=20)
|
1001 |
+
test_abstract_total_lines_one_hot
|
1002 |
+
|
1003 |
+
# Split abstract lines into characters
|
1004 |
+
abstract_chars = [split_chars(sentence) for sentence in abstract_lines]
|
1005 |
+
abstract_chars
|
1006 |
+
|
1007 |
+
import tensorflow as tf
|
1008 |
+
import time
|
1009 |
+
|
1010 |
+
# Define the depths for one-hot encoding
|
1011 |
+
line_numbers_depth = 15
|
1012 |
+
total_lines_depth = 20
|
1013 |
+
|
1014 |
+
# Prepare the input features
|
1015 |
+
test_abstract_line_numbers_one_hot = tf.one_hot(test_abstract_line_numbers, depth=line_numbers_depth)
|
1016 |
+
test_abstract_total_lines_one_hot = tf.one_hot(test_abstract_total_lines, depth=total_lines_depth)
|
1017 |
+
test_abstract_abstract_lines = tf.constant(abstract_lines)
|
1018 |
+
test_abstract_abstract_chars = tf.constant(abstract_chars)
|
1019 |
+
|
1020 |
+
# Make predictions on the sample abstract features
|
1021 |
+
start_time = time.time()
|
1022 |
+
|
1023 |
+
with tf.device('/CPU:0'):
|
1024 |
+
# Note - Here you can use loaded_model if you want
|
1025 |
+
test_abstract_pred_probs = model_5.predict(x=(test_abstract_line_numbers_one_hot, test_abstract_total_lines_one_hot, tf.constant(abstract_lines), tf.constant(abstract_chars)))
|
1026 |
+
|
1027 |
+
end_time = time.time()
|
1028 |
+
|
1029 |
+
|
1030 |
+
# Display the prediction probabilities
|
1031 |
+
print("Prediction Probabilities:", test_abstract_pred_probs)
|
1032 |
+
|
1033 |
+
# Display the time taken for predictions
|
1034 |
+
print("Time taken for predictions: {:.2f} seconds".format(end_time - start_time))
|
1035 |
+
|
1036 |
+
# Turn prediction probabilities into prediction classes
|
1037 |
+
test_abstract_preds = tf.argmax(test_abstract_pred_probs, axis=1)
|
1038 |
+
test_abstract_preds
|
1039 |
+
|
1040 |
+
# Turn prediction class integers into string class names
|
1041 |
+
test_abstract_pred_classes = [label_encoder.classes_[i] for i in test_abstract_preds]
|
1042 |
+
test_abstract_pred_classes
|
1043 |
+
|
1044 |
+
# Visualize abstract lines and predicted sequence labels
|
1045 |
+
for i, line in enumerate(abstract_lines):
|
1046 |
+
print(f"{test_abstract_pred_classes[i]}: {line}")
|
abstract_data.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Thu Feb 8 20:41:39 2024
|
4 |
+
|
5 |
+
@author: Dhrumit Patel
|
6 |
+
"""
|
7 |
+
|
8 |
+
[
|
9 |
+
{
|
10 |
+
"abstract": "This RCT examined the efficacy of a manualized social intervention for children with HFASDs. Participants were randomly assigned to treatment or wait-list conditions. Treatment included instruction and therapeutic activities targeting social skills, face-emotion recognition, interest expansion, and interpretation of non-literal language. A response-cost program was applied to reduce problem behaviors and foster skills acquisition. Significant treatment effects were found for five of seven primary outcome measures (parent ratings and direct child measures). Secondary measures based on staff ratings (treatment group only) corroborated gains reported by parents. High levels of parent, child and staff satisfaction were reported, along with high levels of treatment fidelity. Standardized effect size estimates were primarily in the medium and large ranges and favored the treatment group.",
|
11 |
+
"source": "https://pubmed.ncbi.nlm.nih.gov/20232240/",
|
12 |
+
"details": "RCT of a manualized social treatment for high-functioning autism spectrum disorders"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"abstract": "Postpartum depression (PPD) is the most prevalent mood disorder associated with childbirth. No single cause of PPD has been identified, however the increased risk of nutritional deficiencies incurred through the high nutritional requirements of pregnancy may play a role in the pathology of depressive symptoms. Three nutritional interventions have drawn particular interest as possible non-invasive and cost-effective prevention and/or treatment strategies for PPD; omega-3 (n-3) long chain polyunsaturated fatty acids (LCPUFA), vitamin D and overall diet. We searched for meta-analyses of randomised controlled trials (RCT's) of nutritional interventions during the perinatal period with PPD as an outcome, and checked for any trials published subsequently to the meta-analyses. Fish oil: Eleven RCT's of prenatal fish oil supplementation RCT's show null and positive effects on PPD symptoms. Vitamin D: no relevant RCT's were identified, however seven observational studies of maternal vitamin D levels with PPD outcomes showed inconsistent associations. Diet: Two Australian RCT's with dietary advice interventions in pregnancy had a positive and null result on PPD. With the exception of fish oil, few RCT's with nutritional interventions during pregnancy assess PPD. Further research is needed to determine whether nutritional intervention strategies during pregnancy can protect against symptoms of PPD. Given the prevalence of PPD and ease of administering PPD measures, we recommend future prenatal nutritional RCT's include PPD as an outcome.",
|
16 |
+
"source": "https://pubmed.ncbi.nlm.nih.gov/28012571/",
|
17 |
+
"details": "Formatting removed (can be used to compare model to actual example)"
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"abstract": "Mental illness, including depression, anxiety and bipolar disorder, accounts for a significant proportion of global disability and poses a substantial social, economic and heath burden. Treatment is presently dominated by pharmacotherapy, such as antidepressants, and psychotherapy, such as cognitive behavioural therapy; however, such treatments avert less than half of the disease burden, suggesting that additional strategies are needed to prevent and treat mental disorders. There are now consistent mechanistic, observational and interventional data to suggest diet quality may be a modifiable risk factor for mental illness. This review provides an overview of the nutritional psychiatry field. It includes a discussion of the neurobiological mechanisms likely modulated by diet, the use of dietary and nutraceutical interventions in mental disorders, and recommendations for further research. Potential biological pathways related to mental disorders include inflammation, oxidative stress, the gut microbiome, epigenetic modifications and neuroplasticity. Consistent epidemiological evidence, particularly for depression, suggests an association between measures of diet quality and mental health, across multiple populations and age groups; these do not appear to be explained by other demographic, lifestyle factors or reverse causality. Our recently published intervention trial provides preliminary clinical evidence that dietary interventions in clinically diagnosed populations are feasible and can provide significant clinical benefit. Furthermore, nutraceuticals including n-3 fatty acids, folate, S-adenosylmethionine, N-acetyl cysteine and probiotics, among others, are promising avenues for future research. Continued research is now required to investigate the efficacy of intervention studies in large cohorts and within clinically relevant populations, particularly in patients with schizophrenia, bipolar and anxiety disorders.",
|
21 |
+
"source": "https://pubmed.ncbi.nlm.nih.gov/28942748/",
|
22 |
+
"details": "Effect of nutrition on mental health"
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"abstract": "Hepatitis C virus (HCV) and alcoholic liver disease (ALD), either alone or in combination, count for more than two thirds of all liver diseases in the Western world. There is no safe level of drinking in HCV-infected patients and the most effective goal for these patients is total abstinence. Baclofen, a GABA(B) receptor agonist, represents a promising pharmacotherapy for alcohol dependence (AD). Previously, we performed a randomized clinical trial (RCT), which demonstrated the safety and efficacy of baclofen in patients affected by AD and cirrhosis. The goal of this post-hoc analysis was to explore baclofen's effect in a subgroup of alcohol-dependent HCV-infected cirrhotic patients. Any patient with HCV infection was selected for this analysis. Among the 84 subjects randomized in the main trial, 24 alcohol-dependent cirrhotic patients had a HCV infection; 12 received baclofen 10mg t.i.d. and 12 received placebo for 12-weeks. With respect to the placebo group (3/12, 25.0%), a significantly higher number of patients who achieved and maintained total alcohol abstinence was found in the baclofen group (10/12, 83.3%; p=0.0123). Furthermore, in the baclofen group, compared to placebo, there was a significantly higher increase in albumin values from baseline (p=0.0132) and a trend toward a significant reduction in INR levels from baseline (p=0.0716). In conclusion, baclofen was safe and significantly more effective than placebo in promoting alcohol abstinence, and improving some Liver Function Tests (LFTs) (i.e. albumin, INR) in alcohol-dependent HCV-infected cirrhotic patients. Baclofen may represent a clinically relevant alcohol pharmacotherapy for these patients.",
|
26 |
+
"source": "https://pubmed.ncbi.nlm.nih.gov/22244707/",
|
27 |
+
"details": "Baclofen promotes alcohol abstinence in alcohol dependent cirrhotic patients with hepatitis C virus (HCV) infection"
|
28 |
+
}
|
29 |
+
]
|
fingerprint.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9c81d738ff29c57b85f8b5d929181c03640b437fbd63fdd5de33c18c33b17ecd
|
3 |
+
size 55
|
helper_functions.py
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
|
3 |
+
# Create a function to import an image and resize it to be able to be used with our model
|
4 |
+
def load_and_prep_image(filename, img_shape=224, scale=True):
|
5 |
+
"""
|
6 |
+
Reads in an image from filename, turns it into a tensor and reshapes into
|
7 |
+
(224, 224, 3).
|
8 |
+
|
9 |
+
Parameters
|
10 |
+
----------
|
11 |
+
filename (str): string filename of target image
|
12 |
+
img_shape (int): size to resize target image to, default 224
|
13 |
+
scale (bool): whether to scale pixel values to range(0, 1), default True
|
14 |
+
"""
|
15 |
+
# Read in the image
|
16 |
+
img = tf.io.read_file(filename)
|
17 |
+
# Decode it into a tensor
|
18 |
+
img = tf.image.decode_jpeg(img)
|
19 |
+
# Resize the image
|
20 |
+
img = tf.image.resize(img, [img_shape, img_shape])
|
21 |
+
if scale:
|
22 |
+
# Rescale the image (get all values between 0 and 1)
|
23 |
+
return img/255.
|
24 |
+
else:
|
25 |
+
return img
|
26 |
+
|
27 |
+
# Note: The following confusion matrix code is a remix of Scikit-Learn's
|
28 |
+
# plot_confusion_matrix function - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.plot_confusion_matrix.html
|
29 |
+
import itertools
|
30 |
+
import matplotlib.pyplot as plt
|
31 |
+
import numpy as np
|
32 |
+
from sklearn.metrics import confusion_matrix
|
33 |
+
|
34 |
+
# Our function needs a different name to sklearn's plot_confusion_matrix
|
35 |
+
def make_confusion_matrix(y_true, y_pred, classes=None, figsize=(10, 10), text_size=15, norm=False, savefig=False):
|
36 |
+
"""Makes a labelled confusion matrix comparing predictions and ground truth labels.
|
37 |
+
|
38 |
+
If classes is passed, confusion matrix will be labelled, if not, integer class values
|
39 |
+
will be used.
|
40 |
+
|
41 |
+
Args:
|
42 |
+
y_true: Array of truth labels (must be same shape as y_pred).
|
43 |
+
y_pred: Array of predicted labels (must be same shape as y_true).
|
44 |
+
classes: Array of class labels (e.g. string form). If `None`, integer labels are used.
|
45 |
+
figsize: Size of output figure (default=(10, 10)).
|
46 |
+
text_size: Size of output figure text (default=15).
|
47 |
+
norm: normalize values or not (default=False).
|
48 |
+
savefig: save confusion matrix to file (default=False).
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
A labelled confusion matrix plot comparing y_true and y_pred.
|
52 |
+
|
53 |
+
Example usage:
|
54 |
+
make_confusion_matrix(y_true=test_labels, # ground truth test labels
|
55 |
+
y_pred=y_preds, # predicted labels
|
56 |
+
classes=class_names, # array of class label names
|
57 |
+
figsize=(15, 15),
|
58 |
+
text_size=10)
|
59 |
+
"""
|
60 |
+
# Create the confustion matrix
|
61 |
+
cm = confusion_matrix(y_true, y_pred)
|
62 |
+
cm_norm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] # normalize it
|
63 |
+
n_classes = cm.shape[0] # find the number of classes we're dealing with
|
64 |
+
|
65 |
+
# Plot the figure and make it pretty
|
66 |
+
fig, ax = plt.subplots(figsize=figsize)
|
67 |
+
cax = ax.matshow(cm, cmap=plt.cm.Blues) # colors will represent how 'correct' a class is, darker == better
|
68 |
+
fig.colorbar(cax)
|
69 |
+
|
70 |
+
# Are there a list of classes?
|
71 |
+
if classes:
|
72 |
+
labels = classes
|
73 |
+
else:
|
74 |
+
labels = np.arange(cm.shape[0])
|
75 |
+
|
76 |
+
# Label the axes
|
77 |
+
ax.set(title="Confusion Matrix",
|
78 |
+
xlabel="Predicted label",
|
79 |
+
ylabel="True label",
|
80 |
+
xticks=np.arange(n_classes), # create enough axis slots for each class
|
81 |
+
yticks=np.arange(n_classes),
|
82 |
+
xticklabels=labels, # axes will labeled with class names (if they exist) or ints
|
83 |
+
yticklabels=labels)
|
84 |
+
|
85 |
+
# Make x-axis labels appear on bottom
|
86 |
+
ax.xaxis.set_label_position("bottom")
|
87 |
+
ax.xaxis.tick_bottom()
|
88 |
+
|
89 |
+
# Set the threshold for different colors
|
90 |
+
threshold = (cm.max() + cm.min()) / 2.
|
91 |
+
|
92 |
+
# Plot the text on each cell
|
93 |
+
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
|
94 |
+
if norm:
|
95 |
+
plt.text(j, i, f"{cm[i, j]} ({cm_norm[i, j]*100:.1f}%)",
|
96 |
+
horizontalalignment="center",
|
97 |
+
color="white" if cm[i, j] > threshold else "black",
|
98 |
+
size=text_size)
|
99 |
+
else:
|
100 |
+
plt.text(j, i, f"{cm[i, j]}",
|
101 |
+
horizontalalignment="center",
|
102 |
+
color="white" if cm[i, j] > threshold else "black",
|
103 |
+
size=text_size)
|
104 |
+
|
105 |
+
# Save the figure to the current working directory
|
106 |
+
if savefig:
|
107 |
+
fig.savefig("confusion_matrix.png")
|
108 |
+
|
109 |
+
# Make a function to predict on images and plot them (works with multi-class)
|
110 |
+
def pred_and_plot(model, filename, class_names):
|
111 |
+
"""
|
112 |
+
Imports an image located at filename, makes a prediction on it with
|
113 |
+
a trained model and plots the image with the predicted class as the title.
|
114 |
+
"""
|
115 |
+
# Import the target image and preprocess it
|
116 |
+
img = load_and_prep_image(filename)
|
117 |
+
|
118 |
+
# Make a prediction
|
119 |
+
pred = model.predict(tf.expand_dims(img, axis=0))
|
120 |
+
|
121 |
+
# Get the predicted class
|
122 |
+
if len(pred[0]) > 1: # check for multi-class
|
123 |
+
pred_class = class_names[pred.argmax()] # if more than one output, take the max
|
124 |
+
else:
|
125 |
+
pred_class = class_names[int(tf.round(pred)[0][0])] # if only one output, round
|
126 |
+
|
127 |
+
# Plot the image and predicted class
|
128 |
+
plt.imshow(img)
|
129 |
+
plt.title(f"Prediction: {pred_class}")
|
130 |
+
plt.axis(False);
|
131 |
+
|
132 |
+
import datetime
|
133 |
+
|
134 |
+
def create_tensorboard_callback(dir_name, experiment_name):
|
135 |
+
"""
|
136 |
+
Creates a TensorBoard callback instance to store log files.
|
137 |
+
|
138 |
+
Stores log files with the filepath:
|
139 |
+
"dir_name/experiment_name/current_datetime/"
|
140 |
+
|
141 |
+
Args:
|
142 |
+
dir_name: target directory to store TensorBoard log files
|
143 |
+
experiment_name: name of experiment directory (e.g. efficientnet_model_1)
|
144 |
+
"""
|
145 |
+
log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
146 |
+
tensorboard_callback = tf.keras.callbacks.TensorBoard(
|
147 |
+
log_dir=log_dir
|
148 |
+
)
|
149 |
+
print(f"Saving TensorBoard log files to: {log_dir}")
|
150 |
+
return tensorboard_callback
|
151 |
+
|
152 |
+
# Plot the validation and training data separately
|
153 |
+
import matplotlib.pyplot as plt
|
154 |
+
|
155 |
+
def plot_loss_curves(history):
|
156 |
+
"""
|
157 |
+
Returns separate loss curves for training and validation metrics.
|
158 |
+
|
159 |
+
Args:
|
160 |
+
history: TensorFlow model History object (see: https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/History)
|
161 |
+
"""
|
162 |
+
loss = history.history['loss']
|
163 |
+
val_loss = history.history['val_loss']
|
164 |
+
|
165 |
+
accuracy = history.history['accuracy']
|
166 |
+
val_accuracy = history.history['val_accuracy']
|
167 |
+
|
168 |
+
epochs = range(len(history.history['loss']))
|
169 |
+
|
170 |
+
# Plot loss
|
171 |
+
plt.plot(epochs, loss, label='training_loss')
|
172 |
+
plt.plot(epochs, val_loss, label='val_loss')
|
173 |
+
plt.title('Loss')
|
174 |
+
plt.xlabel('Epochs')
|
175 |
+
plt.legend()
|
176 |
+
|
177 |
+
# Plot accuracy
|
178 |
+
plt.figure()
|
179 |
+
plt.plot(epochs, accuracy, label='training_accuracy')
|
180 |
+
plt.plot(epochs, val_accuracy, label='val_accuracy')
|
181 |
+
plt.title('Accuracy')
|
182 |
+
plt.xlabel('Epochs')
|
183 |
+
plt.legend();
|
184 |
+
|
185 |
+
def compare_historys(original_history, new_history, initial_epochs=5):
|
186 |
+
"""
|
187 |
+
Compares two TensorFlow model History objects.
|
188 |
+
|
189 |
+
Args:
|
190 |
+
original_history: History object from original model (before new_history)
|
191 |
+
new_history: History object from continued model training (after original_history)
|
192 |
+
initial_epochs: Number of epochs in original_history (new_history plot starts from here)
|
193 |
+
"""
|
194 |
+
|
195 |
+
# Get original history measurements
|
196 |
+
acc = original_history.history["accuracy"]
|
197 |
+
loss = original_history.history["loss"]
|
198 |
+
|
199 |
+
val_acc = original_history.history["val_accuracy"]
|
200 |
+
val_loss = original_history.history["val_loss"]
|
201 |
+
|
202 |
+
# Combine original history with new history
|
203 |
+
total_acc = acc + new_history.history["accuracy"]
|
204 |
+
total_loss = loss + new_history.history["loss"]
|
205 |
+
|
206 |
+
total_val_acc = val_acc + new_history.history["val_accuracy"]
|
207 |
+
total_val_loss = val_loss + new_history.history["val_loss"]
|
208 |
+
|
209 |
+
# Make plots
|
210 |
+
plt.figure(figsize=(8, 8))
|
211 |
+
plt.subplot(2, 1, 1)
|
212 |
+
plt.plot(total_acc, label='Training Accuracy')
|
213 |
+
plt.plot(total_val_acc, label='Validation Accuracy')
|
214 |
+
plt.plot([initial_epochs-1, initial_epochs-1],
|
215 |
+
plt.ylim(), label='Start Fine Tuning') # reshift plot around epochs
|
216 |
+
plt.legend(loc='lower right')
|
217 |
+
plt.title('Training and Validation Accuracy')
|
218 |
+
|
219 |
+
plt.subplot(2, 1, 2)
|
220 |
+
plt.plot(total_loss, label='Training Loss')
|
221 |
+
plt.plot(total_val_loss, label='Validation Loss')
|
222 |
+
plt.plot([initial_epochs-1, initial_epochs-1],
|
223 |
+
plt.ylim(), label='Start Fine Tuning') # reshift plot around epochs
|
224 |
+
plt.legend(loc='upper right')
|
225 |
+
plt.title('Training and Validation Loss')
|
226 |
+
plt.xlabel('epoch')
|
227 |
+
plt.show()
|
228 |
+
|
229 |
+
# Create function to unzip a zipfile into current working directory
|
230 |
+
# (since we're going to be downloading and unzipping a few files)
|
231 |
+
import zipfile
|
232 |
+
|
233 |
+
def unzip_data(filename):
|
234 |
+
"""
|
235 |
+
Unzips filename into the current working directory.
|
236 |
+
|
237 |
+
Args:
|
238 |
+
filename (str): a filepath to a target zip folder to be unzipped.
|
239 |
+
"""
|
240 |
+
zip_ref = zipfile.ZipFile(filename, "r")
|
241 |
+
zip_ref.extractall()
|
242 |
+
zip_ref.close()
|
243 |
+
|
244 |
+
|
245 |
+
# Download and unzip file
|
246 |
+
import zipfile
|
247 |
+
import requests
|
248 |
+
import os
|
249 |
+
|
250 |
+
def download_and_unzip(url, target_folder):
|
251 |
+
# Download the file from url and save it
|
252 |
+
filename = os.path.join(target_folder, os.path.basename(url))
|
253 |
+
with open(filename, 'wb') as f:
|
254 |
+
r = requests.get(url)
|
255 |
+
f.write(r.content)
|
256 |
+
|
257 |
+
# Unzip the downloaded file
|
258 |
+
with zipfile.ZipFile(filename, 'r') as zip_ref:
|
259 |
+
zip_ref.extractall(target_folder)
|
260 |
+
|
261 |
+
# Walk through an image classification directory and find out how many files (images)
|
262 |
+
# are in each subdirectory.
|
263 |
+
import os
|
264 |
+
|
265 |
+
def walk_through_dir(dir_path):
|
266 |
+
"""
|
267 |
+
Walks through dir_path returning its contents.
|
268 |
+
|
269 |
+
Args:
|
270 |
+
dir_path (str): target directory
|
271 |
+
|
272 |
+
Returns:
|
273 |
+
A print out of:
|
274 |
+
number of subdiretories in dir_path
|
275 |
+
number of images (files) in each subdirectory
|
276 |
+
name of each subdirectory
|
277 |
+
"""
|
278 |
+
for dirpath, dirnames, filenames in os.walk(dir_path):
|
279 |
+
print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")
|
280 |
+
|
281 |
+
# Function to evaluate: accuracy, precision, recall, f1-score
|
282 |
+
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
283 |
+
|
284 |
+
def calculate_results(y_true, y_pred):
|
285 |
+
"""
|
286 |
+
Calculates model accuracy, precision, recall and f1 score of a binary classification model.
|
287 |
+
|
288 |
+
Args:
|
289 |
+
y_true: true labels in the form of a 1D array
|
290 |
+
y_pred: predicted labels in the form of a 1D array
|
291 |
+
|
292 |
+
Returns a dictionary of accuracy, precision, recall, f1-score.
|
293 |
+
"""
|
294 |
+
# Calculate model accuracy
|
295 |
+
model_accuracy = accuracy_score(y_true, y_pred) * 100
|
296 |
+
# Calculate model precision, recall and f1 score using "weighted average
|
297 |
+
model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
|
298 |
+
model_results = {"accuracy": model_accuracy,
|
299 |
+
"precision": model_precision,
|
300 |
+
"recall": model_recall,
|
301 |
+
"f1": model_f1}
|
302 |
+
return model_results
|
keras_metadata.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f4431dc1c1c5954e42a925f5156fd8dd6472e0215107f818b0859c41a3fe0738
|
3 |
+
size 37902
|
saved_model.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7385d58b575dac3584e0ab16b16c9343e8495abaaaff8d08a2191dce1004900b
|
3 |
+
size 11391900
|
variables.data-00000-of-00001
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3fb9667aef827b0b64a8b51f515da5ab6ec36cf290fc4c7cbb5dee913b4a0e9
|
3 |
+
size 1028877700
|
variables.index
ADDED
Binary file (15.7 kB). View file
|
|