eaglelandsonce commited on
Commit
79ac5ce
·
verified ·
1 Parent(s): ef623be

Create 21_NLP.py

Browse files
Files changed (1) hide show
  1. pages/21_NLP.py +98 -0
pages/21_NLP.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tensorflow as tf
3
+ from transformers import BertTokenizer, TFBertForSequenceClassification
4
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ from sklearn.model_selection import train_test_split
8
+
9
+ # Load the IMDb dataset
10
+ from datasets import load_dataset
11
+
12
+ # Load dataset
13
+ dataset = load_dataset("imdb")
14
+
15
+ # Split dataset into training and testing
16
+ train_data, test_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2)
17
+
18
+ # Initialize the tokenizer
19
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
20
+
21
+ # Tokenization and padding
22
+ max_length = 128
23
+
24
+ def tokenize_and_pad(text):
25
+ tokens = tokenizer.encode_plus(
26
+ text,
27
+ max_length=max_length,
28
+ padding='max_length',
29
+ truncation=True,
30
+ return_tensors='tf'
31
+ )
32
+ return tokens['input_ids'], tokens['attention_mask']
33
+
34
+ # Preprocess the dataset
35
+ def preprocess_data(data):
36
+ input_ids = []
37
+ attention_masks = []
38
+ labels = []
39
+ for review, label in zip(data['text'], data['label']):
40
+ ids, mask = tokenize_and_pad(review)
41
+ input_ids.append(ids)
42
+ attention_masks.append(mask)
43
+ labels.append(label)
44
+ return np.array(input_ids), np.array(attention_masks), np.array(labels)
45
+
46
+ X_train_ids, X_train_mask, y_train = preprocess_data(train_data)
47
+ X_test_ids, X_test_mask, y_test = preprocess_data(test_data)
48
+
49
+ # Load the pre-trained BERT model
50
+ model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
51
+
52
+ # Build the Keras model
53
+ input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
54
+ attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")
55
+
56
+ bert_outputs = model(input_ids, attention_mask=attention_mask)
57
+ outputs = tf.keras.layers.Dense(1, activation='sigmoid')(bert_outputs.logits)
58
+
59
+ model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs)
60
+
61
+ model.summary()
62
+
63
+ # Compile the model
64
+ model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
65
+ loss='binary_crossentropy',
66
+ metrics=['accuracy'])
67
+
68
+ # Train the model
69
+ history = model.fit(
70
+ [X_train_ids, X_train_mask],
71
+ y_train,
72
+ validation_split=0.1,
73
+ epochs=3,
74
+ batch_size=32
75
+ )
76
+
77
+ # Evaluate the model
78
+ loss, accuracy = model.evaluate([X_test_ids, X_test_mask], y_test)
79
+ st.write(f'Test Accuracy: {accuracy}')
80
+
81
+ # Plot training & validation accuracy values
82
+ st.subheader("Training and Validation Accuracy")
83
+ fig, ax = plt.subplots()
84
+ ax.plot(history.history['accuracy'], label='Training Accuracy')
85
+ ax.plot(history.history['val_accuracy'], label='Validation Accuracy')
86
+ ax.set_xlabel('Epoch')
87
+ ax.set_ylabel('Accuracy')
88
+ ax.legend()
89
+ st.pyplot(fig)
90
+
91
+ st.subheader("Training and Validation Loss")
92
+ fig, ax = plt.subplots()
93
+ ax.plot(history.history['loss'], label='Training Loss')
94
+ ax.plot(history.history['val_loss'], label='Validation Loss')
95
+ ax.set_xlabel('Epoch')
96
+ ax.set_ylabel('Loss')
97
+ ax.legend()
98
+ st.pyplot(fig)