Spaces:

eaglelandsonce
/

TensorFlowClass

Sleeping

App Files Files Community

eaglelandsonce commited on Jul 11, 2024

Commit

08cf096

•

1 Parent(s): 79ac5ce

Update pages/21_NLP.py

Browse files

Files changed (1) hide show

pages/21_NLP.py +40 -53

pages/21_NLP.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import streamlit as st
 import tensorflow as tf
-from transformers import BertTokenizer, TFBertForSequenceClassification
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 import numpy as np
 import matplotlib.pyplot as plt
@@ -15,67 +14,47 @@ dataset = load_dataset("imdb")
 # Split dataset into training and testing
 train_data, test_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2)
-# Initialize the tokenizer
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-# Tokenization and padding
 max_length = 128
-def tokenize_and_pad(text):
-    tokens = tokenizer.encode_plus(
-        text,
-        max_length=max_length,
-        padding='max_length',
-        truncation=True,
-        return_tensors='tf'
-    )
-    return tokens['input_ids'], tokens['attention_mask']
-# Preprocess the dataset
-def preprocess_data(data):
-    input_ids = []
-    attention_masks = []
-    labels = []
-    for review, label in zip(data['text'], data['label']):
-        ids, mask = tokenize_and_pad(review)
-        input_ids.append(ids)
-        attention_masks.append(mask)
-        labels.append(label)
-    return np.array(input_ids), np.array(attention_masks), np.array(labels)
-X_train_ids, X_train_mask, y_train = preprocess_data(train_data)
-X_test_ids, X_test_mask, y_test = preprocess_data(test_data)
-# Load the pre-trained BERT model
-model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
-# Build the Keras model
-input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
-attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")
-bert_outputs = model(input_ids, attention_mask=attention_mask)
-outputs = tf.keras.layers.Dense(1, activation='sigmoid')(bert_outputs.logits)
-model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs)
 model.summary()
 # Compile the model
-model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
-              loss='binary_crossentropy',
-              metrics=['accuracy'])
 # Train the model
-history = model.fit(
-    [X_train_ids, X_train_mask],
-    y_train,
-    validation_split=0.1,
-    epochs=3,
-    batch_size=32
-)
 # Evaluate the model
-loss, accuracy = model.evaluate([X_test_ids, X_test_mask], y_test)
 st.write(f'Test Accuracy: {accuracy}')
 # Plot training & validation accuracy values
@@ -96,3 +75,11 @@ ax.set_xlabel('Epoch')
 ax.set_ylabel('Loss')
 ax.legend()
 st.pyplot(fig)

 import streamlit as st
 import tensorflow as tf
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 import numpy as np
 import matplotlib.pyplot as plt
 # Split dataset into training and testing
 train_data, test_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2)
+# Tokenizer parameters
+vocab_size = 10000
 max_length = 128
+embedding_dim = 128
+# Tokenize the data
+tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<OOV>")
+tokenizer.fit_on_texts(train_data['text'].values)
+word_index = tokenizer.word_index
+# Convert text to sequences
+X_train = tokenizer.texts_to_sequences(train_data['text'].values)
+X_test = tokenizer.texts_to_sequences(test_data['text'].values)
+# Pad sequences
+X_train = pad_sequences(X_train, maxlen=max_length, padding='post', truncating='post')
+X_test = pad_sequences(X_test, maxlen=max_length, padding='post', truncating='post')
+# Labels
+y_train = train_data['label'].values
+y_test = test_data['label'].values
+# Build the LSTM model
+model = tf.keras.Sequential([
+    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
+    tf.keras.layers.LSTM(64, return_sequences=True),
+    tf.keras.layers.LSTM(32),
+    tf.keras.layers.Dense(24, activation='relu'),
+    tf.keras.layers.Dense(1, activation='sigmoid')
+])
 model.summary()
 # Compile the model
+model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
 # Train the model
+history = model.fit(X_train, y_train, epochs=3, validation_split=0.1, batch_size=32)
 # Evaluate the model
+loss, accuracy = model.evaluate(X_test, y_test)
 st.write(f'Test Accuracy: {accuracy}')
 # Plot training & validation accuracy values
 ax.set_ylabel('Loss')
 ax.legend()
 st.pyplot(fig)
+# Convert the model to TensorFlow.js format
+import tensorflowjs as tfjs
+tfjs_target_dir = 'tfjs_model'
+model.save('model.h5')
+tfjs.converters.save_keras_model(model, tfjs_target_dir)
+st.write("Model saved and converted to TensorFlow.js format.")