Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
from tensorflow.keras import layers | |
from tensorflow.keras import Input | |
from tensorflow.keras.models import Model | |
from tensorflow.keras.preprocessing import sequence | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
import matplotlib.pyplot as plt | |
from tensorflow.keras.callbacks import EarlyStopping | |
df = pd.read_csv("train.csv") | |
embeddings_index = {} | |
f = open('glove.6B.100d.txt',encoding="utf") | |
for line in f: | |
values = line.split() | |
word = values[0] | |
coefs = np.asarray(values[1:], dtype='float32') | |
embeddings_index[word] = coefs | |
f.close() | |
print('Found %s word vectors.' % len(embeddings_index)) | |
data = df.text | |
labels = df.target | |
x_train = data[0:6100] | |
x_test = data[6100:] | |
y_train = labels[0:6100] | |
y_test = labels[6100:] | |
tokenizer = Tokenizer() | |
tokenizer.fit_on_texts(x_train.values) | |
sequences = tokenizer.texts_to_sequences(x_train.values) | |
sequences = sequence.pad_sequences(sequences, maxlen=200) | |
vocab_size = len(tokenizer.word_index)+1 | |
embedding_dim = 100 | |
max_words=1513 | |
embedding_matrix = np.zeros((vocab_size, embedding_dim)) | |
for word, i in tokenizer.word_index.items(): | |
if i < max_words: | |
embedding_vector = embeddings_index.get(word) | |
if embedding_vector is not None: | |
embedding_matrix[i] = embedding_vector | |
input_layer = Input(shape=(None,), dtype='int32', name='tweet_input') | |
x = layers.Embedding(vocab_size, 100, input_length=200)(input_layer) | |
x = layers.LSTM(32, | |
dropout=0.1, | |
recurrent_dropout=0.5, | |
return_sequences=True)(x) | |
x = layers.LSTM(32, | |
dropout=0.1, | |
recurrent_dropout=0.5, | |
return_sequences=False)(x) | |
x = layers.Dense(100, activation='relu')(x) | |
output = layers.Dense(1, activation='sigmoid')(x) | |
model = Model(input_layer,output) | |
model.layers[1].set_weights([embedding_matrix]) | |
model.layers[1].trainable = False | |
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) | |
es = EarlyStopping(monitor='val_loss', mode='min') | |
history = model.fit(sequences, y_train.values, epochs=20, validation_split=0.2, callbacks = [es]) | |
model.save("trained.h5") | |
sequences = tokenizer.texts_to_sequences(x_test.values) | |
sequences = sequence.pad_sequences(sequences, maxlen=200) | |
x_test = sequences | |
score = model.evaluate(x_test, y_test.values) | |
test = pd.read_csv("test.csv") | |
ids = test.id | |
test = test.text | |
sequences = tokenizer.texts_to_sequences(test) | |
sequences = sequence.pad_sequences(sequences, maxlen=200) | |
results = model.predict(sequences) | |
results = results.round() | |
results = results.squeeze() | |
csv_df = pd.DataFrame({ | |
"id": ids, | |
"target": results | |
}) | |
csv_df.index = csv_df.id | |
csv_df = csv_df["target"] | |
csv_df = csv_df.astype(int) | |
csv_df.to_csv("results.csv", header=True) | |
def encoder(text): | |
text = tokenizer.texts_to_sequences([text]) | |
text = sequence.pad_sequences(text, maxlen=200) | |
return text | |
def predict(text): | |
encoded_text = encoder(text) | |
# print(encoded_text) | |
prediction = (model.predict(encoded_text)) | |
print(prediction) | |
prediction = np.round(prediction) | |
if prediction==1: | |
return "Disaster" | |
return "Not a Disaster" | |
import gradio as gr | |
title="Relevance Classifier" | |
description="<p style='text-align:center'>Classifies input text into Disaster-related or not disaster related." | |
gr.Interface(fn=predict, inputs='text', outputs='text', title=title, description=description).launch() | |