import pandas as pd import numpy as np from tensorflow.keras import layers from tensorflow.keras import Input from tensorflow.keras.models import Model from tensorflow.keras.preprocessing import sequence from tensorflow.keras.preprocessing.text import Tokenizer import matplotlib.pyplot as plt from tensorflow.keras.callbacks import EarlyStopping df = pd.read_csv("train.csv") embeddings_index = {} f = open('glove.6B.100d.txt',encoding="utf") for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('Found %s word vectors.' % len(embeddings_index)) data = df.text labels = df.target x_train = data[0:6100] x_test = data[6100:] y_train = labels[0:6100] y_test = labels[6100:] tokenizer = Tokenizer() tokenizer.fit_on_texts(x_train.values) sequences = tokenizer.texts_to_sequences(x_train.values) sequences = sequence.pad_sequences(sequences, maxlen=200) vocab_size = len(tokenizer.word_index)+1 embedding_dim = 100 max_words=1513 embedding_matrix = np.zeros((vocab_size, embedding_dim)) for word, i in tokenizer.word_index.items(): if i < max_words: embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector input_layer = Input(shape=(None,), dtype='int32', name='tweet_input') x = layers.Embedding(vocab_size, 100, input_length=200)(input_layer) x = layers.LSTM(32, dropout=0.1, recurrent_dropout=0.5, return_sequences=True)(x) x = layers.LSTM(32, dropout=0.1, recurrent_dropout=0.5, return_sequences=False)(x) x = layers.Dense(100, activation='relu')(x) output = layers.Dense(1, activation='sigmoid')(x) model = Model(input_layer,output) model.layers[1].set_weights([embedding_matrix]) model.layers[1].trainable = False model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) es = EarlyStopping(monitor='val_loss', mode='min') history = model.fit(sequences, y_train.values, epochs=20, validation_split=0.2, callbacks = [es]) model.save("trained.h5") sequences = tokenizer.texts_to_sequences(x_test.values) sequences = sequence.pad_sequences(sequences, maxlen=200) x_test = sequences score = model.evaluate(x_test, y_test.values) test = pd.read_csv("test.csv") ids = test.id test = test.text sequences = tokenizer.texts_to_sequences(test) sequences = sequence.pad_sequences(sequences, maxlen=200) results = model.predict(sequences) results = results.round() results = results.squeeze() csv_df = pd.DataFrame({ "id": ids, "target": results }) csv_df.index = csv_df.id csv_df = csv_df["target"] csv_df = csv_df.astype(int) csv_df.to_csv("results.csv", header=True) def encoder(text): text = tokenizer.texts_to_sequences([text]) text = sequence.pad_sequences(text, maxlen=200) return text def predict(text): encoded_text = encoder(text) # print(encoded_text) prediction = (model.predict(encoded_text)) print(prediction) prediction = np.round(prediction) if prediction==1: return "Disaster" return "Not a Disaster" import gradio as gr title="Relevance Classifier" description="

Classifies input text into Disaster-related or not disaster related." gr.Interface(fn=predict, inputs='text', outputs='text', title=title, description=description).launch()