File size: 3,377 Bytes
85035a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
df = pd.read_csv("train.csv")
embeddings_index = {}
f = open('glove.6B.100d.txt',encoding="utf")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))
data = df.text
labels = df.target
x_train = data[0:6100]
x_test = data[6100:]
y_train = labels[0:6100] 
y_test = labels[6100:]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train.values)
sequences = tokenizer.texts_to_sequences(x_train.values)
sequences = sequence.pad_sequences(sequences, maxlen=200)
vocab_size = len(tokenizer.word_index)+1
embedding_dim = 100
max_words=1513

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
input_layer = Input(shape=(None,), dtype='int32', name='tweet_input')
x = layers.Embedding(vocab_size, 100, input_length=200)(input_layer)
x = layers.LSTM(32,
dropout=0.1,
recurrent_dropout=0.5,
return_sequences=True)(x)
x = layers.LSTM(32,
dropout=0.1,
recurrent_dropout=0.5,
return_sequences=False)(x)
x = layers.Dense(100, activation='relu')(x)
output = layers.Dense(1, activation='sigmoid')(x)
model = Model(input_layer,output)
model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable = False
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
es = EarlyStopping(monitor='val_loss', mode='min')
history = model.fit(sequences, y_train.values, epochs=20, validation_split=0.2, callbacks = [es])
model.save("trained.h5")
sequences = tokenizer.texts_to_sequences(x_test.values)
sequences = sequence.pad_sequences(sequences, maxlen=200)
x_test = sequences
score = model.evaluate(x_test, y_test.values)
test = pd.read_csv("test.csv")
ids = test.id
test = test.text
sequences = tokenizer.texts_to_sequences(test)
sequences = sequence.pad_sequences(sequences, maxlen=200)
results = model.predict(sequences)
results = results.round()
results = results.squeeze()
csv_df = pd.DataFrame({
    "id": ids,
    "target": results
})
csv_df.index = csv_df.id
csv_df = csv_df["target"]
csv_df = csv_df.astype(int)
csv_df.to_csv("results.csv", header=True)
def encoder(text):
    text = tokenizer.texts_to_sequences([text])
    text = sequence.pad_sequences(text, maxlen=200)
    return text
def predict(text):
    encoded_text = encoder(text)
#     print(encoded_text)
    prediction = (model.predict(encoded_text))
    print(prediction)
    prediction = np.round(prediction)
    if prediction==1:
        return "Disaster"
    return "Not a Disaster"
import gradio as gr
title="Relevance Classifier"
description="<p style='text-align:center'>Classifies input text into Disaster-related or not disaster related."

gr.Interface(fn=predict, inputs='text', outputs='text', title=title, description=description).launch()