zion581 commited on
Commit
2528553
1 Parent(s): 89a6762

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +94 -0
main.py CHANGED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from nltk.tokenize.treebank import TreebankWordDetokenizer
3
+ import gensim
4
+ from sklearn.model_selection import train_test_split
5
+ import tensorflow as tf
6
+ import keras
7
+ import numpy as np
8
+ import pandas as pd
9
+ from keras.preprocessing.text import Tokenizer
10
+ from keras_preprocessing.sequence import pad_sequences
11
+ from keras.models import Sequential
12
+ from keras import layers
13
+ from keras.callbacks import ModelCheckpoint
14
+ from fastapi import FastAPI
15
+ from fastapi.openapi.utils import get_openapi
16
+ from pydantic import BaseModel
17
+ import streamlit as st
18
+
19
+ app = FastAPI()
20
+
21
+ csv_data = pd.read_csv('airline_sentiment_analysis.csv')
22
+
23
+ train = csv_data[['airline_sentiment', 'text']]
24
+
25
+
26
+ def purify_data(data):
27
+ url_pattern = re.compile(r'https?://\S+|www\.\S+')
28
+ data = url_pattern.sub(r'', data)
29
+ data = re.sub('\S*@\S*\s?', '', data)
30
+ data = re.sub('\.', '', data)
31
+ data = re.sub('\s+', ' ', data)
32
+ data = re.sub("\'", "", data)
33
+ data = re.sub(r'"', '', data)
34
+
35
+ return data
36
+
37
+
38
+ temp = []
39
+ # Splitting pd.Series to list
40
+ data_to_list = train['text'].values.tolist()
41
+ for i in range(len(data_to_list)):
42
+ temp.append(purify_data(data_to_list[i]))
43
+
44
+
45
+ def sent_to_words(sentences):
46
+ for sentence in sentences:
47
+ yield (gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
48
+
49
+
50
+ data_words = list(sent_to_words(temp))
51
+
52
+
53
+ def detokenize(text):
54
+ return TreebankWordDetokenizer().detokenize(text)
55
+
56
+
57
+ data = []
58
+ for i in range(len(data_words)):
59
+ data.append(detokenize(data_words[i]))
60
+ data = np.array(data)
61
+
62
+ labels = np.array(train['airline_sentiment'])
63
+ y = []
64
+ for i in range(len(labels)):
65
+ if labels[i] == 'positive':
66
+ y.append(1)
67
+ else:
68
+ y.append(0)
69
+ y = np.array(y)
70
+ labels = tf.keras.utils.to_categorical(y, 2, dtype="float32")
71
+ del y
72
+
73
+ max_words = 5000
74
+ max_len = 200
75
+
76
+ tokenizer = Tokenizer(num_words=max_words)
77
+ tokenizer.fit_on_texts(data)
78
+ sequences = tokenizer.texts_to_sequences(data)
79
+ tweets = pad_sequences(sequences, maxlen=max_len)
80
+
81
+ X_train, X_test, y_train, y_test = train_test_split(tweets, labels, random_state=0, test_size=0.1)
82
+
83
+ best_model = keras.models.load_model("best_model3.hdf5")
84
+
85
+ sentiment = ['Negative','Positive']
86
+ text = st.text_area("Please enter the text here:")
87
+ text = purify_data(text)
88
+ sequence = tokenizer.texts_to_sequences([text])
89
+ test = pad_sequences(sequence, maxlen=max_len)
90
+ prediction = sentiment[np.around(best_model.predict(test), decimals=0).argmax(axis=1)[0]]
91
+
92
+ if text:
93
+ out = prediction
94
+ st.json(out)