zion581's picture
Update main.py
2528553
raw
history blame
2.51 kB
import re
from nltk.tokenize.treebank import TreebankWordDetokenizer
import gensim
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from keras.callbacks import ModelCheckpoint
from fastapi import FastAPI
from fastapi.openapi.utils import get_openapi
from pydantic import BaseModel
import streamlit as st
app = FastAPI()
csv_data = pd.read_csv('airline_sentiment_analysis.csv')
train = csv_data[['airline_sentiment', 'text']]
def purify_data(data):
url_pattern = re.compile(r'https?://\S+|www\.\S+')
data = url_pattern.sub(r'', data)
data = re.sub('\S*@\S*\s?', '', data)
data = re.sub('\.', '', data)
data = re.sub('\s+', ' ', data)
data = re.sub("\'", "", data)
data = re.sub(r'"', '', data)
return data
temp = []
# Splitting pd.Series to list
data_to_list = train['text'].values.tolist()
for i in range(len(data_to_list)):
temp.append(purify_data(data_to_list[i]))
def sent_to_words(sentences):
for sentence in sentences:
yield (gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(temp))
def detokenize(text):
return TreebankWordDetokenizer().detokenize(text)
data = []
for i in range(len(data_words)):
data.append(detokenize(data_words[i]))
data = np.array(data)
labels = np.array(train['airline_sentiment'])
y = []
for i in range(len(labels)):
if labels[i] == 'positive':
y.append(1)
else:
y.append(0)
y = np.array(y)
labels = tf.keras.utils.to_categorical(y, 2, dtype="float32")
del y
max_words = 5000
max_len = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
tweets = pad_sequences(sequences, maxlen=max_len)
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, random_state=0, test_size=0.1)
best_model = keras.models.load_model("best_model3.hdf5")
sentiment = ['Negative','Positive']
text = st.text_area("Please enter the text here:")
text = purify_data(text)
sequence = tokenizer.texts_to_sequences([text])
test = pad_sequences(sequence, maxlen=max_len)
prediction = sentiment[np.around(best_model.predict(test), decimals=0).argmax(axis=1)[0]]
if text:
out = prediction
st.json(out)