import re from nltk.tokenize.treebank import TreebankWordDetokenizer import gensim from sklearn.model_selection import train_test_split import tensorflow as tf import keras import numpy as np import pandas as pd from keras.preprocessing.text import Tokenizer from keras_preprocessing.sequence import pad_sequences from keras.models import Sequential from keras import layers from keras.callbacks import ModelCheckpoint from fastapi import FastAPI from fastapi.openapi.utils import get_openapi from pydantic import BaseModel import streamlit as st app = FastAPI() csv_data = pd.read_csv('airline_sentiment_analysis.csv') train = csv_data[['airline_sentiment', 'text']] def purify_data(data): url_pattern = re.compile(r'https?://\S+|www\.\S+') data = url_pattern.sub(r'', data) data = re.sub('\S*@\S*\s?', '', data) data = re.sub('\.', '', data) data = re.sub('\s+', ' ', data) data = re.sub("\'", "", data) data = re.sub(r'"', '', data) return data temp = [] # Splitting pd.Series to list data_to_list = train['text'].values.tolist() for i in range(len(data_to_list)): temp.append(purify_data(data_to_list[i])) def sent_to_words(sentences): for sentence in sentences: yield (gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations data_words = list(sent_to_words(temp)) def detokenize(text): return TreebankWordDetokenizer().detokenize(text) data = [] for i in range(len(data_words)): data.append(detokenize(data_words[i])) data = np.array(data) labels = np.array(train['airline_sentiment']) y = [] for i in range(len(labels)): if labels[i] == 'positive': y.append(1) else: y.append(0) y = np.array(y) labels = tf.keras.utils.to_categorical(y, 2, dtype="float32") del y max_words = 5000 max_len = 200 tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(data) sequences = tokenizer.texts_to_sequences(data) tweets = pad_sequences(sequences, maxlen=max_len) X_train, X_test, y_train, y_test = train_test_split(tweets, labels, random_state=0, test_size=0.1) best_model = keras.models.load_model("best_model3.hdf5") sentiment = ['Negative','Positive'] text = st.text_area("Please enter the text here:") text = purify_data(text) sequence = tokenizer.texts_to_sequences([text]) test = pad_sequences(sequence, maxlen=max_len) prediction = sentiment[np.around(best_model.predict(test), decimals=0).argmax(axis=1)[0]] if text: out = prediction st.json(out)