# importing libraries import streamlit as st import tensorflow as tf import nltk from PIL import Image from nltk.stem import WordNetLemmatizer from nltk.tokenize import RegexpTokenizer import string import re import pandas as pd import numpy as np try: nltk.find("corpora/wordnet.zip") except: nltk.download("wordnet") # read files try: acronyms_dict,contractions_dict,stops except(NameError): acronyms_dict = pd.read_json("acronym.json", typ="series") contractions_dict = pd.read_json("contraction.json", typ="series") stops = list(pd.read_csv("stop_words.csv").values.flatten()) # preprocess function # Defining tokenizer regexp = RegexpTokenizer("[\w']+") def preprocess(text): text = text.lower() # lowercase text = text.strip() # whitespaces # Removing html tags html = re.compile(r'<.*?>') text = html.sub(r'', text) # html tags # Removing emoji patterns emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags = re.UNICODE) text = emoji_pattern.sub(r'', text) # unicode char # Removing urls http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http") pattern = r"({})".format(http) # creating pattern text = re.sub(pattern, "", text) # remove urls # Removing twitter usernames pattern = r'@[\w_]+' text = re.sub(pattern, "", text) # remove @twitter usernames # Removing punctuations and numbers punct_str = string.punctuation + string.digits punct_str = punct_str.replace("'", "") punct_str = punct_str.replace("-", "") text = text.translate(str.maketrans('', '', punct_str)) # punctuation and numbers # Replacing "-" in text with empty space text = text.replace("-", " ") # "-" # Substituting acronyms words = [] for word in regexp.tokenize(text): if word in acronyms_dict.index: words = words + acronyms_dict[word].split() else: words = words + word.split() text = ' '.join(words) # acronyms # Substituting Contractions words = [] for word in regexp.tokenize(text): if word in contractions_dict.index: words = words + contractions_dict[word].split() else: words = words + word.split() text = " ".join(words) # contractions punct_str = string.punctuation text = text.translate(str.maketrans('', '', punct_str)) # punctuation again to remove "'" # spellchecker # lemmatization lemmatizer = WordNetLemmatizer() text = " ".join([lemmatizer.lemmatize(word) for word in regexp.tokenize(text)]) # lemmatize # Stopwords Removal text = ' '.join([word for word in regexp.tokenize(text) if word not in stops]) # stopwords # Removing all characters except alphabets and " " (space) filter = string.ascii_letters + " " text = "".join([chr for chr in text if chr in filter]) # remove all characters except alphabets and " " (space) # Removing words with one alphabet occuring more than 3 times continuously pattern = r'\b\w*?(.)\1{2,}\w*\b' text = re.sub(pattern, "", text).strip() # remove words with one alphabet occuring more than 3 times continuously # Removing words with less than 3 characters short_words = r'\b\w{1,2}\b' text = re.sub(short_words, "", text) # remove words with less than 3 characters # return final output return text # making frontend st.write("# Disaster Tweet Prediction") img=Image.open("disaster.jpg") st.image(img,width=500) tweet=st.text_input(label="",value="Enter or paste your tweet here") # model load @st.cache_resource def cache_model(model_name): model=tf.keras.models.load_model(model_name) return model model=cache_model("tweet_model") if len(tweet)>0: clean_tweet=preprocess(tweet) y_pred=model.predict([clean_tweet]) y_pred_num=int(np.round(y_pred)[0][0]) if y_pred_num==0: st.write(f"## Non-Disaster tweet with disaster probability {round(y_pred[0][0],4)*100}%") else: st.write(f"## Disaster tweet with disaster probability {round(y_pred[0][0],4)*100}%")