|
|
|
|
|
import streamlit as st |
|
import tensorflow as tf |
|
import nltk |
|
from PIL import Image |
|
from nltk.stem import WordNetLemmatizer |
|
from nltk.tokenize import RegexpTokenizer |
|
import string |
|
import re |
|
import pandas as pd |
|
import numpy as np |
|
|
|
|
|
try: |
|
nltk.find("corpora/wordnet.zip") |
|
except: |
|
nltk.download("wordnet") |
|
|
|
|
|
try: |
|
acronyms_dict,contractions_dict,stops |
|
except(NameError): |
|
acronyms_dict = pd.read_json("acronym.json", typ="series") |
|
contractions_dict = pd.read_json("contraction.json", typ="series") |
|
stops = list(pd.read_csv("stop_words.csv").values.flatten()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
regexp = RegexpTokenizer("[\w']+") |
|
|
|
def preprocess(text): |
|
|
|
text = text.lower() |
|
text = text.strip() |
|
|
|
|
|
html = re.compile(r'<.*?>') |
|
text = html.sub(r'', text) |
|
|
|
|
|
emoji_pattern = re.compile("[" |
|
u"\U0001F600-\U0001F64F" |
|
u"\U0001F300-\U0001F5FF" |
|
u"\U0001F680-\U0001F6FF" |
|
u"\U0001F1E0-\U0001F1FF" |
|
u"\U00002702-\U000027B0" |
|
u"\U000024C2-\U0001F251" |
|
"]+", flags = re.UNICODE) |
|
text = emoji_pattern.sub(r'', text) |
|
|
|
|
|
http = "https?://\S+|www\.\S+" |
|
pattern = r"({})".format(http) |
|
text = re.sub(pattern, "", text) |
|
|
|
|
|
pattern = r'@[\w_]+' |
|
text = re.sub(pattern, "", text) |
|
|
|
|
|
punct_str = string.punctuation + string.digits |
|
punct_str = punct_str.replace("'", "") |
|
punct_str = punct_str.replace("-", "") |
|
text = text.translate(str.maketrans('', '', punct_str)) |
|
|
|
|
|
text = text.replace("-", " ") |
|
|
|
|
|
words = [] |
|
for word in regexp.tokenize(text): |
|
if word in acronyms_dict.index: |
|
words = words + acronyms_dict[word].split() |
|
else: |
|
words = words + word.split() |
|
text = ' '.join(words) |
|
|
|
|
|
words = [] |
|
for word in regexp.tokenize(text): |
|
if word in contractions_dict.index: |
|
words = words + contractions_dict[word].split() |
|
else: |
|
words = words + word.split() |
|
text = " ".join(words) |
|
|
|
punct_str = string.punctuation |
|
text = text.translate(str.maketrans('', '', punct_str)) |
|
|
|
|
|
|
|
lemmatizer = WordNetLemmatizer() |
|
text = " ".join([lemmatizer.lemmatize(word) for word in regexp.tokenize(text)]) |
|
|
|
|
|
text = ' '.join([word for word in regexp.tokenize(text) if word not in stops]) |
|
|
|
|
|
filter = string.ascii_letters + " " |
|
text = "".join([chr for chr in text if chr in filter]) |
|
|
|
|
|
pattern = r'\b\w*?(.)\1{2,}\w*\b' |
|
text = re.sub(pattern, "", text).strip() |
|
|
|
|
|
short_words = r'\b\w{1,2}\b' |
|
text = re.sub(short_words, "", text) |
|
|
|
|
|
return text |
|
|
|
|
|
|
|
|
|
st.write("# Disaster Tweet Prediction") |
|
|
|
img=Image.open("disaster.jpg") |
|
st.image(img,width=500) |
|
|
|
|
|
tweet=st.text_input(label="",value="Enter or paste your tweet here") |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
def cache_model(model_name): |
|
model=tf.keras.models.load_model(model_name) |
|
return model |
|
|
|
model=cache_model("tweet_model") |
|
|
|
|
|
|
|
if len(tweet)>0: |
|
clean_tweet=preprocess(tweet) |
|
y_pred=model.predict([clean_tweet]) |
|
y_pred_num=int(np.round(y_pred)[0][0]) |
|
|
|
|
|
if y_pred_num==0: |
|
st.write(f"## Non-Disaster tweet with disaster probability {round(y_pred[0][0],4)*100}%") |
|
|
|
else: |
|
st.write(f"## Disaster tweet with disaster probability {round(y_pred[0][0],4)*100}%") |
|
|
|
|
|
|