AyushSingh127's picture
Upload 11 files
fba9090
raw
history blame contribute delete
No virus
5.73 kB
# importing libraries
import streamlit as st
import tensorflow as tf
import nltk
from PIL import Image
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import string
import re
import pandas as pd
import numpy as np
try:
nltk.find("corpora/wordnet.zip")
except:
nltk.download("wordnet")
# read files
try:
acronyms_dict,contractions_dict,stops
except(NameError):
acronyms_dict = pd.read_json("acronym.json", typ="series")
contractions_dict = pd.read_json("contraction.json", typ="series")
stops = list(pd.read_csv("stop_words.csv").values.flatten())
# preprocess function
# Defining tokenizer
regexp = RegexpTokenizer("[\w']+")
def preprocess(text):
text = text.lower() # lowercase
text = text.strip() # whitespaces
# Removing html tags
html = re.compile(r'<.*?>')
text = html.sub(r'', text) # html tags
# Removing emoji patterns
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags = re.UNICODE)
text = emoji_pattern.sub(r'', text) # unicode char
# Removing urls
http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
pattern = r"({})".format(http) # creating pattern
text = re.sub(pattern, "", text) # remove urls
# Removing twitter usernames
pattern = r'@[\w_]+'
text = re.sub(pattern, "", text) # remove @twitter usernames
# Removing punctuations and numbers
punct_str = string.punctuation + string.digits
punct_str = punct_str.replace("'", "")
punct_str = punct_str.replace("-", "")
text = text.translate(str.maketrans('', '', punct_str)) # punctuation and numbers
# Replacing "-" in text with empty space
text = text.replace("-", " ") # "-"
# Substituting acronyms
words = []
for word in regexp.tokenize(text):
if word in acronyms_dict.index:
words = words + acronyms_dict[word].split()
else:
words = words + word.split()
text = ' '.join(words) # acronyms
# Substituting Contractions
words = []
for word in regexp.tokenize(text):
if word in contractions_dict.index:
words = words + contractions_dict[word].split()
else:
words = words + word.split()
text = " ".join(words) # contractions
punct_str = string.punctuation
text = text.translate(str.maketrans('', '', punct_str)) # punctuation again to remove "'" # spellchecker
# lemmatization
lemmatizer = WordNetLemmatizer()
text = " ".join([lemmatizer.lemmatize(word) for word in regexp.tokenize(text)]) # lemmatize
# Stopwords Removal
text = ' '.join([word for word in regexp.tokenize(text) if word not in stops]) # stopwords
# Removing all characters except alphabets and " " (space)
filter = string.ascii_letters + " "
text = "".join([chr for chr in text if chr in filter]) # remove all characters except alphabets and " " (space)
# Removing words with one alphabet occuring more than 3 times continuously
pattern = r'\b\w*?(.)\1{2,}\w*\b'
text = re.sub(pattern, "", text).strip() # remove words with one alphabet occuring more than 3 times continuously
# Removing words with less than 3 characters
short_words = r'\b\w{1,2}\b'
text = re.sub(short_words, "", text) # remove words with less than 3 characters
# return final output
return text
# making frontend
st.write("# Disaster Tweet Prediction")
img=Image.open("disaster.jpg")
st.image(img,width=500)
tweet=st.text_input(label="",value="Enter or paste your tweet here")
# model load
@st.cache_resource
def cache_model(model_name):
model=tf.keras.models.load_model(model_name)
return model
model=cache_model("tweet_model")
if len(tweet)>0:
clean_tweet=preprocess(tweet)
y_pred=model.predict([clean_tweet])
y_pred_num=int(np.round(y_pred)[0][0])
if y_pred_num==0:
st.write(f"## Non-Disaster tweet with disaster probability {round(y_pred[0][0],4)*100}%")
else:
st.write(f"## Disaster tweet with disaster probability {round(y_pred[0][0],4)*100}%")