Spaces:
Sleeping
Sleeping
import streamlit as st | |
from keras.models import load_model | |
import nltk | |
import re | |
from nltk.tokenize import TweetTokenizer | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
import subprocess | |
import numpy as np | |
# Download NLTK stopwords if not already downloaded | |
try: | |
nltk.data.find('corpora/stopwords') | |
except LookupError: | |
nltk.download('stopwords') | |
# Additional imports | |
from nltk.corpus import stopwords | |
# Download NLTK punkt tokenizer if not already downloaded | |
try: | |
nltk.data.find('tokenizers/punkt/PY3/english.pickle') | |
except LookupError: | |
nltk.download('punkt') | |
# Additional imports | |
from nltk.tokenize import word_tokenize | |
# Load the LSTM model | |
model_path = "./my_model.h5" # Set your model path here | |
def load_lstm_model(model_path): | |
return load_model(model_path) | |
def clean_text(text): | |
# Remove stopwords | |
stop_words = set(stopwords.words('english')) | |
words = word_tokenize(text) | |
filtered_words = [word for word in words if word not in stop_words] | |
# Remove Twitter usernames | |
text = re.sub(r'@\w+', '', ' '.join(filtered_words)) | |
# Remove URLs | |
text = re.sub(r'http\S+', '', text) | |
# Tokenize using TweetTokenizer | |
tokenizer = TweetTokenizer(preserve_case=True) | |
text = tokenizer.tokenize(text) | |
# Remove hashtag symbols | |
text = [word.replace('#', '') for word in text] | |
# Remove short words | |
text = ' '.join([word.lower() for word in text if len(word) > 2]) | |
# Remove digits | |
text = re.sub(r'\d+', '', text) | |
# Remove non-alphanumeric characters | |
text = re.sub(r'[^a-zA-Z\s]', '', text) | |
return text | |
def preprocess_text(text): | |
# Clean the text | |
cleaned_text = clean_text(text) | |
# Tokenize and pad sequences | |
token = Tokenizer() | |
token.fit_on_texts([cleaned_text]) | |
text_sequences = token.texts_to_sequences([cleaned_text]) | |
padded_sequences = pad_sequences(text_sequences, maxlen=100) | |
return padded_sequences | |
# Function to predict hate speech | |
def predict_hate_speech(text, lstm_model): | |
# Preprocess the text | |
padded_sequences = preprocess_text(text) | |
prediction = lstm_model.predict(padded_sequences) | |
return prediction | |
# Main function to run the Streamlit app | |
def main(): | |
# Set up Streamlit UI | |
st.title("Hate Speech Detection") | |
st.write("Enter text below to detect hate speech:") | |
input_text = st.text_area("Input Text", "") | |
if st.button("Detect Hate Speech"): | |
if input_text: | |
# Load the model | |
lstm_model = load_lstm_model(model_path) | |
# Predict hate speech | |
prediction = predict_hate_speech(input_text, lstm_model) | |
# Convert the list to a numpy array | |
arr = np.array(prediction[0]) | |
max_index = np.argmax(arr) | |
if max_index == 1: | |
#negative | |
st.error("Hate Speech Detected") | |
else: | |
st.success("No Hate Speech Detected") | |
else: | |
st.warning("Please enter some text") | |
# Run the app | |
if __name__ == "__main__": | |
main() | |