lydiadida commited on
Commit
2525ce4
1 Parent(s): fb1a9aa

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +112 -0
  2. my_model.h5 +3 -0
  3. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from keras.models import load_model
3
+ import nltk
4
+ import re
5
+ from nltk.tokenize import TweetTokenizer
6
+ from tensorflow.keras.preprocessing.text import Tokenizer
7
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
8
+ import subprocess
9
+ import numpy as np
10
+
11
+ # Download NLTK stopwords if not already downloaded
12
+ try:
13
+ nltk.data.find('corpora/stopwords')
14
+ except LookupError:
15
+ nltk.download('stopwords')
16
+
17
+ # Additional imports
18
+ from nltk.corpus import stopwords
19
+
20
+ # Download NLTK punkt tokenizer if not already downloaded
21
+ try:
22
+ nltk.data.find('tokenizers/punkt/PY3/english.pickle')
23
+ except LookupError:
24
+ nltk.download('punkt')
25
+
26
+ # Additional imports
27
+ from nltk.tokenize import word_tokenize
28
+
29
+ # Load the LSTM model
30
+ model_path = "./my_model.h5" # Set your model path here
31
+
32
+ def load_lstm_model(model_path):
33
+ return load_model(model_path)
34
+
35
+
36
+
37
+ def clean_text(text):
38
+ # Remove stopwords
39
+ stop_words = set(stopwords.words('english'))
40
+ words = word_tokenize(text)
41
+ filtered_words = [word for word in words if word not in stop_words]
42
+
43
+ # Remove Twitter usernames
44
+ text = re.sub(r'@\w+', '', ' '.join(filtered_words))
45
+
46
+ # Remove URLs
47
+ text = re.sub(r'http\S+', '', text)
48
+
49
+ # Tokenize using TweetTokenizer
50
+ tokenizer = TweetTokenizer(preserve_case=True)
51
+ text = tokenizer.tokenize(text)
52
+
53
+ # Remove hashtag symbols
54
+ text = [word.replace('#', '') for word in text]
55
+
56
+ # Remove short words
57
+ text = ' '.join([word.lower() for word in text if len(word) > 2])
58
+
59
+ # Remove digits
60
+ text = re.sub(r'\d+', '', text)
61
+
62
+ # Remove non-alphanumeric characters
63
+ text = re.sub(r'[^a-zA-Z\s]', '', text)
64
+
65
+ return text
66
+
67
+ def preprocess_text(text):
68
+ # Clean the text
69
+ cleaned_text = clean_text(text)
70
+
71
+ # Tokenize and pad sequences
72
+ token = Tokenizer()
73
+ token.fit_on_texts([cleaned_text])
74
+ text_sequences = token.texts_to_sequences([cleaned_text])
75
+ padded_sequences = pad_sequences(text_sequences, maxlen=100)
76
+
77
+ return padded_sequences
78
+
79
+ # Function to predict hate speech
80
+ def predict_hate_speech(text, lstm_model):
81
+ # Preprocess the text
82
+ padded_sequences = preprocess_text(text)
83
+ prediction = lstm_model.predict(padded_sequences)
84
+ return prediction
85
+
86
+ # Main function to run the Streamlit app
87
+ def main():
88
+ # Set up Streamlit UI
89
+ st.title("Hate Speech Detection")
90
+ st.write("Enter text below to detect hate speech:")
91
+ input_text = st.text_area("Input Text", "")
92
+
93
+ if st.button("Detect Hate Speech"):
94
+ if input_text:
95
+ # Load the model
96
+ lstm_model = load_lstm_model(model_path)
97
+ # Predict hate speech
98
+ prediction = predict_hate_speech(input_text, lstm_model)
99
+ # Convert the list to a numpy array
100
+ arr = np.array(prediction[0])
101
+ max_index = np.argmax(arr)
102
+ if max_index == 1:
103
+ #negative
104
+ st.error("Hate Speech Detected")
105
+ else:
106
+ st.success("No Hate Speech Detected")
107
+ else:
108
+ st.warning("Please enter some text")
109
+
110
+ # Run the app
111
+ if __name__ == "__main__":
112
+ main()
my_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16cac2f352b17d0cac372fa35e56d49363b58e9a2c8a15f54cdf227009419567
3
+ size 9365784
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ keras
3
+ nltk
4
+ tensorflow