Priyanhsu commited on
Commit
49b04e1
·
1 Parent(s): 36b080e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -78
app.py CHANGED
@@ -18,106 +18,46 @@ import nltk
18
  nltk.download('stopwords')
19
  nltk.download('punkt')
20
 
21
- with open('NewData.json') as file:
22
- data = json.load(file)
23
 
24
- df = pd.DataFrame(data)
25
- # shuffling all our data
26
- df = df.sample(frac=1)
27
- # reading only Message_body and label
28
- df = df[['content','label']]
29
- df['clean_msg'] = df['content'].apply(lambda x: x.lower())
30
- # Remove punctuation
31
- import string
32
  def remove_punctuation(text):
33
  punctuation_free = "".join([i for i in text if i not in string.punctuation])
34
  return punctuation_free
35
-
36
- df['clean_msg'] = df['clean_msg'].apply(lambda x: remove_punctuation(x))
37
- # Tokenization
38
- from nltk.tokenize import WhitespaceTokenizer
39
- def tokenization(text):
40
- tk = WhitespaceTokenizer()
41
- return tk.tokenize(text)
42
-
43
- df['tokenized_clean_msg'] = df['clean_msg'].apply(lambda x: tokenization(x))
44
- # Remove stopwords
45
- from nltk.corpus import stopwords
46
- stop_words = set(stopwords.words('english'))
47
-
48
- def remove_stopwords(text):
49
- output = [word for word in text if word not in stopwords]
50
- return output
51
-
52
- df['cleaned_tokens'] = df['tokenized_clean_msg'].apply(lambda x: remove_stopwords(x))
53
- # Count word frequencies
54
- from collections import Counter
55
- cnt = Counter()
56
- for text in df['cleaned_tokens'].values:
57
- for word in text:
58
- cnt[word] += 1
59
-
60
- # Select most common words
61
- FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
62
-
63
- # Remove frequent words
64
- def remove_freqwords(text):
65
- return [word for word in text if word not in FREQWORDS]
66
-
67
- df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: remove_freqwords(x))
68
-
69
- # Stemming
70
- from nltk.stem.porter import PorterStemmer
71
- porter_stemmer = PorterStemmer()
72
-
73
- def stemming(text):
74
- stem_text = [porter_stemmer.stem(word) for word in text]
75
- return stem_text
76
-
77
- df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: stemming(x))
78
 
79
- # Prepare feature matrix and target vector
80
- X = df['cleaned_tokens'].apply(lambda x: ' '.join(x))
81
- y = df['label']
82
- # Split the data into training and testing sets
83
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
84
- # Vectorize the data
85
- from sklearn.feature_extraction.text import CountVectorizer
86
- vectorizer = CountVectorizer()
87
- X_train_vectorized = vectorizer.fit_transform(X_train)
88
- X_test_vectorized = vectorizer.transform(X_test)
89
- # Train the Multinomial Naive Bayes model
90
- model = MultinomialNB()
91
- model.fit(X_train_vectorized, y_train)
92
- # Make predictions on the test set
93
- y_pred = model.predict(X_test_vectorized)
94
  def test_model(text):
95
  # Convert text to lowercase
96
  text = text.lower()
97
 
98
  # Remove punctuation
99
- text =remove_punctuation(text)
100
 
101
  # Remove numbers
102
  text = re.sub(r'\d+', '', text)
103
 
104
  # Remove stopwords
105
- stop_words = set(stopwords.words('english'))
106
  tokens = word_tokenize(text)
107
- filtered_text = [word for word in tokens if word not in stop_words]
108
 
109
  # Join the filtered tokens back into a string
110
  preprocessed_text = ' '.join(filtered_text)
111
-
112
  # Vectorize the preprocessed text
113
- text_vectorized = vectorizer.transform([preprocessed_text])
114
-
115
  # Make prediction on the vectorized text
116
- prediction = model.predict(text_vectorized)[0]
117
 
118
  # Return the prediction
119
  return prediction
 
120
  # Create the Gradio interface
121
- iface = gr.Interface(fn=test_model, inputs="text", outputs="text")
122
- # Launch the interface
123
- iface.launch()
 
18
  nltk.download('stopwords')
19
  nltk.download('punkt')
20
 
21
+ # Load the trained model
22
+ model = joblib.load('model.bin')
23
 
 
 
 
 
 
 
 
 
24
  def remove_punctuation(text):
25
  punctuation_free = "".join([i for i in text if i not in string.punctuation])
26
  return punctuation_free
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ def vectorize_text(texts):
29
+ vectorizer = CountVectorizer()
30
+ vectorizer.fit(texts)
31
+ text_vectorized = vectorizer.transform(texts)
32
+ return text_vectorized, vectorizer
33
+
 
 
 
 
 
 
 
 
 
34
  def test_model(text):
35
  # Convert text to lowercase
36
  text = text.lower()
37
 
38
  # Remove punctuation
39
+ text = remove_punctuation(text)
40
 
41
  # Remove numbers
42
  text = re.sub(r'\d+', '', text)
43
 
44
  # Remove stopwords
45
+ stopwords_set = set(stopwords.words('english'))
46
  tokens = word_tokenize(text)
47
+ filtered_text = [word for word in tokens if word not in stopwords_set]
48
 
49
  # Join the filtered tokens back into a string
50
  preprocessed_text = ' '.join(filtered_text)
51
+
52
  # Vectorize the preprocessed text
53
+ vectorize_texts = vectorize_text([preprocessed_text])
54
+
55
  # Make prediction on the vectorized text
56
+ prediction = model.predict(vectorize_texts[0])[0]
57
 
58
  # Return the prediction
59
  return prediction
60
+
61
  # Create the Gradio interface
62
+ iface = gr.Interface(fn=test_model, inputs="text", outputs="text", title="Text Classification")
63
+ iface.launch()