Spaces:

CSharpCorner
/

CSharpGrammer

Runtime error

App Files Files Community

Priyanhsu commited on Jun 1, 2023

Commit

49b04e1

1 Parent(s): 36b080e

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -78

app.py CHANGED Viewed

@@ -18,106 +18,46 @@ import nltk
 nltk.download('stopwords')
 nltk.download('punkt')
-with open('NewData.json') as file:
-    data = json.load(file)
-df = pd.DataFrame(data)
-# shuffling all our data
-df = df.sample(frac=1)
-# reading only Message_body and label
-df = df[['content','label']]
-df['clean_msg'] = df['content'].apply(lambda x: x.lower())
-# Remove punctuation
-import string
 def remove_punctuation(text):
     punctuation_free = "".join([i for i in text if i not in string.punctuation])
     return punctuation_free
-df['clean_msg'] = df['clean_msg'].apply(lambda x: remove_punctuation(x))
-# Tokenization
-from nltk.tokenize import WhitespaceTokenizer
-def tokenization(text):
-    tk = WhitespaceTokenizer()
-    return tk.tokenize(text)
-df['tokenized_clean_msg'] = df['clean_msg'].apply(lambda x: tokenization(x))
-# Remove stopwords
-from nltk.corpus import stopwords
-stop_words = set(stopwords.words('english'))
-def remove_stopwords(text):
-    output = [word for word in text if word not in stopwords]
-    return output
-df['cleaned_tokens'] = df['tokenized_clean_msg'].apply(lambda x: remove_stopwords(x))
-# Count word frequencies
-from collections import Counter
-cnt = Counter()
-for text in df['cleaned_tokens'].values:
-    for word in text:
-        cnt[word] += 1
-# Select most common words
-FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
-# Remove frequent words
-def remove_freqwords(text):
-    return [word for word in text if word not in FREQWORDS]
-df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: remove_freqwords(x))
-# Stemming
-from nltk.stem.porter import PorterStemmer
-porter_stemmer = PorterStemmer()
-def stemming(text):
-    stem_text = [porter_stemmer.stem(word) for word in text]
-    return stem_text
-df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: stemming(x))
-# Prepare feature matrix and target vector
-X = df['cleaned_tokens'].apply(lambda x: ' '.join(x))
-y = df['label']
-# Split the data into training and testing sets
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-# Vectorize the data
-from sklearn.feature_extraction.text import CountVectorizer
-vectorizer = CountVectorizer()
-X_train_vectorized = vectorizer.fit_transform(X_train)
-X_test_vectorized = vectorizer.transform(X_test)
-# Train the Multinomial Naive Bayes model
-model = MultinomialNB()
-model.fit(X_train_vectorized, y_train)
-# Make predictions on the test set
-y_pred = model.predict(X_test_vectorized)
 def test_model(text):
     # Convert text to lowercase
     text = text.lower()
     # Remove punctuation
-    text =remove_punctuation(text)
     # Remove numbers
     text = re.sub(r'\d+', '', text)
     # Remove stopwords
-    stop_words = set(stopwords.words('english'))
     tokens = word_tokenize(text)
-    filtered_text = [word for word in tokens if word not in stop_words]
     # Join the filtered tokens back into a string
     preprocessed_text = ' '.join(filtered_text)
     # Vectorize the preprocessed text
-    text_vectorized = vectorizer.transform([preprocessed_text])
     # Make prediction on the vectorized text
-    prediction = model.predict(text_vectorized)[0]
     # Return the prediction
     return prediction
 # Create the Gradio interface
-iface = gr.Interface(fn=test_model, inputs="text", outputs="text")
-# Launch the interface
-iface.launch()

 nltk.download('stopwords')
 nltk.download('punkt')
+# Load the trained model
+model = joblib.load('model.bin')
 def remove_punctuation(text):
     punctuation_free = "".join([i for i in text if i not in string.punctuation])
     return punctuation_free
+def vectorize_text(texts):
+    vectorizer = CountVectorizer()
+    vectorizer.fit(texts)
+    text_vectorized = vectorizer.transform(texts)
+    return text_vectorized, vectorizer
 def test_model(text):
     # Convert text to lowercase
     text = text.lower()
     # Remove punctuation
+    text = remove_punctuation(text)
     # Remove numbers
     text = re.sub(r'\d+', '', text)
     # Remove stopwords
+    stopwords_set = set(stopwords.words('english'))
     tokens = word_tokenize(text)
+    filtered_text = [word for word in tokens if word not in stopwords_set]
     # Join the filtered tokens back into a string
     preprocessed_text = ' '.join(filtered_text)
     # Vectorize the preprocessed text
+    vectorize_texts = vectorize_text([preprocessed_text])
     # Make prediction on the vectorized text
+    prediction = model.predict(vectorize_texts[0])[0]
     # Return the prediction
     return prediction
 # Create the Gradio interface
+iface = gr.Interface(fn=test_model, inputs="text", outputs="text", title="Text Classification")
+iface.launch()