import warnings warnings.filterwarnings('ignore') warnings.simplefilter('ignore') from sklearn.metrics import confusion_matrix from transformers import pipeline import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import fetch_20newsgroups from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import accuracy_score, classification_report from sklearn.pipeline import make_pipeline from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import LabelEncoder import gradio as gr # !pip install transformers //download it # pip install --upgrade transformers// update it if you get error # !pip install gradio // download it # Fetch the 20 newsgroups dataset data = fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes')) print("First few rows of the dataset:") print(data.data[:2]) # Display information about the dataset print("Number of samples:", len(data.data)) print("\nTarget names:", data.target_names) # Split the dataset into training and testing sets X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.1, random_state=1) categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'comp.windows.x','misc.forsale', 'rec.autos', 'rec.motorcycles','rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt' ,'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns','talk.politics.mideast', 'talk.politics.misc','talk.religion.misc'] # Training the data on these categories train = fetch_20newsgroups (subset='train', categories=categories) #MultinomialNaiveBayes functon class MultinomialNaiveBayes: def __init__(self, alpha=0.01): self.alpha = alpha self.class_probs = None self.feature_probs = None def fit(self, X, y): num_classes = len(np.unique(y)) num_features = X.shape[1] # Calculate class probabilities self.class_probs = np.zeros(num_classes) for i in range(num_classes): self.class_probs[i] = np.sum(y == i) / len(y) # Calculate feature probabilities self.feature_probs = np.zeros((num_classes, num_features)) for i in range(num_classes): class_count = np.sum(y == i) self.feature_probs[i, :] = (np.sum(X[y == i], axis=0) + self.alpha) / (class_count + self.alpha * num_features) def predict(self, X): num_samples = X.shape[0] num_classes = len(self.class_probs) predictions = np.zeros(num_samples, dtype=int) for i in range(num_samples): # Ensure X[i] is a 2D array with a single row sample_probs = np.sum(np.log(self.feature_probs) * X[i, :].toarray(), axis=1) + np.log(self.class_probs) predictions[i] = np.argmax(sample_probs) return predictions # Define a list of classifiers to try classifiers = [ MultinomialNaiveBayes(alpha=.01), RandomForestClassifier(), SVC(), LogisticRegression() ] ma=0 bar_values=[] bar_class=["MultinomialNB","RandomForestClassifier","SVC","LogisticRegression",] classifi=None for classifier in classifiers: # Create a pipeline with TF-IDF vectorizer and the current classifier model = make_pipeline(TfidfVectorizer(), classifier) # Train the model model.fit(train.data, train.target) # Make predictions on the test set predictions = model.predict(X_test) # Evaluate the performance of the model accuracy = accuracy_score(y_test, predictions) print(f"\nClassifier: {classifier.__class__.__name__}") maxx=round(accuracy, 2) bar_values.append(maxx) print(f"Accuracy: {accuracy:.2f}") # Display classification report print("Classification Report:\n", classification_report(y_test, predictions)) conf_matrix = confusion_matrix(y_test, predictions) # Plot confusion matrix as a heatmap plt.figure(figsize=(8, 6)) sns.heatmap(conf_matrix, annot=True, fmt='d', cbar=False, xticklabels=data.target_names, yticklabels=data.target_names) plt.xlabel('Predicted') plt.ylabel('Actual') plt.title(f'Confusion Matrix - {classifier.__class__.__name__}') plt.show() #getting best model train if(maxx>ma): ma=maxx classifi=classifier print("\n\n\n") plt.xlabel('Model', fontweight ='bold', fontsize = 15) plt.ylabel('Accuracy', fontweight ='bold', fontsize = 15) plt.bar(bar_class,bar_values, width = 0.4) # Annotating each bar with its value for i, value in enumerate(bar_values): plt.text(i, value, f'{value:.2f}', ha='center', va='bottom', fontweight='bold') # best algo model is trained aagain print(f"Best accuracy model is {classifi}") model = make_pipeline(TfidfVectorizer(), classifi) # Train the model model.fit(train.data, train.target) # Make predictions on the test set predictions = model.predict(X_test) # Evaluate the performance of the model accuracy = accuracy_score(y_test, predictions) print(f"\nClassifier: {classifi}") maxx=round(accuracy, 2) print(f"Accuracy: {accuracy:.2f}") # Display classification report print("Classification Report:\n", classification_report(y_test, predictions)) conf_matrix = confusion_matrix(y_test, predictions) def predict_category(Enter_article, train=train, model=model): pred=model.predict([Enter_article]) return train.target_names[pred[0]] iface=gr.Interface(fn=predict_category,inputs=gr.Textbox(lines=10, placeholder="Enter text here"),outputs="text", title="Text Classification",description="getting... the categories of Artical/news") iface.launch(inline=False,share=True)