breast_cancer / full code.py
DilshanKavinda's picture
add new files
9a2636b verified
# Data preparation and import libraries
import pandas as pd
import numpy as np
import datetime as dt
# import matplotlib.pyplot as plt
# import seaborn as sns
import plotly.express as px
# %matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
data = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
# Data Exploration and Data Cleaning
data.head()
data.shape
data.info()
# Check Duplication
data.duplicated().sum()
# There aren't duplicate values
# check Missing value
data.isna().sum()
#- No Missing Value is avalible
# Check the number of unique values of each column
data.nunique()
#- Dropping the id and Unnamed: 32 columns which will not provide any information for our model
data = data.drop(['id','Unnamed: 32'], axis= 1)
#- Changed the name of the diagnostic column to target!
data = data.rename(columns={'diagnosis' : 'target'})
#- changed taget data in the dataset. I changed malignant to 1 and benign to 0.
data.target.replace({'M' : '1','B': '0'},inplace=True)
# Converting target type to int64
data.target = data.target.astype('float64')
### Data processing result
data.head()
data.tail()
data.info()
# Check statistic of dataset
data.describe().T
# Analysis📝 & EDA📊
# I looked at how many benign and malignant yields there are.
data.target.value_counts()
# visualized target data in the dataset.
data['target'].value_counts().plot(kind='bar',edgecolor='black',color=['lightsteelblue','navajowhite'])
plt.title(" Target",fontsize=20)
plt.show()
#- 1-->Malignant
#- 0-->Benign
# Correlation Analysis
cor = data.corr()
cor
plt.figure(figsize=(25,23))
sns.heatmap(cor, annot= True, linewidths= 0.3 ,linecolor = "black", fmt = ".2f")
plt.title('Correlation Heatmap')
plt.show()
#- Data with a correlation greater than 0.75.
threshold = 0.75
filtre = np.abs(cor["target"] > threshold)
corr_features = cor.columns[filtre].tolist()
plt.figure(figsize=(10,8))
sns.clustermap(data[corr_features].corr(), annot = True, fmt = ".2f")
plt.title("\n Correlation Between Features with Cor Thresgold [0.75]\n",fontsize=20)
plt.show()
### visualized the data with a correlation greater than 0.75
sns.pairplot(data[corr_features], diag_kind = "kde" , markers = "*", hue="target")
plt.show()
# Machine Learning Model Evaluation
# Splitting data
x= data.drop('target',axis=1)
y= data['target']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.30,random_state=101)
s= StandardScaler()
x_train = s.fit_transform(x_train)
x_test = s.fit_transform(x_test)
algorithm = ['KNeighborsClassifier','RandomForestClassifier','DecisionTreeClassifier','GaussianNB','LogisticRegression']
Accuracy=[]
def all(model):
model.fit(x_train,y_train)
pred = model.predict(x_test)
acc=accuracy_score(y_test,pred)
Accuracy.append(acc)
# confusion matrix without Normalization
print('confusion matrix')
# Calculate confusion matrix
cm = confusion_matrix(y_test,pred)
# Plot the confusion matrix
sns.heatmap(cm, annot=True, fmt='d',cmap=['lightsteelblue','navajowhite'])
plt.title('Confusion matrix')
plt.xlabel('Predcted lablel')
plt.ylabel('True lable')
plt.show()
# confusion matrix without Normalization
print('Normalized confusion matrix')
# Calculate confusion matrix
cm1 = confusion_matrix(y_test,pred, normalize='true')
# Plot the confusion matrix
sns.heatmap(cm1, annot=True,cmap=['lightsteelblue','navajowhite'])
plt.title('Normalized Confusion matrix')
plt.xlabel('Predcted lablel')
plt.ylabel('True lable')
plt.show()
# print Confusion matrix, Classification report and accuracy report
print(cm)
print(classification_report(y_test,pred))
print('accuracy_score : ' , acc)
### KNN machine learning model Evaluation for breast cancer
model_1 =KNeighborsClassifier(n_neighbors=2)
all(model_1)
### RandomForest
model_2= RandomForestClassifier(n_estimators=100,random_state=0)
all(model_2)
### DecisionTree
model_3 = DecisionTreeClassifier(random_state=42)
all(model_3)
### Naive_bayes
model_4 = GaussianNB()
all(model_4)
### Logistic Regression
model_5 = LogisticRegression()
all(model_5)
df = pd.DataFrame({'Algorithm':algorithm,'Accuracy':Accuracy })
df
fig = plt.figure(figsize=(20,10))
plt.plot(df.Algorithm,df.Accuracy,label='Accuracy',lw=5,color='peru',marker='o',markersize = 15)
plt.legend(fontsize=15)
plt.xlabel('\nModel',fontsize= 20)
plt.ylabel('Accuracy\n',fontsize= 20)
plt.show()