# -*- coding: utf-8 -*- """🦀 Breast Cancer Prediction Using Machine Learning Automatically generated by Colab. Original file is located at https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/breast-cancer-prediction-using-machine-learning-64dbd263-f311-46a0-9f3a-6d5379802a34.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20240706/auto/storage/goog4_request%26X-Goog-Date%3D20240706T233729Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D770b61f66b57f06cbdb54f5d4bc4ba32650abee908c284002eeb0472828613c36367d32e0a38bde7138192f4066bfd1989608bbf31e1f46626f2f9cf0ca2e8845b9e2b421ac0b2af146b3e14860f016c245a0909ac13965a6f7ea58b4f3425f3e42c50b8ddffc177dd6cecb561b8c4d47054356112477f0f1c5819cba3750f4737d50937a291458ce7a92ba56dd0f3dd2b91bac287210da2318d5f4e74d79aa63b496369ed514c57b8e8953a3b1b9cdf673261822f27b2e488f4c2d7c225be9fa7d959fa1afa6fb5455d6f2a8db1f67711c39e69e654183c88e15fb420a0b8696bc1d6420a2d81f03eb8b5ebb8e80c40d7cf7664fb585951d3ae1dc04093d6a0 """ # IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES # TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK, # THEN FEEL FREE TO DELETE THIS CELL. # NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON # ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR # NOTEBOOK. import os import sys from tempfile import NamedTemporaryFile from urllib.request import urlopen from urllib.parse import unquote, urlparse from urllib.error import HTTPError from zipfile import ZipFile import tarfile import shutil CHUNK_SIZE = 40960 DATA_SOURCE_MAPPING = 'breast-cancer-wisconsin-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F180%2F408%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240706%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240706T233729Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2a42b19591dbfb7e3dadedf38ba5c4a2f41943260a2d6207aadbbdd6dc68ac198d85c58f17405095296f8c79de5c9517c6b9fdead7a5db588fea525cfb3a0474d6648706bd7ed55b1eec6b7718d64035647349365aa3b684519ef9f3ee4b750db4f314a520cd629a09d7a6ab3553ca46600d66b8613a67f2335fcfb93a051a47237d3adde9a5dbeccff7f24f0de64e5dc4346b7d5fcf85ce9ef16e62007599a879c970761ea4b4dfdc90568736428bca9722b7c679b20b5843c031092316569902ec1e5e413c2fb039207260c95e5cea134c8a4bc1f27e559256bb1c78141d4a53f01b9253fa597423bf463719f5f3d47f21afdf5c9030c3fd43009a347010b5' KAGGLE_INPUT_PATH='/kaggle/input' KAGGLE_WORKING_PATH='/kaggle/working' KAGGLE_SYMLINK='kaggle' !umount /kaggle/input/ 2> /dev/null shutil.rmtree('/kaggle/input', ignore_errors=True) os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True) os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True) try: os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True) except FileExistsError: pass try: os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True) except FileExistsError: pass for data_source_mapping in DATA_SOURCE_MAPPING.split(','): directory, download_url_encoded = data_source_mapping.split(':') download_url = unquote(download_url_encoded) filename = urlparse(download_url).path destination_path = os.path.join(KAGGLE_INPUT_PATH, directory) try: with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile: total_length = fileres.headers['content-length'] print(f'Downloading {directory}, {total_length} bytes compressed') dl = 0 data = fileres.read(CHUNK_SIZE) while len(data) > 0: dl += len(data) tfile.write(data) done = int(50 * dl / int(total_length)) sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded") sys.stdout.flush() data = fileres.read(CHUNK_SIZE) if filename.endswith('.zip'): with ZipFile(tfile) as zfile: zfile.extractall(destination_path) else: with tarfile.open(tfile.name) as tarfile: tarfile.extractall(destination_path) print(f'\nDownloaded and uncompressed: {directory}') except HTTPError as e: print(f'Failed to load (likely expired) {download_url} to path {destination_path}') continue except OSError as e: print(f'Failed to load {download_url} to path {destination_path}') continue print('Data source import complete.') """# 🦀 Breast Cancer Prediction Using Machine Learning.

Table Of Contains

--- > ### Steps are: 1. [Gathering Data](#1) - [Exploratory Data Analysis](#2) - [Data Visualizations](#3) - [Model Implementation.](#4) - [ML Model Selecting and Model PredPrediction](#5) - [HyperTunning the ML Model](#6) - [Deploy Model](#7) **Hope** you guys ****Love It**** and get a better **learning experience**. 🙏
Breast Cancer Prediction Using Machine Learning
### Attribute Information: 1. ID number - Diagnosis (M = malignant, B = benign) Ten real-valued features are computed for each cell nucleus: 1. radius (mean of distances from center to points on the perimeter) - texture (standard deviation of gray-scale values) - perimeter - area - smoothness (local variation in radius lengths) - compactness (perimeter^2 / area - 1.0) - concavity (severity of concave portions of the contour) - concave points (number of concave portions of the contour) - symmetry - fractal dimension ("coastline approximation" - 1) """ import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) pd.options.display.max_columns = 100 """After installing numpy and pandas package, we are ready to fetch data using pandas package, Befor we use it, We need to know where's our dataset located. Means what is the path of our dataset""" # import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) """
# 1. Data Collection. """ data = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv") """After collecting data, we need to know what are the shape of this dataset, Here we have attribute(`property`) called `data.shape` For that we have 2 type of methods to show the shape of the datasets. 1. `len(data.index), len(data.columns)` - `data.shape` Both methods are giving us the same output, As you can see in the below cells` """ # Cell 1 len(data.index), len(data.columns) # Cell 2 data.shape data.head() data.tail() """
# 2. Exploring Data Analysis """ data.info() data.isna() data.isna().any() data.isna().sum() data = data.dropna(axis='columns') """### Get object features - Using this method, we can see how many `object(categorical)` type of feature exists in dataset """ data.describe(include="O") """- *As we can see abouve result there are only one single feature is categorical and it's values are `B` and `M`* ### To know how many unique values """ data.diagnosis.value_counts() """using `value_counts` method we can see number of unique values in categorical type of feature. ### Identify dependent and independent """ data.head(2) diagnosis_unique = data.diagnosis.unique() diagnosis_unique """
# 3. Data Visualization. """ # Commented out IPython magic to ensure Python compatibility. import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import plotly.graph_objects as go # %matplotlib inline sns.set_style('darkgrid') plt.figure(figsize=(15, 5)) plt.subplot(1, 2, 1) plt.hist( data.diagnosis) # plt.legend() plt.title("Counts of Diagnosis") plt.xlabel("Diagnosis") plt.subplot(1, 2, 2) #sns.countplot('diagnosis', data=data); # ";" to remove output like this > # plt.show() # plt.figure(figsize=(7,12)) px.histogram(data, x='diagnosis') # plt.show() cols = ["diagnosis", "radius_mean", "texture_mean", "perimeter_mean", "area_mean"] sns.pairplot(data[cols], hue="diagnosis") plt.show() size = len(data['texture_mean']) area = np.pi * (15 * np.random.rand( size ))**2 colors = np.random.rand( size ) plt.xlabel("texture mean") plt.ylabel("radius mean") plt.scatter(data['texture_mean'], data['radius_mean'], s=area, c=colors, alpha=0.5); """### Data Filtering - Now, we have one categorical feature, so we need to convert it into numeric values using `LabelEncoder` from `sklearn.preprocessing` packages """ from sklearn.preprocessing import LabelEncoder data.head(2) """* LabelEncoder can be used to normalize labels. """ labelencoder_Y = LabelEncoder() data.diagnosis = labelencoder_Y.fit_transform(data.diagnosis) """After converting into numerical values, we can check it's values using this way,""" data.head(2) print(data.diagnosis.value_counts()) print("\n", data.diagnosis.value_counts().sum()) """Finnaly, We can see in this output categorical values converted into 0 and 1. #### Find the correlation between other features, mean features only """ cols = ['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean'] print(len(cols)) data[cols].corr() plt.figure(figsize=(12, 9)) plt.title("Correlation Graph") cmap = sns.diverging_palette( 1000, 120, as_cmap=True) sns.heatmap(data[cols].corr(), annot=True, fmt='.1%', linewidths=.05, cmap=cmap); """Using, Plotly Pacage we can show it in interactive graphs like this,""" plt.figure(figsize=(15, 10)) fig = px.imshow(data[cols].corr()); fig.show() """
# Model Implementation --- --- #### Train Test Splitting ##### Preprocessing and model selection """ from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler """### Import Machine Learning Models """ from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier """### Check the Model Accuracy, Errors and it's Validations""" from sklearn.metrics import accuracy_score, confusion_matrix, f1_score from sklearn.metrics import classification_report from sklearn.model_selection import KFold from sklearn.model_selection import cross_validate, cross_val_score from sklearn.svm import SVC from sklearn import metrics """### Feature Selection Select feature for predictions """ data.columns """- Take the dependent and independent feature for prediction""" prediction_feature = [ "radius_mean", 'perimeter_mean', 'area_mean', 'symmetry_mean', 'compactness_mean', 'concave points_mean'] targeted_feature = 'diagnosis' len(prediction_feature) X = data[prediction_feature] X # print(X.shape) # print(X.values) y = data.diagnosis y # print(y.values) """- Splite the dataset into TrainingSet and TestingSet by 33% and set the 15 fixed records""" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=15) print(X_train) # print(X_test) """### Perform Feature Standerd Scalling Standardize features by removing the mean and scaling to unit variance The standard score of a sample x is calculated as: - z = (x - u) / s """ # Scale the data to keep all the values in the same magnitude of 0 -1 sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.fit_transform(X_test) """
# ML Model Selecting and Model PredPrediction --- --- #### Model Building Now, we are ready to build our model for prediction, for the I made function for model building and preforming prediction and measure it's prediction and accuracy score. #### Arguments 1. model => ML Model Object 2. Feature Training Set data 3. Feature Testing Set data 4. Targetd Training Set data 5. Targetd Testing Set data """ def model_building(model, X_train, X_test, y_train, y_test): """ Model Fitting, Prediction And Other stuff return ('score', 'accuracy_score', 'predictions' ) """ model.fit(X_train, y_train) score = model.score(X_train, y_train) predictions = model.predict(X_test) accuracy = accuracy_score(predictions, y_test) return (score, accuracy, predictions) """Let's make a dictionary for multiple models for bulk predictions""" models_list = { "LogisticRegression" : LogisticRegression(), "RandomForestClassifier" : RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=5), "DecisionTreeClassifier" : DecisionTreeClassifier(criterion='entropy', random_state=0), "SVC" : SVC(), } # print(models_list) """Before, sending it to the prediction check the key and values to store it's values in DataFrame below.""" print(list(models_list.keys())) print(list(models_list.values())) # print(zip(list(models_list.keys()), list(models_list.values()))) """### Model Implementing Now, Train the model one by one and show the classification report of perticular models wise. """ # Let's Define the function for confision metric Graphs def cm_metrix_graph(cm): sns.heatmap(cm,annot=True,fmt="d") plt.show() df_prediction = [] confusion_matrixs = [] df_prediction_cols = [ 'model_name', 'score', 'accuracy_score' , "accuracy_percentage"] for name, model in zip(list(models_list.keys()), list(models_list.values())): (score, accuracy, predictions) = model_building(model, X_train, X_test, y_train, y_test ) print("\n\nClassification Report of '"+ str(name), "'\n") print(classification_report(y_test, predictions)) df_prediction.append([name, score, accuracy, "{0:.2%}".format(accuracy)]) # For Showing Metrics confusion_matrixs.append(confusion_matrix(y_test, predictions)) df_pred = pd.DataFrame(df_prediction, columns=df_prediction_cols) print(len(confusion_matrixs)) plt.figure(figsize=(10, 2)) # plt.title("Confusion Metric Graph") for index, cm in enumerate(confusion_matrixs): up # plt.xlabel("Negative Positive") # plt.ylabel("True Positive") # Show The Metrics Graph cm_metrix_graph(cm) # Call the Confusion Metrics Graph plt.tight_layout(pad=True) """While Predicting we can store model's score and prediction values to new generated dataframe""" df_pred """- print the hightest accuracy score using sort values""" df_pred.sort_values('score', ascending=False) # df_pred.sort_values('accuracy_score', ascending=False) """### K-Fold Applying ...""" len(data) # print(len(X)) # Sample For testing only cv_score = cross_validate(LogisticRegression(), X, y, cv=3, scoring=('r2', 'neg_mean_squared_error'), return_train_score=True) pd.DataFrame(cv_score).describe().T """Let's define a functino for cross validation scorring for multiple ML models """ def cross_val_scorring(model): # (score, accuracy, predictions) = model_building(model, X_train, X_test, y_train, y_test ) model.fit(data[prediction_feature], data[targeted_feature]) # score = model.score(X_train, y_train) predictions = model.predict(data[prediction_feature]) accuracy = accuracy_score(predictions, data[targeted_feature]) print("\nFull-Data Accuracy:", round(accuracy, 2)) print("Cross Validation Score of'"+ str(name), "'\n") # Initialize K folds. kFold = KFold(n_splits=5) # define 5 diffrent data folds err = [] for train_index, test_index in kFold.split(data): # print("TRAIN:", train_index, "TEST:", test_index) # Data Spliting via fold indexes X_train = data[prediction_feature].iloc[train_index, :] # train_index = rows and all columns for Prediction_features y_train = data[targeted_feature].iloc[train_index] # all targeted features trains X_test = data[prediction_feature].iloc[test_index, :] # testing all rows and cols y_test = data[targeted_feature].iloc[test_index] # all targeted tests # Again Model Fitting model.fit(X_train, y_train) err.append(model.score(X_train, y_train)) print("Score:", round(np.mean(err), 2) ) """Call the function to know the cross validation function by mean for our select model predictions.""" for name, model in zip(list(models_list.keys()), list(models_list.values())): cross_val_scorring(model) """- Some of the model are giving prefect scorring. it means sometimes overfitting occurs
# HyperTunning the ML Model --- --- ### Tuning Parameters applying... """ from sklearn.model_selection import GridSearchCV """For HyperTunning we can use `GridSearchCV` to know the best performing parameters - GridSearchCV implements a “fit” and a “score” method. It also implements “predict”, “predict_proba”, “decision_function”, “transform” and “inverse_transform” if they are implemented in the estimator used. - The parameters of the estimator used to apply these methods are optimized by cross-validated grid-search over a parameter grid. """ # Let's Implement Grid Search Algorithm # Pick the model model = DecisionTreeClassifier() # Tunning Params param_grid = {'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_split': [2,3,4,5,6,7,8,9,10], 'min_samples_leaf':[2,3,4,5,6,7,8,9,10] } # Implement GridSearchCV gsc = GridSearchCV(model, param_grid, cv=10) # For 10 Cross-Validation gsc.fit(X_train, y_train) # Model Fitting print("\n Best Score is ") print(gsc.best_score_) print("\n Best Estinator is ") print(gsc.best_estimator_) print("\n Best Parametes are") print(gsc.best_params_) """### Observation Using this Algorithm, we can see that - The best score is increases - know the best estimator parametes for final model - get the best parametes for it. - *Let's apply same criteria for* **K Neighbors Classification** [**To know the right params chckout its doc params**](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html) """ # Pick the model model = KNeighborsClassifier() # Tunning Params param_grid = { 'n_neighbors': list(range(1, 30)), 'leaf_size': list(range(1,30)), 'weights': [ 'distance', 'uniform' ] } # Implement GridSearchCV gsc = GridSearchCV(model, param_grid, cv=10) # Model Fitting gsc.fit(X_train, y_train) print("\n Best Score is ") print(gsc.best_score_) print("\n Best Estinator is ") print(gsc.best_estimator_) print("\n Best Parametes are") print(gsc.best_params_) """### Observation Using this Algorithm, we can see that - A little score improved compared to previous model - Showing the Best Estimator Parametes for final model - We can see the Best Parametes for KNN Model. - Finally, Implement same strategy for **SVM** """ # Pick the model model = SVC() # Tunning Params param_grid = [ {'C': [1, 10, 100, 1000], 'kernel': ['linear'] }, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] } ] # Implement GridSearchCV gsc = GridSearchCV(model, param_grid, cv=10) # 10 Cross Validation # Model Fitting gsc.fit(X_train, y_train) print("\n Best Score is ") print(gsc.best_score_) print("\n Best Estinator is ") print(gsc.best_estimator_) print("\n Best Parametes are") print(gsc.best_params_) """### Observation Using this Algorithm, we can see that - It's gives slight better score - Showing the Best Estimator Parametes for final model Let's Implementing RandomForestClassifier for hyper Tunning > Remember while you run the below cell, it will take time for prediction and give the best params and estimators """ # Pick the model model = RandomForestClassifier() # Tunning Params random_grid = {'bootstrap': [True, False], 'max_depth': [40, 50, None], # 10, 20, 30, 60, 70, 100, 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2], # , 4 'min_samples_split': [2, 5], # , 10 'n_estimators': [200, 400]} # , 600, 800, 1000, 1200, 1400, 1600, 1800, 2000 # Implement GridSearchCV gsc = GridSearchCV(model, random_grid, cv=10) # 10 Cross Validation # Model Fitting gsc.fit(X_train, y_train) print("\n Best Score is ") print(gsc.best_score_) print("\n Best Estinator is ") print(gsc.best_estimator_) print("\n Best Parametes are") print(gsc.best_params_) """### Observation Using this Algorithm, we can see that - It's gives slight better score - Showing the Best Estimator Parametes for final model ---
# 7. Deploy Model - Finally, we are done so far. The last step is to deploy our model in production map. So we need to export our model and bind with web application API. Using pickle we can export our model and store in to `model.pkl` file, so we can ealy access this file and calculate customize prediction using Web App API. ### A little bit information about pickle: `Pickle` is the standard way of serializing objects in Python. You can use the pickle operation to serialize your machine learning algorithms and save the serialized format to a file. Later you can load this file to deserialize your model and use it to make new predictions >> Here is example of the Pickle export model ``` model.fit(X_train, Y_train) # save the model to disk filename = 'finalized_model.sav' pickle.dump(model, open(filename, 'wb')) # some time later... # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) result = loaded_model.score(X_test, Y_test) print(result) ``` """ import pickle as pkl # Trainned Model # You can also use your own trainned model logistic_model = LogisticRegression() logistic_model.fit(X_train, y_train) filename = 'logistic_model.pkl' pkl.dump(logistic_model, open(filename, 'wb')) # wb means write as binary !pip install datasets !pip install huggingface_hub from huggingface_hub import login from datasets import Dataset login() import pandas as pd input = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv') input dataset = Dataset.from_pandas(input) dataset = dataset.train_test_split(test_size=0.3) print(dataset) dataset.push_to_hub('Tiburoncin/mom-cancer2') """#### Now, You can check your current directory. You can see the file with named "logistic_model.pkl" - To read model from file ``` # load the model from disk loaded_model = pkl.load(open(filename, 'rb')) # rb means read as binary result = loaded_model.score(X_test, Y_test) ``` """ """--- --- --- ### Conclusion - In this kernal, We had seen the data clearning and EDA using pandas methods and show some visual graphs to know the behaviour of this dataset and finnaly we train some model for it and calculate the prediction and it's acciracy scores and hyper tunning. I have wroted some basic codes in this notebook. So, After socessfully completed we can deploye our models to the live production mode using **exporting models and some python web applications.** For that we can use `Flask`, `Django` or `FastAPI` frameworks. ### I hope you enjoy in this kernel and give Upvote it. đź‘Ť --- ---

That's it Guys,

🙏

I Hope you guys you like and enjoy it, and learn something interesting things from this notebook, Even I learn a lots of things while I'm creating this notebook Keep Learning, Regards, Vikas Ukani.
--- --- Thank You """