# -*- coding: utf-8 -*- """Lab2222.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1OUGOeTdmMbccW_st3Ao8nHDR5wm_VUNg from google.colab import drive drive.mount("/content/ML_Course") cd /content/ML_Course/MyDrive/ML_Course """ pip3 install -U scikit-learn scipy matplotlib import pandas as pd housing = pd.read_csv("housing.csv") housing.head(n = 5) housing.columns housing.describe() housing.info() # Commented out IPython magic to ensure Python compatibility. # %matplotlib inline import matplotlib.pyplot as plt housing.hist(bins=50, figsize=(20,15)) plt.show() # to make this notebook's output identical at every run import numpy as np np.random.seed(10) # For illustration only. Sklearn has train_test_split() def split_train_test(data, test_ratio): shuffled_indices = np.random.permutation(len(data)) test_set_size = int(len(data) * test_ratio) test_indices = shuffled_indices[:test_set_size] train_indices = shuffled_indices[test_set_size:] return data.iloc[train_indices], data.iloc[test_indices] # run the function to get the train & test set train_set, test_set = split_train_test(housing, 0.2) train_set.info() test_set.info() from sklearn.model_selection import train_test_split train_set, test_set = train_test_split(housing, test_size=0.2, random_state=10) train_set.info() test_set.info() test_set.to_csv('blind_test.csv', index = False) train_set.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=train_set["population"]/100, label="population", figsize=(10,7), c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True, sharex=False) plt.legend() plt.show() train_set.info() train_set[train_set.isna().any(axis=1)] train_set_clean = train_set.dropna(subset=["total_bedrooms"]) train_set_clean train_set_clean.info() train_labels = train_set_clean["median_house_value"].copy() # get labels for output label Y train_features = train_set_clean.drop("median_house_value", axis=1) # drop labels to get features X for training set train_features.info() train_features.head() train_features.columns train_features.info() train_features.describe() train_labels train_features.hist(bins=50, figsize=(12,9)) train_features.describe() from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() ## define the transformer scaler.fit(train_features) ## call .fit() method to calculate the min and max value for each column in dataset print("Min of each column: ",scaler.data_min_) print("Max of each column: ",scaler.data_max_) train_features.describe() train_features_normalized = scaler.transform(train_features) train_features_normalized pd.DataFrame(train_features_normalized).hist(bins=50, figsize=(12,9)) plt.show() ## 1. split data to get train and test set from sklearn.model_selection import train_test_split train_set, test_set = train_test_split(housing, test_size=0.2, random_state=10) ## 2. clean the missing values train_set_clean = train_set.dropna(subset=["total_bedrooms"]) train_set_clean ## 2. derive training features and training labels train_labels = train_set_clean["median_house_value"].copy() # get labels for output label Y train_features = train_set_clean.drop("median_house_value", axis=1) # drop labels to get features X for training set ## 4. scale the numeric features in training set from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() ## define the transformer scaler.fit(train_features) ## call .fit() method to calculate the min and max value for each column in dataset train_features_normalized = scaler.transform(train_features) train_features_normalized from sklearn.linear_model import LinearRegression ## import the LinearRegression Function lin_reg = LinearRegression() ## Initialize the class lin_reg.fit(train_features_normalized, train_labels) # feed the training data X, and label Y for supervised learning # feed the training data X, and label Y for supervised learning training_predictions = lin_reg.predict(train_features_normalized) training_predictions.shape train_labels ## plot scatter plot import matplotlib.pyplot as plt plt.scatter(training_predictions, train_labels ) plt.xlabel('training_predictions', fontsize=15,color="red") plt.ylabel('train_label', fontsize=15,color="green") plt.title('Scatter plot for training_predictions and train_label', fontsize=15) plt.xlim(0,np.max(training_predictions)) # remove the predictions that have negative prices plt.show() import numpy as np np.corrcoef(training_predictions, train_labels) import pandas as pd prediction_summary = pd.DataFrame({'predicted_label':training_predictions, 'actual_label':train_labels}) prediction_summary prediction_summary['error'] = prediction_summary['actual_label'] - prediction_summary['predicted_label'] prediction_summary from sklearn.metrics import mean_squared_error lin_mse = mean_squared_error(train_labels, training_predictions) lin_rmse = np.sqrt(lin_mse) lin_rmse ## Step 1: training the data using decision tree algorithm from sklearn.tree import DecisionTreeRegressor ## import the DecisionTree Function tree_reg = DecisionTreeRegressor(random_state=10) ## Initialize the class tree_reg.fit(train_features_normalized, train_labels) # feed the training data X, and label Y for supervised learning ### Step 2: make a prediction using tree model training_predictions_trees = tree_reg.predict(train_features_normalized) training_predictions_trees ## Step 3: visualize the scatter plot between predictions and actual labels import matplotlib.pyplot as plt plt.scatter(training_predictions_trees, train_labels ) plt.xlabel('training_predictions_trees', fontsize=15,color="red") plt.ylabel('train_label', fontsize=15,color="green") plt.title('Scatter plot for training_predictions_trees and train_label', fontsize=15) plt.xlim(0,np.max(training_predictions_trees)) # remove the predictions that have negative prices plt.show() from sklearn.metrics import mean_squared_error tree_mse = mean_squared_error(train_labels, training_predictions_trees) tree_rmse = np.sqrt(tree_mse) tree_rmse ## 1. clean the missing values in test set test_set_clean = test_set.dropna(subset=["total_bedrooms"]) test_set_clean ## 2. derive test features and test labels. In this case, test labels are only used for evaluation test_labels = test_set_clean["median_house_value"].copy() # get labels for output label Y test_features = test_set_clean.drop("median_house_value", axis=1) # drop labels to get features X for training set ## 4. scale the numeric features in test set. ## important note: do not apply fit function on the test set, using same scalar from training set test_features_normalized = scaler.transform(test_features) test_features_normalized ### Step 5: make a prediction using tree model test_predictions_trees = tree_reg.predict(test_features_normalized) test_predictions_trees from sklearn.metrics import mean_squared_error test_tree_mse = mean_squared_error(test_labels, test_predictions_trees) test_tree_rmse = np.sqrt(test_tree_mse) test_tree_rmse # Step 1: install Gradio #!pip install --quiet gradio # Step 2: import library #import gradio as gr #print(gr.__version__) # Step 3.1: Define a simple "Hello World" function # requirement: input is text, output is text def greet(name): return "Hello " + name + "!!" # Step 3.2: Define the input component (text style) and output component (text style) to create a simple GUI import gradio as gr input_module = gr.inputs.Textbox(label = "Input Text") output_module = gr.outputs.Textbox(label = "Output Text") # Step 3.3: Put all three component together into the gradio's interface function gr.Interface(fn=greet, inputs=input_module, outputs=output_module).launch() # Step 5.1: Define a simple "image-to-text" function # requirement: input is text, output is text def caption(image): return "Image is processed!!" # Step 5.2: Define the input component (image style) and output component (text style) to create a simple GUI import gradio as gr input_module = gr.inputs.Image(label = "Input Image") output_module = gr.outputs.Textbox(label = "Output Text") # Step 5.3: Put all three component together into the gradio's interface function gr.Interface(fn=caption, inputs=input_module, outputs=output_module).launch() # Step 6.1: Define different input components import gradio as gr # a. define text data type input_module1 = gr.inputs.Textbox(label = "Input Text") # b. define image data type input_module2 = gr.inputs.Image(label = "Input Image") # c. define Number data type input_module3 = gr.inputs.Number(label = "Input Number") # d. define Slider data type input_module4 = gr.inputs.Slider(1, 100, step=5, label = "Input Slider") # e. define Checkbox data type input_module5 = gr.inputs.Checkbox(label = "Does it work?") # f. define Radio data type input_module6 = gr.inputs.Radio(choices=["park", "zoo", "road"], label = "Input Radio") # g. define Dropdown data type input_module7 = gr.inputs.Dropdown(choices=["park", "zoo", "road"], label = "Input Dropdown") # Step 6.2: Define different output components # a. define text data type output_module1 = gr.outputs.Textbox(label = "Output Text") # b. define image data type output_module2 = gr.outputs.Image(label = "Output Image") # you can define more output components # Step 6.3: Define a new function that accommodates the input modules. def multi_inputs(input1, input2, input3, input4, input5, input6, input7 ): import numpy as np ## processing inputs ## return outputs output1 = "Processing inputs and return outputs" # text output example output2 = np.random.rand(6,6) # image-like array output example return output1,output2 # Step 6.4: Put all three component together into the gradio's interface function gr.Interface(fn=multi_inputs, inputs=[input_module1, input_module2, input_module3, input_module4, input_module5, input_module6, input_module7], outputs=[output_module1, output_module2] ).launch() # Step 6.1: Define different input components import gradio as gr # a. define text data type input_module1 = gr.inputs.Slider(-124.35,-114.35, step =0.5,label = "Longitude") # b. define image data type input_module2 = gr.inputs.Slider(32,41, step =0.5,label = "Latitude") # c. define Number data type input_module3 = gr.inputs.Slider(1,52, step = 1,label = "Housing_median_age(Year)") # d. define Slider data type input_module4 = gr.inputs.Slider(1, 40000, step=1, label = "Total_rooms") # e. define Checkbox data type input_module5 = gr.inputs.Slider(1, 6441,label = "Total_bedrooms") # f. define Radio data type input_module6 = gr.inputs.Slider(1,6441,step = 1,label = "Population") # g. define Dropdown data type input_module7 = gr.inputs.Slider(1,6081,step = 1,label = "Households") input_module8 = gr.inputs.Slider(0,15,step = 1,label = "Median_income") # Step 6.2: Define different output components # a. define text data type output_module1 = gr.outputs.Textbox(label = "Predicted Housing Prices") # b. define image data type output_module2 = gr.outputs.Image(label = "Output Image") # you can define more output components train_set.columns #save machinel earning model to local drive import pickle #save with open('tree_reg.pkl','wb') as f: pickle.dump(tree_reg,f) ls # Step 6.3: Define a new function that accommodates the input modules. def machine_learning_model(input1, input2, input3, input4, input5, input6, input7, input8): print('Start ML process') import numpy as np import pandas as pd print(input1, input2, input3, input4, input5, input6, input7, input8) #1. process the user submission new_feature = np.array([[input1, input2, input3, input4, input5, input6, input7, input8]]) print(new_feature) test_set = pd.DataFrame(new_feature, columns = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']) ## 1. clean the missing values in test set test_set_clean = test_set.dropna(subset=["total_bedrooms"]) test_set_clean ## 2. derive test features and test labels. In this case, test labels are only used for evaluation #test_labels = test_set_clean["median_house_value"].copy() # get labels for output label Y #test_features = test_set_clean.drop("median_house_value", axis=1) # drop labels to get features X for training set test_features_normalized = scaler.transform(test_set_clean) print("test_features_normalized: ", test_features_normalized) with open('tree_reg.pkl','rb') as f: tree_reg = pickle.load(f) print("Start processing") output1 = 'This is the output' output2 = np.random.rand(28,28) #2. follow the data preprocessing steps as we have done in the test data #2.2 Check missing values in total_bedrroms # 2.2 feature normalization #3. load pre trained machine learning #4 apply loaded modeld test_predictions_trees = tree_reg.predict(test_features_normalized) print("Predicition is :",test_predictions_trees) import matplotlib.pyplot as plt train_set.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=train_set["population"]/100, label="population", figsize=(10,7), c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True, sharex=False) plt.legend() #plt.show() plt.xlim(-124.35,-114.35) plt.ylim(32,41) plt.plot([input1],[input2],marker = "X",markersize = 20, markeredgecolor="yellow", markerfacecolor="black") plt.savefig('test.png') #5 send back the prediciton return test_predictions_trees,'test.png' gr.Interface(fn=machine_learning_model, inputs=[input_module1, input_module2, input_module3, input_module4, input_module5, input_module6, input_module7, input_module8], outputs=[output_module1, output_module2] ).launch(debug = True)