Jayabalambika commited on
Commit
5fa63b7
1 Parent(s): 66539e8

first commit

Browse files
Files changed (1) hide show
  1. app.py +216 -0
app.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ from sklearn.decomposition import PCA
6
+
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.pipeline import make_pipeline
9
+ from sklearn.linear_model import LinearRegression
10
+ from sklearn.preprocessing import StandardScaler
11
+ from sklearn.decomposition import PCA
12
+ from sklearn.cross_decomposition import PLSRegression
13
+
14
+
15
+ #Data preparation
16
+ def make_data():
17
+ rng = np.random.RandomState(0)
18
+ n_samples = 500
19
+ cov = [[3, 3], [3, 4]]
20
+ X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples)
21
+ return X,rng,n_samples
22
+
23
+
24
+ def plot_scatter_pca(alpha):
25
+
26
+
27
+ plt.scatter(X[:, 0], X[:, 1], alpha=alpha, label="samples")
28
+ for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)):
29
+ comp = comp * var # scale component by its variance explanation power
30
+ plt.plot(
31
+ [0, comp[0]],
32
+ [0, comp[1]],
33
+ label=f"Component {i}",
34
+ linewidth=5,
35
+ color=f"C{i + 2}",
36
+ )
37
+ plt.gca().set(
38
+ aspect="equal",
39
+ title="2-dimensional dataset with principal components",
40
+ xlabel="first feature",
41
+ ylabel="second feature",
42
+ )
43
+ plt.legend()
44
+ # plt.show()
45
+ return plt
46
+
47
+ def datagen_y():
48
+ y = X.dot(pca.components_[1]) + rng.normal(size=n_samples) / 2
49
+ return y
50
+
51
+ def data_projections():
52
+ y = datagen_y()
53
+
54
+ fig, axes = plt.subplots(1, 2, figsize=(10, 3))
55
+
56
+ axes[0].scatter(X.dot(pca.components_[0]), y, alpha=0.3)
57
+ axes[0].set(xlabel="Projected data onto first PCA component", ylabel="y")
58
+ axes[1].scatter(X.dot(pca.components_[1]), y, alpha=0.3)
59
+ axes[1].set(xlabel="Projected data onto second PCA component", ylabel="y")
60
+ plt.tight_layout()
61
+ # plt.show()
62
+ return plt
63
+
64
+ def plot_pca_ls():
65
+ X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
66
+
67
+ pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())
68
+ pcr.fit(X_train, y_train)
69
+ pca = pcr.named_steps["pca"] # retrieve the PCA step of the pipeline
70
+
71
+ pls = PLSRegression(n_components=1)
72
+ pls.fit(X_train, y_train)
73
+
74
+ fig, axes = plt.subplots(1, 2, figsize=(10, 3))
75
+ axes[0].scatter(pca.transform(X_test), y_test, alpha=0.3, label="ground truth")
76
+ axes[0].scatter(
77
+ pca.transform(X_test), pcr.predict(X_test), alpha=0.3, label="predictions"
78
+ )
79
+ axes[0].set(
80
+ xlabel="Projected data onto first PCA component", ylabel="y", title="PCR / PCA"
81
+ )
82
+ axes[0].legend()
83
+ axes[1].scatter(pls.transform(X_test), y_test, alpha=0.3, label="ground truth")
84
+ axes[1].scatter(
85
+ pls.transform(X_test), pls.predict(X_test), alpha=0.3, label="predictions"
86
+ )
87
+ axes[1].set(xlabel="Projected data onto first PLS component", ylabel="y", title="PLS")
88
+ axes[1].legend()
89
+ plt.tight_layout()
90
+ # plt.show()
91
+ return plt
92
+
93
+
94
+ def get_components():
95
+ X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
96
+ pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())
97
+ pls = PLSRegression(n_components=1)
98
+ return X_train, X_test, y_train, y_test, pcr, pls
99
+
100
+
101
+ def print_results():
102
+ X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
103
+
104
+ pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())
105
+ pcr.fit(X_train, y_train)
106
+ pca = pcr.named_steps["pca"] # retrieve the PCA step of the pipeline
107
+
108
+ pls = PLSRegression(n_components=1)
109
+ pls.fit(X_train, y_train)
110
+ result1 = f"PCR r-squared {pcr.score(X_test, y_test):.3f}"
111
+ result2 = f"PLS r-squared {pls.score(X_test, y_test):.3f}"
112
+ mystr = result1 +"\n"+ result2
113
+
114
+ return mystr
115
+
116
+
117
+ def calc_pcr_r2():
118
+ X_train, X_test, y_train, y_test, pcr, pls = get_components()
119
+ pca_2 = make_pipeline(PCA(n_components=2), LinearRegression())
120
+ pca_2.fit(X_train, y_train)
121
+ r2 = f"PCR r-squared with 2 components {pca_2.score(X_test, y_test):.3f}"
122
+
123
+ return r2
124
+
125
+
126
+ X, rng, n_samples = make_data()
127
+ pca = PCA(n_components=2).fit(X)
128
+ y = datagen_y()
129
+ # plot_scatter_pca(alpha)
130
+
131
+
132
+ title = " Principal Component Regression vs Partial Least Squares Regression."
133
+
134
+
135
+ with gr.Blocks(title=title, theme='gstaff/xkcd') as demo:
136
+
137
+ gr.Markdown(f" # {title}")
138
+ gr.Markdown(
139
+ """
140
+ This example compares Principal Component Regression (PCR) and Partial Least Squares Regression (PLS) on a toy dataset.
141
+
142
+ Our goal is to illustrate how PLS can outperform PCR when the target is strongly correlated with some directions in the
143
+ data that have a low variance.
144
+
145
+ PCR is a regressor composed of two steps: first, PCA is applied to the training data, possibly performing dimensionality reduction;
146
+ then, a regressor (e.g. a linear regressor) is trained on the transformed samples.
147
+
148
+ In PCA, the transformation is purely unsupervised, meaning that no information about the targets is used.
149
+ As a result, PCR may perform poorly in some datasets where the target is strongly correlated with directions that have low variance.
150
+
151
+ Indeed, the dimensionality reduction of PCA projects the data into a lower dimensional space where the variance of the projected data
152
+ is greedily maximized along each axis. Despite them having the most predictive power on the target,
153
+ the directions with a lower variance will be dropped, and the final regressor will not be able to leverage them.
154
+
155
+ PLS is both a transformer and a regressor, and it is quite similar to PCR:
156
+ it also applies a dimensionality reduction to the samples before applying a linear regressor to the transformed data.
157
+
158
+ The main difference with PCR is that the PLS transformation is supervised. Therefore, as we will see in this example,
159
+ it does not suffer from the issue we just mentioned.
160
+ """)
161
+
162
+ gr.Markdown("You can see the associated scikit-learn example [here](https://scikit-learn.org/stable/auto_examples/cross_decomposition/plot_pcr_vs_pls.html#sphx-glr-auto-examples-cross-decomposition-plot-pcr-vs-pls-py).")
163
+
164
+ # loaded_model = load_hf_model_hub()
165
+
166
+ with gr.Tab("Visualize Input dataset"):
167
+
168
+
169
+ with gr.Row(equal_height=True):
170
+ slider1 = gr.Slider(label="alpha", minimum=0.0, maximum=1.0)
171
+
172
+
173
+ slider1.change(plot_scatter_pca, slider1, outputs= gr.Plot(label='Visualizing input dataset') )
174
+
175
+
176
+ with gr.Tab("PCA data projections"):
177
+ btn_decision = gr.Button(value="PCA data projections")
178
+ btn_decision.click(data_projections, outputs= gr.Plot(label='PCA data projections') )
179
+
180
+
181
+ with gr.Tab("predictive power"):
182
+ btn_power = gr.Button(value="Predictive power")
183
+ btn_power.click(plot_pca_ls, outputs= gr.Plot(label='Predictive power') )
184
+
185
+
186
+ with gr.Tab("Results tab"):
187
+ gr.Markdown(
188
+ """
189
+ As a final remark,
190
+ we note that PCR with 2 components performs as well as PLS: this is because in this case,
191
+ PCR was able to leverage the second component which has the most preditive power on the target.
192
+ """)
193
+ btn_power = gr.Button(value="Results")
194
+ out = gr.Textbox(label="r2 score of both estimators")
195
+ btn_power.click(print_results, outputs= out )
196
+
197
+
198
+ with gr.Tab("r2_score of predictors comparison"):
199
+ with gr.Row(equal_height=True):
200
+ gr.Markdown(
201
+ """
202
+ We also print the R-squared scores of both estimators, which further confirms that PLS is a better alternative than PCR in this case.
203
+ A negative R-squared indicates that PCR performs worse than a regressor that would simply predict the mean of the target.
204
+ """)
205
+ btn_1 = gr.Button(value="r2_score of predictors")
206
+ out1 = gr.Textbox(label="r2_score of predictors")
207
+ btn_1.click(calc_pcr_r2, outputs= out1 )
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+ gr.Markdown( f"## End of page")
216
+ demo.launch()