File size: 7,903 Bytes
5fa63b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1974532
5fa63b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import gradio as gr

import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression


#Data preparation
def make_data():
  rng = np.random.RandomState(0)
  n_samples = 500
  cov = [[3, 3], [3, 4]]
  X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples)
  return X,rng,n_samples


def plot_scatter_pca(alpha):
  

  plt.scatter(X[:, 0], X[:, 1], alpha=alpha, label="samples")
  for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)):
      comp = comp * var  # scale component by its variance explanation power
      plt.plot(
          [0, comp[0]],
          [0, comp[1]],
          label=f"Component {i}",
          linewidth=5,
          color=f"C{i + 2}",
      )
  plt.gca().set(
      aspect="equal",
      title="2-dimensional dataset with principal components",
      xlabel="first feature",
      ylabel="second feature",
  )
  plt.legend()
  # plt.show()
  return plt

def datagen_y():
  y = X.dot(pca.components_[1]) + rng.normal(size=n_samples) / 2
  return y

def data_projections():
  y = datagen_y()

  fig, axes = plt.subplots(1, 2, figsize=(10, 3))

  axes[0].scatter(X.dot(pca.components_[0]), y, alpha=0.3)
  axes[0].set(xlabel="Projected data onto first PCA component", ylabel="y")
  axes[1].scatter(X.dot(pca.components_[1]), y, alpha=0.3)
  axes[1].set(xlabel="Projected data onto second PCA component", ylabel="y")
  plt.tight_layout()
  # plt.show()
  return plt

def plot_pca_ls():
  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

  pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())
  pcr.fit(X_train, y_train)
  pca = pcr.named_steps["pca"]  # retrieve the PCA step of the pipeline

  pls = PLSRegression(n_components=1)
  pls.fit(X_train, y_train)

  fig, axes = plt.subplots(1, 2, figsize=(10, 3))
  axes[0].scatter(pca.transform(X_test), y_test, alpha=0.3, label="ground truth")
  axes[0].scatter(
      pca.transform(X_test), pcr.predict(X_test), alpha=0.3, label="predictions"
  )
  axes[0].set(
      xlabel="Projected data onto first PCA component", ylabel="y", title="PCR / PCA"
  )
  axes[0].legend()
  axes[1].scatter(pls.transform(X_test), y_test, alpha=0.3, label="ground truth")
  axes[1].scatter(
      pls.transform(X_test), pls.predict(X_test), alpha=0.3, label="predictions"
  )
  axes[1].set(xlabel="Projected data onto first PLS component", ylabel="y", title="PLS")
  axes[1].legend()
  plt.tight_layout()
  # plt.show()
  return plt


def get_components():
  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
  pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())
  pls = PLSRegression(n_components=1)
  return X_train, X_test, y_train, y_test, pcr, pls


def print_results():
  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

  pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())
  pcr.fit(X_train, y_train)
  pca = pcr.named_steps["pca"]  # retrieve the PCA step of the pipeline

  pls = PLSRegression(n_components=1)
  pls.fit(X_train, y_train)
  result1 = f"PCR r-squared {pcr.score(X_test, y_test):.3f}"
  result2 = f"PLS r-squared {pls.score(X_test, y_test):.3f}"
  mystr = result1 +"\n"+ result2

  return mystr


def calc_pcr_r2():
  X_train, X_test, y_train, y_test, pcr, pls = get_components()
  pca_2 = make_pipeline(PCA(n_components=2), LinearRegression())
  pca_2.fit(X_train, y_train)
  r2 = f"PCR r-squared with 2 components {pca_2.score(X_test, y_test):.3f}"

  return r2


X, rng, n_samples = make_data()
pca = PCA(n_components=2).fit(X)
y = datagen_y()
# plot_scatter_pca(alpha)


title = " Principal Component Regression vs Partial Least Squares Regression."


with gr.Blocks(title=title, theme=gr.themes.Default(font=[gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"])) as demo:
    
    gr.Markdown(f" # {title}")
    gr.Markdown(
    """
    This example compares Principal Component Regression (PCR) and Partial Least Squares Regression (PLS) on a toy dataset.

    Our goal is to illustrate how PLS can outperform PCR when the target is strongly correlated with some directions in the 
    data that have a low variance.
    
    PCR is a regressor composed of two steps: first, PCA is applied to the training data, possibly performing dimensionality reduction; 
    then, a regressor (e.g. a linear regressor) is trained on the transformed samples. 
    
    In PCA, the transformation is purely unsupervised, meaning that no information about the targets is used. 
    As a result, PCR may perform poorly in some datasets where the target is strongly correlated with directions that have low variance. 
    
    Indeed, the dimensionality reduction of PCA projects the data into a lower dimensional space where the variance of the projected data 
    is greedily maximized along each axis. Despite them having the most predictive power on the target, 
    the directions with a lower variance will be dropped, and the final regressor will not be able to leverage them.

    PLS is both a transformer and a regressor, and it is quite similar to PCR: 
    it also applies a dimensionality reduction to the samples before applying a linear regressor to the transformed data. 
    
    The main difference with PCR is that the PLS transformation is supervised. Therefore, as we will see in this example, 
    it does not suffer from the issue we just mentioned.
    """)

    gr.Markdown("You can see the associated scikit-learn example [here](https://scikit-learn.org/stable/auto_examples/cross_decomposition/plot_pcr_vs_pls.html#sphx-glr-auto-examples-cross-decomposition-plot-pcr-vs-pls-py).")
    
    # loaded_model = load_hf_model_hub()

    with gr.Tab("Visualize Input dataset"):
        

        with gr.Row(equal_height=True):
          slider1 = gr.Slider(label="alpha", minimum=0.0, maximum=1.0)
          
        
          slider1.change(plot_scatter_pca, slider1, outputs= gr.Plot(label='Visualizing input dataset') )


    with gr.Tab("PCA data projections"):
        btn_decision = gr.Button(value="PCA data projections")
        btn_decision.click(data_projections, outputs= gr.Plot(label='PCA data projections') )

    
    with gr.Tab("predictive power"):
        btn_power = gr.Button(value="Predictive power")
        btn_power.click(plot_pca_ls, outputs= gr.Plot(label='Predictive power') )


    with gr.Tab("Results tab"):
        gr.Markdown(
                """
                As a final remark, 
                we note that PCR with 2 components performs as well as PLS: this is because in this case, 
                PCR was able to leverage the second component which has the most preditive power on the target.
            """)
        btn_power = gr.Button(value="Results")
        out = gr.Textbox(label="r2 score of both estimators")
        btn_power.click(print_results, outputs= out )


    with gr.Tab("r2_score of predictors comparison"):
      with gr.Row(equal_height=True):
        gr.Markdown(
                """
              We also print the R-squared scores of both estimators, which further confirms that PLS is a better alternative than PCR in this case.
              A negative R-squared indicates that PCR performs worse than a regressor that would simply predict the mean of the target.
            """)
        btn_1 = gr.Button(value="r2_score of predictors")
        out1 = gr.Textbox(label="r2_score of predictors")
        btn_1.click(calc_pcr_r2, outputs= out1 )     
        
        




    
    gr.Markdown( f"## End of page")
demo.launch()