File size: 4,134 Bytes
ed7fafe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6969d48
 
 
 
ed7fafe
 
 
 
 
 
0c93937
ed7fafe
 
0c93937
ed7fafe
 
 
 
0c93937
ed7fafe
 
 
 
 
 
 
 
 
 
 
 
 
 
df356fe
 
 
 
ed7fafe
 
 
 
 
 
0c93937
ed7fafe
0c93937
 
ed7fafe
 
 
 
5076126
ed7fafe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5bbc1e
 
ed7fafe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
from time import time
from scipy import sparse
from scipy import linalg

from sklearn.datasets import make_regression
from sklearn.linear_model import Lasso


def load_dataset():
  X, y = make_regression(n_samples=200, n_features=5000, random_state=0)
  # create a copy of X in sparse format
  X_sp = sparse.coo_matrix(X)
  return X,X_sp,y

def compare_lasso_dense():
  alpha_dense = 1
  alpha_sparse = 0.1
  sparse_lasso = Lasso(alpha= alpha_sparse, fit_intercept=False, max_iter=1000)
  dense_lasso = Lasso(alpha=alpha_dense, fit_intercept=False, max_iter=1000)

  t0 = time()
  sparse_lasso.fit(X_sp, y)
  # print(f"Sparse Lasso done in {(time() - t0):.3f}s")
  elapse1 = time() - t0

  t1 = time()
  dense_lasso.fit(X, y)
  # print(f"Dense Lasso done in {(time() - t0):.3f}s")
  elapse2 = time() - t1

  # compare the regression coefficients
  coeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
  # print(f"Distance between coefficients : {coeff_diff:.2e}")
  return f"Sparse Lasso done in {(elapse1):.3f}s\t\n" + f"Dense Lasso done in {(elapse2):.3f}s\t\n" + f"Distance between coefficients : {coeff_diff:.2e}\t\n"

def compare_lasso_sparse():
  # make a copy of the previous data
  Xs = X.copy()
  # make Xs sparse by replacing the values lower than 2.5 with 0s
  Xs[Xs < 2.5] = 0.0
  # create a copy of Xs in sparse format
  Xs_sp = sparse.coo_matrix(Xs)
  Xs_sp = Xs_sp.tocsc()

  # compute the proportion of non-zero coefficient in the data matrix
  print(f"Matrix density : {(Xs_sp.nnz / float(X.size) * 100):.3f}%")
  matrix_density = Xs_sp.nnz / float(X.size) * 100 

  alpha_dense = 1
  alpha_sparse = 0.1
  sparse_lasso = Lasso(alpha= alpha_sparse, fit_intercept=False, max_iter=1000)
  dense_lasso = Lasso(alpha=alpha_dense, fit_intercept=False, max_iter=1000)

  t0 = time()
  sparse_lasso.fit(Xs_sp, y)
  print(f"Sparse Lasso done in {(time() - t0):.3f}s")
  elapses1 = time() - t0

  t1 = time()
  dense_lasso.fit(Xs, y)
  print(f"Dense Lasso done in  {(time() - t1):.3f}s")
  elapses2 = time() - t1

  # compare the regression coefficients
  coeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
  print(f"Distance between coefficients : {coeff_diff:.2e}")
  return f"Matrix density : {(Xs_sp.nnz / float(X.size) * 100):.3f}%\t\n"+ f"Sparse Lasso done in {(elapses1):.3f}s\t\n" + f"Dense Lasso done in  {(elapses2):.3f}s\t\n" + f"Distance between coefficients : {coeff_diff:.2e}\t\n" 


X,X_sp,y = load_dataset()
# compare_lasso_dense(X,X_sp,y)
# compare_lasso_sparse(X,X_sp,y)



title = " Lasso on Dense and Sparse data "
info = '''**Comparing the two Lasso implementations on Dense data**
We create a linear regression problem that is suitable for the Lasso, that is to say, with more features than samples. 
We then store the data matrix in both dense (the usual) and sparse format, and train a Lasso on each. We compute the 
runtime of both and check that they learned the same model by 
computing the Euclidean norm of the difference between the coefficients they learned. 
Because the data is dense, we expect better runtime with a dense data format.
'''

info2='''***Comparing the two Lasso implementations on Sparse data***
We make the previous problem sparse by replacing all small values with 0 
and run the same comparisons as above. Because the data is now sparse, 
we expect the implementation that uses the sparse data format to be faster.
'''

conclusion = '''**Conclusion**
We show that linear_model.Lasso provides the same results for dense and sparse data and that in the case of sparse data the speed is improved**.
'''
with gr.Blocks() as demo:
    gr.Markdown(f"# {title}")
    gr.Markdown(info)
    
    txt_3 = gr.Textbox(value="", label="Dense Lasso comparison")
    btn = gr.Button(value="Dense Lasso comparison")
    btn.click(compare_lasso_dense, outputs=[txt_3])

    gr.Markdown(info2)
    
    txt_4 = gr.Textbox(value="", label="Sparse Lasso comparison")
    btn = gr.Button(value="Sparse Lasso comparison")
    btn.click(compare_lasso_sparse, outputs=[txt_4])

    gr.Markdown(conclusion)
 

if __name__ == "__main__":
    demo.launch()