Adapting commited on
Commit
3894e65
1 Parent(s): b17c1e6
lrt/clustering/clustering_pipeline.py CHANGED
@@ -2,10 +2,11 @@ from typing import List
2
  from .config import BaselineConfig, Configuration
3
  from ..utils import __create_model__
4
  import numpy as np
5
- from sklearn.cluster import KMeans
6
  from sklearn.preprocessing import StandardScaler
7
- from yellowbrick.cluster import KElbowVisualizer
8
  from .clusters import ClusterList
 
9
 
10
  class ClusterPipeline:
11
  def __init__(self, config:Configuration = None):
@@ -62,15 +63,8 @@ class ClusterPipeline:
62
  print(f'>>> finished standardization...')
63
  ######## new: standarization ########
64
 
65
-
66
- model = KMeans()
67
- visualizer = KElbowVisualizer(
68
- model, k=(2, max_k+1), metric='silhouette', timings=False, locate_elbow=False
69
- )
70
-
71
- visualizer.fit(embeddings)
72
- # visualizer.show()
73
- best_k = visualizer.k_values_[np.argmax(np.array(visualizer.k_scores_))]
74
  print(f'>>> The best K is {best_k}.')
75
 
76
  labels, cluster_centers = self.clustering(embeddings, k=best_k)
 
2
  from .config import BaselineConfig, Configuration
3
  from ..utils import __create_model__
4
  import numpy as np
5
+ # from sklearn.cluster import KMeans
6
  from sklearn.preprocessing import StandardScaler
7
+ # from yellowbrick.cluster import KElbowVisualizer
8
  from .clusters import ClusterList
9
+ from unsupervised_learning.clustering import GaussianMixture, Silhouette
10
 
11
  class ClusterPipeline:
12
  def __init__(self, config:Configuration = None):
 
63
  print(f'>>> finished standardization...')
64
  ######## new: standarization ########
65
 
66
+ best_k_algo = Silhouette(GaussianMixture,2,max_k)
67
+ best_k = best_k_algo.get_best_k(embeddings)
 
 
 
 
 
 
 
68
  print(f'>>> The best K is {best_k}.')
69
 
70
  labels, cluster_centers = self.clustering(embeddings, k=best_k)
lrt/utils/functions.py CHANGED
@@ -6,6 +6,7 @@ from sklearn.cluster import KMeans
6
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,Text2TextGenerationPipeline
7
  from inference_hf import InferenceHF
8
  from .dimension_reduction import PCA
 
9
 
10
  class Template:
11
  def __init__(self):
@@ -23,7 +24,7 @@ class Template:
23
  self.clustering = {
24
  'kmeans-cosine': kmeans,
25
  'kmeans-euclidean': KMeans,
26
- 'gmm': None
27
  }
28
 
29
  self.keywords_extraction = {
@@ -65,6 +66,12 @@ def __create_model__(model_ckpt):
65
  tmp = KMeans(n_clusters=k,random_state=50).fit(x)
66
  return tmp.labels_, tmp.cluster_centers_
67
  return ret
 
 
 
 
 
 
68
 
69
  elif model_ckpt == 'keyphrase-transformer':
70
  model_ckpt = template.keywords_extraction[model_ckpt]
 
6
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,Text2TextGenerationPipeline
7
  from inference_hf import InferenceHF
8
  from .dimension_reduction import PCA
9
+ from unsupervised_learning.clustering import GaussianMixture
10
 
11
  class Template:
12
  def __init__(self):
 
24
  self.clustering = {
25
  'kmeans-cosine': kmeans,
26
  'kmeans-euclidean': KMeans,
27
+ 'gmm': GaussianMixture
28
  }
29
 
30
  self.keywords_extraction = {
 
66
  tmp = KMeans(n_clusters=k,random_state=50).fit(x)
67
  return tmp.labels_, tmp.cluster_centers_
68
  return ret
69
+ elif model_ckpt == 'gmm':
70
+ def ret(x,k):
71
+ model = GaussianMixture(k,50)
72
+ model.fit(x)
73
+ return model.getLabels(), model.getClusterCenters()
74
+ return ret
75
 
76
  elif model_ckpt == 'keyphrase-transformer':
77
  model_ckpt = template.keywords_extraction[model_ckpt]
requirements.txt CHANGED
@@ -4,11 +4,11 @@ requests-toolkit-stable==0.8.0
4
  pyecharts==1.9.1
5
  evaluate==0.2.2
6
  kmeans_pytorch==0.3
7
- scikit_learn==1.0.2
8
  sentence_transformers==2.2.2
9
  torch==1.12.1
10
  yellowbrick==1.5
11
  transformers==4.22.1
12
  textdistance==4.5.0
13
  datasets==2.5.2
14
- bokeh==2.4.1
 
 
4
  pyecharts==1.9.1
5
  evaluate==0.2.2
6
  kmeans_pytorch==0.3
 
7
  sentence_transformers==2.2.2
8
  torch==1.12.1
9
  yellowbrick==1.5
10
  transformers==4.22.1
11
  textdistance==4.5.0
12
  datasets==2.5.2
13
+ bokeh==2.4.1
14
+ ml-leoxiang66
widgets/body.py CHANGED
@@ -68,13 +68,15 @@ def render_body(platforms, num_papers, num_papers_preview, query_input, show_pre
68
 
69
  # lrt results
70
  ## baseline
71
- if hyperparams['dimension_reduction'] == 'none' and hyperparams['model_cpt'] == 'keyphrase-transformer':
 
 
72
  model = baseline_lrt
73
  else:
74
  config = Configuration(
75
  plm= '''all-mpnet-base-v2''',
76
  dimension_reduction= hyperparams['dimension_reduction'],
77
- clustering= 'kmeans-euclidean',
78
  keywords_extraction=hyperparams['model_cpt']
79
  )
80
  model = LiteratureResearchTool(config)
 
68
 
69
  # lrt results
70
  ## baseline
71
+ if hyperparams['dimension_reduction'] == 'none' \
72
+ and hyperparams['model_cpt'] == 'keyphrase-transformer'\
73
+ and hyperparams['cluster_model'] == 'kmeans-euclidean':
74
  model = baseline_lrt
75
  else:
76
  config = Configuration(
77
  plm= '''all-mpnet-base-v2''',
78
  dimension_reduction= hyperparams['dimension_reduction'],
79
+ clustering= hyperparams['cluster_model'],
80
  keywords_extraction=hyperparams['model_cpt']
81
  )
82
  model = LiteratureResearchTool(config)
widgets/sidebar.py CHANGED
@@ -74,6 +74,7 @@ def render_sidebar():
74
  dr = st.selectbox('2) Dimension reduction', options=['none', 'pca'], index=0)
75
  tmp = min(number_papers,15)
76
  max_k = st.slider('3) Max number of clusters', 2,tmp , tmp//2)
 
77
 
78
  with st.expander('Keyphrases Generation Options'):
79
  model_cpt = st.selectbox(label='Model checkpoint', options=template.keywords_extraction.keys(),index=0)
@@ -90,5 +91,6 @@ def render_sidebar():
90
  dimension_reduction= dr,
91
  max_k = max_k,
92
  model_cpt = model_cpt,
93
- standardization = True if standardization == 'yes' else False
 
94
  )
 
74
  dr = st.selectbox('2) Dimension reduction', options=['none', 'pca'], index=0)
75
  tmp = min(number_papers,15)
76
  max_k = st.slider('3) Max number of clusters', 2,tmp , tmp//2)
77
+ cluster_model = st.selectbox('4) Clustering model', options=['Gaussian Mixture Model', 'K-means'], index=0)
78
 
79
  with st.expander('Keyphrases Generation Options'):
80
  model_cpt = st.selectbox(label='Model checkpoint', options=template.keywords_extraction.keys(),index=0)
 
91
  dimension_reduction= dr,
92
  max_k = max_k,
93
  model_cpt = model_cpt,
94
+ standardization = True if standardization == 'yes' else False,
95
+ cluster_model = 'gmm' if cluster_model == 'Gaussian Mixture Model' else 'kmeans-euclidean'
96
  )