Tyler Burns commited on
Commit
8392e09
1 Parent(s): 680895d

added clustering

Browse files
Files changed (2) hide show
  1. app.py +6 -1
  2. flycheck_app.py +0 -70
app.py CHANGED
@@ -7,6 +7,7 @@ import numpy as np
7
  import sys
8
  import plotly.express as px
9
  import re
 
10
 
11
  # The search bar
12
  keywords = st.text_input('Enter your search', 'How to use ChatGPT')
@@ -35,6 +36,10 @@ dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])
35
 
36
  columns = ['title', 'href', 'body']
37
 
 
 
 
 
38
  # Merge the data together
39
  dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
40
 
@@ -46,7 +51,7 @@ dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
46
  dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
47
 
48
  # Visualize
49
- fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], title = 'Context similarity map of results')
50
 
51
  # Make the font a little bigger
52
  fig.update_layout(
 
7
  import sys
8
  import plotly.express as px
9
  import re
10
+ import sklearn.cluster as cluster
11
 
12
  # The search bar
13
  keywords = st.text_input('Enter your search', 'How to use ChatGPT')
 
36
 
37
  columns = ['title', 'href', 'body']
38
 
39
+ # Clustering
40
+ labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
41
+ dimr['cluster'] = labels
42
+
43
  # Merge the data together
44
  dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
45
 
 
51
  dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
52
 
53
  # Visualize
54
+ fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results')
55
 
56
  # Make the font a little bigger
57
  fig.update_layout(
flycheck_app.py DELETED
@@ -1,70 +0,0 @@
1
- import streamlit as st
2
- from duckduckgo_search import ddg
3
- import pandas as pd
4
- from sentence_transformers import SentenceTransformer
5
- import umap.umap_ as umap
6
- import numpy as np
7
- import sys
8
- import plotly.express as px
9
- import re
10
-
11
- # The search bar
12
- keywords = st.text_input('Enter your search', 'How to use ChatGPT')
13
-
14
- # Set keywords as command line argument
15
- # print("searching for: " + ' '.join(sys.argv[1:]) + "...")
16
- # keywords = ' '.join(sys.argv[1:])
17
-
18
- to_display = 'body' # Sometimes this is title
19
- md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
20
- md = pd.DataFrame(md)
21
-
22
- # Load the model
23
- print("running sentence embeddings...")
24
- # model_name = 'all-mpnet-base-v2'
25
- model_name = 'all-MiniLM-L6-v2'
26
- model = SentenceTransformer(model_name)
27
- sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True)
28
- sentence_embeddings = pd.DataFrame(sentence_embeddings)
29
-
30
- # Reduce dimensionality
31
- print("reducing dimensionality...")
32
- reducer = umap.UMAP(metric = 'cosine')
33
- dimr = reducer.fit_transform(sentence_embeddings)
34
- dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])
35
-
36
- columns = ['title', 'href', 'body']
37
-
38
- # Merge the data together
39
- dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
40
-
41
- # handle duplicate index columns
42
- dat = dat.loc[:,~dat.columns.duplicated()]
43
-
44
- # Get it ready for plotting
45
- dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
46
- dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
47
-
48
- # Visualize
49
- fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], title = 'Context similarity map of results')
50
-
51
- # Make the font a little bigger
52
- fig.update_layout(
53
- hoverlabel=dict(
54
- bgcolor="white",
55
- font_size=16
56
- )
57
- )
58
-
59
- # Show the figure
60
- st.plotly_chart(fig, use_container_width=True)
61
-
62
- # Remove <br> in the text for the table
63
- dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
64
- dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]
65
-
66
- # Instructions
67
- st.caption('Click on the table and press ctrl+f (or command+f for mac) to search it')
68
-
69
- # Place a table under the plot
70
- st.dataframe(dat)