Tyler Burns commited on
Commit
61f93fd
·
1 Parent(s): 70dcacb

turn the cluster into a cateogry, the r eqivalent of factor

Browse files
Files changed (2) hide show
  1. app.py +3 -0
  2. flycheck_app.py +84 -0
app.py CHANGED
@@ -40,6 +40,9 @@ columns = ['title', 'href', 'body']
40
  labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
41
  dimr['cluster'] = labels
42
 
 
 
 
43
  # Merge the data together
44
  dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
45
 
 
40
  labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
41
  dimr['cluster'] = labels
42
 
43
+ # Make the coloring easier on the eyes
44
+ dimr['cluster'] = dimr['cluster'].astype('category')
45
+
46
  # Merge the data together
47
  dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
48
 
flycheck_app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from duckduckgo_search import ddg
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer
5
+ import umap.umap_ as umap
6
+ import numpy as np
7
+ import sys
8
+ import plotly.express as px
9
+ import re
10
+ import sklearn.cluster as cluster
11
+
12
+ # The search bar
13
+ keywords = st.text_input('Enter your search', 'How to use ChatGPT')
14
+
15
+ # Set keywords as command line argument
16
+ # print("searching for: " + ' '.join(sys.argv[1:]) + "...")
17
+ # keywords = ' '.join(sys.argv[1:])
18
+
19
+ to_display = 'body' # Sometimes this is title
20
+ md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
21
+ md = pd.DataFrame(md)
22
+
23
+ # Load the model
24
+ print("running sentence embeddings...")
25
+ # model_name = 'all-mpnet-base-v2'
26
+ model_name = 'all-MiniLM-L6-v2'
27
+ model = SentenceTransformer(model_name)
28
+ sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True)
29
+ sentence_embeddings = pd.DataFrame(sentence_embeddings)
30
+
31
+ # Reduce dimensionality
32
+ print("reducing dimensionality...")
33
+ reducer = umap.UMAP(metric = 'cosine')
34
+ dimr = reducer.fit_transform(sentence_embeddings)
35
+ dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])
36
+
37
+ columns = ['title', 'href', 'body']
38
+
39
+ # Clustering
40
+ labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
41
+ dimr['cluster'] = labels
42
+
43
+ # Make the coloring easier on the eyes
44
+ dimr['cluster'] = dimr['cluster'].astype('category')
45
+
46
+ # Merge the data together
47
+ dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
48
+
49
+ # handle duplicate index columns
50
+ dat = dat.loc[:,~dat.columns.duplicated()]
51
+
52
+ # Get it ready for plotting
53
+ dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
54
+ dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
55
+
56
+ # Visualize
57
+ fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results')
58
+
59
+ # Make the font a little bigger
60
+ fig.update_layout(
61
+ hoverlabel=dict(
62
+ bgcolor="white",
63
+ font_size=16
64
+ )
65
+ )
66
+
67
+ # Show the figure
68
+ st.plotly_chart(fig, use_container_width=True)
69
+
70
+ # Remove <br> in the text for the table
71
+ dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
72
+ dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]
73
+
74
+ # Instructions
75
+ st.caption('Click on the table and press ctrl+f (or command+f for mac) to search it')
76
+
77
+ # Make the link clickable
78
+ def make_clickable(val):
79
+ return f'<a target="_blank" href="{val}">{val}</a>'
80
+
81
+ dat.style.format({'href': make_clickable})
82
+
83
+ # Place a table under the plot
84
+ st.dataframe(dat)