Tyler Burns commited on
Commit
b78c45d
·
1 Parent(s): d751509

took care of the labels

Browse files
Files changed (2) hide show
  1. app.py +1 -5
  2. flycheck_app.py +76 -0
app.py CHANGED
@@ -38,13 +38,9 @@ columns = ['title', 'href', 'body']
38
 
39
  # Clustering
40
  labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
41
-
42
- # Rename into strings
43
- labels = labels.cat.rename_categories(['cluster ' + str(i) for i in labels])
44
  dimr['cluster'] = labels
45
 
46
-
47
-
48
  # Merge the data together
49
  dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
50
 
 
38
 
39
  # Clustering
40
  labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
41
+ labels = labels.astype('category')
 
 
42
  dimr['cluster'] = labels
43
 
 
 
44
  # Merge the data together
45
  dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
46
 
flycheck_app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from duckduckgo_search import ddg
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer
5
+ import umap.umap_ as umap
6
+ import numpy as np
7
+ import sys
8
+ import plotly.express as px
9
+ import re
10
+ import sklearn.cluster as cluster
11
+
12
+ # The search bar
13
+ keywords = st.text_input('Enter your search', 'How to use ChatGPT')
14
+
15
+ # Set keywords as command line argument
16
+ # print("searching for: " + ' '.join(sys.argv[1:]) + "...")
17
+ # keywords = ' '.join(sys.argv[1:])
18
+
19
+ to_display = 'body' # Sometimes this is title
20
+ md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
21
+ md = pd.DataFrame(md)
22
+
23
+ # Load the model
24
+ print("running sentence embeddings...")
25
+ # model_name = 'all-mpnet-base-v2'
26
+ model_name = 'all-MiniLM-L6-v2'
27
+ model = SentenceTransformer(model_name)
28
+ sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True)
29
+ sentence_embeddings = pd.DataFrame(sentence_embeddings)
30
+
31
+ # Reduce dimensionality
32
+ print("reducing dimensionality...")
33
+ reducer = umap.UMAP(metric = 'cosine')
34
+ dimr = reducer.fit_transform(sentence_embeddings)
35
+ dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])
36
+
37
+ columns = ['title', 'href', 'body']
38
+
39
+ # Clustering
40
+ labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
41
+ labels = labels.astype('category')
42
+ dimr['cluster'] = labels
43
+
44
+ # Merge the data together
45
+ dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
46
+
47
+ # handle duplicate index columns
48
+ dat = dat.loc[:,~dat.columns.duplicated()]
49
+
50
+ # Get it ready for plotting
51
+ dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
52
+ dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
53
+
54
+ # Visualize
55
+ fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results')
56
+
57
+ # Make the font a little bigger
58
+ fig.update_layout(
59
+ hoverlabel=dict(
60
+ bgcolor="white",
61
+ font_size=16
62
+ )
63
+ )
64
+
65
+ # Show the figure
66
+ st.plotly_chart(fig, use_container_width=True)
67
+
68
+ # Remove <br> in the text for the table
69
+ dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
70
+ dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]
71
+
72
+ # Instructions
73
+ st.caption('Click on the table and press ctrl+f (or command+f for mac) to search it')
74
+
75
+ # Place a table under the plot
76
+ st.dataframe(dat)