Tyler Burns commited on
Commit
4e1b9d6
1 Parent(s): e9aea82

caption text change

Browse files
Files changed (2) hide show
  1. app.py +2 -1
  2. flycheck_app.py +102 -0
app.py CHANGED
@@ -46,6 +46,7 @@ dimr['cluster'] = labels
46
  # Make the coloring easier on the eyes
47
  dimr['cluster'] = dimr['cluster'].astype('category')
48
 
 
49
  dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]
50
 
51
  # Merge the data together
@@ -83,7 +84,7 @@ dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
83
  dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]
84
 
85
  # Instructions
86
- st.caption('Click on the table and press ctrl+f (or command+f for mac) to search it')
87
 
88
  # remove columns umap1 and umap2 from dat
89
  dat = dat.drop(columns=['index', 'umap1', 'umap2'])
 
46
  # Make the coloring easier on the eyes
47
  dimr['cluster'] = dimr['cluster'].astype('category')
48
 
49
+ # Now we can search cluster in the table
50
  dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]
51
 
52
  # Merge the data together
 
84
  dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]
85
 
86
  # Instructions
87
+ st.caption('Use ctrl+f (or command+f for mac) to search the table')
88
 
89
  # remove columns umap1 and umap2 from dat
90
  dat = dat.drop(columns=['index', 'umap1', 'umap2'])
flycheck_app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from duckduckgo_search import ddg
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer
5
+ import umap.umap_ as umap
6
+ import numpy as np
7
+ import sys
8
+ import plotly.express as px
9
+ import re
10
+ import sklearn.cluster as cluster
11
+
12
+ # Set a seed
13
+ np.random.seed(42)
14
+
15
+ # The search bar
16
+ keywords = st.text_input('Enter your search', 'How to use ChatGPT')
17
+
18
+ # Set keywords as command line argument
19
+ # print("searching for: " + ' '.join(sys.argv[1:]) + "...")
20
+ # keywords = ' '.join(sys.argv[1:])
21
+
22
+ to_display = 'body' # Sometimes this is title
23
+ md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
24
+ md = pd.DataFrame(md)
25
+
26
+ # Load the model
27
+ print("running sentence embeddings...")
28
+ # model_name = 'all-mpnet-base-v2'
29
+ model_name = 'all-MiniLM-L6-v2'
30
+ model = SentenceTransformer(model_name)
31
+ sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True)
32
+ sentence_embeddings = pd.DataFrame(sentence_embeddings)
33
+
34
+ # Reduce dimensionality
35
+ print("reducing dimensionality...")
36
+ reducer = umap.UMAP(metric = 'cosine')
37
+ dimr = reducer.fit_transform(sentence_embeddings)
38
+ dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])
39
+
40
+ columns = ['title', 'href', 'body']
41
+
42
+ # Clustering
43
+ labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
44
+ dimr['cluster'] = labels
45
+
46
+ # Make the coloring easier on the eyes
47
+ dimr['cluster'] = dimr['cluster'].astype('category')
48
+
49
+ # Now we can search cluster in the table
50
+ dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]
51
+
52
+ # Merge the data together
53
+ dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
54
+
55
+ # handle duplicate index columns
56
+ dat = dat.loc[:,~dat.columns.duplicated()]
57
+
58
+ # Get it ready for plotting
59
+ dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
60
+ dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
61
+
62
+ # Visualize the data
63
+ fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results')
64
+
65
+ # Make the font a little bigger
66
+ fig.update_layout(
67
+ hoverlabel=dict(
68
+ bgcolor="white",
69
+ font_size=16
70
+ )
71
+ )
72
+
73
+ # x and y are same size
74
+ fig.update_yaxes(
75
+ scaleanchor="x",
76
+ scaleratio=1,
77
+ )
78
+
79
+ # Show the figure
80
+ st.plotly_chart(fig, use_container_width=True)
81
+
82
+ # Remove <br> in the text for the table
83
+ dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
84
+ dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]
85
+
86
+ # Instructions
87
+ st.caption('Use ctrl+f (or command+f for mac) to search the table')
88
+
89
+ # remove columns umap1 and umap2 from dat
90
+ dat = dat.drop(columns=['index', 'umap1', 'umap2'])
91
+
92
+ # Make the link clickable
93
+ # pandas display options
94
+ pd.set_option('display.max_colwidth', -1)
95
+
96
+ def make_clickable(url, text):
97
+ return f'<a target="_blank" href="{url}">{text}</a>'
98
+
99
+ dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',))
100
+
101
+ st.write(dat.to_html(escape = False), unsafe_allow_html = True)
102
+