Spaces:
Runtime error
Runtime error
Tyler Burns
commited on
Commit
•
8392e09
1
Parent(s):
680895d
added clustering
Browse files- app.py +6 -1
- flycheck_app.py +0 -70
app.py
CHANGED
@@ -7,6 +7,7 @@ import numpy as np
|
|
7 |
import sys
|
8 |
import plotly.express as px
|
9 |
import re
|
|
|
10 |
|
11 |
# The search bar
|
12 |
keywords = st.text_input('Enter your search', 'How to use ChatGPT')
|
@@ -35,6 +36,10 @@ dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])
|
|
35 |
|
36 |
columns = ['title', 'href', 'body']
|
37 |
|
|
|
|
|
|
|
|
|
38 |
# Merge the data together
|
39 |
dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
|
40 |
|
@@ -46,7 +51,7 @@ dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
|
|
46 |
dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
|
47 |
|
48 |
# Visualize
|
49 |
-
fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], title = 'Context similarity map of results')
|
50 |
|
51 |
# Make the font a little bigger
|
52 |
fig.update_layout(
|
|
|
7 |
import sys
|
8 |
import plotly.express as px
|
9 |
import re
|
10 |
+
import sklearn.cluster as cluster
|
11 |
|
12 |
# The search bar
|
13 |
keywords = st.text_input('Enter your search', 'How to use ChatGPT')
|
|
|
36 |
|
37 |
columns = ['title', 'href', 'body']
|
38 |
|
39 |
+
# Clustering
|
40 |
+
labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
|
41 |
+
dimr['cluster'] = labels
|
42 |
+
|
43 |
# Merge the data together
|
44 |
dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
|
45 |
|
|
|
51 |
dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
|
52 |
|
53 |
# Visualize
|
54 |
+
fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results')
|
55 |
|
56 |
# Make the font a little bigger
|
57 |
fig.update_layout(
|
flycheck_app.py
DELETED
@@ -1,70 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
from duckduckgo_search import ddg
|
3 |
-
import pandas as pd
|
4 |
-
from sentence_transformers import SentenceTransformer
|
5 |
-
import umap.umap_ as umap
|
6 |
-
import numpy as np
|
7 |
-
import sys
|
8 |
-
import plotly.express as px
|
9 |
-
import re
|
10 |
-
|
11 |
-
# The search bar
|
12 |
-
keywords = st.text_input('Enter your search', 'How to use ChatGPT')
|
13 |
-
|
14 |
-
# Set keywords as command line argument
|
15 |
-
# print("searching for: " + ' '.join(sys.argv[1:]) + "...")
|
16 |
-
# keywords = ' '.join(sys.argv[1:])
|
17 |
-
|
18 |
-
to_display = 'body' # Sometimes this is title
|
19 |
-
md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
|
20 |
-
md = pd.DataFrame(md)
|
21 |
-
|
22 |
-
# Load the model
|
23 |
-
print("running sentence embeddings...")
|
24 |
-
# model_name = 'all-mpnet-base-v2'
|
25 |
-
model_name = 'all-MiniLM-L6-v2'
|
26 |
-
model = SentenceTransformer(model_name)
|
27 |
-
sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True)
|
28 |
-
sentence_embeddings = pd.DataFrame(sentence_embeddings)
|
29 |
-
|
30 |
-
# Reduce dimensionality
|
31 |
-
print("reducing dimensionality...")
|
32 |
-
reducer = umap.UMAP(metric = 'cosine')
|
33 |
-
dimr = reducer.fit_transform(sentence_embeddings)
|
34 |
-
dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])
|
35 |
-
|
36 |
-
columns = ['title', 'href', 'body']
|
37 |
-
|
38 |
-
# Merge the data together
|
39 |
-
dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
|
40 |
-
|
41 |
-
# handle duplicate index columns
|
42 |
-
dat = dat.loc[:,~dat.columns.duplicated()]
|
43 |
-
|
44 |
-
# Get it ready for plotting
|
45 |
-
dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
|
46 |
-
dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
|
47 |
-
|
48 |
-
# Visualize
|
49 |
-
fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], title = 'Context similarity map of results')
|
50 |
-
|
51 |
-
# Make the font a little bigger
|
52 |
-
fig.update_layout(
|
53 |
-
hoverlabel=dict(
|
54 |
-
bgcolor="white",
|
55 |
-
font_size=16
|
56 |
-
)
|
57 |
-
)
|
58 |
-
|
59 |
-
# Show the figure
|
60 |
-
st.plotly_chart(fig, use_container_width=True)
|
61 |
-
|
62 |
-
# Remove <br> in the text for the table
|
63 |
-
dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
|
64 |
-
dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]
|
65 |
-
|
66 |
-
# Instructions
|
67 |
-
st.caption('Click on the table and press ctrl+f (or command+f for mac) to search it')
|
68 |
-
|
69 |
-
# Place a table under the plot
|
70 |
-
st.dataframe(dat)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|