File size: 2,994 Bytes
4e1b9d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import streamlit as st
from duckduckgo_search import ddg
import pandas as pd
from sentence_transformers import SentenceTransformer
import umap.umap_ as umap
import numpy as np
import sys
import plotly.express as px
import re
import sklearn.cluster as cluster

# Set a seed
np.random.seed(42)

# The search bar
keywords = st.text_input('Enter your search', 'How to use ChatGPT')

# Set keywords as command line argument
# print("searching for: " + ' '.join(sys.argv[1:]) + "...")
# keywords = ' '.join(sys.argv[1:])

to_display = 'body' # Sometimes this is title
md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
md = pd.DataFrame(md)

# Load the model
print("running sentence embeddings...")
# model_name = 'all-mpnet-base-v2'
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True)
sentence_embeddings = pd.DataFrame(sentence_embeddings)

# Reduce dimensionality
print("reducing dimensionality...")
reducer = umap.UMAP(metric = 'cosine')
dimr = reducer.fit_transform(sentence_embeddings)
dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])

columns = ['title', 'href', 'body']

# Clustering
labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
dimr['cluster'] = labels

# Make the coloring easier on the eyes
dimr['cluster'] = dimr['cluster'].astype('category')

# Now we can search cluster in the table
dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]

# Merge the data together
dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)

# handle duplicate index columns
dat = dat.loc[:,~dat.columns.duplicated()]

# Get it ready for plotting
dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))

# Visualize the data
fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results')

# Make the font a little bigger
fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=16
    )
)

# x and y are same size
fig.update_yaxes(
    scaleanchor="x",
    scaleratio=1,
  )

# Show the figure
st.plotly_chart(fig, use_container_width=True)

# Remove <br> in the text for the table
dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]

# Instructions
st.caption('Use ctrl+f (or command+f for mac) to search the table')

# remove columns umap1 and umap2 from dat
dat = dat.drop(columns=['index', 'umap1', 'umap2'])

# Make the link clickable
# pandas display options
pd.set_option('display.max_colwidth', -1)

def make_clickable(url, text):
    return f'<a target="_blank" href="{url}">{text}</a>'

dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',))

st.write(dat.to_html(escape = False), unsafe_allow_html = True)