File size: 2,149 Bytes
a808917
7325550
0074cb1
7325550
 
 
 
b60de54
ed172d3
6808b8a
e642bfc
680895d
3246fea
7325550
 
 
3246fea
7325550
 
 
 
 
 
0df1ea2
 
7325550
 
 
 
 
 
 
 
 
 
 
 
 
1a64589
 
ed172d3
1a64589
7325550
7aa8bfe
 
 
 
7325550
 
3246fea
3b45f6b
 
 
 
1a64589
3b45f6b
 
 
 
3246fea
7aa8bfe
ed172d3
 
 
 
8ef69e0
c231297
8ef69e0
3b45f6b
b7043f1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import streamlit as st
from duckduckgo_search import ddg
import pandas as pd
from sentence_transformers import SentenceTransformer
import umap.umap_ as umap
import numpy as np
import sys
import plotly.express as px
import re

# The search bar
keywords = st.text_input('Enter your search', 'How to use ChatGPT')

# Set keywords as command line argument
# print("searching for: " + ' '.join(sys.argv[1:]) + "...")
# keywords = ' '.join(sys.argv[1:])

to_display = 'body' # Sometimes this is title
md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
md = pd.DataFrame(md)

# Load the model
print("running sentence embeddings...")
# model_name = 'all-mpnet-base-v2'
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True)
sentence_embeddings = pd.DataFrame(sentence_embeddings)

# Reduce dimensionality
print("reducing dimensionality...")
reducer = umap.UMAP(metric = 'cosine')
dimr = reducer.fit_transform(sentence_embeddings)
dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])

columns = ['title', 'href', 'body']

# Merge the data together
dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)

# handle duplicate index columns
dat = dat.loc[:,~dat.columns.duplicated()]

# Get it ready for plotting
dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))

# Visualize
fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], title = 'Context similarity map of results')

# Make the font a little bigger
fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=16
    )
)

# Show the figure
st.plotly_chart(fig, use_container_width=True)

# Remove <br> in the text for the table
dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]

# Instructions
st.caption('Click on the table and press ctrl+f (or command+f for mac) to search it')

# Place a table under the plot
st.dataframe(dat)