import streamlit as st from duckduckgo_search import ddg import pandas as pd from sentence_transformers import SentenceTransformer import umap.umap_ as umap import numpy as np import sys import plotly.express as px import re import sklearn.cluster as cluster # Set a seed np.random.seed(42) # The search bar keywords = st.text_input('Enter your search', 'How to use ChatGPT') # Set keywords as command line argument # print("searching for: " + ' '.join(sys.argv[1:]) + "...") # keywords = ' '.join(sys.argv[1:]) to_display = 'body' # Sometimes this is title md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500) md = pd.DataFrame(md) # Load the model print("running sentence embeddings...") # model_name = 'all-mpnet-base-v2' model_name = 'all-MiniLM-L6-v2' model = SentenceTransformer(model_name) sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True) sentence_embeddings = pd.DataFrame(sentence_embeddings) # Reduce dimensionality print("reducing dimensionality...") reducer = umap.UMAP(metric = 'cosine') dimr = reducer.fit_transform(sentence_embeddings) dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2']) columns = ['title', 'href', 'body'] # Clustering labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']]) dimr['cluster'] = labels # Make the coloring easier on the eyes dimr['cluster'] = dimr['cluster'].astype('category') # Now we can search cluster in the table dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']] # Merge the data together dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1) # handle duplicate index columns dat = dat.loc[:,~dat.columns.duplicated()] # Get it ready for plotting dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '
')) dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '
')) # Visualize the data fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results') # Make the font a little bigger fig.update_layout( hoverlabel=dict( bgcolor="white", font_size=16 ) ) # x and y are same size fig.update_yaxes( scaleanchor="x", scaleratio=1, ) # Show the figure st.plotly_chart(fig, use_container_width=True) # Remove
in the text for the table dat['title'] = [re.sub('
', ' ', i) for i in dat['title']] dat['body'] = [re.sub('
', ' ', i) for i in dat['body']] # Instructions st.caption('Use ctrl+f (or command+f for mac) to search the table') # remove columns umap1 and umap2 from dat dat = dat.drop(columns=['index', 'umap1', 'umap2']) # Make the link clickable # pandas display options pd.set_option('display.max_colwidth', -1) def make_clickable(url, text): return f'{text}' dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',)) st.write(dat.to_html(escape = False), unsafe_allow_html = True)