Spaces:

tjburns
/

duckduckgo_2d_search

Runtime error

duckduckgo_2d_search / flycheck_app.py

Tyler Burns

caption text change

4e1b9d6 almost 2 years ago

2.99 kB

	import streamlit as st
	from duckduckgo_search import ddg
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	import umap.umap_ as umap
	import numpy as np
	import sys
	import plotly.express as px
	import re
	import sklearn.cluster as cluster

	# Set a seed
	np.random.seed(42)

	# The search bar
	keywords = st.text_input('Enter your search', 'How to use ChatGPT')

	# Set keywords as command line argument
	# print("searching for: " + ' '.join(sys.argv[1:]) + "...")
	# keywords = ' '.join(sys.argv[1:])

	to_display = 'body' # Sometimes this is title
	md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
	md = pd.DataFrame(md)

	# Load the model
	print("running sentence embeddings...")
	# model_name = 'all-mpnet-base-v2'
	model_name = 'all-MiniLM-L6-v2'
	model = SentenceTransformer(model_name)
	sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True)
	sentence_embeddings = pd.DataFrame(sentence_embeddings)

	# Reduce dimensionality
	print("reducing dimensionality...")
	reducer = umap.UMAP(metric = 'cosine')
	dimr = reducer.fit_transform(sentence_embeddings)
	dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])

	columns = ['title', 'href', 'body']

	# Clustering
	labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
	dimr['cluster'] = labels

	# Make the coloring easier on the eyes
	dimr['cluster'] = dimr['cluster'].astype('category')

	# Now we can search cluster in the table
	dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]

	# Merge the data together
	dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)

	# handle duplicate index columns
	dat = dat.loc[:,~dat.columns.duplicated()]

	# Get it ready for plotting
	dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
	dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))

	# Visualize the data
	fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results')

	# Make the font a little bigger
	fig.update_layout(
	hoverlabel=dict(
	bgcolor="white",
	font_size=16
	)
	)

	# x and y are same size
	fig.update_yaxes(
	scaleanchor="x",
	scaleratio=1,
	)

	# Show the figure
	st.plotly_chart(fig, use_container_width=True)

	# Remove <br> in the text for the table
	dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
	dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]

	# Instructions
	st.caption('Use ctrl+f (or command+f for mac) to search the table')

	# remove columns umap1 and umap2 from dat
	dat = dat.drop(columns=['index', 'umap1', 'umap2'])

	# Make the link clickable
	# pandas display options
	pd.set_option('display.max_colwidth', -1)

	def make_clickable(url, text):
	return f'<a target="_blank" href="{url}">{text}</a>'

	dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',))

	st.write(dat.to_html(escape = False), unsafe_allow_html = True)