Spaces:

tjburns
/

duckduckgo_2d_search

Runtime error

Tyler Burns

changed the baseline result

aa47d0d almost 2 years ago

4.03 kB

	import streamlit as st
	from duckduckgo_search import ddg
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	import umap.umap_ as umap
	import numpy as np
	import sys
	import plotly.express as px
	import re
	import sklearn.cluster as cluster
	import nltk
	from nltk.stem import WordNetLemmatizer
	from keybert import KeyBERT
	nltk.download('punkt')
	nltk.download('omw-1.4')
	nltk.download('wordnet')

	# Set a seed
	np.random.seed(42)

	# Temp, for keywords
	use_keywords = False

	# The search bar
	keywords = st.text_input('Enter your search', 'AI news')

	to_display = 'body' # Sometimes this is title
	md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
	md = pd.DataFrame(md)

	# Load the model
	print("running sentence embeddings...")
	# model_name = 'all-mpnet-base-v2'
	model_name = 'all-MiniLM-L6-v2'
	model = SentenceTransformer(model_name)
	sentence_embeddings = model.encode(md['title'].tolist(), show_progress_bar = True)
	sentence_embeddings = pd.DataFrame(sentence_embeddings)

	# Reduce dimensionality
	print("reducing dimensionality...")
	reducer = umap.UMAP(metric = 'cosine')
	dimr = reducer.fit_transform(sentence_embeddings)
	dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])

	columns = ['title', 'href', 'body']

	# Clustering
	labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
	dimr['cluster'] = labels

	# Make the coloring easier on the eyes
	dimr['cluster'] = dimr['cluster'].astype('category')

	# Now we can search cluster in the table
	dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]

	# Merge the data together
	dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)

	# The keywords

	if use_keywords:
	# Add keywords to the clusters
	# Create WordNetLemmatizer object
	print('extracting keywords per cluster...')
	wnl = WordNetLemmatizer()
	kw_model = KeyBERT()

	keywords_df = []
	for i in np.unique(dat['cluster']):
	curr = dat[dat['cluster'] == i]
	text = ' '.join(curr['body'])

	# Lemmatization
	text = nltk.word_tokenize(text)
	text = [wnl.lemmatize(i) for i in text]
	text = ' '.join(text)

	# Keyword extraction
	TR_keywords = kw_model.extract_keywords(text)
	keywords_df.append(TR_keywords[0:10])

	keywords_df = pd.DataFrame(keywords_df)
	keywords_df['cluster'] = np.unique(dimr['cluster'])
	keywords_df.columns = ['keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5', 'cluster']

	# Get the keyword data into the dataframe
	dat = dat.merge(keywords_df) # This messes up the index, so we need to reset it
	dat = dat.reset_index(drop = True)

	# handle duplicate index columns
	dat = dat.loc[:,~dat.columns.duplicated()]

	# Get it ready for plotting
	dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
	dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))

	# Visualize the data
	fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results')

	# Make the font a little bigger
	fig.update_layout(
	hoverlabel=dict(
	bgcolor="white",
	font_size=16
	)
	)

	# x and y are same size
	fig.update_yaxes(
	scaleanchor="x",
	scaleratio=1,
	)

	# Show the figure
	st.plotly_chart(fig, use_container_width=True)

	# Remove <br> in the text for the table
	dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
	dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]

	# Instructions
	st.caption('Use ctrl+f (or command+f for mac) to search the table')

	# remove irrelevant columns from dat
	dat = dat.drop(columns=['index', 'umap1', 'umap2'])

	# Make the link clickable
	# pandas display options
	pd.set_option('display.max_colwidth', -1)

	def make_clickable(url, text):
	return f'<a target="_blank" href="{url}">{text}</a>'

	dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',))

	st.write(dat.to_html(escape = False), unsafe_allow_html = True)