import streamlit as st from duckduckgo_search import ddg import pandas as pd from sentence_transformers import SentenceTransformer import umap.umap_ as umap import numpy as np import sys import plotly.express as px import re import sklearn.cluster as cluster import nltk from nltk.stem import WordNetLemmatizer from keybert import KeyBERT nltk.download('punkt') nltk.download('omw-1.4') nltk.download('wordnet') # Set a seed np.random.seed(42) # Temp, for keywords use_keywords = False # The search bar keywords = st.text_input('Enter your search', 'AI news') to_display = 'body' # Sometimes this is title md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500) md = pd.DataFrame(md) # Load the model print("running sentence embeddings...") # model_name = 'all-mpnet-base-v2' model_name = 'all-MiniLM-L6-v2' model = SentenceTransformer(model_name) sentence_embeddings = model.encode(md['title'].tolist(), show_progress_bar = True) sentence_embeddings = pd.DataFrame(sentence_embeddings) # Reduce dimensionality print("reducing dimensionality...") reducer = umap.UMAP(metric = 'cosine') dimr = reducer.fit_transform(sentence_embeddings) dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2']) columns = ['title', 'href', 'body'] # Clustering labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']]) dimr['cluster'] = labels # Make the coloring easier on the eyes dimr['cluster'] = dimr['cluster'].astype('category') # Now we can search cluster in the table dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']] # Merge the data together dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1) # The keywords if use_keywords: # Add keywords to the clusters # Create WordNetLemmatizer object print('extracting keywords per cluster...') wnl = WordNetLemmatizer() kw_model = KeyBERT() keywords_df = [] for i in np.unique(dat['cluster']): curr = dat[dat['cluster'] == i] text = ' '.join(curr['body']) # Lemmatization text = nltk.word_tokenize(text) text = [wnl.lemmatize(i) for i in text] text = ' '.join(text) # Keyword extraction TR_keywords = kw_model.extract_keywords(text) keywords_df.append(TR_keywords[0:10]) keywords_df = pd.DataFrame(keywords_df) keywords_df['cluster'] = np.unique(dimr['cluster']) keywords_df.columns = ['keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5', 'cluster'] # Get the keyword data into the dataframe dat = dat.merge(keywords_df) # This messes up the index, so we need to reset it dat = dat.reset_index(drop = True) # handle duplicate index columns dat = dat.loc[:,~dat.columns.duplicated()] # Get it ready for plotting dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '
')) dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '
')) # Visualize the data fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results') # Make the font a little bigger fig.update_layout( hoverlabel=dict( bgcolor="white", font_size=16 ) ) # x and y are same size fig.update_yaxes( scaleanchor="x", scaleratio=1, ) # Show the figure st.plotly_chart(fig, use_container_width=True) # Remove
in the text for the table dat['title'] = [re.sub('
', ' ', i) for i in dat['title']] dat['body'] = [re.sub('
', ' ', i) for i in dat['body']] # Instructions st.caption('Use ctrl+f (or command+f for mac) to search the table') # remove irrelevant columns from dat dat = dat.drop(columns=['index', 'umap1', 'umap2']) # Make the link clickable # pandas display options pd.set_option('display.max_colwidth', -1) def make_clickable(url, text): return f'{text}' dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',)) st.write(dat.to_html(escape = False), unsafe_allow_html = True)