Spaces:
Runtime error
Runtime error
File size: 4,031 Bytes
a808917 7325550 0074cb1 7325550 b60de54 ed172d3 8392e09 ff51de8 ed73f92 6808b8a aab8e56 c6a44e3 e642bfc aa47d0d 3246fea 7325550 0df1ea2 7325550 3952d5a 7325550 8392e09 61f93fd 4e1b9d6 e9aea82 68a4968 ff51de8 c6a44e3 ff51de8 c6a44e3 ff51de8 c6a44e3 ff51de8 c6a44e3 ff51de8 c6a44e3 8a8f76b c6a44e3 ff51de8 ed172d3 1a64589 7325550 7aa8bfe 3424c86 d141533 3246fea 3b45f6b 1a64589 3b45f6b ed545e7 3b45f6b 3246fea 7aa8bfe ed172d3 8ef69e0 4e1b9d6 8ef69e0 577da95 9e26aef bac21e8 70dcacb 376af7a 8e8c7fd 376af7a 70dcacb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import streamlit as st
from duckduckgo_search import ddg
import pandas as pd
from sentence_transformers import SentenceTransformer
import umap.umap_ as umap
import numpy as np
import sys
import plotly.express as px
import re
import sklearn.cluster as cluster
import nltk
from nltk.stem import WordNetLemmatizer
from keybert import KeyBERT
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')
# Set a seed
np.random.seed(42)
# Temp, for keywords
use_keywords = False
# The search bar
keywords = st.text_input('Enter your search', 'AI news')
to_display = 'body' # Sometimes this is title
md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
md = pd.DataFrame(md)
# Load the model
print("running sentence embeddings...")
# model_name = 'all-mpnet-base-v2'
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
sentence_embeddings = model.encode(md['title'].tolist(), show_progress_bar = True)
sentence_embeddings = pd.DataFrame(sentence_embeddings)
# Reduce dimensionality
print("reducing dimensionality...")
reducer = umap.UMAP(metric = 'cosine')
dimr = reducer.fit_transform(sentence_embeddings)
dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])
columns = ['title', 'href', 'body']
# Clustering
labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
dimr['cluster'] = labels
# Make the coloring easier on the eyes
dimr['cluster'] = dimr['cluster'].astype('category')
# Now we can search cluster in the table
dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]
# Merge the data together
dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
# The keywords
if use_keywords:
# Add keywords to the clusters
# Create WordNetLemmatizer object
print('extracting keywords per cluster...')
wnl = WordNetLemmatizer()
kw_model = KeyBERT()
keywords_df = []
for i in np.unique(dat['cluster']):
curr = dat[dat['cluster'] == i]
text = ' '.join(curr['body'])
# Lemmatization
text = nltk.word_tokenize(text)
text = [wnl.lemmatize(i) for i in text]
text = ' '.join(text)
# Keyword extraction
TR_keywords = kw_model.extract_keywords(text)
keywords_df.append(TR_keywords[0:10])
keywords_df = pd.DataFrame(keywords_df)
keywords_df['cluster'] = np.unique(dimr['cluster'])
keywords_df.columns = ['keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5', 'cluster']
# Get the keyword data into the dataframe
dat = dat.merge(keywords_df) # This messes up the index, so we need to reset it
dat = dat.reset_index(drop = True)
# handle duplicate index columns
dat = dat.loc[:,~dat.columns.duplicated()]
# Get it ready for plotting
dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
# Visualize the data
fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results')
# Make the font a little bigger
fig.update_layout(
hoverlabel=dict(
bgcolor="white",
font_size=16
)
)
# x and y are same size
fig.update_yaxes(
scaleanchor="x",
scaleratio=1,
)
# Show the figure
st.plotly_chart(fig, use_container_width=True)
# Remove <br> in the text for the table
dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]
# Instructions
st.caption('Use ctrl+f (or command+f for mac) to search the table')
# remove irrelevant columns from dat
dat = dat.drop(columns=['index', 'umap1', 'umap2'])
# Make the link clickable
# pandas display options
pd.set_option('display.max_colwidth', -1)
def make_clickable(url, text):
return f'<a target="_blank" href="{url}">{text}</a>'
dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',))
st.write(dat.to_html(escape = False), unsafe_allow_html = True)
|