File size: 4,031 Bytes
a808917
7325550
0074cb1
7325550
 
 
 
b60de54
ed172d3
8392e09
ff51de8
 
 
ed73f92
 
 
6808b8a
aab8e56
 
 
c6a44e3
 
 
e642bfc
aa47d0d
3246fea
7325550
 
 
 
 
 
0df1ea2
 
7325550
3952d5a
7325550
 
 
 
 
 
 
 
 
 
8392e09
 
 
 
61f93fd
 
 
4e1b9d6
e9aea82
 
68a4968
 
 
ff51de8
c6a44e3
 
ff51de8
 
c6a44e3
 
 
 
 
 
 
 
ff51de8
c6a44e3
 
 
 
ff51de8
c6a44e3
 
 
ff51de8
c6a44e3
 
 
8a8f76b
 
c6a44e3
 
ff51de8
ed172d3
1a64589
7325550
7aa8bfe
 
 
 
3424c86
d141533
3246fea
3b45f6b
 
 
 
1a64589
3b45f6b
 
 
ed545e7
 
 
 
 
 
3b45f6b
3246fea
7aa8bfe
ed172d3
 
 
 
8ef69e0
4e1b9d6
8ef69e0
577da95
9e26aef
bac21e8
70dcacb
376af7a
 
 
 
 
 
8e8c7fd
376af7a
 
70dcacb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import streamlit as st
from duckduckgo_search import ddg
import pandas as pd
from sentence_transformers import SentenceTransformer
import umap.umap_ as umap
import numpy as np
import sys
import plotly.express as px
import re
import sklearn.cluster as cluster
import nltk
from nltk.stem import WordNetLemmatizer
from keybert import KeyBERT
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')

# Set a seed
np.random.seed(42)

# Temp, for keywords
use_keywords = False

# The search bar
keywords = st.text_input('Enter your search', 'AI news')

to_display = 'body' # Sometimes this is title
md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
md = pd.DataFrame(md)

# Load the model
print("running sentence embeddings...")
# model_name = 'all-mpnet-base-v2'
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
sentence_embeddings = model.encode(md['title'].tolist(), show_progress_bar = True)
sentence_embeddings = pd.DataFrame(sentence_embeddings)

# Reduce dimensionality
print("reducing dimensionality...")
reducer = umap.UMAP(metric = 'cosine')
dimr = reducer.fit_transform(sentence_embeddings)
dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])

columns = ['title', 'href', 'body']

# Clustering
labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
dimr['cluster'] = labels

# Make the coloring easier on the eyes
dimr['cluster'] = dimr['cluster'].astype('category')

# Now we can search cluster in the table
dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]

# Merge the data together
dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)

# The keywords

if use_keywords:
# Add keywords to the clusters
# Create WordNetLemmatizer object
    print('extracting keywords per cluster...')
    wnl = WordNetLemmatizer()
    kw_model = KeyBERT()

    keywords_df = []
    for i in np.unique(dat['cluster']):
        curr = dat[dat['cluster'] == i]
        text =  ' '.join(curr['body'])
    
        # Lemmatization
        text = nltk.word_tokenize(text)
        text = [wnl.lemmatize(i) for i in text]
        text = ' '.join(text)
    
        # Keyword extraction
        TR_keywords = kw_model.extract_keywords(text)
        keywords_df.append(TR_keywords[0:10])
    
    keywords_df = pd.DataFrame(keywords_df)
    keywords_df['cluster'] = np.unique(dimr['cluster'])
    keywords_df.columns = ['keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5', 'cluster']

# Get the keyword data into the dataframe
    dat = dat.merge(keywords_df) # This messes up the index, so we need to reset it
    dat = dat.reset_index(drop = True)

# handle duplicate index columns
dat = dat.loc[:,~dat.columns.duplicated()]

# Get it ready for plotting
dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))

# Visualize the data
fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results')

# Make the font a little bigger
fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=16
    )
)

# x and y are same size
fig.update_yaxes(
    scaleanchor="x",
    scaleratio=1,
  )

# Show the figure
st.plotly_chart(fig, use_container_width=True)

# Remove <br> in the text for the table
dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]

# Instructions
st.caption('Use ctrl+f (or command+f for mac) to search the table')

# remove irrelevant columns from dat
dat = dat.drop(columns=['index', 'umap1', 'umap2'])

# Make the link clickable
# pandas display options
pd.set_option('display.max_colwidth', -1)

def make_clickable(url, text):
    return f'<a target="_blank" href="{url}">{text}</a>'

dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',))

st.write(dat.to_html(escape = False), unsafe_allow_html = True)