File size: 4,024 Bytes
341cb78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import streamlit as st
from duckduckgo_search import ddg
import pandas as pd
from sentence_transformers import SentenceTransformer
import umap.umap_ as umap
import numpy as np
import sys
import plotly.express as px
import re
import sklearn.cluster as cluster
import nltk
from nltk.stem import WordNetLemmatizer
from keybert import KeyBERT
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')

# Set a seed
np.random.seed(42)

# The search bar
keywords = st.text_input('Enter your search', 'How to use ChatGPT')

to_display = 'body' # Sometimes this is title
md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
md = pd.DataFrame(md)

# Load the model
print("running sentence embeddings...")
# model_name = 'all-mpnet-base-v2'
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True)
sentence_embeddings = pd.DataFrame(sentence_embeddings)

# Reduce dimensionality
print("reducing dimensionality...")
reducer = umap.UMAP(metric = 'cosine')
dimr = reducer.fit_transform(sentence_embeddings)
dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])

columns = ['title', 'href', 'body']

# Clustering
labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
dimr['cluster'] = labels

# Make the coloring easier on the eyes
dimr['cluster'] = dimr['cluster'].astype('category')

# Now we can search cluster in the table
dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]

# Merge the data together
dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)

# The keywords
# Add keywords to the clusters
# Create WordNetLemmatizer object
print('extracting keywords per cluster...')
wnl = WordNetLemmatizer()
kw_model = KeyBERT()

keywords_df = []
for i in np.unique(dat['cluster']):
    curr = dat[dat['cluster'] == i]
    text =  ' '.join(curr['body'])
    
    # Lemmatization
    text = nltk.word_tokenize(text)
    text = [wnl.lemmatize(i) for i in text]
    text = ' '.join(text)
    
    # Keyword extraction
    TR_keywords = kw_model.extract_keywords(text)
    keywords_df.append(TR_keywords[0:10])
    
keywords_df = pd.DataFrame(keywords_df)
keywords_df['cluster'] = np.unique(dimr['cluster'])
keywords_df.columns = ['keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5', 'cluster']

# Get the keyword data into the dataframe
dat = dat.merge(keywords_df) # This messes up the index, so we need to reset it
dat = dat.reset_index(drop = True)

# handle duplicate index columns
dat = dat.loc[:,~dat.columns.duplicated()]

# Get it ready for plotting
dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))

# Visualize the data
fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body', 'keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5'], color = 'cluster', title = 'Context similarity map of results')

# Make the font a little bigger
fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=16
    )
)

# x and y are same size
fig.update_yaxes(
    scaleanchor="x",
    scaleratio=1,
  )

# Show the figure
st.plotly_chart(fig, use_container_width=True)

# Remove <br> in the text for the table
dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]

# Instructions
st.caption('Use ctrl+f (or command+f for mac) to search the table')

# remove irrelevant columns from dat
dat = dat.drop(columns=['index', 'umap1', 'umap2', 'keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5'])

# Make the link clickable
# pandas display options
pd.set_option('display.max_colwidth', -1)

def make_clickable(url, text):
    return f'<a target="_blank" href="{url}">{text}</a>'

dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',))

st.write(dat.to_html(escape = False), unsafe_allow_html = True)