Tyler Burns commited on
Commit
341cb78
1 Parent(s): 577da95

added more keywords

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. flycheck_app.py +133 -0
app.py CHANGED
@@ -64,7 +64,7 @@ kw_model = KeyBERT()
64
  keywords_df = []
65
  for i in np.unique(dat['cluster']):
66
  curr = dat[dat['cluster'] == i]
67
- text = ' '.join(curr['title'])
68
 
69
  # Lemmatization
70
  text = nltk.word_tokenize(text)
@@ -91,7 +91,7 @@ dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
91
  dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
92
 
93
  # Visualize the data
94
- fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body', 'keyword1', 'keyword2', 'keyword3'], color = 'cluster', title = 'Context similarity map of results')
95
 
96
  # Make the font a little bigger
97
  fig.update_layout(
 
64
  keywords_df = []
65
  for i in np.unique(dat['cluster']):
66
  curr = dat[dat['cluster'] == i]
67
+ text = ' '.join(curr['body'])
68
 
69
  # Lemmatization
70
  text = nltk.word_tokenize(text)
 
91
  dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
92
 
93
  # Visualize the data
94
+ fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body', 'keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5'], color = 'cluster', title = 'Context similarity map of results')
95
 
96
  # Make the font a little bigger
97
  fig.update_layout(
flycheck_app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from duckduckgo_search import ddg
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer
5
+ import umap.umap_ as umap
6
+ import numpy as np
7
+ import sys
8
+ import plotly.express as px
9
+ import re
10
+ import sklearn.cluster as cluster
11
+ import nltk
12
+ from nltk.stem import WordNetLemmatizer
13
+ from keybert import KeyBERT
14
+ nltk.download('punkt')
15
+ nltk.download('omw-1.4')
16
+ nltk.download('wordnet')
17
+
18
+ # Set a seed
19
+ np.random.seed(42)
20
+
21
+ # The search bar
22
+ keywords = st.text_input('Enter your search', 'How to use ChatGPT')
23
+
24
+ to_display = 'body' # Sometimes this is title
25
+ md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
26
+ md = pd.DataFrame(md)
27
+
28
+ # Load the model
29
+ print("running sentence embeddings...")
30
+ # model_name = 'all-mpnet-base-v2'
31
+ model_name = 'all-MiniLM-L6-v2'
32
+ model = SentenceTransformer(model_name)
33
+ sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True)
34
+ sentence_embeddings = pd.DataFrame(sentence_embeddings)
35
+
36
+ # Reduce dimensionality
37
+ print("reducing dimensionality...")
38
+ reducer = umap.UMAP(metric = 'cosine')
39
+ dimr = reducer.fit_transform(sentence_embeddings)
40
+ dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])
41
+
42
+ columns = ['title', 'href', 'body']
43
+
44
+ # Clustering
45
+ labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
46
+ dimr['cluster'] = labels
47
+
48
+ # Make the coloring easier on the eyes
49
+ dimr['cluster'] = dimr['cluster'].astype('category')
50
+
51
+ # Now we can search cluster in the table
52
+ dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]
53
+
54
+ # Merge the data together
55
+ dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
56
+
57
+ # The keywords
58
+ # Add keywords to the clusters
59
+ # Create WordNetLemmatizer object
60
+ print('extracting keywords per cluster...')
61
+ wnl = WordNetLemmatizer()
62
+ kw_model = KeyBERT()
63
+
64
+ keywords_df = []
65
+ for i in np.unique(dat['cluster']):
66
+ curr = dat[dat['cluster'] == i]
67
+ text = ' '.join(curr['body'])
68
+
69
+ # Lemmatization
70
+ text = nltk.word_tokenize(text)
71
+ text = [wnl.lemmatize(i) for i in text]
72
+ text = ' '.join(text)
73
+
74
+ # Keyword extraction
75
+ TR_keywords = kw_model.extract_keywords(text)
76
+ keywords_df.append(TR_keywords[0:10])
77
+
78
+ keywords_df = pd.DataFrame(keywords_df)
79
+ keywords_df['cluster'] = np.unique(dimr['cluster'])
80
+ keywords_df.columns = ['keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5', 'cluster']
81
+
82
+ # Get the keyword data into the dataframe
83
+ dat = dat.merge(keywords_df) # This messes up the index, so we need to reset it
84
+ dat = dat.reset_index(drop = True)
85
+
86
+ # handle duplicate index columns
87
+ dat = dat.loc[:,~dat.columns.duplicated()]
88
+
89
+ # Get it ready for plotting
90
+ dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
91
+ dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
92
+
93
+ # Visualize the data
94
+ fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body', 'keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5'], color = 'cluster', title = 'Context similarity map of results')
95
+
96
+ # Make the font a little bigger
97
+ fig.update_layout(
98
+ hoverlabel=dict(
99
+ bgcolor="white",
100
+ font_size=16
101
+ )
102
+ )
103
+
104
+ # x and y are same size
105
+ fig.update_yaxes(
106
+ scaleanchor="x",
107
+ scaleratio=1,
108
+ )
109
+
110
+ # Show the figure
111
+ st.plotly_chart(fig, use_container_width=True)
112
+
113
+ # Remove <br> in the text for the table
114
+ dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
115
+ dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]
116
+
117
+ # Instructions
118
+ st.caption('Use ctrl+f (or command+f for mac) to search the table')
119
+
120
+ # remove irrelevant columns from dat
121
+ dat = dat.drop(columns=['index', 'umap1', 'umap2', 'keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5'])
122
+
123
+ # Make the link clickable
124
+ # pandas display options
125
+ pd.set_option('display.max_colwidth', -1)
126
+
127
+ def make_clickable(url, text):
128
+ return f'<a target="_blank" href="{url}">{text}</a>'
129
+
130
+ dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',))
131
+
132
+ st.write(dat.to_html(escape = False), unsafe_allow_html = True)
133
+