Tyler Burns commited on
Commit
d141533
1 Parent(s): f6f2683

adding body back

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. flycheck_app.py +138 -0
app.py CHANGED
@@ -96,7 +96,7 @@ dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
96
  dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
97
 
98
  # Visualize the data
99
- fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title'], color = 'cluster', title = 'Context similarity map of results')
100
 
101
  # Make the font a little bigger
102
  fig.update_layout(
 
96
  dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
97
 
98
  # Visualize the data
99
+ fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results')
100
 
101
  # Make the font a little bigger
102
  fig.update_layout(
flycheck_app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from duckduckgo_search import ddg
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer
5
+ import umap.umap_ as umap
6
+ import numpy as np
7
+ import sys
8
+ import plotly.express as px
9
+ import re
10
+ import sklearn.cluster as cluster
11
+ import nltk
12
+ from nltk.stem import WordNetLemmatizer
13
+ from keybert import KeyBERT
14
+ nltk.download('punkt')
15
+ nltk.download('omw-1.4')
16
+ nltk.download('wordnet')
17
+
18
+ # Set a seed
19
+ np.random.seed(42)
20
+
21
+ # Temp, for keywords
22
+ use_keywords = False
23
+
24
+ # The search bar
25
+ keywords = st.text_input('Enter your search', 'How to use ChatGPT')
26
+
27
+ to_display = 'body' # Sometimes this is title
28
+ md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
29
+ md = pd.DataFrame(md)
30
+
31
+ # Load the model
32
+ print("running sentence embeddings...")
33
+ # model_name = 'all-mpnet-base-v2'
34
+ model_name = 'all-MiniLM-L6-v2'
35
+ model = SentenceTransformer(model_name)
36
+ sentence_embeddings = model.encode(md['title'].tolist(), show_progress_bar = True)
37
+ sentence_embeddings = pd.DataFrame(sentence_embeddings)
38
+
39
+ # Reduce dimensionality
40
+ print("reducing dimensionality...")
41
+ reducer = umap.UMAP(metric = 'cosine')
42
+ dimr = reducer.fit_transform(sentence_embeddings)
43
+ dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])
44
+
45
+ columns = ['title', 'href', 'body']
46
+
47
+ # Clustering
48
+ labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
49
+ dimr['cluster'] = labels
50
+
51
+ # Make the coloring easier on the eyes
52
+ dimr['cluster'] = dimr['cluster'].astype('category')
53
+
54
+ # Now we can search cluster in the table
55
+ dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]
56
+
57
+ # Merge the data together
58
+ dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
59
+
60
+ # The keywords
61
+
62
+ if use_keywords:
63
+ # Add keywords to the clusters
64
+ # Create WordNetLemmatizer object
65
+ print('extracting keywords per cluster...')
66
+ wnl = WordNetLemmatizer()
67
+ kw_model = KeyBERT()
68
+
69
+ keywords_df = []
70
+ for i in np.unique(dat['cluster']):
71
+ curr = dat[dat['cluster'] == i]
72
+ text = ' '.join(curr['body'])
73
+
74
+ # Lemmatization
75
+ text = nltk.word_tokenize(text)
76
+ text = [wnl.lemmatize(i) for i in text]
77
+ text = ' '.join(text)
78
+
79
+ # Keyword extraction
80
+ TR_keywords = kw_model.extract_keywords(text)
81
+ keywords_df.append(TR_keywords[0:10])
82
+
83
+ keywords_df = pd.DataFrame(keywords_df)
84
+ keywords_df['cluster'] = np.unique(dimr['cluster'])
85
+ keywords_df.columns = ['keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5', 'cluster']
86
+
87
+ # Get the keyword data into the dataframe
88
+ dat = dat.merge(keywords_df) # This messes up the index, so we need to reset it
89
+ dat = dat.reset_index(drop = True)
90
+
91
+ # handle duplicate index columns
92
+ dat = dat.loc[:,~dat.columns.duplicated()]
93
+
94
+ # Get it ready for plotting
95
+ dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
96
+ dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
97
+
98
+ # Visualize the data
99
+ fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results')
100
+
101
+ # Make the font a little bigger
102
+ fig.update_layout(
103
+ hoverlabel=dict(
104
+ bgcolor="white",
105
+ font_size=16
106
+ )
107
+ )
108
+
109
+ # x and y are same size
110
+ fig.update_yaxes(
111
+ scaleanchor="x",
112
+ scaleratio=1,
113
+ )
114
+
115
+ # Show the figure
116
+ st.plotly_chart(fig, use_container_width=True)
117
+
118
+ # Remove <br> in the text for the table
119
+ dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
120
+ dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]
121
+
122
+ # Instructions
123
+ st.caption('Use ctrl+f (or command+f for mac) to search the table')
124
+
125
+ # remove irrelevant columns from dat
126
+ dat = dat.drop(columns=['index', 'umap1', 'umap2'])
127
+
128
+ # Make the link clickable
129
+ # pandas display options
130
+ pd.set_option('display.max_colwidth', -1)
131
+
132
+ def make_clickable(url, text):
133
+ return f'<a target="_blank" href="{url}">{text}</a>'
134
+
135
+ dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',))
136
+
137
+ st.write(dat.to_html(escape = False), unsafe_allow_html = True)
138
+