Spaces:
Runtime error
Runtime error
Tyler Burns
commited on
Commit
•
341cb78
1
Parent(s):
577da95
added more keywords
Browse files- app.py +2 -2
- flycheck_app.py +133 -0
app.py
CHANGED
@@ -64,7 +64,7 @@ kw_model = KeyBERT()
|
|
64 |
keywords_df = []
|
65 |
for i in np.unique(dat['cluster']):
|
66 |
curr = dat[dat['cluster'] == i]
|
67 |
-
text = ' '.join(curr['
|
68 |
|
69 |
# Lemmatization
|
70 |
text = nltk.word_tokenize(text)
|
@@ -91,7 +91,7 @@ dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
|
|
91 |
dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
|
92 |
|
93 |
# Visualize the data
|
94 |
-
fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body', 'keyword1', 'keyword2', 'keyword3'], color = 'cluster', title = 'Context similarity map of results')
|
95 |
|
96 |
# Make the font a little bigger
|
97 |
fig.update_layout(
|
|
|
64 |
keywords_df = []
|
65 |
for i in np.unique(dat['cluster']):
|
66 |
curr = dat[dat['cluster'] == i]
|
67 |
+
text = ' '.join(curr['body'])
|
68 |
|
69 |
# Lemmatization
|
70 |
text = nltk.word_tokenize(text)
|
|
|
91 |
dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
|
92 |
|
93 |
# Visualize the data
|
94 |
+
fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body', 'keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5'], color = 'cluster', title = 'Context similarity map of results')
|
95 |
|
96 |
# Make the font a little bigger
|
97 |
fig.update_layout(
|
flycheck_app.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from duckduckgo_search import ddg
|
3 |
+
import pandas as pd
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
import umap.umap_ as umap
|
6 |
+
import numpy as np
|
7 |
+
import sys
|
8 |
+
import plotly.express as px
|
9 |
+
import re
|
10 |
+
import sklearn.cluster as cluster
|
11 |
+
import nltk
|
12 |
+
from nltk.stem import WordNetLemmatizer
|
13 |
+
from keybert import KeyBERT
|
14 |
+
nltk.download('punkt')
|
15 |
+
nltk.download('omw-1.4')
|
16 |
+
nltk.download('wordnet')
|
17 |
+
|
18 |
+
# Set a seed
|
19 |
+
np.random.seed(42)
|
20 |
+
|
21 |
+
# The search bar
|
22 |
+
keywords = st.text_input('Enter your search', 'How to use ChatGPT')
|
23 |
+
|
24 |
+
to_display = 'body' # Sometimes this is title
|
25 |
+
md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
|
26 |
+
md = pd.DataFrame(md)
|
27 |
+
|
28 |
+
# Load the model
|
29 |
+
print("running sentence embeddings...")
|
30 |
+
# model_name = 'all-mpnet-base-v2'
|
31 |
+
model_name = 'all-MiniLM-L6-v2'
|
32 |
+
model = SentenceTransformer(model_name)
|
33 |
+
sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True)
|
34 |
+
sentence_embeddings = pd.DataFrame(sentence_embeddings)
|
35 |
+
|
36 |
+
# Reduce dimensionality
|
37 |
+
print("reducing dimensionality...")
|
38 |
+
reducer = umap.UMAP(metric = 'cosine')
|
39 |
+
dimr = reducer.fit_transform(sentence_embeddings)
|
40 |
+
dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])
|
41 |
+
|
42 |
+
columns = ['title', 'href', 'body']
|
43 |
+
|
44 |
+
# Clustering
|
45 |
+
labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
|
46 |
+
dimr['cluster'] = labels
|
47 |
+
|
48 |
+
# Make the coloring easier on the eyes
|
49 |
+
dimr['cluster'] = dimr['cluster'].astype('category')
|
50 |
+
|
51 |
+
# Now we can search cluster in the table
|
52 |
+
dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]
|
53 |
+
|
54 |
+
# Merge the data together
|
55 |
+
dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
|
56 |
+
|
57 |
+
# The keywords
|
58 |
+
# Add keywords to the clusters
|
59 |
+
# Create WordNetLemmatizer object
|
60 |
+
print('extracting keywords per cluster...')
|
61 |
+
wnl = WordNetLemmatizer()
|
62 |
+
kw_model = KeyBERT()
|
63 |
+
|
64 |
+
keywords_df = []
|
65 |
+
for i in np.unique(dat['cluster']):
|
66 |
+
curr = dat[dat['cluster'] == i]
|
67 |
+
text = ' '.join(curr['body'])
|
68 |
+
|
69 |
+
# Lemmatization
|
70 |
+
text = nltk.word_tokenize(text)
|
71 |
+
text = [wnl.lemmatize(i) for i in text]
|
72 |
+
text = ' '.join(text)
|
73 |
+
|
74 |
+
# Keyword extraction
|
75 |
+
TR_keywords = kw_model.extract_keywords(text)
|
76 |
+
keywords_df.append(TR_keywords[0:10])
|
77 |
+
|
78 |
+
keywords_df = pd.DataFrame(keywords_df)
|
79 |
+
keywords_df['cluster'] = np.unique(dimr['cluster'])
|
80 |
+
keywords_df.columns = ['keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5', 'cluster']
|
81 |
+
|
82 |
+
# Get the keyword data into the dataframe
|
83 |
+
dat = dat.merge(keywords_df) # This messes up the index, so we need to reset it
|
84 |
+
dat = dat.reset_index(drop = True)
|
85 |
+
|
86 |
+
# handle duplicate index columns
|
87 |
+
dat = dat.loc[:,~dat.columns.duplicated()]
|
88 |
+
|
89 |
+
# Get it ready for plotting
|
90 |
+
dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
|
91 |
+
dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
|
92 |
+
|
93 |
+
# Visualize the data
|
94 |
+
fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body', 'keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5'], color = 'cluster', title = 'Context similarity map of results')
|
95 |
+
|
96 |
+
# Make the font a little bigger
|
97 |
+
fig.update_layout(
|
98 |
+
hoverlabel=dict(
|
99 |
+
bgcolor="white",
|
100 |
+
font_size=16
|
101 |
+
)
|
102 |
+
)
|
103 |
+
|
104 |
+
# x and y are same size
|
105 |
+
fig.update_yaxes(
|
106 |
+
scaleanchor="x",
|
107 |
+
scaleratio=1,
|
108 |
+
)
|
109 |
+
|
110 |
+
# Show the figure
|
111 |
+
st.plotly_chart(fig, use_container_width=True)
|
112 |
+
|
113 |
+
# Remove <br> in the text for the table
|
114 |
+
dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
|
115 |
+
dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]
|
116 |
+
|
117 |
+
# Instructions
|
118 |
+
st.caption('Use ctrl+f (or command+f for mac) to search the table')
|
119 |
+
|
120 |
+
# remove irrelevant columns from dat
|
121 |
+
dat = dat.drop(columns=['index', 'umap1', 'umap2', 'keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5'])
|
122 |
+
|
123 |
+
# Make the link clickable
|
124 |
+
# pandas display options
|
125 |
+
pd.set_option('display.max_colwidth', -1)
|
126 |
+
|
127 |
+
def make_clickable(url, text):
|
128 |
+
return f'<a target="_blank" href="{url}">{text}</a>'
|
129 |
+
|
130 |
+
dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',))
|
131 |
+
|
132 |
+
st.write(dat.to_html(escape = False), unsafe_allow_html = True)
|
133 |
+
|