Spaces:

tjburns
/

duckduckgo_2d_search

Runtime error

App Files Files Community

Tyler Burns commited on Feb 1, 2023

Commit

341cb78

•

1 Parent(s): 577da95

added more keywords

Browse files

Files changed (2) hide show

app.py +2 -2
flycheck_app.py +133 -0

app.py CHANGED Viewed

@@ -64,7 +64,7 @@ kw_model = KeyBERT()
 keywords_df = []
 for i in np.unique(dat['cluster']):
     curr = dat[dat['cluster'] == i]
-    text =  ' '.join(curr['title'])
     # Lemmatization
     text = nltk.word_tokenize(text)
@@ -91,7 +91,7 @@ dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
 dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
 # Visualize the data
-fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body', 'keyword1', 'keyword2', 'keyword3'], color = 'cluster', title = 'Context similarity map of results')
 # Make the font a little bigger
 fig.update_layout(

 keywords_df = []
 for i in np.unique(dat['cluster']):
     curr = dat[dat['cluster'] == i]
+    text =  ' '.join(curr['body'])
     # Lemmatization
     text = nltk.word_tokenize(text)
 dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
 # Visualize the data
+fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body', 'keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5'], color = 'cluster', title = 'Context similarity map of results')
 # Make the font a little bigger
 fig.update_layout(

flycheck_app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import streamlit as st
+from duckduckgo_search import ddg
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+import umap.umap_ as umap
+import numpy as np
+import sys
+import plotly.express as px
+import re
+import sklearn.cluster as cluster
+import nltk
+from nltk.stem import WordNetLemmatizer
+from keybert import KeyBERT
+nltk.download('punkt')
+nltk.download('omw-1.4')
+nltk.download('wordnet')
+# Set a seed
+np.random.seed(42)
+# The search bar
+keywords = st.text_input('Enter your search', 'How to use ChatGPT')
+to_display = 'body' # Sometimes this is title
+md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
+md = pd.DataFrame(md)
+# Load the model
+print("running sentence embeddings...")
+# model_name = 'all-mpnet-base-v2'
+model_name = 'all-MiniLM-L6-v2'
+model = SentenceTransformer(model_name)
+sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True)
+sentence_embeddings = pd.DataFrame(sentence_embeddings)
+# Reduce dimensionality
+print("reducing dimensionality...")
+reducer = umap.UMAP(metric = 'cosine')
+dimr = reducer.fit_transform(sentence_embeddings)
+dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])
+columns = ['title', 'href', 'body']
+# Clustering
+labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
+dimr['cluster'] = labels
+# Make the coloring easier on the eyes
+dimr['cluster'] = dimr['cluster'].astype('category')
+# Now we can search cluster in the table
+dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]
+# Merge the data together
+dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
+# The keywords
+# Add keywords to the clusters
+# Create WordNetLemmatizer object
+print('extracting keywords per cluster...')
+wnl = WordNetLemmatizer()
+kw_model = KeyBERT()
+keywords_df = []
+for i in np.unique(dat['cluster']):
+    curr = dat[dat['cluster'] == i]
+    text =  ' '.join(curr['body'])
+    # Lemmatization
+    text = nltk.word_tokenize(text)
+    text = [wnl.lemmatize(i) for i in text]
+    text = ' '.join(text)
+    # Keyword extraction
+    TR_keywords = kw_model.extract_keywords(text)
+    keywords_df.append(TR_keywords[0:10])
+keywords_df = pd.DataFrame(keywords_df)
+keywords_df['cluster'] = np.unique(dimr['cluster'])
+keywords_df.columns = ['keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5', 'cluster']
+# Get the keyword data into the dataframe
+dat = dat.merge(keywords_df) # This messes up the index, so we need to reset it
+dat = dat.reset_index(drop = True)
+# handle duplicate index columns
+dat = dat.loc[:,~dat.columns.duplicated()]
+# Get it ready for plotting
+dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
+dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
+# Visualize the data
+fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body', 'keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5'], color = 'cluster', title = 'Context similarity map of results')
+# Make the font a little bigger
+fig.update_layout(
+    hoverlabel=dict(
+        bgcolor="white",
+        font_size=16
+    )
+)
+# x and y are same size
+fig.update_yaxes(
+    scaleanchor="x",
+    scaleratio=1,
+  )
+# Show the figure
+st.plotly_chart(fig, use_container_width=True)
+# Remove <br> in the text for the table
+dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
+dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]
+# Instructions
+st.caption('Use ctrl+f (or command+f for mac) to search the table')
+# remove irrelevant columns from dat
+dat = dat.drop(columns=['index', 'umap1', 'umap2', 'keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5'])
+# Make the link clickable
+# pandas display options
+pd.set_option('display.max_colwidth', -1)
+def make_clickable(url, text):
+    return f'<a target="_blank" href="{url}">{text}</a>'
+dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',))
+st.write(dat.to_html(escape = False), unsafe_allow_html = True)