Tyler Burns commited on
Commit
aa47d0d
1 Parent(s): d141533

changed the baseline result

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. flycheck_app.py +0 -138
app.py CHANGED
@@ -22,7 +22,7 @@ np.random.seed(42)
22
  use_keywords = False
23
 
24
  # The search bar
25
- keywords = st.text_input('Enter your search', 'How to use ChatGPT')
26
 
27
  to_display = 'body' # Sometimes this is title
28
  md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
 
22
  use_keywords = False
23
 
24
  # The search bar
25
+ keywords = st.text_input('Enter your search', 'AI news')
26
 
27
  to_display = 'body' # Sometimes this is title
28
  md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
flycheck_app.py DELETED
@@ -1,138 +0,0 @@
1
- import streamlit as st
2
- from duckduckgo_search import ddg
3
- import pandas as pd
4
- from sentence_transformers import SentenceTransformer
5
- import umap.umap_ as umap
6
- import numpy as np
7
- import sys
8
- import plotly.express as px
9
- import re
10
- import sklearn.cluster as cluster
11
- import nltk
12
- from nltk.stem import WordNetLemmatizer
13
- from keybert import KeyBERT
14
- nltk.download('punkt')
15
- nltk.download('omw-1.4')
16
- nltk.download('wordnet')
17
-
18
- # Set a seed
19
- np.random.seed(42)
20
-
21
- # Temp, for keywords
22
- use_keywords = False
23
-
24
- # The search bar
25
- keywords = st.text_input('Enter your search', 'How to use ChatGPT')
26
-
27
- to_display = 'body' # Sometimes this is title
28
- md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
29
- md = pd.DataFrame(md)
30
-
31
- # Load the model
32
- print("running sentence embeddings...")
33
- # model_name = 'all-mpnet-base-v2'
34
- model_name = 'all-MiniLM-L6-v2'
35
- model = SentenceTransformer(model_name)
36
- sentence_embeddings = model.encode(md['title'].tolist(), show_progress_bar = True)
37
- sentence_embeddings = pd.DataFrame(sentence_embeddings)
38
-
39
- # Reduce dimensionality
40
- print("reducing dimensionality...")
41
- reducer = umap.UMAP(metric = 'cosine')
42
- dimr = reducer.fit_transform(sentence_embeddings)
43
- dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])
44
-
45
- columns = ['title', 'href', 'body']
46
-
47
- # Clustering
48
- labels = cluster.KMeans(n_clusters=5).fit_predict(dimr[['umap1', 'umap2']])
49
- dimr['cluster'] = labels
50
-
51
- # Make the coloring easier on the eyes
52
- dimr['cluster'] = dimr['cluster'].astype('category')
53
-
54
- # Now we can search cluster in the table
55
- dimr['cluster'] = ['cluster ' + str(x) for x in dimr['cluster']]
56
-
57
- # Merge the data together
58
- dat = pd.concat([md.reset_index(), dimr.reset_index()], axis = 1)
59
-
60
- # The keywords
61
-
62
- if use_keywords:
63
- # Add keywords to the clusters
64
- # Create WordNetLemmatizer object
65
- print('extracting keywords per cluster...')
66
- wnl = WordNetLemmatizer()
67
- kw_model = KeyBERT()
68
-
69
- keywords_df = []
70
- for i in np.unique(dat['cluster']):
71
- curr = dat[dat['cluster'] == i]
72
- text = ' '.join(curr['body'])
73
-
74
- # Lemmatization
75
- text = nltk.word_tokenize(text)
76
- text = [wnl.lemmatize(i) for i in text]
77
- text = ' '.join(text)
78
-
79
- # Keyword extraction
80
- TR_keywords = kw_model.extract_keywords(text)
81
- keywords_df.append(TR_keywords[0:10])
82
-
83
- keywords_df = pd.DataFrame(keywords_df)
84
- keywords_df['cluster'] = np.unique(dimr['cluster'])
85
- keywords_df.columns = ['keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5', 'cluster']
86
-
87
- # Get the keyword data into the dataframe
88
- dat = dat.merge(keywords_df) # This messes up the index, so we need to reset it
89
- dat = dat.reset_index(drop = True)
90
-
91
- # handle duplicate index columns
92
- dat = dat.loc[:,~dat.columns.duplicated()]
93
-
94
- # Get it ready for plotting
95
- dat['title'] = dat.title.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
96
- dat['body'] = dat.body.str.wrap(30).apply(lambda x: x.replace('\n', '<br>'))
97
-
98
- # Visualize the data
99
- fig = px.scatter(dat, x = 'umap1', y = 'umap2', hover_data = ['title', 'body'], color = 'cluster', title = 'Context similarity map of results')
100
-
101
- # Make the font a little bigger
102
- fig.update_layout(
103
- hoverlabel=dict(
104
- bgcolor="white",
105
- font_size=16
106
- )
107
- )
108
-
109
- # x and y are same size
110
- fig.update_yaxes(
111
- scaleanchor="x",
112
- scaleratio=1,
113
- )
114
-
115
- # Show the figure
116
- st.plotly_chart(fig, use_container_width=True)
117
-
118
- # Remove <br> in the text for the table
119
- dat['title'] = [re.sub('<br>', ' ', i) for i in dat['title']]
120
- dat['body'] = [re.sub('<br>', ' ', i) for i in dat['body']]
121
-
122
- # Instructions
123
- st.caption('Use ctrl+f (or command+f for mac) to search the table')
124
-
125
- # remove irrelevant columns from dat
126
- dat = dat.drop(columns=['index', 'umap1', 'umap2'])
127
-
128
- # Make the link clickable
129
- # pandas display options
130
- pd.set_option('display.max_colwidth', -1)
131
-
132
- def make_clickable(url, text):
133
- return f'<a target="_blank" href="{url}">{text}</a>'
134
-
135
- dat['href'] = dat['href'].apply(make_clickable, args = ('Click here',))
136
-
137
- st.write(dat.to_html(escape = False), unsafe_allow_html = True)
138
-