KAI MAURIN-JONES commited on
Commit
aaaf5e8
1 Parent(s): 6fd01dc

files added

Browse files
Files changed (5) hide show
  1. .DS_Store +0 -0
  2. app.py +35 -0
  3. requirements.txt +8 -0
  4. wiki_game_local.py +180 -0
  5. wiki_game_st.py +181 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from wiki_game_st import *
3
+
4
+ # Set the title of the app
5
+ st.title("Wiki Game")
6
+
7
+ # Add body text
8
+ st.text("Enter a topic to start at and a topic to end at, and choose your level of drama")
9
+
10
+ # Set up two text inputs for start and end topics
11
+ start_topic = st.text_input("Start topic:")
12
+ end_topic = st.text_input("End topic:")
13
+
14
+ # Create a slider with values 1-5
15
+ slider = st.slider('Choose your level of drama', 1, 5)
16
+
17
+ # Create an if statement to print out the text underneath the bar based on the slider value
18
+ if slider == 1:
19
+ st.write("Absolute zoominess")
20
+ delay = 0
21
+ elif slider == 2:
22
+ st.write("Some zoominess")
23
+ delay = 2
24
+ elif slider == 3:
25
+ st.write("You'll actually be able to read things")
26
+ delay = 3
27
+ elif slider == 4:
28
+ st.write("A natural pace")
29
+ delay = 4
30
+ elif slider == 5:
31
+ st.write("You'll only do this once")
32
+ delay = 6
33
+
34
+ # Print the value of delay variable
35
+ st.write(f"Delay is set to: {delay}")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ altair<5
2
+ selenium==4.10.0
3
+ beautifulsoup4==4.11.1
4
+ numpy==1.23.5
5
+ tensorflow==2.10.0
6
+ tensorflow-hub==0.14.0
7
+ jellyfish==0.11.2
8
+ streamlit==1.21.0
wiki_game_local.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.common.by import By
3
+ from selenium.webdriver.common.keys import Keys
4
+ from bs4 import BeautifulSoup
5
+ import time
6
+ # !pip install tensorflow tensorflow-hub
7
+ import tensorflow as tf
8
+ import tensorflow_hub as hub
9
+ import numpy as np
10
+ # !pip install jellyfish
11
+ import jellyfish
12
+
13
+ # Load the pre-trained Universal Sentence Encoder
14
+ embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
15
+
16
+ def calculate_jaro_similarity(str1, str2):
17
+ jaro_similarity = jellyfish.jaro_distance(str1, str2)
18
+ return jaro_similarity
19
+
20
+ def most_similar_sentence(target_topic, labels_list):
21
+ # Encode the context sentence and all sentences in the list
22
+ context_embedding = embed([target_topic])[0]
23
+ sentence_embeddings = embed(labels_list)
24
+
25
+ # Calculate cosine similarities between the context sentence and each sentence in the list
26
+ similarities = np.inner(context_embedding, sentence_embeddings)
27
+
28
+ # Find the index of the most similar sentence
29
+ most_similar_index = np.argmax(similarities)
30
+
31
+ return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index
32
+
33
+ def search_wikipedia(query, driver):
34
+ # Go to Wikipedia's main page
35
+ driver.get("https://www.wikipedia.org/")
36
+
37
+ # Find the search bar using its name
38
+ search_bar = driver.find_element(By.NAME, "search")
39
+
40
+ # Send the query to the search bar and hit Enter
41
+ search_bar.send_keys(query)
42
+ search_bar.send_keys(Keys.RETURN)
43
+
44
+ return driver
45
+
46
+ def get_topic_context(driver):
47
+ # Find the first paragraph of the main article
48
+ first_paragraph = driver.find_element(By.CSS_SELECTOR, "div.mw-parser-output > p:not(.mw-empty-elt)").text
49
+
50
+ context_sentence = first_paragraph.split(". ")[0]
51
+ # print(context_sentence)
52
+
53
+ return context_sentence
54
+
55
+ def search_wikipedia(query, driver):
56
+ # Go to Wikipedia's main page
57
+ driver.get("https://www.wikipedia.org/")
58
+
59
+ # Find the search bar using its name
60
+ search_bar = driver.find_element(By.NAME, "search")
61
+
62
+ # Send the query to the search bar and hit Enter
63
+ search_bar.send_keys(query)
64
+ search_bar.send_keys(Keys.RETURN)
65
+
66
+ return driver
67
+
68
+ def get_topic_context(driver):
69
+ # Find the first paragraph of the main article
70
+ first_paragraph = driver.find_element(By.CSS_SELECTOR, "div.mw-parser-output > p:not(.mw-empty-elt)").text
71
+
72
+ context_sentence = first_paragraph.split(". ")[0]
73
+ # print(context_sentence)
74
+
75
+ return context_sentence
76
+
77
+ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
78
+
79
+ ##### Setup Chrome options
80
+ chrome_options = webdriver.ChromeOptions()
81
+ chrome_options.add_argument("--headless") # Ensure GUI is off
82
+ chrome_options.add_argument("--no-sandbox")
83
+ chrome_options.add_argument("--disable-dev-shm-usage")
84
+ driver = webdriver.Chrome(options = chrome_options)
85
+
86
+ topic = starting_topic
87
+ num_pages = 0
88
+ used_topics = []
89
+ used_links = []
90
+
91
+ start_time = time.time()
92
+
93
+ ### BEGIN ###
94
+
95
+ print("-" * 150)
96
+ print(f"\nStarting!\n")
97
+ print("-" * 150)
98
+
99
+ driver = search_wikipedia(starting_topic, driver)
100
+ used_links.append(driver.current_url)
101
+
102
+ while True:
103
+ # increment the page tracking by 1 for each new page
104
+ num_pages += 1
105
+
106
+ # if not the first page, navigate to the new page
107
+ if num_pages > 1:
108
+ driver.get(next_link)
109
+
110
+ context_sentence = get_topic_context(driver)
111
+ links_texts = []
112
+
113
+ current_url = driver.current_url
114
+ current_url_suffix = str(current_url).split("/")[-1]
115
+
116
+ ### Use BeautifulSoup and Requests instead of Selenium for link extraction
117
+ current_page = driver.page_source # html from Selenium instead of BeautifulSoup
118
+
119
+ soup = BeautifulSoup(current_page, 'html.parser')
120
+
121
+ links = soup.find_all('a')
122
+
123
+ # Iterate through the links and extract their URLs
124
+ for link in links:
125
+ link_url = link.get('href')
126
+ if link_url and link_url.startswith("/wiki/"):
127
+ link_url = "https://en.wikipedia.org" + link_url
128
+ link_text = link.text.strip() # Get the text and remove leading/trailing spaces
129
+
130
+ # make sure they are both not None
131
+ if link_text and current_url_suffix not in link_url:
132
+
133
+ if link_url not in used_links and link_text not in used_topics:
134
+
135
+ # eliminates topic duplicates, non-wiki links, and wiki-help pages (non-content pages)
136
+ if topic.lower() not in link_url.lower() and "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
137
+ links_texts.append((link_url, link_text))
138
+
139
+ best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])
140
+
141
+ print(f"\nPage: {num_pages}")
142
+ print(f"Current topic: '{topic.title()}'")
143
+ print(f"Current URL: '{current_url}'")
144
+ print(f"Current Topic Context: '{context_sentence}'")
145
+ print(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%")
146
+
147
+ next_link, topic = links_texts[loc_idx]
148
+ # print(next_link)
149
+
150
+ # if target_topic.lower() in topic.lower():# or best_score > float(0.85):
151
+ if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90): # if topic text is identical or at least 90% the same spelling
152
+ print("\n" + "-" * 150)
153
+ print(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
154
+ print(f"Starting topic: '{starting_topic.title()}': '{used_links[0]}'")
155
+ print(f"Target topic: '{target_topic.title()}': '{used_links[-1]}'\n")
156
+ print("-" * 150)
157
+ break
158
+
159
+ ##### ADD DRAMATIC DELAY HERE #####
160
+ # time.sleep(0.5)
161
+ # time.sleep(10)
162
+
163
+ if num_pages == limit:
164
+ print("\n" + "-" * 150)
165
+ print(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
166
+ print(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{target_topic.title()}': '{used_links[-1]}'")
167
+ print(f"\nTry a different combination to see if it can do it!\n")
168
+ print("-" * 150)
169
+ break
170
+
171
+ used_links.append(next_link)
172
+ used_topics.append(topic)
173
+
174
+ driver.quit()
175
+
176
+ ###### Example
177
+
178
+ # starting_topic = "soulja boy"
179
+ # target_topic = "test"
180
+ # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)
wiki_game_st.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.common.by import By
3
+ from selenium.webdriver.common.keys import Keys
4
+ from bs4 import BeautifulSoup
5
+ import time
6
+ # !pip install tensorflow tensorflow-hub
7
+ import tensorflow as tf
8
+ import tensorflow_hub as hub
9
+ import numpy as np
10
+ # !pip install jellyfish
11
+ import jellyfish
12
+ import streamlit as st
13
+
14
+ # Load the pre-trained Universal Sentence Encoder
15
+ embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
16
+
17
+ def calculate_jaro_similarity(str1, str2):
18
+ jaro_similarity = jellyfish.jaro_distance(str1, str2)
19
+ return jaro_similarity
20
+
21
+ def most_similar_sentence(target_topic, labels_list):
22
+ # Encode the context sentence and all sentences in the list
23
+ context_embedding = embed([target_topic])[0]
24
+ sentence_embeddings = embed(labels_list)
25
+
26
+ # Calculate cosine similarities between the context sentence and each sentence in the list
27
+ similarities = np.inner(context_embedding, sentence_embeddings)
28
+
29
+ # Find the index of the most similar sentence
30
+ most_similar_index = np.argmax(similarities)
31
+
32
+ return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index
33
+
34
+ def search_wikipedia(query, driver):
35
+ # Go to Wikipedia's main page
36
+ driver.get("https://www.wikipedia.org/")
37
+
38
+ # Find the search bar using its name
39
+ search_bar = driver.find_element(By.NAME, "search")
40
+
41
+ # Send the query to the search bar and hit Enter
42
+ search_bar.send_keys(query)
43
+ search_bar.send_keys(Keys.RETURN)
44
+
45
+ return driver
46
+
47
+ def get_topic_context(driver):
48
+ # Find the first paragraph of the main article
49
+ first_paragraph = driver.find_element(By.CSS_SELECTOR, "div.mw-parser-output > p:not(.mw-empty-elt)").text
50
+
51
+ context_sentence = first_paragraph.split(". ")[0]
52
+ # st.write(context_sentence)
53
+
54
+ return context_sentence
55
+
56
+ def search_wikipedia(query, driver):
57
+ # Go to Wikipedia's main page
58
+ driver.get("https://www.wikipedia.org/")
59
+
60
+ # Find the search bar using its name
61
+ search_bar = driver.find_element(By.NAME, "search")
62
+
63
+ # Send the query to the search bar and hit Enter
64
+ search_bar.send_keys(query)
65
+ search_bar.send_keys(Keys.RETURN)
66
+
67
+ return driver
68
+
69
+ def get_topic_context(driver):
70
+ # Find the first paragraph of the main article
71
+ first_paragraph = driver.find_element(By.CSS_SELECTOR, "div.mw-parser-output > p:not(.mw-empty-elt)").text
72
+
73
+ context_sentence = first_paragraph.split(". ")[0]
74
+ # st.write(context_sentence)
75
+
76
+ return context_sentence
77
+
78
+ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
79
+
80
+ ##### Setup Chrome options
81
+ chrome_options = webdriver.ChromeOptions()
82
+ chrome_options.add_argument("--headless") # Ensure GUI is off
83
+ chrome_options.add_argument("--no-sandbox")
84
+ chrome_options.add_argument("--disable-dev-shm-usage")
85
+ driver = webdriver.Chrome(options = chrome_options)
86
+
87
+ topic = starting_topic
88
+ num_pages = 0
89
+ used_topics = []
90
+ used_links = []
91
+
92
+ start_time = time.time()
93
+
94
+ ### BEGIN ###
95
+
96
+ st.write("-" * 150)
97
+ st.write(f"\nStarting!\n")
98
+ st.write("-" * 150)
99
+
100
+ driver = search_wikipedia(starting_topic, driver)
101
+ used_links.append(driver.current_url)
102
+
103
+ while True:
104
+ # increment the page tracking by 1 for each new page
105
+ num_pages += 1
106
+
107
+ # if not the first page, navigate to the new page
108
+ if num_pages > 1:
109
+ driver.get(next_link)
110
+
111
+ context_sentence = get_topic_context(driver)
112
+ links_texts = []
113
+
114
+ current_url = driver.current_url
115
+ current_url_suffix = str(current_url).split("/")[-1]
116
+
117
+ ### Use BeautifulSoup and Requests instead of Selenium for link extraction
118
+ current_page = driver.page_source # html from Selenium instead of BeautifulSoup
119
+
120
+ soup = BeautifulSoup(current_page, 'html.parser')
121
+
122
+ links = soup.find_all('a')
123
+
124
+ # Iterate through the links and extract their URLs
125
+ for link in links:
126
+ link_url = link.get('href')
127
+ if link_url and link_url.startswith("/wiki/"):
128
+ link_url = "https://en.wikipedia.org" + link_url
129
+ link_text = link.text.strip() # Get the text and remove leading/trailing spaces
130
+
131
+ # make sure they are both not None
132
+ if link_text and current_url_suffix not in link_url:
133
+
134
+ if link_url not in used_links and link_text not in used_topics:
135
+
136
+ # eliminates topic duplicates, non-wiki links, and wiki-help pages (non-content pages)
137
+ if topic.lower() not in link_url.lower() and "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
138
+ links_texts.append((link_url, link_text))
139
+
140
+ best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])
141
+
142
+ st.write(f"\nPage: {num_pages}")
143
+ st.write(f"Current topic: '{topic.title()}'")
144
+ st.write(f"Current URL: '{current_url}'")
145
+ st.write(f"Current Topic Context: '{context_sentence}'")
146
+ st.write(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%")
147
+
148
+ next_link, topic = links_texts[loc_idx]
149
+ # st.write(next_link)
150
+
151
+ # if target_topic.lower() in topic.lower():# or best_score > float(0.85):
152
+ if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90): # if topic text is identical or at least 90% the same spelling
153
+ st.write("\n" + "-" * 150)
154
+ st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
155
+ st.write(f"Starting topic: '{starting_topic.title()}': '{used_links[0]}'")
156
+ st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1]}'\n")
157
+ st.write("-" * 150)
158
+ break
159
+
160
+ ##### ADD DRAMATIC DELAY HERE #####
161
+ # time.sleep(0.5)
162
+ # time.sleep(10)
163
+
164
+ if num_pages == limit:
165
+ st.write("\n" + "-" * 150)
166
+ st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
167
+ st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{target_topic.title()}': '{used_links[-1]}'")
168
+ st.write(f"\nTry a different combination to see if it can do it!\n")
169
+ st.write("-" * 150)
170
+ break
171
+
172
+ used_links.append(next_link)
173
+ used_topics.append(topic)
174
+
175
+ driver.quit()
176
+
177
+ ###### Example
178
+
179
+ # starting_topic = "soulja boy"
180
+ # target_topic = "test"
181
+ # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)