Spaces:
Running
Running
File size: 11,834 Bytes
0109ba9 bf84cfc 0c34ee8 bf84cfc 8f7c025 bf84cfc 0c34ee8 bf84cfc 0109ba9 bf84cfc 0109ba9 bf84cfc 0c34ee8 8f7c025 bf84cfc 0c34ee8 2e462a5 0c34ee8 2e462a5 bf84cfc 0c34ee8 8f7c025 bb120ba 0c34ee8 bf84cfc 0109ba9 bf84cfc 0109ba9 bf84cfc 0109ba9 bf84cfc 0109ba9 bf84cfc 8f7c025 0109ba9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 |
# import requests
# from bs4 import BeautifulSoup
# import time
# import tensorflow as tf
# import tensorflow_hub as hub
# import numpy as np
# import jellyfish
# import re
# import streamlit as st
# # Load the pre-trained Universal Sentence Encoder
# embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
# def calculate_jaro_similarity(str1, str2):
# jaro_similarity = jellyfish.jaro_distance(str1, str2)
# return jaro_similarity
# def most_similar_sentence(target_topic, labels_list):
# context_embedding = embed([target_topic])[0]
# sentence_embeddings = embed(labels_list)
# similarities = np.inner(context_embedding, sentence_embeddings)
# most_similar_index = np.argmax(similarities)
# return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index
# def get_wikipedia_page(query):
# if "wikipedia" not in query:
# response = requests.get(f"https://en.wikipedia.org/wiki/{query}")
# else:
# response = requests.get(query)
# return response.text
# def get_topic_context(page_source):
# soup = BeautifulSoup(page_source, 'html.parser')
# first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
# context_sentence = re.sub(r'\[.*?\]', '', first_paragraph)
# context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "")
# while " " in context_sentence:
# context_sentence = context_sentence.replace(" ", " ")
# return context_sentence
# def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
# topic = starting_topic
# num_pages = 0
# used_topics = []
# used_links = []
# start_time = time.time()
# st.write("-" * 150)
# st.write(f"\nStarting!\n")
# st.write("-" * 150)
# page_source = get_wikipedia_page(starting_topic)
# used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}")
# while True:
# num_pages += 1
# if num_pages > 1:
# # load url to new page of next topic
# page_source = get_wikipedia_page(used_links[-1])
# # create backup list of links and texts from previous page in case new current page fails
# prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic
# try:
# context_sentence = get_topic_context(page_source)
# except Exception as e:
# context_sentence = "Context could not be found from webpage"
# links_texts = []
# soup = BeautifulSoup(page_source, 'html.parser')
# links = soup.find_all('a')
# for link in links:
# link_url = link.get('href')
# if link_url and link_url.startswith("/wiki/"):
# link_url = "https://en.wikipedia.org" + link_url
# link_text = link.text.strip()
# if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics:
# if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
# links_texts.append((link_url, link_text))
# # st.write(f'links_texts length: {len(links_texts)}')
# prev_links_texts = []
# if len(links_texts) == 0 and num_pages > 1: # if no links
# links_texts = prev_links_texts
# labels_list = [text for link, text in links_texts]
# # st.write(f'labels_list length: {len(labels_list)}')
# try:
# best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list)
# except Exception as e:
# best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page
# st.write(f"\nPage: {num_pages}")
# st.write(f"Current topic: '{topic.title()}'")
# st.write(f"Current URL: 'https://en.wikipedia.org/wiki/{topic}'")
# st.write(f"Current Topic Context: '{context_sentence}'")
# st.write(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%")
# next_link, topic = links_texts[loc_idx]
# if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
# st.write("\n" + "-" * 150)
# st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
# st.write(f"Starting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}")
# st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n")
# st.write("-" * 150)
# break
# if num_pages == limit:
# st.write("\n" + "-" * 150)
# st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
# st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'")
# st.write(f"\nTry a different combination to see if it can do it!\n")
# st.write("-" * 150)
# break
# used_links.append(next_link)
# used_topics.append(topic)
# # starting_topic = "soulja boy"
# # target_topic = "game"
# # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100)
import requests
from bs4 import BeautifulSoup
import time
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import jellyfish
import re
import streamlit as st
# Load the pre-trained Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
def calculate_jaro_similarity(str1, str2):
jaro_similarity = jellyfish.jaro_distance(str1, str2)
return jaro_similarity
def most_similar_sentence(target_topic, labels_list):
context_embedding = embed([target_topic])[0]
sentence_embeddings = embed(labels_list)
similarities = np.inner(context_embedding, sentence_embeddings)
most_similar_index = np.argmax(similarities)
return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index
def get_wikipedia_page(query):
if "wikipedia" not in query:
response = requests.get(f"https://en.wikipedia.org/wiki/{query}")
else:
response = requests.get(query)
return response.text
def get_topic_context(page_source):
soup = BeautifulSoup(page_source, 'html.parser')
first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
context_sentence = re.sub(r'\[.*?\]', '', first_paragraph)
context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "")
while " " in context_sentence:
context_sentence = context_sentence.replace(" ", " ")
return context_sentence
def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100, delay: int = 0):
topic = starting_topic
num_pages = 0
used_topics = []
used_links = []
start_time = time.time()
st.write("-" * 150)
st.write(f"\nStarting!\n")
st.write("-" * 150)
page_source = get_wikipedia_page(starting_topic)
used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}")
placeholder = st.empty() # Placeholder for your most recent statement
old_statements = "" # Store old statements
while True:
num_pages += 1
if num_pages > 1:
# load url to new page of next topic
page_source = get_wikipedia_page(used_links[-1])
# create backup list of links and texts from previous page in case new current page fails
prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic
try:
context_sentence = get_topic_context(page_source)
except Exception as e:
context_sentence = "Context could not be found from webpage"
links_texts = []
soup = BeautifulSoup(page_source, 'html.parser')
links = soup.find_all('a')
for link in links:
link_url = link.get('href')
if link_url and link_url.startswith("/wiki/"):
link_url = "https://en.wikipedia.org" + link_url
link_text = link.text.strip()
if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics:
if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
links_texts.append((link_url, link_text))
# st.write(f'links_texts length: {len(links_texts)}')
prev_links_texts = []
if len(links_texts) == 0 and num_pages > 1: # if no links
links_texts = prev_links_texts
labels_list = [text for link, text in links_texts]
# st.write(f'labels_list length: {len(labels_list)}')
try:
best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list)
except Exception as e:
best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page
next_link, topic = links_texts[loc_idx]
new_statement = f"\nPage: {num_pages}\nCurrent topic: '{topic.title()}'\nCurrent URL: 'https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}'\nCurrent Topic Context: '{context_sentence}'\nNext topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%"
old_statements = new_statement + "\n" + old_statements
placeholder.text(old_statements)
used_links.append(next_link)
used_topics.append(topic)
if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
new_statement = "\n" + "-" * 150 + f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!\nStarting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}\nTarget topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n" + "-" * 150
old_statements = new_statement + "\n" + old_statements
placeholder.text(old_statements)
break
if num_pages == limit:
new_statement = "\n" + "-" * 150 + f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.\nIn {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'\n\nTry a different combination to see if it can do it!\n" + "-" * 150
old_statements = new_statement + "\n" + old_statements
placeholder.text(old_statements)
break
# delay things, if applicable
time.sleep(delay)
# starting_topic = "soulja boy"
# target_topic = "game"
# delay = 0
# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100, delay = delay) |