Spaces:
Running
Running
# import requests | |
# from bs4 import BeautifulSoup | |
# import time | |
# import tensorflow as tf | |
# import tensorflow_hub as hub | |
# import numpy as np | |
# import jellyfish | |
# import re | |
# import streamlit as st | |
# # Load the pre-trained Universal Sentence Encoder | |
# embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") | |
# def calculate_jaro_similarity(str1, str2): | |
# jaro_similarity = jellyfish.jaro_distance(str1, str2) | |
# return jaro_similarity | |
# def most_similar_sentence(target_topic, labels_list): | |
# context_embedding = embed([target_topic])[0] | |
# sentence_embeddings = embed(labels_list) | |
# similarities = np.inner(context_embedding, sentence_embeddings) | |
# most_similar_index = np.argmax(similarities) | |
# return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index | |
# def get_wikipedia_page(query): | |
# if "wikipedia" not in query: | |
# response = requests.get(f"https://en.wikipedia.org/wiki/{query}") | |
# else: | |
# response = requests.get(query) | |
# return response.text | |
# def get_topic_context(page_source): | |
# soup = BeautifulSoup(page_source, 'html.parser') | |
# first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text | |
# context_sentence = re.sub(r'\[.*?\]', '', first_paragraph) | |
# context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "") | |
# while " " in context_sentence: | |
# context_sentence = context_sentence.replace(" ", " ") | |
# return context_sentence | |
# def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100): | |
# topic = starting_topic | |
# num_pages = 0 | |
# used_topics = [] | |
# used_links = [] | |
# start_time = time.time() | |
# st.write("-" * 150) | |
# st.write(f"\nStarting!\n") | |
# st.write("-" * 150) | |
# page_source = get_wikipedia_page(starting_topic) | |
# used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}") | |
# while True: | |
# num_pages += 1 | |
# if num_pages > 1: | |
# # load url to new page of next topic | |
# page_source = get_wikipedia_page(used_links[-1]) | |
# # create backup list of links and texts from previous page in case new current page fails | |
# prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic | |
# try: | |
# context_sentence = get_topic_context(page_source) | |
# except Exception as e: | |
# context_sentence = "Context could not be found from webpage" | |
# links_texts = [] | |
# soup = BeautifulSoup(page_source, 'html.parser') | |
# links = soup.find_all('a') | |
# for link in links: | |
# link_url = link.get('href') | |
# if link_url and link_url.startswith("/wiki/"): | |
# link_url = "https://en.wikipedia.org" + link_url | |
# link_text = link.text.strip() | |
# if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics: | |
# if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]): | |
# links_texts.append((link_url, link_text)) | |
# # st.write(f'links_texts length: {len(links_texts)}') | |
# prev_links_texts = [] | |
# if len(links_texts) == 0 and num_pages > 1: # if no links | |
# links_texts = prev_links_texts | |
# labels_list = [text for link, text in links_texts] | |
# # st.write(f'labels_list length: {len(labels_list)}') | |
# try: | |
# best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list) | |
# except Exception as e: | |
# best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page | |
# st.write(f"\nPage: {num_pages}") | |
# st.write(f"Current topic: '{topic.title()}'") | |
# st.write(f"Current URL: 'https://en.wikipedia.org/wiki/{topic}'") | |
# st.write(f"Current Topic Context: '{context_sentence}'") | |
# st.write(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%") | |
# next_link, topic = links_texts[loc_idx] | |
# if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90): | |
# st.write("\n" + "-" * 150) | |
# st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!") | |
# st.write(f"Starting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}") | |
# st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n") | |
# st.write("-" * 150) | |
# break | |
# if num_pages == limit: | |
# st.write("\n" + "-" * 150) | |
# st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.") | |
# st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'") | |
# st.write(f"\nTry a different combination to see if it can do it!\n") | |
# st.write("-" * 150) | |
# break | |
# used_links.append(next_link) | |
# used_topics.append(topic) | |
# # starting_topic = "soulja boy" | |
# # target_topic = "game" | |
# # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100) | |
import requests | |
from bs4 import BeautifulSoup | |
import time | |
import tensorflow as tf | |
import tensorflow_hub as hub | |
import numpy as np | |
import jellyfish | |
import re | |
import streamlit as st | |
# Load the pre-trained Universal Sentence Encoder | |
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") | |
def calculate_jaro_similarity(str1, str2): | |
jaro_similarity = jellyfish.jaro_distance(str1, str2) | |
return jaro_similarity | |
def most_similar_sentence(target_topic, labels_list): | |
context_embedding = embed([target_topic])[0] | |
sentence_embeddings = embed(labels_list) | |
similarities = np.inner(context_embedding, sentence_embeddings) | |
most_similar_index = np.argmax(similarities) | |
return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index | |
def get_wikipedia_page(query): | |
if "wikipedia" not in query: | |
response = requests.get(f"https://en.wikipedia.org/wiki/{query}") | |
else: | |
response = requests.get(query) | |
return response.text | |
def get_topic_context(page_source): | |
soup = BeautifulSoup(page_source, 'html.parser') | |
first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text | |
context_sentence = re.sub(r'\[.*?\]', '', first_paragraph) | |
context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "") | |
while " " in context_sentence: | |
context_sentence = context_sentence.replace(" ", " ") | |
return context_sentence | |
def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100, delay: int = 0): | |
topic = starting_topic | |
num_pages = 0 | |
used_topics = [] | |
used_links = [] | |
start_time = time.time() | |
st.write("-" * 150) | |
st.write(f"\nStarting!\n") | |
st.write("-" * 150) | |
page_source = get_wikipedia_page(starting_topic) | |
used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}") | |
placeholder = st.empty() # Placeholder for your most recent statement | |
old_statements = "" # Store old statements | |
while True: | |
num_pages += 1 | |
if num_pages > 1: | |
# load url to new page of next topic | |
page_source = get_wikipedia_page(used_links[-1]) | |
# create backup list of links and texts from previous page in case new current page fails | |
prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic | |
try: | |
context_sentence = get_topic_context(page_source) | |
except Exception as e: | |
context_sentence = "Context could not be found from webpage" | |
links_texts = [] | |
soup = BeautifulSoup(page_source, 'html.parser') | |
links = soup.find_all('a') | |
for link in links: | |
link_url = link.get('href') | |
if link_url and link_url.startswith("/wiki/"): | |
link_url = "https://en.wikipedia.org" + link_url | |
link_text = link.text.strip() | |
if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics: | |
if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]): | |
links_texts.append((link_url, link_text)) | |
# st.write(f'links_texts length: {len(links_texts)}') | |
prev_links_texts = [] | |
if len(links_texts) == 0 and num_pages > 1: # if no links | |
links_texts = prev_links_texts | |
labels_list = [text for link, text in links_texts] | |
# st.write(f'labels_list length: {len(labels_list)}') | |
try: | |
best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list) | |
except Exception as e: | |
best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page | |
next_link, topic = links_texts[loc_idx] | |
new_statement = f"\nPage: {num_pages}\nCurrent topic: '{topic.title()}'\nCurrent URL: 'https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}'\nCurrent Topic Context: '{context_sentence}'\nNext topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%" | |
old_statements = new_statement + "\n" + old_statements | |
placeholder.text(old_statements) | |
used_links.append(next_link) | |
used_topics.append(topic) | |
if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90): | |
new_statement = "\n" + "-" * 150 + f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!\nStarting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}\nTarget topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n" + "-" * 150 | |
old_statements = new_statement + "\n" + old_statements | |
placeholder.text(old_statements) | |
break | |
if num_pages == limit: | |
new_statement = "\n" + "-" * 150 + f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.\nIn {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'\n\nTry a different combination to see if it can do it!\n" + "-" * 150 | |
old_statements = new_statement + "\n" + old_statements | |
placeholder.text(old_statements) | |
break | |
# delay things, if applicable | |
time.sleep(delay) | |
# starting_topic = "soulja boy" | |
# target_topic = "game" | |
# delay = 0 | |
# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100, delay = delay) |