Spaces:
Sleeping
Sleeping
File size: 5,644 Bytes
bf84cfc 0c34ee8 bf84cfc 8f7c025 bf84cfc 0c34ee8 bf84cfc 0c34ee8 8f7c025 bf84cfc 0c34ee8 2e462a5 0c34ee8 2e462a5 bf84cfc 0c34ee8 8f7c025 bb120ba 0c34ee8 bf84cfc 0c34ee8 bf84cfc 0c34ee8 bf84cfc 8f7c025 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import requests
from bs4 import BeautifulSoup
import time
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import jellyfish
import re
import streamlit as st
# Load the pre-trained Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
def calculate_jaro_similarity(str1, str2):
jaro_similarity = jellyfish.jaro_distance(str1, str2)
return jaro_similarity
def most_similar_sentence(target_topic, labels_list):
context_embedding = embed([target_topic])[0]
sentence_embeddings = embed(labels_list)
similarities = np.inner(context_embedding, sentence_embeddings)
most_similar_index = np.argmax(similarities)
return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index
def get_wikipedia_page(query):
if "wikipedia" not in query:
response = requests.get(f"https://en.wikipedia.org/wiki/{query}")
else:
response = requests.get(query)
return response.text
def get_topic_context(page_source):
soup = BeautifulSoup(page_source, 'html.parser')
first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
context_sentence = re.sub(r'\[.*?\]', '', first_paragraph)
context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "")
while " " in context_sentence:
context_sentence = context_sentence.replace(" ", " ")
return context_sentence
def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
topic = starting_topic
num_pages = 0
used_topics = []
used_links = []
start_time = time.time()
st.write("-" * 150)
st.write(f"\nStarting!\n")
st.write("-" * 150)
page_source = get_wikipedia_page(starting_topic)
used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}")
while True:
num_pages += 1
if num_pages > 1:
# load url to new page of next topic
page_source = get_wikipedia_page(used_links[-1])
# create backup list of links and texts from previous page in case new current page fails
prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic
try:
context_sentence = get_topic_context(page_source)
except Exception as e:
context_sentence = "Context could not be found from webpage"
links_texts = []
soup = BeautifulSoup(page_source, 'html.parser')
links = soup.find_all('a')
for link in links:
link_url = link.get('href')
if link_url and link_url.startswith("/wiki/"):
link_url = "https://en.wikipedia.org" + link_url
link_text = link.text.strip()
if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics:
if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
links_texts.append((link_url, link_text))
# st.write(f'links_texts length: {len(links_texts)}')
prev_links_texts = []
if len(links_texts) == 0 and num_pages > 1: # if no links
links_texts = prev_links_texts
labels_list = [text for link, text in links_texts]
# st.write(f'labels_list length: {len(labels_list)}')
try:
best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list)
except Exception as e:
best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page
st.write(f"\nPage: {num_pages}")
st.write(f"Current topic: '{topic.title()}'")
st.write(f"Current URL: 'https://en.wikipedia.org/wiki/{topic}'")
st.write(f"Current Topic Context: '{context_sentence}'")
st.write(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%")
next_link, topic = links_texts[loc_idx]
if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
st.write("\n" + "-" * 150)
st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
st.write(f"Starting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}")
st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n")
st.write("-" * 150)
break
if num_pages == limit:
st.write("\n" + "-" * 150)
st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'")
st.write(f"\nTry a different combination to see if it can do it!\n")
st.write("-" * 150)
break
used_links.append(next_link)
used_topics.append(topic)
# starting_topic = "soulja boy"
# target_topic = "game"
# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100) |