# import requests # from bs4 import BeautifulSoup # import time # import tensorflow as tf # import tensorflow_hub as hub # import numpy as np # import jellyfish # import re # import streamlit as st # # Load the pre-trained Universal Sentence Encoder # embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") # def calculate_jaro_similarity(str1, str2): # jaro_similarity = jellyfish.jaro_distance(str1, str2) # return jaro_similarity # def most_similar_sentence(target_topic, labels_list): # context_embedding = embed([target_topic])[0] # sentence_embeddings = embed(labels_list) # similarities = np.inner(context_embedding, sentence_embeddings) # most_similar_index = np.argmax(similarities) # return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index # def get_wikipedia_page(query): # if "wikipedia" not in query: # response = requests.get(f"https://en.wikipedia.org/wiki/{query}") # else: # response = requests.get(query) # return response.text # def get_topic_context(page_source): # soup = BeautifulSoup(page_source, 'html.parser') # first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text # context_sentence = re.sub(r'\[.*?\]', '', first_paragraph) # context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "") # while " " in context_sentence: # context_sentence = context_sentence.replace(" ", " ") # return context_sentence # def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100): # topic = starting_topic # num_pages = 0 # used_topics = [] # used_links = [] # start_time = time.time() # st.write("-" * 150) # st.write(f"\nStarting!\n") # st.write("-" * 150) # page_source = get_wikipedia_page(starting_topic) # used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}") # while True: # num_pages += 1 # if num_pages > 1: # # load url to new page of next topic # page_source = get_wikipedia_page(used_links[-1]) # # create backup list of links and texts from previous page in case new current page fails # prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic # try: # context_sentence = get_topic_context(page_source) # except Exception as e: # context_sentence = "Context could not be found from webpage" # links_texts = [] # soup = BeautifulSoup(page_source, 'html.parser') # links = soup.find_all('a') # for link in links: # link_url = link.get('href') # if link_url and link_url.startswith("/wiki/"): # link_url = "https://en.wikipedia.org" + link_url # link_text = link.text.strip() # if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics: # if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]): # links_texts.append((link_url, link_text)) # # st.write(f'links_texts length: {len(links_texts)}') # prev_links_texts = [] # if len(links_texts) == 0 and num_pages > 1: # if no links # links_texts = prev_links_texts # labels_list = [text for link, text in links_texts] # # st.write(f'labels_list length: {len(labels_list)}') # try: # best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list) # except Exception as e: # best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page # st.write(f"\nPage: {num_pages}") # st.write(f"Current topic: '{topic.title()}'") # st.write(f"Current URL: 'https://en.wikipedia.org/wiki/{topic}'") # st.write(f"Current Topic Context: '{context_sentence}'") # st.write(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%") # next_link, topic = links_texts[loc_idx] # if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90): # st.write("\n" + "-" * 150) # st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!") # st.write(f"Starting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}") # st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n") # st.write("-" * 150) # break # if num_pages == limit: # st.write("\n" + "-" * 150) # st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.") # st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'") # st.write(f"\nTry a different combination to see if it can do it!\n") # st.write("-" * 150) # break # used_links.append(next_link) # used_topics.append(topic) # # starting_topic = "soulja boy" # # target_topic = "game" # # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100) import requests from bs4 import BeautifulSoup import time import tensorflow as tf import tensorflow_hub as hub import numpy as np import jellyfish import re import streamlit as st # Load the pre-trained Universal Sentence Encoder embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") def calculate_jaro_similarity(str1, str2): jaro_similarity = jellyfish.jaro_distance(str1, str2) return jaro_similarity def most_similar_sentence(target_topic, labels_list): context_embedding = embed([target_topic])[0] sentence_embeddings = embed(labels_list) similarities = np.inner(context_embedding, sentence_embeddings) most_similar_index = np.argmax(similarities) return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index def get_wikipedia_page(query): if "wikipedia" not in query: response = requests.get(f"https://en.wikipedia.org/wiki/{query}") else: response = requests.get(query) return response.text def get_topic_context(page_source): soup = BeautifulSoup(page_source, 'html.parser') first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text context_sentence = re.sub(r'\[.*?\]', '', first_paragraph) context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "") while " " in context_sentence: context_sentence = context_sentence.replace(" ", " ") return context_sentence def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100, delay: int = 0): topic = starting_topic num_pages = 0 used_topics = [] used_links = [] start_time = time.time() st.write("-" * 150) st.write(f"\nStarting!\n") st.write("-" * 150) page_source = get_wikipedia_page(starting_topic) used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}") placeholder = st.empty() # Placeholder for your most recent statement old_statements = "" # Store old statements while True: num_pages += 1 if num_pages > 1: # load url to new page of next topic page_source = get_wikipedia_page(used_links[-1]) # create backup list of links and texts from previous page in case new current page fails prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic try: context_sentence = get_topic_context(page_source) except Exception as e: context_sentence = "Context could not be found from webpage" links_texts = [] soup = BeautifulSoup(page_source, 'html.parser') links = soup.find_all('a') for link in links: link_url = link.get('href') if link_url and link_url.startswith("/wiki/"): link_url = "https://en.wikipedia.org" + link_url link_text = link.text.strip() if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics: if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]): links_texts.append((link_url, link_text)) # st.write(f'links_texts length: {len(links_texts)}') prev_links_texts = [] if len(links_texts) == 0 and num_pages > 1: # if no links links_texts = prev_links_texts labels_list = [text for link, text in links_texts] # st.write(f'labels_list length: {len(labels_list)}') try: best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list) except Exception as e: best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page next_link, topic = links_texts[loc_idx] new_statement = f"\nPage: {num_pages}\nCurrent topic: '{topic.title()}'\nCurrent URL: 'https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}'\nCurrent Topic Context: '{context_sentence}'\nNext topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%" old_statements = new_statement + "\n" + old_statements placeholder.text(old_statements) used_links.append(next_link) used_topics.append(topic) if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90): new_statement = "\n" + "-" * 150 + f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!\nStarting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}\nTarget topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n" + "-" * 150 old_statements = new_statement + "\n" + old_statements placeholder.text(old_statements) break if num_pages == limit: new_statement = "\n" + "-" * 150 + f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.\nIn {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'\n\nTry a different combination to see if it can do it!\n" + "-" * 150 old_statements = new_statement + "\n" + old_statements placeholder.text(old_statements) break # delay things, if applicable time.sleep(delay) # starting_topic = "soulja boy" # target_topic = "game" # delay = 0 # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100, delay = delay)