from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup import time # !pip install tensorflow tensorflow-hub import tensorflow as tf import tensorflow_hub as hub import numpy as np # !pip install jellyfish import jellyfish # Load the pre-trained Universal Sentence Encoder embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") def calculate_jaro_similarity(str1, str2): jaro_similarity = jellyfish.jaro_distance(str1, str2) return jaro_similarity def most_similar_sentence(target_topic, labels_list): # Encode the context sentence and all sentences in the list context_embedding = embed([target_topic])[0] sentence_embeddings = embed(labels_list) # Calculate cosine similarities between the context sentence and each sentence in the list similarities = np.inner(context_embedding, sentence_embeddings) # Find the index of the most similar sentence most_similar_index = np.argmax(similarities) return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index def search_wikipedia(query, driver): # Go to Wikipedia's main page driver.get("https://www.wikipedia.org/") # Find the search bar using its name search_bar = driver.find_element(By.NAME, "search") # Send the query to the search bar and hit Enter search_bar.send_keys(query) search_bar.send_keys(Keys.RETURN) return driver def get_topic_context(driver): # Find the first paragraph of the main article first_paragraph = driver.find_element(By.CSS_SELECTOR, "div.mw-parser-output > p:not(.mw-empty-elt)").text context_sentence = first_paragraph.split(". ")[0] # print(context_sentence) return context_sentence def search_wikipedia(query, driver): # Go to Wikipedia's main page driver.get("https://www.wikipedia.org/") # Find the search bar using its name search_bar = driver.find_element(By.NAME, "search") # Send the query to the search bar and hit Enter search_bar.send_keys(query) search_bar.send_keys(Keys.RETURN) return driver def get_topic_context(driver): # Find the first paragraph of the main article first_paragraph = driver.find_element(By.CSS_SELECTOR, "div.mw-parser-output > p:not(.mw-empty-elt)").text context_sentence = first_paragraph.split(". ")[0] # print(context_sentence) return context_sentence def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100): ##### Setup Chrome options chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--headless") # Ensure GUI is off chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") driver = webdriver.Chrome(options = chrome_options) topic = starting_topic num_pages = 0 used_topics = [] used_links = [] start_time = time.time() ### BEGIN ### print("-" * 150) print(f"\nStarting!\n") print("-" * 150) driver = search_wikipedia(starting_topic, driver) used_links.append(driver.current_url) while True: # increment the page tracking by 1 for each new page num_pages += 1 # if not the first page, navigate to the new page if num_pages > 1: driver.get(next_link) context_sentence = get_topic_context(driver) links_texts = [] current_url = driver.current_url current_url_suffix = str(current_url).split("/")[-1] ### Use BeautifulSoup and Requests instead of Selenium for link extraction current_page = driver.page_source # html from Selenium instead of BeautifulSoup soup = BeautifulSoup(current_page, 'html.parser') links = soup.find_all('a') # Iterate through the links and extract their URLs for link in links: link_url = link.get('href') if link_url and link_url.startswith("/wiki/"): link_url = "https://en.wikipedia.org" + link_url link_text = link.text.strip() # Get the text and remove leading/trailing spaces # make sure they are both not None if link_text and current_url_suffix not in link_url: if link_url not in used_links and link_text not in used_topics: # eliminates topic duplicates, non-wiki links, and wiki-help pages (non-content pages) if topic.lower() not in link_url.lower() and "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]): links_texts.append((link_url, link_text)) best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts]) print(f"\nPage: {num_pages}") print(f"Current topic: '{topic.title()}'") print(f"Current URL: '{current_url}'") print(f"Current Topic Context: '{context_sentence}'") print(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%") next_link, topic = links_texts[loc_idx] # print(next_link) # if target_topic.lower() in topic.lower():# or best_score > float(0.85): if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90): # if topic text is identical or at least 90% the same spelling print("\n" + "-" * 150) print(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!") print(f"Starting topic: '{starting_topic.title()}': '{used_links[0]}'") print(f"Target topic: '{target_topic.title()}': '{used_links[-1]}'\n") print("-" * 150) break ##### ADD DRAMATIC DELAY HERE ##### # time.sleep(0.5) # time.sleep(10) if num_pages == limit: print("\n" + "-" * 150) print(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.") print(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{target_topic.title()}': '{used_links[-1]}'") print(f"\nTry a different combination to see if it can do it!\n") print("-" * 150) break used_links.append(next_link) used_topics.append(topic) driver.quit() ###### Example # starting_topic = "soulja boy" # target_topic = "test" # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)