Spaces:

kmaurinjones
/

Wiki-Game

Running

App Files Files Community

Wiki-Game / wiki_game_local.py

KAI MAURIN-JONES

files added

aaaf5e8 12 months ago

raw

history blame

No virus

6.95 kB

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.common.keys import Keys
	from bs4 import BeautifulSoup
	import time
	# !pip install tensorflow tensorflow-hub
	import tensorflow as tf
	import tensorflow_hub as hub
	import numpy as np
	# !pip install jellyfish
	import jellyfish

	# Load the pre-trained Universal Sentence Encoder
	embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

	def calculate_jaro_similarity(str1, str2):
	jaro_similarity = jellyfish.jaro_distance(str1, str2)
	return jaro_similarity

	def most_similar_sentence(target_topic, labels_list):
	# Encode the context sentence and all sentences in the list
	context_embedding = embed([target_topic])[0]
	sentence_embeddings = embed(labels_list)

	# Calculate cosine similarities between the context sentence and each sentence in the list
	similarities = np.inner(context_embedding, sentence_embeddings)

	# Find the index of the most similar sentence
	most_similar_index = np.argmax(similarities)

	return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index

	def search_wikipedia(query, driver):
	# Go to Wikipedia's main page
	driver.get("https://www.wikipedia.org/")

	# Find the search bar using its name
	search_bar = driver.find_element(By.NAME, "search")

	# Send the query to the search bar and hit Enter
	search_bar.send_keys(query)
	search_bar.send_keys(Keys.RETURN)

	return driver

	def get_topic_context(driver):
	# Find the first paragraph of the main article
	first_paragraph = driver.find_element(By.CSS_SELECTOR, "div.mw-parser-output > p:not(.mw-empty-elt)").text

	context_sentence = first_paragraph.split(". ")[0]
	# print(context_sentence)

	return context_sentence

	def search_wikipedia(query, driver):
	# Go to Wikipedia's main page
	driver.get("https://www.wikipedia.org/")

	# Find the search bar using its name
	search_bar = driver.find_element(By.NAME, "search")

	# Send the query to the search bar and hit Enter
	search_bar.send_keys(query)
	search_bar.send_keys(Keys.RETURN)

	return driver

	def get_topic_context(driver):
	# Find the first paragraph of the main article
	first_paragraph = driver.find_element(By.CSS_SELECTOR, "div.mw-parser-output > p:not(.mw-empty-elt)").text

	context_sentence = first_paragraph.split(". ")[0]
	# print(context_sentence)

	return context_sentence

	def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):

	##### Setup Chrome options
	chrome_options = webdriver.ChromeOptions()
	chrome_options.add_argument("--headless") # Ensure GUI is off
	chrome_options.add_argument("--no-sandbox")
	chrome_options.add_argument("--disable-dev-shm-usage")
	driver = webdriver.Chrome(options = chrome_options)

	topic = starting_topic
	num_pages = 0
	used_topics = []
	used_links = []

	start_time = time.time()

	### BEGIN ###

	print("-" * 150)
	print(f"\nStarting!\n")
	print("-" * 150)

	driver = search_wikipedia(starting_topic, driver)
	used_links.append(driver.current_url)

	while True:
	# increment the page tracking by 1 for each new page
	num_pages += 1

	# if not the first page, navigate to the new page
	if num_pages > 1:
	driver.get(next_link)

	context_sentence = get_topic_context(driver)
	links_texts = []

	current_url = driver.current_url
	current_url_suffix = str(current_url).split("/")[-1]

	### Use BeautifulSoup and Requests instead of Selenium for link extraction
	current_page = driver.page_source # html from Selenium instead of BeautifulSoup

	soup = BeautifulSoup(current_page, 'html.parser')

	links = soup.find_all('a')

	# Iterate through the links and extract their URLs
	for link in links:
	link_url = link.get('href')
	if link_url and link_url.startswith("/wiki/"):
	link_url = "https://en.wikipedia.org" + link_url
	link_text = link.text.strip() # Get the text and remove leading/trailing spaces

	# make sure they are both not None
	if link_text and current_url_suffix not in link_url:

	if link_url not in used_links and link_text not in used_topics:

	# eliminates topic duplicates, non-wiki links, and wiki-help pages (non-content pages)
	if topic.lower() not in link_url.lower() and "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
	links_texts.append((link_url, link_text))

	best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])

	print(f"\nPage: {num_pages}")
	print(f"Current topic: '{topic.title()}'")
	print(f"Current URL: '{current_url}'")
	print(f"Current Topic Context: '{context_sentence}'")
	print(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%")

	next_link, topic = links_texts[loc_idx]
	# print(next_link)

	# if target_topic.lower() in topic.lower():# or best_score > float(0.85):
	if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90): # if topic text is identical or at least 90% the same spelling
	print("\n" + "-" * 150)
	print(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
	print(f"Starting topic: '{starting_topic.title()}': '{used_links[0]}'")
	print(f"Target topic: '{target_topic.title()}': '{used_links[-1]}'\n")
	print("-" * 150)
	break

	##### ADD DRAMATIC DELAY HERE #####
	# time.sleep(0.5)
	# time.sleep(10)

	if num_pages == limit:
	print("\n" + "-" * 150)
	print(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
	print(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{target_topic.title()}': '{used_links[-1]}'")
	print(f"\nTry a different combination to see if it can do it!\n")
	print("-" * 150)
	break

	used_links.append(next_link)
	used_topics.append(topic)

	driver.quit()

	###### Example

	# starting_topic = "soulja boy"
	# target_topic = "test"
	# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)