Spaces:

kmaurinjones
/

Wiki-Game

Running

App Files Files Community

Wiki-Game / wiki_game_st_bs4.py

KAI MAURIN-JONES

app updated

0109ba9 12 months ago

raw

history blame contribute delete

No virus

11.8 kB

	# import requests
	# from bs4 import BeautifulSoup
	# import time
	# import tensorflow as tf
	# import tensorflow_hub as hub
	# import numpy as np
	# import jellyfish
	# import re
	# import streamlit as st

	# # Load the pre-trained Universal Sentence Encoder
	# embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

	# def calculate_jaro_similarity(str1, str2):
	# jaro_similarity = jellyfish.jaro_distance(str1, str2)
	# return jaro_similarity

	# def most_similar_sentence(target_topic, labels_list):
	# context_embedding = embed([target_topic])[0]
	# sentence_embeddings = embed(labels_list)
	# similarities = np.inner(context_embedding, sentence_embeddings)
	# most_similar_index = np.argmax(similarities)
	# return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index

	# def get_wikipedia_page(query):
	# if "wikipedia" not in query:
	# response = requests.get(f"https://en.wikipedia.org/wiki/{query}")
	# else:
	# response = requests.get(query)
	# return response.text

	# def get_topic_context(page_source):
	# soup = BeautifulSoup(page_source, 'html.parser')
	# first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
	# context_sentence = re.sub(r'\[.*?\]', '', first_paragraph)
	# context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "")
	# while " " in context_sentence:
	# context_sentence = context_sentence.replace(" ", " ")
	# return context_sentence

	# def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
	# topic = starting_topic
	# num_pages = 0
	# used_topics = []
	# used_links = []
	# start_time = time.time()

	# st.write("-" * 150)
	# st.write(f"\nStarting!\n")
	# st.write("-" * 150)

	# page_source = get_wikipedia_page(starting_topic)
	# used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}")

	# while True:
	# num_pages += 1

	# if num_pages > 1:
	# # load url to new page of next topic
	# page_source = get_wikipedia_page(used_links[-1])

	# # create backup list of links and texts from previous page in case new current page fails
	# prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic

	# try:
	# context_sentence = get_topic_context(page_source)
	# except Exception as e:
	# context_sentence = "Context could not be found from webpage"

	# links_texts = []

	# soup = BeautifulSoup(page_source, 'html.parser')
	# links = soup.find_all('a')

	# for link in links:
	# link_url = link.get('href')
	# if link_url and link_url.startswith("/wiki/"):
	# link_url = "https://en.wikipedia.org" + link_url
	# link_text = link.text.strip()

	# if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics:
	# if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
	# links_texts.append((link_url, link_text))

	# # st.write(f'links_texts length: {len(links_texts)}')
	# prev_links_texts = []
	# if len(links_texts) == 0 and num_pages > 1: # if no links
	# links_texts = prev_links_texts

	# labels_list = [text for link, text in links_texts]
	# # st.write(f'labels_list length: {len(labels_list)}')

	# try:
	# best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list)
	# except Exception as e:
	# best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page

	# st.write(f"\nPage: {num_pages}")
	# st.write(f"Current topic: '{topic.title()}'")
	# st.write(f"Current URL: 'https://en.wikipedia.org/wiki/{topic}'")
	# st.write(f"Current Topic Context: '{context_sentence}'")
	# st.write(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%")

	# next_link, topic = links_texts[loc_idx]

	# if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
	# st.write("\n" + "-" * 150)
	# st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
	# st.write(f"Starting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}")
	# st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n")
	# st.write("-" * 150)
	# break

	# if num_pages == limit:
	# st.write("\n" + "-" * 150)
	# st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
	# st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'")
	# st.write(f"\nTry a different combination to see if it can do it!\n")
	# st.write("-" * 150)
	# break

	# used_links.append(next_link)
	# used_topics.append(topic)

	# # starting_topic = "soulja boy"
	# # target_topic = "game"
	# # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100)

	import requests
	from bs4 import BeautifulSoup
	import time
	import tensorflow as tf
	import tensorflow_hub as hub
	import numpy as np
	import jellyfish
	import re
	import streamlit as st

	# Load the pre-trained Universal Sentence Encoder
	embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

	def calculate_jaro_similarity(str1, str2):
	jaro_similarity = jellyfish.jaro_distance(str1, str2)
	return jaro_similarity

	def most_similar_sentence(target_topic, labels_list):
	context_embedding = embed([target_topic])[0]
	sentence_embeddings = embed(labels_list)
	similarities = np.inner(context_embedding, sentence_embeddings)
	most_similar_index = np.argmax(similarities)
	return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index

	def get_wikipedia_page(query):
	if "wikipedia" not in query:
	response = requests.get(f"https://en.wikipedia.org/wiki/{query}")
	else:
	response = requests.get(query)
	return response.text

	def get_topic_context(page_source):
	soup = BeautifulSoup(page_source, 'html.parser')
	first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
	context_sentence = re.sub(r'\[.*?\]', '', first_paragraph)
	context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "")
	while " " in context_sentence:
	context_sentence = context_sentence.replace(" ", " ")
	return context_sentence

	def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100, delay: int = 0):
	topic = starting_topic
	num_pages = 0
	used_topics = []
	used_links = []
	start_time = time.time()

	st.write("-" * 150)
	st.write(f"\nStarting!\n")
	st.write("-" * 150)

	page_source = get_wikipedia_page(starting_topic)
	used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}")

	placeholder = st.empty() # Placeholder for your most recent statement
	old_statements = "" # Store old statements

	while True:
	num_pages += 1

	if num_pages > 1:
	# load url to new page of next topic
	page_source = get_wikipedia_page(used_links[-1])

	# create backup list of links and texts from previous page in case new current page fails
	prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic

	try:
	context_sentence = get_topic_context(page_source)
	except Exception as e:
	context_sentence = "Context could not be found from webpage"

	links_texts = []

	soup = BeautifulSoup(page_source, 'html.parser')
	links = soup.find_all('a')

	for link in links:
	link_url = link.get('href')
	if link_url and link_url.startswith("/wiki/"):
	link_url = "https://en.wikipedia.org" + link_url
	link_text = link.text.strip()

	if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics:
	if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
	links_texts.append((link_url, link_text))

	# st.write(f'links_texts length: {len(links_texts)}')
	prev_links_texts = []
	if len(links_texts) == 0 and num_pages > 1: # if no links
	links_texts = prev_links_texts

	labels_list = [text for link, text in links_texts]
	# st.write(f'labels_list length: {len(labels_list)}')

	try:
	best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list)
	except Exception as e:
	best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page

	next_link, topic = links_texts[loc_idx]

	new_statement = f"\nPage: {num_pages}\nCurrent topic: '{topic.title()}'\nCurrent URL: 'https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}'\nCurrent Topic Context: '{context_sentence}'\nNext topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%"
	old_statements = new_statement + "\n" + old_statements
	placeholder.text(old_statements)

	used_links.append(next_link)
	used_topics.append(topic)

	if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
	new_statement = "\n" + "-" * 150 + f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!\nStarting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}\nTarget topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n" + "-" * 150
	old_statements = new_statement + "\n" + old_statements
	placeholder.text(old_statements)
	break

	if num_pages == limit:
	new_statement = "\n" + "-" * 150 + f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.\nIn {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'\n\nTry a different combination to see if it can do it!\n" + "-" * 150
	old_statements = new_statement + "\n" + old_statements
	placeholder.text(old_statements)
	break

	# delay things, if applicable
	time.sleep(delay)

	# starting_topic = "soulja boy"
	# target_topic = "game"
	# delay = 0
	# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100, delay = delay)