Spaces:

kmaurinjones
/

Wiki-Game

Running

File size: 11,834 Bytes

# import requests
# from bs4 import BeautifulSoup
# import time
# import tensorflow as tf
# import tensorflow_hub as hub
# import numpy as np
# import jellyfish
# import re
# import streamlit as st

# # Load the pre-trained Universal Sentence Encoder
# embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# def calculate_jaro_similarity(str1, str2):
#     jaro_similarity = jellyfish.jaro_distance(str1, str2)
#     return jaro_similarity

# def most_similar_sentence(target_topic, labels_list):
#     context_embedding = embed([target_topic])[0]
#     sentence_embeddings = embed(labels_list)
#     similarities = np.inner(context_embedding, sentence_embeddings)
#     most_similar_index = np.argmax(similarities)
#     return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index

# def get_wikipedia_page(query):
#     if "wikipedia" not in query:
#         response = requests.get(f"https://en.wikipedia.org/wiki/{query}")
#     else:
#         response = requests.get(query)
#     return response.text

# def get_topic_context(page_source):
#     soup = BeautifulSoup(page_source, 'html.parser')
#     first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
#     context_sentence = re.sub(r'\[.*?\]', '', first_paragraph)
#     context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "")
#     while "  " in context_sentence:
#         context_sentence = context_sentence.replace("  ", " ")
#     return context_sentence

# def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
#     topic = starting_topic
#     num_pages = 0
#     used_topics = []
#     used_links = []
#     start_time = time.time()

#     st.write("-" * 150)
#     st.write(f"\nStarting!\n")
#     st.write("-" * 150)

#     page_source = get_wikipedia_page(starting_topic)
#     used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}")

#     while True:
#         num_pages += 1

#         if num_pages > 1:
#             # load url to new page of next topic
#             page_source = get_wikipedia_page(used_links[-1])

#             # create backup list of links and texts from previous page in case new current page fails
#             prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic

#         try:
#             context_sentence = get_topic_context(page_source)
#         except Exception as e:
#             context_sentence = "Context could not be found from webpage"

#         links_texts = []

#         soup = BeautifulSoup(page_source, 'html.parser')
#         links = soup.find_all('a')

#         for link in links:
#             link_url = link.get('href')
#             if link_url and link_url.startswith("/wiki/"):
#                 link_url = "https://en.wikipedia.org" + link_url
#                 link_text = link.text.strip()

#                 if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics:
#                     if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
#                         links_texts.append((link_url, link_text))

#         # st.write(f'links_texts length: {len(links_texts)}')
#         prev_links_texts = []
#         if len(links_texts) == 0 and num_pages > 1: # if no links
#             links_texts = prev_links_texts

#         labels_list = [text for link, text in links_texts]
#         # st.write(f'labels_list length: {len(labels_list)}')

#         try:
#             best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list)
#         except Exception as e:
#             best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page

#         st.write(f"\nPage: {num_pages}")
#         st.write(f"Current topic: '{topic.title()}'")
#         st.write(f"Current URL: 'https://en.wikipedia.org/wiki/{topic}'")
#         st.write(f"Current Topic Context: '{context_sentence}'")
#         st.write(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%")
        
#         next_link, topic = links_texts[loc_idx]

#         if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
#             st.write("\n" + "-" * 150)
#             st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
#             st.write(f"Starting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}")
#             st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n")
#             st.write("-" * 150)
#             break

#         if num_pages == limit:
#             st.write("\n" + "-" * 150)
#             st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
#             st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'")
#             st.write(f"\nTry a different combination to see if it can do it!\n")
#             st.write("-" * 150)            
#             break

#         used_links.append(next_link)
#         used_topics.append(topic)

# # starting_topic = "soulja boy"
# # target_topic = "game"
# # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100)

import requests
from bs4 import BeautifulSoup
import time
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import jellyfish
import re
import streamlit as st

# Load the pre-trained Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def calculate_jaro_similarity(str1, str2):
    jaro_similarity = jellyfish.jaro_distance(str1, str2)
    return jaro_similarity

def most_similar_sentence(target_topic, labels_list):
    context_embedding = embed([target_topic])[0]
    sentence_embeddings = embed(labels_list)
    similarities = np.inner(context_embedding, sentence_embeddings)
    most_similar_index = np.argmax(similarities)
    return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index

def get_wikipedia_page(query):
    if "wikipedia" not in query:
        response = requests.get(f"https://en.wikipedia.org/wiki/{query}")
    else:
        response = requests.get(query)
    return response.text

def get_topic_context(page_source):
    soup = BeautifulSoup(page_source, 'html.parser')
    first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
    context_sentence = re.sub(r'\[.*?\]', '', first_paragraph)
    context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "")
    while "  " in context_sentence:
        context_sentence = context_sentence.replace("  ", " ")
    return context_sentence

def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100, delay: int = 0):
    topic = starting_topic
    num_pages = 0
    used_topics = []
    used_links = []
    start_time = time.time()

    st.write("-" * 150)
    st.write(f"\nStarting!\n")
    st.write("-" * 150)

    page_source = get_wikipedia_page(starting_topic)
    used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}")

    placeholder = st.empty()  # Placeholder for your most recent statement
    old_statements = ""  # Store old statements

    while True:
        num_pages += 1

        if num_pages > 1:
            # load url to new page of next topic
            page_source = get_wikipedia_page(used_links[-1])

            # create backup list of links and texts from previous page in case new current page fails
            prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic

        try:
            context_sentence = get_topic_context(page_source)
        except Exception as e:
            context_sentence = "Context could not be found from webpage"

        links_texts = []

        soup = BeautifulSoup(page_source, 'html.parser')
        links = soup.find_all('a')

        for link in links:
            link_url = link.get('href')
            if link_url and link_url.startswith("/wiki/"):
                link_url = "https://en.wikipedia.org" + link_url
                link_text = link.text.strip()

                if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics:
                    if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
                        links_texts.append((link_url, link_text))

        # st.write(f'links_texts length: {len(links_texts)}')
        prev_links_texts = []
        if len(links_texts) == 0 and num_pages > 1: # if no links
            links_texts = prev_links_texts

        labels_list = [text for link, text in links_texts]
        # st.write(f'labels_list length: {len(labels_list)}')

        try:
            best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list)
        except Exception as e:
            best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page

        next_link, topic = links_texts[loc_idx]

        new_statement = f"\nPage: {num_pages}\nCurrent topic: '{topic.title()}'\nCurrent URL: 'https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}'\nCurrent Topic Context: '{context_sentence}'\nNext topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%"
        old_statements = new_statement + "\n" + old_statements
        placeholder.text(old_statements)

        used_links.append(next_link)
        used_topics.append(topic)

        if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
            new_statement = "\n" + "-" * 150 + f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!\nStarting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}\nTarget topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n" + "-" * 150
            old_statements = new_statement + "\n" + old_statements
            placeholder.text(old_statements)
            break

        if num_pages == limit:
            new_statement = "\n" + "-" * 150 + f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.\nIn {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'\n\nTry a different combination to see if it can do it!\n" + "-" * 150
            old_statements = new_statement + "\n" + old_statements
            placeholder.text(old_statements)            
            break

        # delay things, if applicable
        time.sleep(delay)
        
# starting_topic = "soulja boy"
# target_topic = "game"
# delay = 0
# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100, delay = delay)