Spaces:

kmaurinjones
/

Wiki-Game

Sleeping

File size: 5,644 Bytes

bf84cfc
 
 
 
 
 
 
0c34ee8
bf84cfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f7c025
 
 
 
bf84cfc
 
 
 
 
0c34ee8
 
 
 
bf84cfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c34ee8
8f7c025
bf84cfc
0c34ee8
 
 
2e462a5
 
 
0c34ee8
2e462a5
bf84cfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c34ee8
8f7c025
bb120ba
0c34ee8
 
 
 
 
 
 
 
 
bf84cfc
 
 
 
 
 
 
 
 
 
 
 
0c34ee8
 
bf84cfc
 
 
 
 
 
0c34ee8
bf84cfc
 
 
 
 
 
 
 
8f7c025

import requests
from bs4 import BeautifulSoup
import time
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import jellyfish
import re
import streamlit as st

# Load the pre-trained Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def calculate_jaro_similarity(str1, str2):
    jaro_similarity = jellyfish.jaro_distance(str1, str2)
    return jaro_similarity

def most_similar_sentence(target_topic, labels_list):
    context_embedding = embed([target_topic])[0]
    sentence_embeddings = embed(labels_list)
    similarities = np.inner(context_embedding, sentence_embeddings)
    most_similar_index = np.argmax(similarities)
    return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index

def get_wikipedia_page(query):
    if "wikipedia" not in query:
        response = requests.get(f"https://en.wikipedia.org/wiki/{query}")
    else:
        response = requests.get(query)
    return response.text

def get_topic_context(page_source):
    soup = BeautifulSoup(page_source, 'html.parser')
    first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
    context_sentence = re.sub(r'\[.*?\]', '', first_paragraph)
    context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "")
    while "  " in context_sentence:
        context_sentence = context_sentence.replace("  ", " ")
    return context_sentence

def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
    topic = starting_topic
    num_pages = 0
    used_topics = []
    used_links = []
    start_time = time.time()

    st.write("-" * 150)
    st.write(f"\nStarting!\n")
    st.write("-" * 150)

    page_source = get_wikipedia_page(starting_topic)
    used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}")

    while True:
        num_pages += 1

        if num_pages > 1:
            # load url to new page of next topic
            page_source = get_wikipedia_page(used_links[-1])

            # create backup list of links and texts from previous page in case new current page fails
            prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic

        try:
            context_sentence = get_topic_context(page_source)
        except Exception as e:
            context_sentence = "Context could not be found from webpage"

        links_texts = []

        soup = BeautifulSoup(page_source, 'html.parser')
        links = soup.find_all('a')

        for link in links:
            link_url = link.get('href')
            if link_url and link_url.startswith("/wiki/"):
                link_url = "https://en.wikipedia.org" + link_url
                link_text = link.text.strip()

                if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics:
                    if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
                        links_texts.append((link_url, link_text))

        # st.write(f'links_texts length: {len(links_texts)}')
        prev_links_texts = []
        if len(links_texts) == 0 and num_pages > 1: # if no links
            links_texts = prev_links_texts

        labels_list = [text for link, text in links_texts]
        # st.write(f'labels_list length: {len(labels_list)}')

        try:
            best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list)
        except Exception as e:
            best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page

        st.write(f"\nPage: {num_pages}")
        st.write(f"Current topic: '{topic.title()}'")
        st.write(f"Current URL: 'https://en.wikipedia.org/wiki/{topic}'")
        st.write(f"Current Topic Context: '{context_sentence}'")
        st.write(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%")
        
        next_link, topic = links_texts[loc_idx]

        if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
            st.write("\n" + "-" * 150)
            st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
            st.write(f"Starting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}")
            st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n")
            st.write("-" * 150)
            break

        if num_pages == limit:
            st.write("\n" + "-" * 150)
            st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
            st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'")
            st.write(f"\nTry a different combination to see if it can do it!\n")
            st.write("-" * 150)            
            break

        used_links.append(next_link)
        used_topics.append(topic)

# starting_topic = "soulja boy"
# target_topic = "game"
# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100)