Spaces:

kmaurinjones
/

Wiki-Game

Running

App Files Files Community

KAI MAURIN-JONES commited on Jul 26, 2023

Commit

0c34ee8

•

1 Parent(s): 2e462a5

app updated

Browse files

Files changed (1) hide show

wiki_game_st_bs4.py +28 -6

wiki_game_st_bs4.py CHANGED Viewed

@@ -5,6 +5,7 @@ import tensorflow as tf
 import tensorflow_hub as hub
 import numpy as np
 import jellyfish
 import streamlit as st
 # Load the pre-trained Universal Sentence Encoder
@@ -28,7 +29,10 @@ def get_wikipedia_page(query):
 def get_topic_context(page_source):
     soup = BeautifulSoup(page_source, 'html.parser')
     first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
-    context_sentence = first_paragraph.split(". ")[0]
     return context_sentence
 def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
@@ -49,12 +53,16 @@ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
         num_pages += 1
         if num_pages > 1:
             page_source = get_wikipedia_page(topic)
         try:
             context_sentence = get_topic_context(page_source)
         except Exception as e:
-            print("Context could not be found from webpage")
         links_texts = []
@@ -71,7 +79,17 @@ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
                     if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
                         links_texts.append((link_url, link_text))
-        best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])
         st.write(f"\nPage: {num_pages}")
         st.write(f"Current topic: '{topic.title()}'")
@@ -84,15 +102,15 @@ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
         if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
             st.write("\n" + "-" * 150)
             st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
-            st.write(f"Starting topic: '{starting_topic.title()}': 'https://en.wikipedia.org/wiki/{starting_topic}'")
-            st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1]}'\n")
             st.write("-" * 150)
             break
         if num_pages == limit:
             st.write("\n" + "-" * 150)
             st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
-            st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{target_topic.title()}': '{used_links[-1]}'")
             st.write(f"\nTry a different combination to see if it can do it!\n")
             st.write("-" * 150)
             break
@@ -100,6 +118,10 @@ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
         used_links.append(next_link)
         used_topics.append(topic)
 # starting_topic = "soulja boy"
 # target_topic = "test"
 # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)

 import tensorflow_hub as hub
 import numpy as np
 import jellyfish
+import re
 import streamlit as st
 # Load the pre-trained Universal Sentence Encoder
 def get_topic_context(page_source):
     soup = BeautifulSoup(page_source, 'html.parser')
     first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
+    context_sentence = re.sub(r'\[.*?\]', '', first_paragraph)
+    context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "")
+    while "  " in context_sentence:
+        context_sentence = context_sentence.replace("  ", " ")
     return context_sentence
 def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
         num_pages += 1
         if num_pages > 1:
+            # load url to new page of next topic
             page_source = get_wikipedia_page(topic)
+            # create backup list of links and texts from previous page in case new current page fails
+            prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic
         try:
             context_sentence = get_topic_context(page_source)
         except Exception as e:
+            context_sentence = "Context could not be found from webpage"
         links_texts = []
                     if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
                         links_texts.append((link_url, link_text))
+        # st.write(f'links_texts length: {len(links_texts)}')
+        if len(links_texts) == 0: # if no links
+            links_texts = prev_links_texts
+        labels_list = [text for link, text in links_texts]
+        # st.write(f'labels_list length: {len(labels_list)}')
+        try:
+            best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list)
+        except Exception as e:
+            best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page
         st.write(f"\nPage: {num_pages}")
         st.write(f"Current topic: '{topic.title()}'")
         if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
             st.write("\n" + "-" * 150)
             st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
+            st.write(f"Starting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}")
+            st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n")
             st.write("-" * 150)
             break
         if num_pages == limit:
             st.write("\n" + "-" * 150)
             st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
+            st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'")
             st.write(f"\nTry a different combination to see if it can do it!\n")
             st.write("-" * 150)
             break
         used_links.append(next_link)
         used_topics.append(topic)
+starting_topic = "soulja boy"
+target_topic = "game"
+play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100)
 # starting_topic = "soulja boy"
 # target_topic = "test"
 # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)