Spaces:
Running
Running
KAI MAURIN-JONES
commited on
Commit
•
0c34ee8
1
Parent(s):
2e462a5
app updated
Browse files- wiki_game_st_bs4.py +28 -6
wiki_game_st_bs4.py
CHANGED
@@ -5,6 +5,7 @@ import tensorflow as tf
|
|
5 |
import tensorflow_hub as hub
|
6 |
import numpy as np
|
7 |
import jellyfish
|
|
|
8 |
import streamlit as st
|
9 |
|
10 |
# Load the pre-trained Universal Sentence Encoder
|
@@ -28,7 +29,10 @@ def get_wikipedia_page(query):
|
|
28 |
def get_topic_context(page_source):
|
29 |
soup = BeautifulSoup(page_source, 'html.parser')
|
30 |
first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
|
31 |
-
context_sentence =
|
|
|
|
|
|
|
32 |
return context_sentence
|
33 |
|
34 |
def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
|
@@ -49,12 +53,16 @@ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
|
|
49 |
num_pages += 1
|
50 |
|
51 |
if num_pages > 1:
|
|
|
52 |
page_source = get_wikipedia_page(topic)
|
53 |
|
|
|
|
|
|
|
54 |
try:
|
55 |
context_sentence = get_topic_context(page_source)
|
56 |
except Exception as e:
|
57 |
-
|
58 |
|
59 |
links_texts = []
|
60 |
|
@@ -71,7 +79,17 @@ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
|
|
71 |
if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
|
72 |
links_texts.append((link_url, link_text))
|
73 |
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
st.write(f"\nPage: {num_pages}")
|
77 |
st.write(f"Current topic: '{topic.title()}'")
|
@@ -84,15 +102,15 @@ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
|
|
84 |
if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
|
85 |
st.write("\n" + "-" * 150)
|
86 |
st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
|
87 |
-
st.write(f"Starting topic: '{starting_topic.title()}': '
|
88 |
-
st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1]}'\n")
|
89 |
st.write("-" * 150)
|
90 |
break
|
91 |
|
92 |
if num_pages == limit:
|
93 |
st.write("\n" + "-" * 150)
|
94 |
st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
|
95 |
-
st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{
|
96 |
st.write(f"\nTry a different combination to see if it can do it!\n")
|
97 |
st.write("-" * 150)
|
98 |
break
|
@@ -100,6 +118,10 @@ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
|
|
100 |
used_links.append(next_link)
|
101 |
used_topics.append(topic)
|
102 |
|
|
|
|
|
|
|
|
|
103 |
# starting_topic = "soulja boy"
|
104 |
# target_topic = "test"
|
105 |
# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)
|
|
|
5 |
import tensorflow_hub as hub
|
6 |
import numpy as np
|
7 |
import jellyfish
|
8 |
+
import re
|
9 |
import streamlit as st
|
10 |
|
11 |
# Load the pre-trained Universal Sentence Encoder
|
|
|
29 |
def get_topic_context(page_source):
|
30 |
soup = BeautifulSoup(page_source, 'html.parser')
|
31 |
first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
|
32 |
+
context_sentence = re.sub(r'\[.*?\]', '', first_paragraph)
|
33 |
+
context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "")
|
34 |
+
while " " in context_sentence:
|
35 |
+
context_sentence = context_sentence.replace(" ", " ")
|
36 |
return context_sentence
|
37 |
|
38 |
def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
|
|
|
53 |
num_pages += 1
|
54 |
|
55 |
if num_pages > 1:
|
56 |
+
# load url to new page of next topic
|
57 |
page_source = get_wikipedia_page(topic)
|
58 |
|
59 |
+
# create backup list of links and texts from previous page in case new current page fails
|
60 |
+
prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic
|
61 |
+
|
62 |
try:
|
63 |
context_sentence = get_topic_context(page_source)
|
64 |
except Exception as e:
|
65 |
+
context_sentence = "Context could not be found from webpage"
|
66 |
|
67 |
links_texts = []
|
68 |
|
|
|
79 |
if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
|
80 |
links_texts.append((link_url, link_text))
|
81 |
|
82 |
+
# st.write(f'links_texts length: {len(links_texts)}')
|
83 |
+
if len(links_texts) == 0: # if no links
|
84 |
+
links_texts = prev_links_texts
|
85 |
+
|
86 |
+
labels_list = [text for link, text in links_texts]
|
87 |
+
# st.write(f'labels_list length: {len(labels_list)}')
|
88 |
+
|
89 |
+
try:
|
90 |
+
best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list)
|
91 |
+
except Exception as e:
|
92 |
+
best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page
|
93 |
|
94 |
st.write(f"\nPage: {num_pages}")
|
95 |
st.write(f"Current topic: '{topic.title()}'")
|
|
|
102 |
if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
|
103 |
st.write("\n" + "-" * 150)
|
104 |
st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
|
105 |
+
st.write(f"Starting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}")
|
106 |
+
st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n")
|
107 |
st.write("-" * 150)
|
108 |
break
|
109 |
|
110 |
if num_pages == limit:
|
111 |
st.write("\n" + "-" * 150)
|
112 |
st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
|
113 |
+
st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'")
|
114 |
st.write(f"\nTry a different combination to see if it can do it!\n")
|
115 |
st.write("-" * 150)
|
116 |
break
|
|
|
118 |
used_links.append(next_link)
|
119 |
used_topics.append(topic)
|
120 |
|
121 |
+
starting_topic = "soulja boy"
|
122 |
+
target_topic = "game"
|
123 |
+
play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100)
|
124 |
+
|
125 |
# starting_topic = "soulja boy"
|
126 |
# target_topic = "test"
|
127 |
# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)
|