KAI MAURIN-JONES commited on
Commit
0c34ee8
1 Parent(s): 2e462a5

app updated

Browse files
Files changed (1) hide show
  1. wiki_game_st_bs4.py +28 -6
wiki_game_st_bs4.py CHANGED
@@ -5,6 +5,7 @@ import tensorflow as tf
5
  import tensorflow_hub as hub
6
  import numpy as np
7
  import jellyfish
 
8
  import streamlit as st
9
 
10
  # Load the pre-trained Universal Sentence Encoder
@@ -28,7 +29,10 @@ def get_wikipedia_page(query):
28
  def get_topic_context(page_source):
29
  soup = BeautifulSoup(page_source, 'html.parser')
30
  first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
31
- context_sentence = first_paragraph.split(". ")[0]
 
 
 
32
  return context_sentence
33
 
34
  def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
@@ -49,12 +53,16 @@ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
49
  num_pages += 1
50
 
51
  if num_pages > 1:
 
52
  page_source = get_wikipedia_page(topic)
53
 
 
 
 
54
  try:
55
  context_sentence = get_topic_context(page_source)
56
  except Exception as e:
57
- print("Context could not be found from webpage")
58
 
59
  links_texts = []
60
 
@@ -71,7 +79,17 @@ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
71
  if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
72
  links_texts.append((link_url, link_text))
73
 
74
- best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])
 
 
 
 
 
 
 
 
 
 
75
 
76
  st.write(f"\nPage: {num_pages}")
77
  st.write(f"Current topic: '{topic.title()}'")
@@ -84,15 +102,15 @@ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
84
  if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
85
  st.write("\n" + "-" * 150)
86
  st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
87
- st.write(f"Starting topic: '{starting_topic.title()}': 'https://en.wikipedia.org/wiki/{starting_topic}'")
88
- st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1]}'\n")
89
  st.write("-" * 150)
90
  break
91
 
92
  if num_pages == limit:
93
  st.write("\n" + "-" * 150)
94
  st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
95
- st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{target_topic.title()}': '{used_links[-1]}'")
96
  st.write(f"\nTry a different combination to see if it can do it!\n")
97
  st.write("-" * 150)
98
  break
@@ -100,6 +118,10 @@ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
100
  used_links.append(next_link)
101
  used_topics.append(topic)
102
 
 
 
 
 
103
  # starting_topic = "soulja boy"
104
  # target_topic = "test"
105
  # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)
 
5
  import tensorflow_hub as hub
6
  import numpy as np
7
  import jellyfish
8
+ import re
9
  import streamlit as st
10
 
11
  # Load the pre-trained Universal Sentence Encoder
 
29
  def get_topic_context(page_source):
30
  soup = BeautifulSoup(page_source, 'html.parser')
31
  first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
32
+ context_sentence = re.sub(r'\[.*?\]', '', first_paragraph)
33
+ context_sentence = context_sentence.split(". ")[0].strip().replace("\n", "")
34
+ while " " in context_sentence:
35
+ context_sentence = context_sentence.replace(" ", " ")
36
  return context_sentence
37
 
38
  def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
 
53
  num_pages += 1
54
 
55
  if num_pages > 1:
56
+ # load url to new page of next topic
57
  page_source = get_wikipedia_page(topic)
58
 
59
+ # create backup list of links and texts from previous page in case new current page fails
60
+ prev_links_texts = links_texts.pop(loc_idx) # removes the previously used topic
61
+
62
  try:
63
  context_sentence = get_topic_context(page_source)
64
  except Exception as e:
65
+ context_sentence = "Context could not be found from webpage"
66
 
67
  links_texts = []
68
 
 
79
  if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
80
  links_texts.append((link_url, link_text))
81
 
82
+ # st.write(f'links_texts length: {len(links_texts)}')
83
+ if len(links_texts) == 0: # if no links
84
+ links_texts = prev_links_texts
85
+
86
+ labels_list = [text for link, text in links_texts]
87
+ # st.write(f'labels_list length: {len(labels_list)}')
88
+
89
+ try:
90
+ best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = labels_list)
91
+ except Exception as e:
92
+ best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = prev_links_texts) # previous page links without chosen error page
93
 
94
  st.write(f"\nPage: {num_pages}")
95
  st.write(f"Current topic: '{topic.title()}'")
 
102
  if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
103
  st.write("\n" + "-" * 150)
104
  st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
105
+ st.write(f"Starting topic: '{starting_topic.title()}': {used_links[0].replace(' ', '_')}")
106
+ st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1].replace(' ', '_')}'\n")
107
  st.write("-" * 150)
108
  break
109
 
110
  if num_pages == limit:
111
  st.write("\n" + "-" * 150)
112
  st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
113
+ st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0].replace(' ', '_')}', to '{used_topics[-1].title()}': '{used_links[-1].replace(' ', '_')}'")
114
  st.write(f"\nTry a different combination to see if it can do it!\n")
115
  st.write("-" * 150)
116
  break
 
118
  used_links.append(next_link)
119
  used_topics.append(topic)
120
 
121
+ starting_topic = "soulja boy"
122
+ target_topic = "game"
123
+ play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 100)
124
+
125
  # starting_topic = "soulja boy"
126
  # target_topic = "test"
127
  # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)