KAI MAURIN-JONES commited on
Commit
bf84cfc
β€’
1 Parent(s): 657dfe4

app updated

Browse files
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import streamlit as st
2
- from wiki_game_st import *
3
 
4
  # Set the title of the app
5
  st.title("Wiki Game (BETA)")
 
1
  import streamlit as st
2
+ from wiki_game_st_bs4 import *
3
 
4
  # Set the title of the app
5
  st.title("Wiki Game (BETA)")
wiki_game_st_bs4.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import time
4
+ import tensorflow as tf
5
+ import tensorflow_hub as hub
6
+ import numpy as np
7
+ import jellyfish
8
+ import streamlit as st
9
+
10
+ # Load the pre-trained Universal Sentence Encoder
11
+ embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
12
+
13
+ def calculate_jaro_similarity(str1, str2):
14
+ jaro_similarity = jellyfish.jaro_distance(str1, str2)
15
+ return jaro_similarity
16
+
17
+ def most_similar_sentence(target_topic, labels_list):
18
+ context_embedding = embed([target_topic])[0]
19
+ sentence_embeddings = embed(labels_list)
20
+ similarities = np.inner(context_embedding, sentence_embeddings)
21
+ most_similar_index = np.argmax(similarities)
22
+ return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index
23
+
24
+ def get_wikipedia_page(query):
25
+ response = requests.get(f"https://en.wikipedia.org/wiki/{query}")
26
+ return response.text
27
+
28
+ def get_topic_context(page_source):
29
+ soup = BeautifulSoup(page_source, 'html.parser')
30
+ first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
31
+ context_sentence = first_paragraph.split(". ")[0]
32
+ return context_sentence
33
+
34
+ def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
35
+ topic = starting_topic
36
+ num_pages = 0
37
+ used_topics = []
38
+ used_links = []
39
+ start_time = time.time()
40
+
41
+ st.write("-" * 150)
42
+ st.write(f"\nStarting!\n")
43
+ st.write("-" * 150)
44
+
45
+ page_source = get_wikipedia_page(starting_topic)
46
+ used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}")
47
+
48
+ while True:
49
+ num_pages += 1
50
+
51
+ if num_pages > 1:
52
+ page_source = get_wikipedia_page(topic)
53
+
54
+ context_sentence = get_topic_context(page_source)
55
+ links_texts = []
56
+
57
+ soup = BeautifulSoup(page_source, 'html.parser')
58
+ links = soup.find_all('a')
59
+
60
+ for link in links:
61
+ link_url = link.get('href')
62
+ if link_url and link_url.startswith("/wiki/"):
63
+ link_url = "https://en.wikipedia.org" + link_url
64
+ link_text = link.text.strip()
65
+
66
+ if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics:
67
+ if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
68
+ links_texts.append((link_url, link_text))
69
+
70
+ best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])
71
+
72
+ st.write(f"\nPage: {num_pages}")
73
+ st.write(f"Current topic: '{topic.title()}'")
74
+ st.write(f"Current URL: 'https://en.wikipedia.org/wiki/{topic}'")
75
+ st.write(f"Current Topic Context: '{context_sentence}'")
76
+ st.write(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%")
77
+
78
+ next_link, topic = links_texts[loc_idx]
79
+
80
+ if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
81
+ st.write("\n" + "-" * 150)
82
+ st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
83
+ st.write(f"Starting topic: '{starting_topic.title()}': 'https://en.wikipedia.org/wiki/{starting_topic}'")
84
+ st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1]}'\n")
85
+ st.write("-" * 150)
86
+ break
87
+
88
+ if num_pages == limit:
89
+ st.write("\n" + "-" * 150)
90
+ st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
91
+ st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': 'https://en.wikipedia.org/wiki/{starting_topic}', to '{target_topic.title()}': '{used_links[-1]}'")
92
+ st.write(f"\nTry a different combination to see if it can do it!\n")
93
+ st.write("-" * 150)
94
+ break
95
+
96
+ used_links.append(next_link)
97
+ used_topics.append(topic)
98
+
99
+ # starting_topic = "soulja boy"
100
+ # target_topic = "test"
101
+ # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)
wiki_game_st.py β†’ wiki_game_st_sel.py RENAMED
File without changes