Spaces:
Sleeping
Sleeping
Removed relative imports
Browse files
app.py
CHANGED
@@ -8,8 +8,155 @@ __all__ = ['categories', 'k', 'min_words', 'max_words', 'ignore_text', 'ignore_c
|
|
8 |
import warnings
|
9 |
warnings.filterwarnings('ignore')
|
10 |
from fastai.text.all import *
|
11 |
-
from .data import *
|
12 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# %% ../nbs/02_app_gradio.ipynb 6
|
15 |
categories = ('pseudoscience','science')
|
|
|
8 |
import warnings
|
9 |
warnings.filterwarnings('ignore')
|
10 |
from fastai.text.all import *
|
|
|
11 |
import gradio as gr
|
12 |
+
import requests
|
13 |
+
from bs4 import BeautifulSoup
|
14 |
+
#import enchant
|
15 |
+
import re
|
16 |
+
import random
|
17 |
+
from collections import Counter
|
18 |
+
import hashlib
|
19 |
+
import pickle
|
20 |
+
|
21 |
+
|
22 |
+
# %% ../nbs/01_data.ipynb 8
|
23 |
+
class Webpage:
|
24 |
+
def __init__(self, url):
|
25 |
+
self.url = url
|
26 |
+
self.hash = self.get_hash_str()
|
27 |
+
self.requested = False
|
28 |
+
self.page_text = ""
|
29 |
+
self.html = ""
|
30 |
+
self.links = []
|
31 |
+
self.text = []
|
32 |
+
self.cleaned_text = []
|
33 |
+
self.most_common_words = []
|
34 |
+
|
35 |
+
def get_page(self, headers, min_size, max_size):
|
36 |
+
r = requests.get(self.url, stream=True, headers=headers)
|
37 |
+
content_length = int(r.headers.get('Content-Length', 0))
|
38 |
+
data = []
|
39 |
+
length = 0
|
40 |
+
|
41 |
+
if content_length > max_size:
|
42 |
+
return None
|
43 |
+
|
44 |
+
for chunk in r.iter_content(1024):
|
45 |
+
data.append(chunk)
|
46 |
+
length += len(chunk)
|
47 |
+
if length > max_size:
|
48 |
+
return None
|
49 |
+
r._content = b''.join(data)
|
50 |
+
if len(r.text) < min_size: return None
|
51 |
+
return r.text
|
52 |
+
|
53 |
+
def get_page_html(self, min_size=1000, max_size=2000000):
|
54 |
+
user_agents = [
|
55 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
56 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
|
57 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
|
58 |
+
'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
|
59 |
+
'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36'
|
60 |
+
]
|
61 |
+
user_agent = random.choice(user_agents)
|
62 |
+
headers = {'User-Agent': user_agent}
|
63 |
+
self.page_text = self.get_page(headers, min_size, max_size)
|
64 |
+
self.html = BeautifulSoup(self.page_text, "html.parser")
|
65 |
+
self.requested = True
|
66 |
+
|
67 |
+
def get_hash_str(self, inp=""):
|
68 |
+
return hashlib.sha3_256((self.url+inp).encode()).hexdigest()
|
69 |
+
|
70 |
+
def get_html_anchors(self, keyword="http"):
|
71 |
+
for anchor in self.html.findAll('a'):
|
72 |
+
link = anchor.get('href')
|
73 |
+
if link == None or link == "":
|
74 |
+
continue
|
75 |
+
if keyword in link:
|
76 |
+
self.links.append(link)
|
77 |
+
|
78 |
+
def get_html_text(self, tags=["p"]):
|
79 |
+
for tag in tags:
|
80 |
+
for p in self.html.findAll(tag):
|
81 |
+
p_text = p.getText().strip()
|
82 |
+
if p_text == None or p_text == '':
|
83 |
+
continue
|
84 |
+
self.text.append(p_text)
|
85 |
+
|
86 |
+
def clean_html_text(self, max_words, enchant_dict="", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
|
87 |
+
all_text = ' '.join(self.text).lower()
|
88 |
+
regex_text = re.sub(rx,'',all_text).strip()
|
89 |
+
split = regex_text.split()
|
90 |
+
split = [word for word in split if word not in ignore]
|
91 |
+
#if enchant_dict != "": d = enchant.Dict(enchant_dict)
|
92 |
+
for word in split:
|
93 |
+
if len(self.cleaned_text) >= max_words: break
|
94 |
+
if len(word) >= min_word_len:
|
95 |
+
if enchant_dict == "":
|
96 |
+
self.cleaned_text.append(word)
|
97 |
+
#elif d.check(word):
|
98 |
+
# self.cleaned_text.append(word)
|
99 |
+
|
100 |
+
def k_common_words(self, k=10, ignore=[]):
|
101 |
+
if self.cleaned_text == "":
|
102 |
+
text = self.text
|
103 |
+
else:
|
104 |
+
text = self.cleaned_text
|
105 |
+
all_text = ' '.join(text).lower()
|
106 |
+
split = all_text.split()
|
107 |
+
split_ignore = [word for word in split if word not in ignore]
|
108 |
+
counts = Counter(split_ignore)
|
109 |
+
k_most_common = counts.most_common(k)
|
110 |
+
self.most_common_words = k_most_common
|
111 |
+
|
112 |
+
def save_text(self, path, fname):
|
113 |
+
file = open(path+fname, 'wb')
|
114 |
+
pickle.dump(self.text, file)
|
115 |
+
file.close()
|
116 |
+
|
117 |
+
def load_text(self, path, fname):
|
118 |
+
file = open(path+fname, 'rb')
|
119 |
+
self.text = pickle.load(file)
|
120 |
+
file.close()
|
121 |
+
|
122 |
+
def save_links(self, path, fname):
|
123 |
+
file = open(path+fname, 'wb')
|
124 |
+
pickle.dump(self.links, file)
|
125 |
+
file.close()
|
126 |
+
|
127 |
+
def load_links(self, path, fname):
|
128 |
+
file = open(path+fname, 'rb')
|
129 |
+
self.links = pickle.load(file)
|
130 |
+
file.close()
|
131 |
+
|
132 |
+
# %% ../nbs/01_data.ipynb 14
|
133 |
+
def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None):
|
134 |
+
page = Webpage(url)
|
135 |
+
fname_text = page.hash+'.text'
|
136 |
+
fname_links = page.hash+'.links'
|
137 |
+
if path == None:
|
138 |
+
page.get_page_html()
|
139 |
+
page.get_html_text(tags=["p","h1","h2","h3","span"])
|
140 |
+
page.get_html_anchors()
|
141 |
+
else:
|
142 |
+
if os.path.isfile(path+fname_text):
|
143 |
+
page.load_text(path, fname_text)
|
144 |
+
else:
|
145 |
+
page.get_page_html()
|
146 |
+
page.get_html_text(tags=["p","h1","h2","h3","span"])
|
147 |
+
page.save_text(path, fname_text)
|
148 |
+
|
149 |
+
if os.path.isfile(path+fname_links):
|
150 |
+
page.load_links(path, fname_links)
|
151 |
+
else:
|
152 |
+
if page.html == "": page.get_page_html()
|
153 |
+
page.get_html_anchors()
|
154 |
+
page.save_links(path, fname_links)
|
155 |
+
|
156 |
+
if page.text is not None:
|
157 |
+
page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+")
|
158 |
+
page.k_common_words(k=k, ignore=ignore_common)
|
159 |
+
return page
|
160 |
|
161 |
# %% ../nbs/02_app_gradio.ipynb 6
|
162 |
categories = ('pseudoscience','science')
|
data.py
DELETED
@@ -1,182 +0,0 @@
|
|
1 |
-
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_data.ipynb.
|
2 |
-
|
3 |
-
# %% auto 0
|
4 |
-
__all__ = ['Webpage', 'get_page_all', 'get_all_links']
|
5 |
-
|
6 |
-
# %% ../nbs/01_data.ipynb 4
|
7 |
-
import warnings
|
8 |
-
warnings.filterwarnings('ignore')
|
9 |
-
import requests
|
10 |
-
from bs4 import BeautifulSoup
|
11 |
-
#import enchant
|
12 |
-
import re
|
13 |
-
import random
|
14 |
-
from collections import Counter
|
15 |
-
from fastai.text.all import *
|
16 |
-
import hashlib
|
17 |
-
import pickle
|
18 |
-
|
19 |
-
# %% ../nbs/01_data.ipynb 8
|
20 |
-
class Webpage:
|
21 |
-
def __init__(self, url):
|
22 |
-
self.url = url
|
23 |
-
self.hash = self.get_hash_str()
|
24 |
-
self.requested = False
|
25 |
-
self.page_text = ""
|
26 |
-
self.html = ""
|
27 |
-
self.links = []
|
28 |
-
self.text = []
|
29 |
-
self.cleaned_text = []
|
30 |
-
self.most_common_words = []
|
31 |
-
|
32 |
-
def get_page(self, headers, min_size, max_size):
|
33 |
-
r = requests.get(self.url, stream=True, headers=headers)
|
34 |
-
content_length = int(r.headers.get('Content-Length', 0))
|
35 |
-
data = []
|
36 |
-
length = 0
|
37 |
-
|
38 |
-
if content_length > max_size:
|
39 |
-
return None
|
40 |
-
|
41 |
-
for chunk in r.iter_content(1024):
|
42 |
-
data.append(chunk)
|
43 |
-
length += len(chunk)
|
44 |
-
if length > max_size:
|
45 |
-
return None
|
46 |
-
r._content = b''.join(data)
|
47 |
-
if len(r.text) < min_size: return None
|
48 |
-
return r.text
|
49 |
-
|
50 |
-
def get_page_html(self, min_size=1000, max_size=2000000):
|
51 |
-
user_agents = [
|
52 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
53 |
-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
|
54 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
|
55 |
-
'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
|
56 |
-
'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36'
|
57 |
-
]
|
58 |
-
user_agent = random.choice(user_agents)
|
59 |
-
headers = {'User-Agent': user_agent}
|
60 |
-
self.page_text = self.get_page(headers, min_size, max_size)
|
61 |
-
self.html = BeautifulSoup(self.page_text, "html.parser")
|
62 |
-
self.requested = True
|
63 |
-
|
64 |
-
def get_hash_str(self, inp=""):
|
65 |
-
return hashlib.sha3_256((self.url+inp).encode()).hexdigest()
|
66 |
-
|
67 |
-
def get_html_anchors(self, keyword="http"):
|
68 |
-
for anchor in self.html.findAll('a'):
|
69 |
-
link = anchor.get('href')
|
70 |
-
if link == None or link == "":
|
71 |
-
continue
|
72 |
-
if keyword in link:
|
73 |
-
self.links.append(link)
|
74 |
-
|
75 |
-
def get_html_text(self, tags=["p"]):
|
76 |
-
for tag in tags:
|
77 |
-
for p in self.html.findAll(tag):
|
78 |
-
p_text = p.getText().strip()
|
79 |
-
if p_text == None or p_text == '':
|
80 |
-
continue
|
81 |
-
self.text.append(p_text)
|
82 |
-
|
83 |
-
def clean_html_text(self, max_words, enchant_dict="", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
|
84 |
-
all_text = ' '.join(self.text).lower()
|
85 |
-
regex_text = re.sub(rx,'',all_text).strip()
|
86 |
-
split = regex_text.split()
|
87 |
-
split = [word for word in split if word not in ignore]
|
88 |
-
#if enchant_dict != "": d = enchant.Dict(enchant_dict)
|
89 |
-
for word in split:
|
90 |
-
if len(self.cleaned_text) >= max_words: break
|
91 |
-
if len(word) >= min_word_len:
|
92 |
-
if enchant_dict == "":
|
93 |
-
self.cleaned_text.append(word)
|
94 |
-
#elif d.check(word):
|
95 |
-
# self.cleaned_text.append(word)
|
96 |
-
|
97 |
-
def k_common_words(self, k=10, ignore=[]):
|
98 |
-
if self.cleaned_text == "":
|
99 |
-
text = self.text
|
100 |
-
else:
|
101 |
-
text = self.cleaned_text
|
102 |
-
all_text = ' '.join(text).lower()
|
103 |
-
split = all_text.split()
|
104 |
-
split_ignore = [word for word in split if word not in ignore]
|
105 |
-
counts = Counter(split_ignore)
|
106 |
-
k_most_common = counts.most_common(k)
|
107 |
-
self.most_common_words = k_most_common
|
108 |
-
|
109 |
-
def save_text(self, path, fname):
|
110 |
-
file = open(path+fname, 'wb')
|
111 |
-
pickle.dump(self.text, file)
|
112 |
-
file.close()
|
113 |
-
|
114 |
-
def load_text(self, path, fname):
|
115 |
-
file = open(path+fname, 'rb')
|
116 |
-
self.text = pickle.load(file)
|
117 |
-
file.close()
|
118 |
-
|
119 |
-
def save_links(self, path, fname):
|
120 |
-
file = open(path+fname, 'wb')
|
121 |
-
pickle.dump(self.links, file)
|
122 |
-
file.close()
|
123 |
-
|
124 |
-
def load_links(self, path, fname):
|
125 |
-
file = open(path+fname, 'rb')
|
126 |
-
self.links = pickle.load(file)
|
127 |
-
file.close()
|
128 |
-
|
129 |
-
# %% ../nbs/01_data.ipynb 14
|
130 |
-
def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None):
|
131 |
-
page = Webpage(url)
|
132 |
-
fname_text = page.hash+'.text'
|
133 |
-
fname_links = page.hash+'.links'
|
134 |
-
if path == None:
|
135 |
-
page.get_page_html()
|
136 |
-
page.get_html_text(tags=["p","h1","h2","h3","span"])
|
137 |
-
page.get_html_anchors()
|
138 |
-
else:
|
139 |
-
if os.path.isfile(path+fname_text):
|
140 |
-
page.load_text(path, fname_text)
|
141 |
-
else:
|
142 |
-
page.get_page_html()
|
143 |
-
page.get_html_text(tags=["p","h1","h2","h3","span"])
|
144 |
-
page.save_text(path, fname_text)
|
145 |
-
|
146 |
-
if os.path.isfile(path+fname_links):
|
147 |
-
page.load_links(path, fname_links)
|
148 |
-
else:
|
149 |
-
if page.html == "": page.get_page_html()
|
150 |
-
page.get_html_anchors()
|
151 |
-
page.save_links(path, fname_links)
|
152 |
-
|
153 |
-
if page.text is not None:
|
154 |
-
page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+")
|
155 |
-
page.k_common_words(k=k, ignore=ignore_common)
|
156 |
-
return page
|
157 |
-
|
158 |
-
def get_all_links(url, dict, k, min_words=20, max_words=500, ignore_text=[], ignore_common=[], ignore_filenames=[".mp3",".jpg",".png"], max_links="", path=None):
|
159 |
-
primary_page = get_page_all(url, k, max_words, ignore_text, ignore_common, path)
|
160 |
-
if primary_page.cleaned_text is not []:
|
161 |
-
dict[url] = [primary_page.cleaned_text, primary_page.most_common_words]
|
162 |
-
if max_links == "" or max_links > len(primary_page.links): max_links=len(primary_page.links)
|
163 |
-
|
164 |
-
for count, link in enumerate(primary_page.links[:max_links]):
|
165 |
-
if all(x not in link for x in ignore_filenames):
|
166 |
-
try:
|
167 |
-
page = get_page_all(link, k, max_words, ignore_text, ignore_common, path)
|
168 |
-
if page.cleaned_text is not []:
|
169 |
-
if len(page.cleaned_text) < min_words: continue
|
170 |
-
if [page.cleaned_text, page.most_common_words] in dict.values(): continue
|
171 |
-
dict[link] = [page.cleaned_text, page.most_common_words]
|
172 |
-
except:
|
173 |
-
pass
|
174 |
-
if link in dict:
|
175 |
-
res = str(len(dict[link][0]))+" words | "+str(dict[link][1][:3])
|
176 |
-
else:
|
177 |
-
res = "Rejected"
|
178 |
-
progress_message = "%s link %4d/%4d | %s = %s %s" % (url, count, len(primary_page.links), link, res, 200*' ')
|
179 |
-
sys.stdout.write("\r" + progress_message)
|
180 |
-
sys.stdout.flush()
|
181 |
-
else:
|
182 |
-
print(url,"returned None, Skipping...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|