Spaces:
Sleeping
Sleeping
Test app file
Browse files
app.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_app_gradio.ipynb.
|
2 |
+
|
3 |
+
# %% auto 0
|
4 |
+
__all__ = ['categories', 'k', 'min_words', 'max_words', 'ignore_text', 'ignore_common', 'learn', 'text', 'label', 'examples',
|
5 |
+
'intf', 'predict']
|
6 |
+
|
7 |
+
# %% ../nbs/02_app_gradio.ipynb 4
|
8 |
+
import warnings
|
9 |
+
warnings.filterwarnings('ignore')
|
10 |
+
from fastai.text.all import *
|
11 |
+
from .data import *
|
12 |
+
import gradio as gr
|
13 |
+
|
14 |
+
# %% ../nbs/02_app_gradio.ipynb 6
|
15 |
+
categories = ('pseudoscience','science')
|
16 |
+
k = 30
|
17 |
+
min_words = 20
|
18 |
+
max_words = 450
|
19 |
+
ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on']
|
20 |
+
ignore_common = ignore_text
|
21 |
+
learn = load_learner('models/2022.12.01 Model v1 88pct', cpu=False)
|
22 |
+
|
23 |
+
def predict(url):
|
24 |
+
page = get_page_all(url, k, max_words, ignore_text, ignore_common)
|
25 |
+
length = len(page.cleaned_text)
|
26 |
+
if length < min_words:
|
27 |
+
return "ERROR: Returned "+str(length)+" words"
|
28 |
+
else:
|
29 |
+
text = ' '.join(page.cleaned_text)
|
30 |
+
with learn.no_bar(), learn.no_logging():
|
31 |
+
pred,idx,probs = learn.predict(text)
|
32 |
+
return dict(zip(categories, map(float,probs)))
|
33 |
+
|
34 |
+
# %% ../nbs/02_app_gradio.ipynb 8
|
35 |
+
text = gr.inputs.Textbox(1)
|
36 |
+
label = gr.outputs.Label()
|
37 |
+
examples = ['https://www.theskepticsguide.org/about','https://www.foxnews.com/opinion']
|
38 |
+
|
39 |
+
intf = gr.Interface(fn=predict, inputs=text, outputs=label, examples=examples)
|
40 |
+
intf.launch()
|
data.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_data.ipynb.
|
2 |
+
|
3 |
+
# %% auto 0
|
4 |
+
__all__ = ['Webpage', 'get_page_all', 'get_all_links']
|
5 |
+
|
6 |
+
# %% ../nbs/01_data.ipynb 4
|
7 |
+
import warnings
|
8 |
+
warnings.filterwarnings('ignore')
|
9 |
+
import requests
|
10 |
+
from bs4 import BeautifulSoup
|
11 |
+
import enchant
|
12 |
+
import re
|
13 |
+
import random
|
14 |
+
from collections import Counter
|
15 |
+
from fastai.text.all import *
|
16 |
+
import hashlib
|
17 |
+
import pickle
|
18 |
+
|
19 |
+
# %% ../nbs/01_data.ipynb 8
|
20 |
+
class Webpage:
|
21 |
+
def __init__(self, url):
|
22 |
+
self.url = url
|
23 |
+
self.hash = self.get_hash_str()
|
24 |
+
self.requested = False
|
25 |
+
self.page_text = ""
|
26 |
+
self.html = ""
|
27 |
+
self.links = []
|
28 |
+
self.text = []
|
29 |
+
self.cleaned_text = []
|
30 |
+
self.most_common_words = []
|
31 |
+
|
32 |
+
def get_page(self, headers, min_size, max_size):
|
33 |
+
r = requests.get(self.url, stream=True, headers=headers)
|
34 |
+
content_length = int(r.headers.get('Content-Length', 0))
|
35 |
+
data = []
|
36 |
+
length = 0
|
37 |
+
|
38 |
+
if content_length > max_size:
|
39 |
+
return None
|
40 |
+
|
41 |
+
for chunk in r.iter_content(1024):
|
42 |
+
data.append(chunk)
|
43 |
+
length += len(chunk)
|
44 |
+
if length > max_size:
|
45 |
+
return None
|
46 |
+
r._content = b''.join(data)
|
47 |
+
if len(r.text) < min_size: return None
|
48 |
+
return r.text
|
49 |
+
|
50 |
+
def get_page_html(self, min_size=1000, max_size=2000000):
|
51 |
+
user_agents = [
|
52 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
53 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
|
54 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
|
55 |
+
'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
|
56 |
+
'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36'
|
57 |
+
]
|
58 |
+
user_agent = random.choice(user_agents)
|
59 |
+
headers = {'User-Agent': user_agent}
|
60 |
+
self.page_text = self.get_page(headers, min_size, max_size)
|
61 |
+
self.html = BeautifulSoup(self.page_text, "html.parser")
|
62 |
+
self.requested = True
|
63 |
+
|
64 |
+
def get_hash_str(self, inp=""):
|
65 |
+
return hashlib.sha3_256((self.url+inp).encode()).hexdigest()
|
66 |
+
|
67 |
+
def get_html_anchors(self, keyword="http"):
|
68 |
+
for anchor in self.html.findAll('a'):
|
69 |
+
link = anchor.get('href')
|
70 |
+
if link == None or link == "":
|
71 |
+
continue
|
72 |
+
if keyword in link:
|
73 |
+
self.links.append(link)
|
74 |
+
|
75 |
+
def get_html_text(self, tags=["p"]):
|
76 |
+
for tag in tags:
|
77 |
+
for p in self.html.findAll(tag):
|
78 |
+
p_text = p.getText().strip()
|
79 |
+
if p_text == None or p_text == '':
|
80 |
+
continue
|
81 |
+
self.text.append(p_text)
|
82 |
+
|
83 |
+
def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
|
84 |
+
all_text = ' '.join(self.text).lower()
|
85 |
+
regex_text = re.sub(rx,'',all_text).strip()
|
86 |
+
split = regex_text.split()
|
87 |
+
split = [word for word in split if word not in ignore]
|
88 |
+
if enchant_dict != "": d = enchant.Dict(enchant_dict)
|
89 |
+
for word in split:
|
90 |
+
if len(self.cleaned_text) >= max_words: break
|
91 |
+
if len(word) >= min_word_len:
|
92 |
+
if enchant_dict == "":
|
93 |
+
self.cleaned_text.append(word)
|
94 |
+
elif d.check(word):
|
95 |
+
self.cleaned_text.append(word)
|
96 |
+
|
97 |
+
def k_common_words(self, k=10, ignore=[]):
|
98 |
+
if self.cleaned_text == "":
|
99 |
+
text = self.text
|
100 |
+
else:
|
101 |
+
text = self.cleaned_text
|
102 |
+
all_text = ' '.join(text).lower()
|
103 |
+
split = all_text.split()
|
104 |
+
split_ignore = [word for word in split if word not in ignore]
|
105 |
+
counts = Counter(split_ignore)
|
106 |
+
k_most_common = counts.most_common(k)
|
107 |
+
self.most_common_words = k_most_common
|
108 |
+
|
109 |
+
def save_text(self, path, fname):
|
110 |
+
file = open(path+fname, 'wb')
|
111 |
+
pickle.dump(self.text, file)
|
112 |
+
file.close()
|
113 |
+
|
114 |
+
def load_text(self, path, fname):
|
115 |
+
file = open(path+fname, 'rb')
|
116 |
+
self.text = pickle.load(file)
|
117 |
+
file.close()
|
118 |
+
|
119 |
+
def save_links(self, path, fname):
|
120 |
+
file = open(path+fname, 'wb')
|
121 |
+
pickle.dump(self.links, file)
|
122 |
+
file.close()
|
123 |
+
|
124 |
+
def load_links(self, path, fname):
|
125 |
+
file = open(path+fname, 'rb')
|
126 |
+
self.links = pickle.load(file)
|
127 |
+
file.close()
|
128 |
+
|
129 |
+
# %% ../nbs/01_data.ipynb 14
|
130 |
+
def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None):
|
131 |
+
page = Webpage(url)
|
132 |
+
fname_text = page.hash+'.text'
|
133 |
+
fname_links = page.hash+'.links'
|
134 |
+
if path == None:
|
135 |
+
page.get_page_html()
|
136 |
+
page.get_html_text(tags=["p","h1","h2","h3","span"])
|
137 |
+
page.get_html_anchors()
|
138 |
+
else:
|
139 |
+
if os.path.isfile(path+fname_text):
|
140 |
+
page.load_text(path, fname_text)
|
141 |
+
else:
|
142 |
+
page.get_page_html()
|
143 |
+
page.get_html_text(tags=["p","h1","h2","h3","span"])
|
144 |
+
page.save_text(path, fname_text)
|
145 |
+
|
146 |
+
if os.path.isfile(path+fname_links):
|
147 |
+
page.load_links(path, fname_links)
|
148 |
+
else:
|
149 |
+
if page.html == "": page.get_page_html()
|
150 |
+
page.get_html_anchors()
|
151 |
+
page.save_links(path, fname_links)
|
152 |
+
|
153 |
+
if page.text is not None:
|
154 |
+
page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+")
|
155 |
+
page.k_common_words(k=k, ignore=ignore_common)
|
156 |
+
return page
|
157 |
+
|
158 |
+
def get_all_links(url, dict, k, min_words=20, max_words=500, ignore_text=[], ignore_common=[], ignore_filenames=[".mp3",".jpg",".png"], max_links="", path=None):
|
159 |
+
primary_page = get_page_all(url, k, max_words, ignore_text, ignore_common, path)
|
160 |
+
if primary_page.cleaned_text is not []:
|
161 |
+
dict[url] = [primary_page.cleaned_text, primary_page.most_common_words]
|
162 |
+
if max_links == "" or max_links > len(primary_page.links): max_links=len(primary_page.links)
|
163 |
+
|
164 |
+
for count, link in enumerate(primary_page.links[:max_links]):
|
165 |
+
if all(x not in link for x in ignore_filenames):
|
166 |
+
try:
|
167 |
+
page = get_page_all(link, k, max_words, ignore_text, ignore_common, path)
|
168 |
+
if page.cleaned_text is not []:
|
169 |
+
if len(page.cleaned_text) < min_words: continue
|
170 |
+
if [page.cleaned_text, page.most_common_words] in dict.values(): continue
|
171 |
+
dict[link] = [page.cleaned_text, page.most_common_words]
|
172 |
+
except:
|
173 |
+
pass
|
174 |
+
if link in dict:
|
175 |
+
res = str(len(dict[link][0]))+" words | "+str(dict[link][1][:3])
|
176 |
+
else:
|
177 |
+
res = "Rejected"
|
178 |
+
progress_message = "%s link %4d/%4d | %s = %s %s" % (url, count, len(primary_page.links), link, res, 200*' ')
|
179 |
+
sys.stdout.write("\r" + progress_message)
|
180 |
+
sys.stdout.flush()
|
181 |
+
else:
|
182 |
+
print(url,"returned None, Skipping...")
|