sbavery commited on
Commit
1e588f4
1 Parent(s): 237973e

Test app file

Browse files
Files changed (2) hide show
  1. app.py +40 -0
  2. data.py +182 -0
app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_app_gradio.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['categories', 'k', 'min_words', 'max_words', 'ignore_text', 'ignore_common', 'learn', 'text', 'label', 'examples',
5
+ 'intf', 'predict']
6
+
7
+ # %% ../nbs/02_app_gradio.ipynb 4
8
+ import warnings
9
+ warnings.filterwarnings('ignore')
10
+ from fastai.text.all import *
11
+ from .data import *
12
+ import gradio as gr
13
+
14
+ # %% ../nbs/02_app_gradio.ipynb 6
15
+ categories = ('pseudoscience','science')
16
+ k = 30
17
+ min_words = 20
18
+ max_words = 450
19
+ ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on']
20
+ ignore_common = ignore_text
21
+ learn = load_learner('models/2022.12.01 Model v1 88pct', cpu=False)
22
+
23
+ def predict(url):
24
+ page = get_page_all(url, k, max_words, ignore_text, ignore_common)
25
+ length = len(page.cleaned_text)
26
+ if length < min_words:
27
+ return "ERROR: Returned "+str(length)+" words"
28
+ else:
29
+ text = ' '.join(page.cleaned_text)
30
+ with learn.no_bar(), learn.no_logging():
31
+ pred,idx,probs = learn.predict(text)
32
+ return dict(zip(categories, map(float,probs)))
33
+
34
+ # %% ../nbs/02_app_gradio.ipynb 8
35
+ text = gr.inputs.Textbox(1)
36
+ label = gr.outputs.Label()
37
+ examples = ['https://www.theskepticsguide.org/about','https://www.foxnews.com/opinion']
38
+
39
+ intf = gr.Interface(fn=predict, inputs=text, outputs=label, examples=examples)
40
+ intf.launch()
data.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_data.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['Webpage', 'get_page_all', 'get_all_links']
5
+
6
+ # %% ../nbs/01_data.ipynb 4
7
+ import warnings
8
+ warnings.filterwarnings('ignore')
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+ import enchant
12
+ import re
13
+ import random
14
+ from collections import Counter
15
+ from fastai.text.all import *
16
+ import hashlib
17
+ import pickle
18
+
19
+ # %% ../nbs/01_data.ipynb 8
20
+ class Webpage:
21
+ def __init__(self, url):
22
+ self.url = url
23
+ self.hash = self.get_hash_str()
24
+ self.requested = False
25
+ self.page_text = ""
26
+ self.html = ""
27
+ self.links = []
28
+ self.text = []
29
+ self.cleaned_text = []
30
+ self.most_common_words = []
31
+
32
+ def get_page(self, headers, min_size, max_size):
33
+ r = requests.get(self.url, stream=True, headers=headers)
34
+ content_length = int(r.headers.get('Content-Length', 0))
35
+ data = []
36
+ length = 0
37
+
38
+ if content_length > max_size:
39
+ return None
40
+
41
+ for chunk in r.iter_content(1024):
42
+ data.append(chunk)
43
+ length += len(chunk)
44
+ if length > max_size:
45
+ return None
46
+ r._content = b''.join(data)
47
+ if len(r.text) < min_size: return None
48
+ return r.text
49
+
50
+ def get_page_html(self, min_size=1000, max_size=2000000):
51
+ user_agents = [
52
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
53
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
54
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
55
+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
56
+ 'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36'
57
+ ]
58
+ user_agent = random.choice(user_agents)
59
+ headers = {'User-Agent': user_agent}
60
+ self.page_text = self.get_page(headers, min_size, max_size)
61
+ self.html = BeautifulSoup(self.page_text, "html.parser")
62
+ self.requested = True
63
+
64
+ def get_hash_str(self, inp=""):
65
+ return hashlib.sha3_256((self.url+inp).encode()).hexdigest()
66
+
67
+ def get_html_anchors(self, keyword="http"):
68
+ for anchor in self.html.findAll('a'):
69
+ link = anchor.get('href')
70
+ if link == None or link == "":
71
+ continue
72
+ if keyword in link:
73
+ self.links.append(link)
74
+
75
+ def get_html_text(self, tags=["p"]):
76
+ for tag in tags:
77
+ for p in self.html.findAll(tag):
78
+ p_text = p.getText().strip()
79
+ if p_text == None or p_text == '':
80
+ continue
81
+ self.text.append(p_text)
82
+
83
+ def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
84
+ all_text = ' '.join(self.text).lower()
85
+ regex_text = re.sub(rx,'',all_text).strip()
86
+ split = regex_text.split()
87
+ split = [word for word in split if word not in ignore]
88
+ if enchant_dict != "": d = enchant.Dict(enchant_dict)
89
+ for word in split:
90
+ if len(self.cleaned_text) >= max_words: break
91
+ if len(word) >= min_word_len:
92
+ if enchant_dict == "":
93
+ self.cleaned_text.append(word)
94
+ elif d.check(word):
95
+ self.cleaned_text.append(word)
96
+
97
+ def k_common_words(self, k=10, ignore=[]):
98
+ if self.cleaned_text == "":
99
+ text = self.text
100
+ else:
101
+ text = self.cleaned_text
102
+ all_text = ' '.join(text).lower()
103
+ split = all_text.split()
104
+ split_ignore = [word for word in split if word not in ignore]
105
+ counts = Counter(split_ignore)
106
+ k_most_common = counts.most_common(k)
107
+ self.most_common_words = k_most_common
108
+
109
+ def save_text(self, path, fname):
110
+ file = open(path+fname, 'wb')
111
+ pickle.dump(self.text, file)
112
+ file.close()
113
+
114
+ def load_text(self, path, fname):
115
+ file = open(path+fname, 'rb')
116
+ self.text = pickle.load(file)
117
+ file.close()
118
+
119
+ def save_links(self, path, fname):
120
+ file = open(path+fname, 'wb')
121
+ pickle.dump(self.links, file)
122
+ file.close()
123
+
124
+ def load_links(self, path, fname):
125
+ file = open(path+fname, 'rb')
126
+ self.links = pickle.load(file)
127
+ file.close()
128
+
129
+ # %% ../nbs/01_data.ipynb 14
130
+ def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None):
131
+ page = Webpage(url)
132
+ fname_text = page.hash+'.text'
133
+ fname_links = page.hash+'.links'
134
+ if path == None:
135
+ page.get_page_html()
136
+ page.get_html_text(tags=["p","h1","h2","h3","span"])
137
+ page.get_html_anchors()
138
+ else:
139
+ if os.path.isfile(path+fname_text):
140
+ page.load_text(path, fname_text)
141
+ else:
142
+ page.get_page_html()
143
+ page.get_html_text(tags=["p","h1","h2","h3","span"])
144
+ page.save_text(path, fname_text)
145
+
146
+ if os.path.isfile(path+fname_links):
147
+ page.load_links(path, fname_links)
148
+ else:
149
+ if page.html == "": page.get_page_html()
150
+ page.get_html_anchors()
151
+ page.save_links(path, fname_links)
152
+
153
+ if page.text is not None:
154
+ page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+")
155
+ page.k_common_words(k=k, ignore=ignore_common)
156
+ return page
157
+
158
+ def get_all_links(url, dict, k, min_words=20, max_words=500, ignore_text=[], ignore_common=[], ignore_filenames=[".mp3",".jpg",".png"], max_links="", path=None):
159
+ primary_page = get_page_all(url, k, max_words, ignore_text, ignore_common, path)
160
+ if primary_page.cleaned_text is not []:
161
+ dict[url] = [primary_page.cleaned_text, primary_page.most_common_words]
162
+ if max_links == "" or max_links > len(primary_page.links): max_links=len(primary_page.links)
163
+
164
+ for count, link in enumerate(primary_page.links[:max_links]):
165
+ if all(x not in link for x in ignore_filenames):
166
+ try:
167
+ page = get_page_all(link, k, max_words, ignore_text, ignore_common, path)
168
+ if page.cleaned_text is not []:
169
+ if len(page.cleaned_text) < min_words: continue
170
+ if [page.cleaned_text, page.most_common_words] in dict.values(): continue
171
+ dict[link] = [page.cleaned_text, page.most_common_words]
172
+ except:
173
+ pass
174
+ if link in dict:
175
+ res = str(len(dict[link][0]))+" words | "+str(dict[link][1][:3])
176
+ else:
177
+ res = "Rejected"
178
+ progress_message = "%s link %4d/%4d | %s = %s %s" % (url, count, len(primary_page.links), link, res, 200*' ')
179
+ sys.stdout.write("\r" + progress_message)
180
+ sys.stdout.flush()
181
+ else:
182
+ print(url,"returned None, Skipping...")