sbavery commited on
Commit
33df71b
1 Parent(s): bfef0ca

Removed relative imports

Browse files
Files changed (2) hide show
  1. app.py +148 -1
  2. data.py +0 -182
app.py CHANGED
@@ -8,8 +8,155 @@ __all__ = ['categories', 'k', 'min_words', 'max_words', 'ignore_text', 'ignore_c
8
  import warnings
9
  warnings.filterwarnings('ignore')
10
  from fastai.text.all import *
11
- from .data import *
12
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # %% ../nbs/02_app_gradio.ipynb 6
15
  categories = ('pseudoscience','science')
 
8
  import warnings
9
  warnings.filterwarnings('ignore')
10
  from fastai.text.all import *
 
11
  import gradio as gr
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+ #import enchant
15
+ import re
16
+ import random
17
+ from collections import Counter
18
+ import hashlib
19
+ import pickle
20
+
21
+
22
+ # %% ../nbs/01_data.ipynb 8
23
+ class Webpage:
24
+ def __init__(self, url):
25
+ self.url = url
26
+ self.hash = self.get_hash_str()
27
+ self.requested = False
28
+ self.page_text = ""
29
+ self.html = ""
30
+ self.links = []
31
+ self.text = []
32
+ self.cleaned_text = []
33
+ self.most_common_words = []
34
+
35
+ def get_page(self, headers, min_size, max_size):
36
+ r = requests.get(self.url, stream=True, headers=headers)
37
+ content_length = int(r.headers.get('Content-Length', 0))
38
+ data = []
39
+ length = 0
40
+
41
+ if content_length > max_size:
42
+ return None
43
+
44
+ for chunk in r.iter_content(1024):
45
+ data.append(chunk)
46
+ length += len(chunk)
47
+ if length > max_size:
48
+ return None
49
+ r._content = b''.join(data)
50
+ if len(r.text) < min_size: return None
51
+ return r.text
52
+
53
+ def get_page_html(self, min_size=1000, max_size=2000000):
54
+ user_agents = [
55
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
56
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
57
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
58
+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
59
+ 'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36'
60
+ ]
61
+ user_agent = random.choice(user_agents)
62
+ headers = {'User-Agent': user_agent}
63
+ self.page_text = self.get_page(headers, min_size, max_size)
64
+ self.html = BeautifulSoup(self.page_text, "html.parser")
65
+ self.requested = True
66
+
67
+ def get_hash_str(self, inp=""):
68
+ return hashlib.sha3_256((self.url+inp).encode()).hexdigest()
69
+
70
+ def get_html_anchors(self, keyword="http"):
71
+ for anchor in self.html.findAll('a'):
72
+ link = anchor.get('href')
73
+ if link == None or link == "":
74
+ continue
75
+ if keyword in link:
76
+ self.links.append(link)
77
+
78
+ def get_html_text(self, tags=["p"]):
79
+ for tag in tags:
80
+ for p in self.html.findAll(tag):
81
+ p_text = p.getText().strip()
82
+ if p_text == None or p_text == '':
83
+ continue
84
+ self.text.append(p_text)
85
+
86
+ def clean_html_text(self, max_words, enchant_dict="", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
87
+ all_text = ' '.join(self.text).lower()
88
+ regex_text = re.sub(rx,'',all_text).strip()
89
+ split = regex_text.split()
90
+ split = [word for word in split if word not in ignore]
91
+ #if enchant_dict != "": d = enchant.Dict(enchant_dict)
92
+ for word in split:
93
+ if len(self.cleaned_text) >= max_words: break
94
+ if len(word) >= min_word_len:
95
+ if enchant_dict == "":
96
+ self.cleaned_text.append(word)
97
+ #elif d.check(word):
98
+ # self.cleaned_text.append(word)
99
+
100
+ def k_common_words(self, k=10, ignore=[]):
101
+ if self.cleaned_text == "":
102
+ text = self.text
103
+ else:
104
+ text = self.cleaned_text
105
+ all_text = ' '.join(text).lower()
106
+ split = all_text.split()
107
+ split_ignore = [word for word in split if word not in ignore]
108
+ counts = Counter(split_ignore)
109
+ k_most_common = counts.most_common(k)
110
+ self.most_common_words = k_most_common
111
+
112
+ def save_text(self, path, fname):
113
+ file = open(path+fname, 'wb')
114
+ pickle.dump(self.text, file)
115
+ file.close()
116
+
117
+ def load_text(self, path, fname):
118
+ file = open(path+fname, 'rb')
119
+ self.text = pickle.load(file)
120
+ file.close()
121
+
122
+ def save_links(self, path, fname):
123
+ file = open(path+fname, 'wb')
124
+ pickle.dump(self.links, file)
125
+ file.close()
126
+
127
+ def load_links(self, path, fname):
128
+ file = open(path+fname, 'rb')
129
+ self.links = pickle.load(file)
130
+ file.close()
131
+
132
+ # %% ../nbs/01_data.ipynb 14
133
+ def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None):
134
+ page = Webpage(url)
135
+ fname_text = page.hash+'.text'
136
+ fname_links = page.hash+'.links'
137
+ if path == None:
138
+ page.get_page_html()
139
+ page.get_html_text(tags=["p","h1","h2","h3","span"])
140
+ page.get_html_anchors()
141
+ else:
142
+ if os.path.isfile(path+fname_text):
143
+ page.load_text(path, fname_text)
144
+ else:
145
+ page.get_page_html()
146
+ page.get_html_text(tags=["p","h1","h2","h3","span"])
147
+ page.save_text(path, fname_text)
148
+
149
+ if os.path.isfile(path+fname_links):
150
+ page.load_links(path, fname_links)
151
+ else:
152
+ if page.html == "": page.get_page_html()
153
+ page.get_html_anchors()
154
+ page.save_links(path, fname_links)
155
+
156
+ if page.text is not None:
157
+ page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+")
158
+ page.k_common_words(k=k, ignore=ignore_common)
159
+ return page
160
 
161
  # %% ../nbs/02_app_gradio.ipynb 6
162
  categories = ('pseudoscience','science')
data.py DELETED
@@ -1,182 +0,0 @@
1
- # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_data.ipynb.
2
-
3
- # %% auto 0
4
- __all__ = ['Webpage', 'get_page_all', 'get_all_links']
5
-
6
- # %% ../nbs/01_data.ipynb 4
7
- import warnings
8
- warnings.filterwarnings('ignore')
9
- import requests
10
- from bs4 import BeautifulSoup
11
- #import enchant
12
- import re
13
- import random
14
- from collections import Counter
15
- from fastai.text.all import *
16
- import hashlib
17
- import pickle
18
-
19
- # %% ../nbs/01_data.ipynb 8
20
- class Webpage:
21
- def __init__(self, url):
22
- self.url = url
23
- self.hash = self.get_hash_str()
24
- self.requested = False
25
- self.page_text = ""
26
- self.html = ""
27
- self.links = []
28
- self.text = []
29
- self.cleaned_text = []
30
- self.most_common_words = []
31
-
32
- def get_page(self, headers, min_size, max_size):
33
- r = requests.get(self.url, stream=True, headers=headers)
34
- content_length = int(r.headers.get('Content-Length', 0))
35
- data = []
36
- length = 0
37
-
38
- if content_length > max_size:
39
- return None
40
-
41
- for chunk in r.iter_content(1024):
42
- data.append(chunk)
43
- length += len(chunk)
44
- if length > max_size:
45
- return None
46
- r._content = b''.join(data)
47
- if len(r.text) < min_size: return None
48
- return r.text
49
-
50
- def get_page_html(self, min_size=1000, max_size=2000000):
51
- user_agents = [
52
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
53
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
54
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
55
- 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
56
- 'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36'
57
- ]
58
- user_agent = random.choice(user_agents)
59
- headers = {'User-Agent': user_agent}
60
- self.page_text = self.get_page(headers, min_size, max_size)
61
- self.html = BeautifulSoup(self.page_text, "html.parser")
62
- self.requested = True
63
-
64
- def get_hash_str(self, inp=""):
65
- return hashlib.sha3_256((self.url+inp).encode()).hexdigest()
66
-
67
- def get_html_anchors(self, keyword="http"):
68
- for anchor in self.html.findAll('a'):
69
- link = anchor.get('href')
70
- if link == None or link == "":
71
- continue
72
- if keyword in link:
73
- self.links.append(link)
74
-
75
- def get_html_text(self, tags=["p"]):
76
- for tag in tags:
77
- for p in self.html.findAll(tag):
78
- p_text = p.getText().strip()
79
- if p_text == None or p_text == '':
80
- continue
81
- self.text.append(p_text)
82
-
83
- def clean_html_text(self, max_words, enchant_dict="", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
84
- all_text = ' '.join(self.text).lower()
85
- regex_text = re.sub(rx,'',all_text).strip()
86
- split = regex_text.split()
87
- split = [word for word in split if word not in ignore]
88
- #if enchant_dict != "": d = enchant.Dict(enchant_dict)
89
- for word in split:
90
- if len(self.cleaned_text) >= max_words: break
91
- if len(word) >= min_word_len:
92
- if enchant_dict == "":
93
- self.cleaned_text.append(word)
94
- #elif d.check(word):
95
- # self.cleaned_text.append(word)
96
-
97
- def k_common_words(self, k=10, ignore=[]):
98
- if self.cleaned_text == "":
99
- text = self.text
100
- else:
101
- text = self.cleaned_text
102
- all_text = ' '.join(text).lower()
103
- split = all_text.split()
104
- split_ignore = [word for word in split if word not in ignore]
105
- counts = Counter(split_ignore)
106
- k_most_common = counts.most_common(k)
107
- self.most_common_words = k_most_common
108
-
109
- def save_text(self, path, fname):
110
- file = open(path+fname, 'wb')
111
- pickle.dump(self.text, file)
112
- file.close()
113
-
114
- def load_text(self, path, fname):
115
- file = open(path+fname, 'rb')
116
- self.text = pickle.load(file)
117
- file.close()
118
-
119
- def save_links(self, path, fname):
120
- file = open(path+fname, 'wb')
121
- pickle.dump(self.links, file)
122
- file.close()
123
-
124
- def load_links(self, path, fname):
125
- file = open(path+fname, 'rb')
126
- self.links = pickle.load(file)
127
- file.close()
128
-
129
- # %% ../nbs/01_data.ipynb 14
130
- def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None):
131
- page = Webpage(url)
132
- fname_text = page.hash+'.text'
133
- fname_links = page.hash+'.links'
134
- if path == None:
135
- page.get_page_html()
136
- page.get_html_text(tags=["p","h1","h2","h3","span"])
137
- page.get_html_anchors()
138
- else:
139
- if os.path.isfile(path+fname_text):
140
- page.load_text(path, fname_text)
141
- else:
142
- page.get_page_html()
143
- page.get_html_text(tags=["p","h1","h2","h3","span"])
144
- page.save_text(path, fname_text)
145
-
146
- if os.path.isfile(path+fname_links):
147
- page.load_links(path, fname_links)
148
- else:
149
- if page.html == "": page.get_page_html()
150
- page.get_html_anchors()
151
- page.save_links(path, fname_links)
152
-
153
- if page.text is not None:
154
- page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+")
155
- page.k_common_words(k=k, ignore=ignore_common)
156
- return page
157
-
158
- def get_all_links(url, dict, k, min_words=20, max_words=500, ignore_text=[], ignore_common=[], ignore_filenames=[".mp3",".jpg",".png"], max_links="", path=None):
159
- primary_page = get_page_all(url, k, max_words, ignore_text, ignore_common, path)
160
- if primary_page.cleaned_text is not []:
161
- dict[url] = [primary_page.cleaned_text, primary_page.most_common_words]
162
- if max_links == "" or max_links > len(primary_page.links): max_links=len(primary_page.links)
163
-
164
- for count, link in enumerate(primary_page.links[:max_links]):
165
- if all(x not in link for x in ignore_filenames):
166
- try:
167
- page = get_page_all(link, k, max_words, ignore_text, ignore_common, path)
168
- if page.cleaned_text is not []:
169
- if len(page.cleaned_text) < min_words: continue
170
- if [page.cleaned_text, page.most_common_words] in dict.values(): continue
171
- dict[link] = [page.cleaned_text, page.most_common_words]
172
- except:
173
- pass
174
- if link in dict:
175
- res = str(len(dict[link][0]))+" words | "+str(dict[link][1][:3])
176
- else:
177
- res = "Rejected"
178
- progress_message = "%s link %4d/%4d | %s = %s %s" % (url, count, len(primary_page.links), link, res, 200*' ')
179
- sys.stdout.write("\r" + progress_message)
180
- sys.stdout.flush()
181
- else:
182
- print(url,"returned None, Skipping...")