Spaces:
Sleeping
Sleeping
Adding wordcloud and enchant
Browse files- app.py +58 -10
- packages.txt +1 -0
- requirements.txt +2 -1
app.py
CHANGED
@@ -11,12 +11,13 @@ from fastai.text.all import *
|
|
11 |
import gradio as gr
|
12 |
import requests
|
13 |
from bs4 import BeautifulSoup
|
14 |
-
|
15 |
import re
|
16 |
import random
|
17 |
from collections import Counter
|
18 |
import hashlib
|
19 |
import pickle
|
|
|
20 |
|
21 |
|
22 |
# %% ../nbs/01_data.ipynb 8
|
@@ -83,19 +84,19 @@ class Webpage:
|
|
83 |
continue
|
84 |
self.text.append(p_text)
|
85 |
|
86 |
-
def clean_html_text(self, max_words, enchant_dict="", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
|
87 |
all_text = ' '.join(self.text).lower()
|
88 |
regex_text = re.sub(rx,'',all_text).strip()
|
89 |
split = regex_text.split()
|
90 |
split = [word for word in split if word not in ignore]
|
91 |
-
|
92 |
for word in split:
|
93 |
if len(self.cleaned_text) >= max_words: break
|
94 |
if len(word) >= min_word_len:
|
95 |
if enchant_dict == "":
|
96 |
self.cleaned_text.append(word)
|
97 |
-
|
98 |
-
|
99 |
|
100 |
def k_common_words(self, k=10, ignore=[]):
|
101 |
if self.cleaned_text == "":
|
@@ -176,12 +177,59 @@ def predict(url):
|
|
176 |
text = ' '.join(page.cleaned_text)
|
177 |
with learn.no_bar(), learn.no_logging():
|
178 |
pred,idx,probs = learn.predict(text)
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
# %% ../nbs/02_app_gradio.ipynb 8
|
182 |
-
text = gr.inputs.Textbox(1)
|
183 |
-
label = gr.outputs.Label()
|
184 |
examples = ['https://www.theskepticsguide.org/about','https://www.foxnews.com/opinion']
|
185 |
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
import gradio as gr
|
12 |
import requests
|
13 |
from bs4 import BeautifulSoup
|
14 |
+
import enchant
|
15 |
import re
|
16 |
import random
|
17 |
from collections import Counter
|
18 |
import hashlib
|
19 |
import pickle
|
20 |
+
from wordcloud import WordCloud
|
21 |
|
22 |
|
23 |
# %% ../nbs/01_data.ipynb 8
|
|
|
84 |
continue
|
85 |
self.text.append(p_text)
|
86 |
|
87 |
+
def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
|
88 |
all_text = ' '.join(self.text).lower()
|
89 |
regex_text = re.sub(rx,'',all_text).strip()
|
90 |
split = regex_text.split()
|
91 |
split = [word for word in split if word not in ignore]
|
92 |
+
if enchant_dict != "": d = enchant.Dict(enchant_dict)
|
93 |
for word in split:
|
94 |
if len(self.cleaned_text) >= max_words: break
|
95 |
if len(word) >= min_word_len:
|
96 |
if enchant_dict == "":
|
97 |
self.cleaned_text.append(word)
|
98 |
+
elif d.check(word):
|
99 |
+
self.cleaned_text.append(word)
|
100 |
|
101 |
def k_common_words(self, k=10, ignore=[]):
|
102 |
if self.cleaned_text == "":
|
|
|
177 |
text = ' '.join(page.cleaned_text)
|
178 |
with learn.no_bar(), learn.no_logging():
|
179 |
pred,idx,probs = learn.predict(text)
|
180 |
+
wordcloud = WordCloud(width = 800, height = 800,
|
181 |
+
background_color ='white',
|
182 |
+
min_font_size = 10).generate(text)
|
183 |
+
|
184 |
+
# plot the WordCloud image
|
185 |
+
fig = plt.figure(figsize = (8, 8), facecolor = None)
|
186 |
+
plt.imshow(wordcloud)
|
187 |
+
plt.axis("off")
|
188 |
+
plt.tight_layout(pad = 0)
|
189 |
+
return (dict(zip(categories, map(float,probs))), fig)
|
190 |
|
191 |
# %% ../nbs/02_app_gradio.ipynb 8
|
|
|
|
|
192 |
examples = ['https://www.theskepticsguide.org/about','https://www.foxnews.com/opinion']
|
193 |
|
194 |
+
pseudo_sources = ["http://www.ageofautism.com/",
|
195 |
+
"http://www.naturalnews.com",
|
196 |
+
"https://foodbabe.com/starthere/",
|
197 |
+
"http://www.chopra.com",
|
198 |
+
"https://www.mercola.com/",
|
199 |
+
"https://www.history.com/",
|
200 |
+
"https://doctoroz.com/",
|
201 |
+
"https://www.disclose.tv/",
|
202 |
+
"https://nationalreport.net/",
|
203 |
+
"https://heartland.org/",
|
204 |
+
"https://www.dailymail.co.uk/",
|
205 |
+
"https://www.motherjones.com/"]
|
206 |
+
|
207 |
+
science_sources = ["https://sciencebasedmedicine.org/",
|
208 |
+
"https://www.hopkinsmedicine.org/gim/research/method/ebm.html",
|
209 |
+
"https://www.bbc.com/news/science_and_environment",
|
210 |
+
"https://www.nature.com/",
|
211 |
+
"https://www.science.org/",
|
212 |
+
"https://www.snopes.com/top/",
|
213 |
+
"https://quackwatch.org/",
|
214 |
+
"https://www.skepdic.com/",
|
215 |
+
"http://scibabe.com/",
|
216 |
+
"http://pandasthumb.org/",
|
217 |
+
"https://skepticalscience.com/",
|
218 |
+
"https://www.cdc.gov/",
|
219 |
+
"https://apnews.com/"]
|
220 |
+
|
221 |
+
with gr.Blocks() as blocks:
|
222 |
+
gr.Markdown("# Pseudometer")
|
223 |
+
gr.Markdown("Prototype machine learning pseudoscience detector for websites!")
|
224 |
+
text = gr.Textbox(label="Input URL (http format):")
|
225 |
+
label = gr.outputs.Label()
|
226 |
+
btn = gr.Button("Analyze!")
|
227 |
+
with gr.Accordion("Pseudoscience Primary Training Sources"):
|
228 |
+
gr.Markdown(', '.join(pseudo_sources))
|
229 |
+
with gr.Accordion("Science Primary Training Sources"):
|
230 |
+
gr.Markdown(', '.join(science_sources))
|
231 |
+
example = gr.Examples(examples=examples, inputs=text)
|
232 |
+
|
233 |
+
btn.click(fn=predict, inputs=text, outputs=[label, gr.Plot(label="Wordcloud")])
|
234 |
+
|
235 |
+
blocks.launch()
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
libenchant-dev
|
requirements.txt
CHANGED
@@ -5,4 +5,5 @@ beautifulsoup4
|
|
5 |
pandas
|
6 |
matplotlib
|
7 |
pyenchant
|
8 |
-
gradio
|
|
|
|
5 |
pandas
|
6 |
matplotlib
|
7 |
pyenchant
|
8 |
+
gradio
|
9 |
+
wordcloud
|