pseudometer / app.py
sbavery's picture
Adding wordcloud and enchant
4e90ce6
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_app_gradio.ipynb.
# %% auto 0
__all__ = ['categories', 'k', 'min_words', 'max_words', 'ignore_text', 'ignore_common', 'learn', 'text', 'label', 'examples',
'intf', 'predict']
# %% ../nbs/02_app_gradio.ipynb 4
import warnings
warnings.filterwarnings('ignore')
from fastai.text.all import *
import gradio as gr
import requests
from bs4 import BeautifulSoup
import enchant
import re
import random
from collections import Counter
import hashlib
import pickle
from wordcloud import WordCloud
# %% ../nbs/01_data.ipynb 8
class Webpage:
def __init__(self, url):
self.url = url
self.hash = self.get_hash_str()
self.requested = False
self.page_text = ""
self.html = ""
self.links = []
self.text = []
self.cleaned_text = []
self.most_common_words = []
def get_page(self, headers, min_size, max_size):
r = requests.get(self.url, stream=True, headers=headers)
content_length = int(r.headers.get('Content-Length', 0))
data = []
length = 0
if content_length > max_size:
return None
for chunk in r.iter_content(1024):
data.append(chunk)
length += len(chunk)
if length > max_size:
return None
r._content = b''.join(data)
if len(r.text) < min_size: return None
return r.text
def get_page_html(self, min_size=1000, max_size=2000000):
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36'
]
user_agent = random.choice(user_agents)
headers = {'User-Agent': user_agent}
self.page_text = self.get_page(headers, min_size, max_size)
self.html = BeautifulSoup(self.page_text, "html.parser")
self.requested = True
def get_hash_str(self, inp=""):
return hashlib.sha3_256((self.url+inp).encode()).hexdigest()
def get_html_anchors(self, keyword="http"):
for anchor in self.html.findAll('a'):
link = anchor.get('href')
if link == None or link == "":
continue
if keyword in link:
self.links.append(link)
def get_html_text(self, tags=["p"]):
for tag in tags:
for p in self.html.findAll(tag):
p_text = p.getText().strip()
if p_text == None or p_text == '':
continue
self.text.append(p_text)
def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
all_text = ' '.join(self.text).lower()
regex_text = re.sub(rx,'',all_text).strip()
split = regex_text.split()
split = [word for word in split if word not in ignore]
if enchant_dict != "": d = enchant.Dict(enchant_dict)
for word in split:
if len(self.cleaned_text) >= max_words: break
if len(word) >= min_word_len:
if enchant_dict == "":
self.cleaned_text.append(word)
elif d.check(word):
self.cleaned_text.append(word)
def k_common_words(self, k=10, ignore=[]):
if self.cleaned_text == "":
text = self.text
else:
text = self.cleaned_text
all_text = ' '.join(text).lower()
split = all_text.split()
split_ignore = [word for word in split if word not in ignore]
counts = Counter(split_ignore)
k_most_common = counts.most_common(k)
self.most_common_words = k_most_common
def save_text(self, path, fname):
file = open(path+fname, 'wb')
pickle.dump(self.text, file)
file.close()
def load_text(self, path, fname):
file = open(path+fname, 'rb')
self.text = pickle.load(file)
file.close()
def save_links(self, path, fname):
file = open(path+fname, 'wb')
pickle.dump(self.links, file)
file.close()
def load_links(self, path, fname):
file = open(path+fname, 'rb')
self.links = pickle.load(file)
file.close()
# %% ../nbs/01_data.ipynb 14
def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None):
page = Webpage(url)
fname_text = page.hash+'.text'
fname_links = page.hash+'.links'
if path == None:
page.get_page_html()
page.get_html_text(tags=["p","h1","h2","h3","span"])
page.get_html_anchors()
else:
if os.path.isfile(path+fname_text):
page.load_text(path, fname_text)
else:
page.get_page_html()
page.get_html_text(tags=["p","h1","h2","h3","span"])
page.save_text(path, fname_text)
if os.path.isfile(path+fname_links):
page.load_links(path, fname_links)
else:
if page.html == "": page.get_page_html()
page.get_html_anchors()
page.save_links(path, fname_links)
if page.text is not None:
page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+")
page.k_common_words(k=k, ignore=ignore_common)
return page
# %% ../nbs/02_app_gradio.ipynb 6
categories = ('pseudoscience','science')
k = 30
min_words = 20
max_words = 450
ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on']
ignore_common = ignore_text
learn = load_learner('model.pkl', cpu=True)
def predict(url):
page = get_page_all(url, k, max_words, ignore_text, ignore_common)
length = len(page.cleaned_text)
if length < min_words:
return "ERROR: Returned "+str(length)+" words"
else:
text = ' '.join(page.cleaned_text)
with learn.no_bar(), learn.no_logging():
pred,idx,probs = learn.predict(text)
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
min_font_size = 10).generate(text)
# plot the WordCloud image
fig = plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
return (dict(zip(categories, map(float,probs))), fig)
# %% ../nbs/02_app_gradio.ipynb 8
examples = ['https://www.theskepticsguide.org/about','https://www.foxnews.com/opinion']
pseudo_sources = ["http://www.ageofautism.com/",
"http://www.naturalnews.com",
"https://foodbabe.com/starthere/",
"http://www.chopra.com",
"https://www.mercola.com/",
"https://www.history.com/",
"https://doctoroz.com/",
"https://www.disclose.tv/",
"https://nationalreport.net/",
"https://heartland.org/",
"https://www.dailymail.co.uk/",
"https://www.motherjones.com/"]
science_sources = ["https://sciencebasedmedicine.org/",
"https://www.hopkinsmedicine.org/gim/research/method/ebm.html",
"https://www.bbc.com/news/science_and_environment",
"https://www.nature.com/",
"https://www.science.org/",
"https://www.snopes.com/top/",
"https://quackwatch.org/",
"https://www.skepdic.com/",
"http://scibabe.com/",
"http://pandasthumb.org/",
"https://skepticalscience.com/",
"https://www.cdc.gov/",
"https://apnews.com/"]
with gr.Blocks() as blocks:
gr.Markdown("# Pseudometer")
gr.Markdown("Prototype machine learning pseudoscience detector for websites!")
text = gr.Textbox(label="Input URL (http format):")
label = gr.outputs.Label()
btn = gr.Button("Analyze!")
with gr.Accordion("Pseudoscience Primary Training Sources"):
gr.Markdown(', '.join(pseudo_sources))
with gr.Accordion("Science Primary Training Sources"):
gr.Markdown(', '.join(science_sources))
example = gr.Examples(examples=examples, inputs=text)
btn.click(fn=predict, inputs=text, outputs=[label, gr.Plot(label="Wordcloud")])
blocks.launch()