# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_app_gradio.ipynb. # %% auto 0 __all__ = ['categories', 'k', 'min_words', 'max_words', 'ignore_text', 'ignore_common', 'learn', 'text', 'label', 'examples', 'intf', 'predict'] # %% ../nbs/02_app_gradio.ipynb 4 import warnings warnings.filterwarnings('ignore') from fastai.text.all import * import gradio as gr import requests from bs4 import BeautifulSoup import enchant import re import random from collections import Counter import hashlib import pickle from wordcloud import WordCloud # %% ../nbs/01_data.ipynb 8 class Webpage: def __init__(self, url): self.url = url self.hash = self.get_hash_str() self.requested = False self.page_text = "" self.html = "" self.links = [] self.text = [] self.cleaned_text = [] self.most_common_words = [] def get_page(self, headers, min_size, max_size): r = requests.get(self.url, stream=True, headers=headers) content_length = int(r.headers.get('Content-Length', 0)) data = [] length = 0 if content_length > max_size: return None for chunk in r.iter_content(1024): data.append(chunk) length += len(chunk) if length > max_size: return None r._content = b''.join(data) if len(r.text) < min_size: return None return r.text def get_page_html(self, min_size=1000, max_size=2000000): user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148', 'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36' ] user_agent = random.choice(user_agents) headers = {'User-Agent': user_agent} self.page_text = self.get_page(headers, min_size, max_size) self.html = BeautifulSoup(self.page_text, "html.parser") self.requested = True def get_hash_str(self, inp=""): return hashlib.sha3_256((self.url+inp).encode()).hexdigest() def get_html_anchors(self, keyword="http"): for anchor in self.html.findAll('a'): link = anchor.get('href') if link == None or link == "": continue if keyword in link: self.links.append(link) def get_html_text(self, tags=["p"]): for tag in tags: for p in self.html.findAll(tag): p_text = p.getText().strip() if p_text == None or p_text == '': continue self.text.append(p_text) def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2): all_text = ' '.join(self.text).lower() regex_text = re.sub(rx,'',all_text).strip() split = regex_text.split() split = [word for word in split if word not in ignore] if enchant_dict != "": d = enchant.Dict(enchant_dict) for word in split: if len(self.cleaned_text) >= max_words: break if len(word) >= min_word_len: if enchant_dict == "": self.cleaned_text.append(word) elif d.check(word): self.cleaned_text.append(word) def k_common_words(self, k=10, ignore=[]): if self.cleaned_text == "": text = self.text else: text = self.cleaned_text all_text = ' '.join(text).lower() split = all_text.split() split_ignore = [word for word in split if word not in ignore] counts = Counter(split_ignore) k_most_common = counts.most_common(k) self.most_common_words = k_most_common def save_text(self, path, fname): file = open(path+fname, 'wb') pickle.dump(self.text, file) file.close() def load_text(self, path, fname): file = open(path+fname, 'rb') self.text = pickle.load(file) file.close() def save_links(self, path, fname): file = open(path+fname, 'wb') pickle.dump(self.links, file) file.close() def load_links(self, path, fname): file = open(path+fname, 'rb') self.links = pickle.load(file) file.close() # %% ../nbs/01_data.ipynb 14 def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None): page = Webpage(url) fname_text = page.hash+'.text' fname_links = page.hash+'.links' if path == None: page.get_page_html() page.get_html_text(tags=["p","h1","h2","h3","span"]) page.get_html_anchors() else: if os.path.isfile(path+fname_text): page.load_text(path, fname_text) else: page.get_page_html() page.get_html_text(tags=["p","h1","h2","h3","span"]) page.save_text(path, fname_text) if os.path.isfile(path+fname_links): page.load_links(path, fname_links) else: if page.html == "": page.get_page_html() page.get_html_anchors() page.save_links(path, fname_links) if page.text is not None: page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+") page.k_common_words(k=k, ignore=ignore_common) return page # %% ../nbs/02_app_gradio.ipynb 6 categories = ('pseudoscience','science') k = 30 min_words = 20 max_words = 450 ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on'] ignore_common = ignore_text learn = load_learner('model.pkl', cpu=True) def predict(url): page = get_page_all(url, k, max_words, ignore_text, ignore_common) length = len(page.cleaned_text) if length < min_words: return "ERROR: Returned "+str(length)+" words" else: text = ' '.join(page.cleaned_text) with learn.no_bar(), learn.no_logging(): pred,idx,probs = learn.predict(text) wordcloud = WordCloud(width = 800, height = 800, background_color ='white', min_font_size = 10).generate(text) # plot the WordCloud image fig = plt.figure(figsize = (8, 8), facecolor = None) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad = 0) return (dict(zip(categories, map(float,probs))), fig) # %% ../nbs/02_app_gradio.ipynb 8 examples = ['https://www.theskepticsguide.org/about','https://www.foxnews.com/opinion'] pseudo_sources = ["http://www.ageofautism.com/", "http://www.naturalnews.com", "https://foodbabe.com/starthere/", "http://www.chopra.com", "https://www.mercola.com/", "https://www.history.com/", "https://doctoroz.com/", "https://www.disclose.tv/", "https://nationalreport.net/", "https://heartland.org/", "https://www.dailymail.co.uk/", "https://www.motherjones.com/"] science_sources = ["https://sciencebasedmedicine.org/", "https://www.hopkinsmedicine.org/gim/research/method/ebm.html", "https://www.bbc.com/news/science_and_environment", "https://www.nature.com/", "https://www.science.org/", "https://www.snopes.com/top/", "https://quackwatch.org/", "https://www.skepdic.com/", "http://scibabe.com/", "http://pandasthumb.org/", "https://skepticalscience.com/", "https://www.cdc.gov/", "https://apnews.com/"] with gr.Blocks() as blocks: gr.Markdown("# Pseudometer") gr.Markdown("Prototype machine learning pseudoscience detector for websites!") text = gr.Textbox(label="Input URL (http format):") label = gr.outputs.Label() btn = gr.Button("Analyze!") with gr.Accordion("Pseudoscience Primary Training Sources"): gr.Markdown(', '.join(pseudo_sources)) with gr.Accordion("Science Primary Training Sources"): gr.Markdown(', '.join(science_sources)) example = gr.Examples(examples=examples, inputs=text) btn.click(fn=predict, inputs=text, outputs=[label, gr.Plot(label="Wordcloud")]) blocks.launch()