sbavery commited on
Commit
4e90ce6
1 Parent(s): 33df71b

Adding wordcloud and enchant

Browse files
Files changed (3) hide show
  1. app.py +58 -10
  2. packages.txt +1 -0
  3. requirements.txt +2 -1
app.py CHANGED
@@ -11,12 +11,13 @@ from fastai.text.all import *
11
  import gradio as gr
12
  import requests
13
  from bs4 import BeautifulSoup
14
- #import enchant
15
  import re
16
  import random
17
  from collections import Counter
18
  import hashlib
19
  import pickle
 
20
 
21
 
22
  # %% ../nbs/01_data.ipynb 8
@@ -83,19 +84,19 @@ class Webpage:
83
  continue
84
  self.text.append(p_text)
85
 
86
- def clean_html_text(self, max_words, enchant_dict="", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
87
  all_text = ' '.join(self.text).lower()
88
  regex_text = re.sub(rx,'',all_text).strip()
89
  split = regex_text.split()
90
  split = [word for word in split if word not in ignore]
91
- #if enchant_dict != "": d = enchant.Dict(enchant_dict)
92
  for word in split:
93
  if len(self.cleaned_text) >= max_words: break
94
  if len(word) >= min_word_len:
95
  if enchant_dict == "":
96
  self.cleaned_text.append(word)
97
- #elif d.check(word):
98
- # self.cleaned_text.append(word)
99
 
100
  def k_common_words(self, k=10, ignore=[]):
101
  if self.cleaned_text == "":
@@ -176,12 +177,59 @@ def predict(url):
176
  text = ' '.join(page.cleaned_text)
177
  with learn.no_bar(), learn.no_logging():
178
  pred,idx,probs = learn.predict(text)
179
- return dict(zip(categories, map(float,probs)))
 
 
 
 
 
 
 
 
 
180
 
181
  # %% ../nbs/02_app_gradio.ipynb 8
182
- text = gr.inputs.Textbox(1)
183
- label = gr.outputs.Label()
184
  examples = ['https://www.theskepticsguide.org/about','https://www.foxnews.com/opinion']
185
 
186
- intf = gr.Interface(fn=predict, inputs=text, outputs=label, examples=examples)
187
- intf.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  import gradio as gr
12
  import requests
13
  from bs4 import BeautifulSoup
14
+ import enchant
15
  import re
16
  import random
17
  from collections import Counter
18
  import hashlib
19
  import pickle
20
+ from wordcloud import WordCloud
21
 
22
 
23
  # %% ../nbs/01_data.ipynb 8
 
84
  continue
85
  self.text.append(p_text)
86
 
87
+ def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
88
  all_text = ' '.join(self.text).lower()
89
  regex_text = re.sub(rx,'',all_text).strip()
90
  split = regex_text.split()
91
  split = [word for word in split if word not in ignore]
92
+ if enchant_dict != "": d = enchant.Dict(enchant_dict)
93
  for word in split:
94
  if len(self.cleaned_text) >= max_words: break
95
  if len(word) >= min_word_len:
96
  if enchant_dict == "":
97
  self.cleaned_text.append(word)
98
+ elif d.check(word):
99
+ self.cleaned_text.append(word)
100
 
101
  def k_common_words(self, k=10, ignore=[]):
102
  if self.cleaned_text == "":
 
177
  text = ' '.join(page.cleaned_text)
178
  with learn.no_bar(), learn.no_logging():
179
  pred,idx,probs = learn.predict(text)
180
+ wordcloud = WordCloud(width = 800, height = 800,
181
+ background_color ='white',
182
+ min_font_size = 10).generate(text)
183
+
184
+ # plot the WordCloud image
185
+ fig = plt.figure(figsize = (8, 8), facecolor = None)
186
+ plt.imshow(wordcloud)
187
+ plt.axis("off")
188
+ plt.tight_layout(pad = 0)
189
+ return (dict(zip(categories, map(float,probs))), fig)
190
 
191
  # %% ../nbs/02_app_gradio.ipynb 8
 
 
192
  examples = ['https://www.theskepticsguide.org/about','https://www.foxnews.com/opinion']
193
 
194
+ pseudo_sources = ["http://www.ageofautism.com/",
195
+ "http://www.naturalnews.com",
196
+ "https://foodbabe.com/starthere/",
197
+ "http://www.chopra.com",
198
+ "https://www.mercola.com/",
199
+ "https://www.history.com/",
200
+ "https://doctoroz.com/",
201
+ "https://www.disclose.tv/",
202
+ "https://nationalreport.net/",
203
+ "https://heartland.org/",
204
+ "https://www.dailymail.co.uk/",
205
+ "https://www.motherjones.com/"]
206
+
207
+ science_sources = ["https://sciencebasedmedicine.org/",
208
+ "https://www.hopkinsmedicine.org/gim/research/method/ebm.html",
209
+ "https://www.bbc.com/news/science_and_environment",
210
+ "https://www.nature.com/",
211
+ "https://www.science.org/",
212
+ "https://www.snopes.com/top/",
213
+ "https://quackwatch.org/",
214
+ "https://www.skepdic.com/",
215
+ "http://scibabe.com/",
216
+ "http://pandasthumb.org/",
217
+ "https://skepticalscience.com/",
218
+ "https://www.cdc.gov/",
219
+ "https://apnews.com/"]
220
+
221
+ with gr.Blocks() as blocks:
222
+ gr.Markdown("# Pseudometer")
223
+ gr.Markdown("Prototype machine learning pseudoscience detector for websites!")
224
+ text = gr.Textbox(label="Input URL (http format):")
225
+ label = gr.outputs.Label()
226
+ btn = gr.Button("Analyze!")
227
+ with gr.Accordion("Pseudoscience Primary Training Sources"):
228
+ gr.Markdown(', '.join(pseudo_sources))
229
+ with gr.Accordion("Science Primary Training Sources"):
230
+ gr.Markdown(', '.join(science_sources))
231
+ example = gr.Examples(examples=examples, inputs=text)
232
+
233
+ btn.click(fn=predict, inputs=text, outputs=[label, gr.Plot(label="Wordcloud")])
234
+
235
+ blocks.launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ libenchant-dev
requirements.txt CHANGED
@@ -5,4 +5,5 @@ beautifulsoup4
5
  pandas
6
  matplotlib
7
  pyenchant
8
- gradio
 
 
5
  pandas
6
  matplotlib
7
  pyenchant
8
+ gradio
9
+ wordcloud