sbavery commited on
Commit
bfef0ca
1 Parent(s): 1e588f4

Adding model

Browse files
Files changed (4) hide show
  1. app.py +1 -1
  2. data.py +5 -5
  3. model.pkl +3 -0
  4. requirements.txt +8 -0
app.py CHANGED
@@ -18,7 +18,7 @@ min_words = 20
18
  max_words = 450
19
  ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on']
20
  ignore_common = ignore_text
21
- learn = load_learner('models/2022.12.01 Model v1 88pct', cpu=False)
22
 
23
  def predict(url):
24
  page = get_page_all(url, k, max_words, ignore_text, ignore_common)
 
18
  max_words = 450
19
  ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on']
20
  ignore_common = ignore_text
21
+ learn = load_learner('model.pkl', cpu=True)
22
 
23
  def predict(url):
24
  page = get_page_all(url, k, max_words, ignore_text, ignore_common)
data.py CHANGED
@@ -8,7 +8,7 @@ import warnings
8
  warnings.filterwarnings('ignore')
9
  import requests
10
  from bs4 import BeautifulSoup
11
- import enchant
12
  import re
13
  import random
14
  from collections import Counter
@@ -80,19 +80,19 @@ class Webpage:
80
  continue
81
  self.text.append(p_text)
82
 
83
- def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
84
  all_text = ' '.join(self.text).lower()
85
  regex_text = re.sub(rx,'',all_text).strip()
86
  split = regex_text.split()
87
  split = [word for word in split if word not in ignore]
88
- if enchant_dict != "": d = enchant.Dict(enchant_dict)
89
  for word in split:
90
  if len(self.cleaned_text) >= max_words: break
91
  if len(word) >= min_word_len:
92
  if enchant_dict == "":
93
  self.cleaned_text.append(word)
94
- elif d.check(word):
95
- self.cleaned_text.append(word)
96
 
97
  def k_common_words(self, k=10, ignore=[]):
98
  if self.cleaned_text == "":
 
8
  warnings.filterwarnings('ignore')
9
  import requests
10
  from bs4 import BeautifulSoup
11
+ #import enchant
12
  import re
13
  import random
14
  from collections import Counter
 
80
  continue
81
  self.text.append(p_text)
82
 
83
+ def clean_html_text(self, max_words, enchant_dict="", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
84
  all_text = ' '.join(self.text).lower()
85
  regex_text = re.sub(rx,'',all_text).strip()
86
  split = regex_text.split()
87
  split = [word for word in split if word not in ignore]
88
+ #if enchant_dict != "": d = enchant.Dict(enchant_dict)
89
  for word in split:
90
  if len(self.cleaned_text) >= max_words: break
91
  if len(word) >= min_word_len:
92
  if enchant_dict == "":
93
  self.cleaned_text.append(word)
94
+ #elif d.check(word):
95
+ # self.cleaned_text.append(word)
96
 
97
  def k_common_words(self, k=10, ignore=[]):
98
  if self.cleaned_text == "":
model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1ebd7381d4b2e78a8a44d5f453a115c71e05a2684f7f5751f21c1119223f6ec
3
+ size 143791219
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastcore
2
+ fastai
3
+ requests
4
+ beautifulsoup4
5
+ pandas
6
+ matplotlib
7
+ pyenchant
8
+ gradio