Spaces:
Sleeping
Sleeping
Adding model
Browse files
app.py
CHANGED
@@ -18,7 +18,7 @@ min_words = 20
|
|
18 |
max_words = 450
|
19 |
ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on']
|
20 |
ignore_common = ignore_text
|
21 |
-
learn = load_learner('
|
22 |
|
23 |
def predict(url):
|
24 |
page = get_page_all(url, k, max_words, ignore_text, ignore_common)
|
|
|
18 |
max_words = 450
|
19 |
ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on']
|
20 |
ignore_common = ignore_text
|
21 |
+
learn = load_learner('model.pkl', cpu=True)
|
22 |
|
23 |
def predict(url):
|
24 |
page = get_page_all(url, k, max_words, ignore_text, ignore_common)
|
data.py
CHANGED
@@ -8,7 +8,7 @@ import warnings
|
|
8 |
warnings.filterwarnings('ignore')
|
9 |
import requests
|
10 |
from bs4 import BeautifulSoup
|
11 |
-
import enchant
|
12 |
import re
|
13 |
import random
|
14 |
from collections import Counter
|
@@ -80,19 +80,19 @@ class Webpage:
|
|
80 |
continue
|
81 |
self.text.append(p_text)
|
82 |
|
83 |
-
def clean_html_text(self, max_words, enchant_dict="
|
84 |
all_text = ' '.join(self.text).lower()
|
85 |
regex_text = re.sub(rx,'',all_text).strip()
|
86 |
split = regex_text.split()
|
87 |
split = [word for word in split if word not in ignore]
|
88 |
-
if enchant_dict != "": d = enchant.Dict(enchant_dict)
|
89 |
for word in split:
|
90 |
if len(self.cleaned_text) >= max_words: break
|
91 |
if len(word) >= min_word_len:
|
92 |
if enchant_dict == "":
|
93 |
self.cleaned_text.append(word)
|
94 |
-
elif d.check(word):
|
95 |
-
|
96 |
|
97 |
def k_common_words(self, k=10, ignore=[]):
|
98 |
if self.cleaned_text == "":
|
|
|
8 |
warnings.filterwarnings('ignore')
|
9 |
import requests
|
10 |
from bs4 import BeautifulSoup
|
11 |
+
#import enchant
|
12 |
import re
|
13 |
import random
|
14 |
from collections import Counter
|
|
|
80 |
continue
|
81 |
self.text.append(p_text)
|
82 |
|
83 |
+
def clean_html_text(self, max_words, enchant_dict="", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
|
84 |
all_text = ' '.join(self.text).lower()
|
85 |
regex_text = re.sub(rx,'',all_text).strip()
|
86 |
split = regex_text.split()
|
87 |
split = [word for word in split if word not in ignore]
|
88 |
+
#if enchant_dict != "": d = enchant.Dict(enchant_dict)
|
89 |
for word in split:
|
90 |
if len(self.cleaned_text) >= max_words: break
|
91 |
if len(word) >= min_word_len:
|
92 |
if enchant_dict == "":
|
93 |
self.cleaned_text.append(word)
|
94 |
+
#elif d.check(word):
|
95 |
+
# self.cleaned_text.append(word)
|
96 |
|
97 |
def k_common_words(self, k=10, ignore=[]):
|
98 |
if self.cleaned_text == "":
|
model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b1ebd7381d4b2e78a8a44d5f453a115c71e05a2684f7f5751f21c1119223f6ec
|
3 |
+
size 143791219
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastcore
|
2 |
+
fastai
|
3 |
+
requests
|
4 |
+
beautifulsoup4
|
5 |
+
pandas
|
6 |
+
matplotlib
|
7 |
+
pyenchant
|
8 |
+
gradio
|