Spaces:

sbavery
/

pseudometer

Sleeping

sbavery commited on Dec 2, 2022

Commit

bfef0ca

•

1 Parent(s): 1e588f4

Adding model

Files changed (4) hide show

app.py CHANGED Viewed

@@ -18,7 +18,7 @@ min_words = 20
 max_words = 450
 ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on']
 ignore_common = ignore_text
-learn = load_learner('models/2022.12.01 Model v1 88pct', cpu=False)
 def predict(url):
     page = get_page_all(url, k, max_words, ignore_text, ignore_common)

 max_words = 450
 ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on']
 ignore_common = ignore_text
+learn = load_learner('model.pkl', cpu=True)
 def predict(url):
     page = get_page_all(url, k, max_words, ignore_text, ignore_common)

data.py CHANGED Viewed

@@ -8,7 +8,7 @@ import warnings
 warnings.filterwarnings('ignore')
 import requests
 from bs4 import BeautifulSoup
-import enchant
 import re
 import random
 from collections import Counter
@@ -80,19 +80,19 @@ class Webpage:
                     continue
                 self.text.append(p_text)
-    def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
         all_text = ' '.join(self.text).lower()
         regex_text = re.sub(rx,'',all_text).strip()
         split = regex_text.split()
         split = [word for word in split if word not in ignore]
-        if enchant_dict != "": d = enchant.Dict(enchant_dict)
         for word in split:
             if len(self.cleaned_text) >= max_words: break
             if len(word) >= min_word_len:
                 if enchant_dict == "":
                     self.cleaned_text.append(word)
-                elif d.check(word):
-                    self.cleaned_text.append(word)
     def k_common_words(self, k=10, ignore=[]):
         if self.cleaned_text == "":

 warnings.filterwarnings('ignore')
 import requests
 from bs4 import BeautifulSoup
+#import enchant
 import re
 import random
 from collections import Counter
                     continue
                 self.text.append(p_text)
+    def clean_html_text(self, max_words, enchant_dict="", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
         all_text = ' '.join(self.text).lower()
         regex_text = re.sub(rx,'',all_text).strip()
         split = regex_text.split()
         split = [word for word in split if word not in ignore]
+        #if enchant_dict != "": d = enchant.Dict(enchant_dict)
         for word in split:
             if len(self.cleaned_text) >= max_words: break
             if len(word) >= min_word_len:
                 if enchant_dict == "":
                     self.cleaned_text.append(word)
+                #elif d.check(word):
+                #    self.cleaned_text.append(word)
     def k_common_words(self, k=10, ignore=[]):
         if self.cleaned_text == "":

model.pkl ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1ebd7381d4b2e78a8a44d5f453a115c71e05a2684f7f5751f21c1119223f6ec
+size 143791219

requirements.txt ADDED Viewed

+fastcore
+fastai
+requests
+beautifulsoup4
+pandas
+matplotlib
+pyenchant
+gradio