Update tfidf.py
Browse files
tfidf.py
CHANGED
@@ -8,10 +8,13 @@ import os
|
|
8 |
from nltk.corpus import stopwords
|
9 |
from nltk.tokenize import word_tokenize
|
10 |
from nltk.stem import WordNetLemmatizer
|
|
|
|
|
11 |
|
12 |
|
13 |
class TfidfWikiGuesser:
|
14 |
def __init__(self, wikidump = 'resources/wiki_text_16.json') -> None:
|
|
|
15 |
self.tfidf = None
|
16 |
self.corpus = None
|
17 |
self.titles = None
|
@@ -21,15 +24,28 @@ class TfidfWikiGuesser:
|
|
21 |
#model_file = "processed_tfidf_wiki_16_model.pkl"
|
22 |
# full_model_path = model_file
|
23 |
full_model_path = os.path.join("./models", model_file)
|
|
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# self.load_model(wikidump)
|
34 |
|
35 |
def load_model(self, wikidump):
|
@@ -99,4 +115,11 @@ class TfidfWikiGuesser:
|
|
99 |
self.vectorizer = data['vectorizer']
|
100 |
self.tfidf = data['tfidf_matrix']
|
101 |
self.titles = data['titles']
|
102 |
-
# self.corpus = data['corpus']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from nltk.corpus import stopwords
|
9 |
from nltk.tokenize import word_tokenize
|
10 |
from nltk.stem import WordNetLemmatizer
|
11 |
+
from huggingface_hub import hf_hub_download
|
12 |
+
import joblib
|
13 |
|
14 |
|
15 |
class TfidfWikiGuesser:
|
16 |
def __init__(self, wikidump = 'resources/wiki_text_16.json') -> None:
|
17 |
+
print("init TfidfWikieGuesser")
|
18 |
self.tfidf = None
|
19 |
self.corpus = None
|
20 |
self.titles = None
|
|
|
24 |
#model_file = "processed_tfidf_wiki_16_model.pkl"
|
25 |
# full_model_path = model_file
|
26 |
full_model_path = os.path.join("./models", model_file)
|
27 |
+
print(full_model_path)
|
28 |
|
29 |
+
|
30 |
+
|
31 |
+
REPO_ID = "nes470/pipeline-as-repo"
|
32 |
+
FILENAME = "processed_tfidf_wiki_page_text_model.pkl"
|
33 |
+
|
34 |
+
model = joblib.load(
|
35 |
+
hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
|
36 |
+
)
|
37 |
+
|
38 |
+
print("loading from hugginface pkl file")
|
39 |
+
self.load_from_pk_direct(model)
|
40 |
+
|
41 |
+
# if os.path.exists(full_model_path):
|
42 |
+
# print("Loading model from pickle...")
|
43 |
+
# self.load_from_pkl(full_model_path)
|
44 |
+
# else:
|
45 |
+
# if wikidump:
|
46 |
+
# print("No pre-trained model found, loading data from dump...")
|
47 |
+
# self.load_model(wikidump)
|
48 |
+
# self.save_model(full_model_path)
|
49 |
# self.load_model(wikidump)
|
50 |
|
51 |
def load_model(self, wikidump):
|
|
|
115 |
self.vectorizer = data['vectorizer']
|
116 |
self.tfidf = data['tfidf_matrix']
|
117 |
self.titles = data['titles']
|
118 |
+
# self.corpus = data['corpus']
|
119 |
+
|
120 |
+
def load_from_pk_direct(self, pkl):
|
121 |
+
#data = pickle.load(pkl)
|
122 |
+
data = pkl
|
123 |
+
self.vectorizer = data['vectorizer']
|
124 |
+
self.tfidf = data['tfidf_matrix']
|
125 |
+
self.titles = data['titles']
|